def save_histogram(samples, param_names=None, info_path=InfoPath(), histogram_params=HistogramParams(), summary_params=SummaryParams()): """ Make histograms for the parameters from posterior destribution. Parameters ----------- samples : Panda's DataFrame Each column contains samples from posterior distribution. param_names : list of str Names of the parameters for plotting. If None, all will be plotted. """ info_path.set_codefile() df_summary, table = sample_summary(df=samples) save_histogram_from_summary(samples, df_summary, param_names=param_names, info_path=info_path, histogram_params=histogram_params, summary_params=summary_params)
def print_summary(fit, param_names=None, summary_params=SummaryParams()): """ Saves statistical summary of the samples using mean, std, mode, hpdi. Parameters ---------- fit : cmdstanpy.stanfit.CmdStanMCMC Contains the samples from cmdstanpy. param_names : list of str Names of parameters to be included in the summary. Include all if None. Returns ------- dict: df: Panda's data frame containing the summary table: str Summary table in text format. samples: Panda's data frame Combined samples from all chains. """ df_summary, summary, samples = make_summary(fit, param_names=param_names, summary_params=summary_params) print(summary) return {"df": df_summary, "table": summary, "samples": samples}
def save_analysis(samples, param_names=None, info_path=InfoPath(), summary_params=SummaryParams()): """ Creates all analysis files at once: summary, trace and posterior. Parameters ----------- samples : Panda's DataFrame Each column contains samples from posterior distribution. param_names : list of str Names of parameters to plot. Plot all parameters if None. """ info_path.set_codefile() summary = save_summary( samples, param_names=param_names, info_path=info_path, summary_params=summary_params) make_tree_plot(summary['df'], param_names=param_names, info_path=info_path, summary_params=summary_params) save_histogram_from_summary( samples, summary['df'], param_names=param_names, info_path=info_path, summary_params=summary_params) save_pair_plot(samples, param_names=param_names, info_path=info_path)
def save_histogram(fit, param_names=None, info_path=InfoPath(), summary_params=SummaryParams(), histogram_params=HistogramParams()): """ Make histograms of parameter distributions. Parameters ---------- fit : cmdstanpy.stanfit.CmdStanMCMC Samples from cmdstanpy. param_names : list of str Names of parameters to be included in the summar. Include all if None. info_path : InfoPath Path information for creating summaries. """ info_path.set_codefile() df_summary, summary, samples = make_summary( fit, param_names=param_names, summary_params=summary_params) save_histogram_from_summary( samples, df_summary, param_names=param_names, info_path=info_path, summary_params=summary_params, histogram_params=histogram_params)
def save_compare_parameters( models, labels, extra_values=[], type: CompareParametersType = CompareParametersType.TEXT, param_names=None, info_path=InfoPath(), summary_params=SummaryParams()): """ Saves a text table that compares model parameters Parameters ---------- models : list Panda's data frames List of data frames for each model, containg sample values for multiple parameters (one parameter is one data frame column). Supply multiple data frames to compare parameter distributions. labels : list of str Names of the models in `models` list. extra_values : list of dict Additional values to be shown in the table: [ { "mu": 2.3, "sigma": 3.3 } ] type : CompareParametersType Format of values in the text table. param_names : list of str Names of parameters. Include all if None. info_path : InfoPath Path information for creating summaries. """ info_path.set_codefile() df, table = compare_parameters(models=models, labels=labels, extra_values=extra_values, type=type, param_names=param_names, summary_params=summary_params) info_path = InfoPath(**info_path.__dict__) info_path.base_name = info_path.base_name or "parameters_compared" info_path.extension = 'txt' path_to_txt = get_info_path(info_path) with open(path_to_txt, "w") as text_file: print(table, file=text_file)
def save_tree_plot(models, extra_values=[], param_names=None, info_path=InfoPath(), summary_params=SummaryParams(), tree_params=TreePlotParams()): """ Save a tree plot that summarises parameter distributions. Can compare summaries from multiple models, when multiple samples are supplied. One can also supply additional markers to be compared with using `extra_values` parameter. Parameters ---------- models : list Panda's data frames List of data frames for each model, containg sample values for multiple parameters (one parameter is one data frame column). Supply multiple data frames to see their distribution summaries compared on the tree plot. extra_values : list of dict Additional markers to be shown on tree plot, without error bars: [ { "mu": 2.3, "sigma": 3.3 } ] param_names : list of str Names of parameters. Include all if None. info_path : InfoPath Path information for creating summaries. """ info_path.set_codefile() summaries = [] for samples in models: column_names = list(samples) param_names = filter_param_names(column_names, param_names) summary, _ = sample_summary(samples, params=summary_params) summaries.append(summary) for values in extra_values: summaries.append(summary_from_dict(values)) make_comparative_tree_plot(summaries, info_path=info_path, tree_params=tree_params)
def save_tree_plot(fits, extra_values=[], param_names=None, info_path=InfoPath(), summary_params=SummaryParams(), tree_params=TreePlotParams()): """ Save a tree plot that summarises parameter distributions. Can compare summaries from multiple models, when multiple fits are supplied. One can also supply additional markers to be compared with using `extra_values` parameter. Parameters ---------- fits : list of cmdstanpy.stanfit.CmdStanMCMC Contains the samples from cmdstanpy. extra_values : list of dict Additional markers to be shown on tree plot, without error bars: [ { "mu": 2.3, "sigma": 3.3 } ] param_names : list of str Names of parameters. Include all if None. info_path : InfoPath Path information for creating summaries. """ info_path.set_codefile() summaries = [] for fit in fits: param_names = filter_param_names(fit.column_names, param_names) samples = fit.get_drawset(params=param_names) summary, _ = sample_summary(samples, params=summary_params) summaries.append(summary) for values in extra_values: summaries.append(summary_from_dict(values)) make_comparative_tree_plot(summaries, info_path=info_path, tree_params=tree_params)
def make_histograms(samples, summary, param_names=None, params=HistogramParams(), summary_params=SummaryParams()): """ Make multiple files with histograms for the parameters from posterior destribution. Parameters ----------- samples : Panda's DataFrame Each column contains samples from posterior distribution. summary : Panda's DataFrame Summary information about each column. param_names : list of str Names of the parameters for plotting. If None, all will be plotted. """ param_names = filter_param_names(samples.columns, param_names) # Total number of plots n_plots = math.ceil(math.ceil(len(param_names) / params.ncols) / params.num_plot_rows) if n_plots > params.max_plot_pages: print(( f'Showing only first {params.max_plot_pages} ' f'pages out of {n_plots} of histogram.' 'Consider specifying "param_names".')) n_plots = params.max_plot_pages if n_plots < 1: n_plots = 1 figures_and_axes = [] # Make multiple traceplots for i_plot in range(n_plots): fig, ax = make_histogram_one_page( i_start=i_plot * params.num_plot_rows * params.ncols, samples=samples, summary=summary, param_names=param_names, params=params, summary_params=summary_params) figures_and_axes.append([fig, ax]) return figures_and_axes
def test_print_summary(capsys): fit = get_fit() result = print_summary(fit, param_names=["mu", "tau", 'eta.1'], summary_params=SummaryParams(hpdis=[0.05, 0.99])) assert "8.05" in capsys.readouterr().out assert result["df"].shape == (3, 11) assert "8.05" in result["table"] assert result["samples"].shape == (4000, 3)
def save_summary(fit, param_names=None, info_path=InfoPath(), summary_params=SummaryParams()): """ Saves statistical summary of the samples using mean, std, mode, hpdi. Parameters ---------- fit : cmdstanpy.stanfit.CmdStanMCMC Contains the samples from cmdstanpy. param_names : list of str Names of parameters to be included in the summary. Include all if None. info_path : InfoPath Path information for creating summaries. Returns ------- dict: df: Panda's data frame containing the summary table: str Summary table in text format. samples: Panda's data frame Combined samples from all chains path_txt: str Path to the text summary path_csv: str Path to summary in CSV format """ info_path.set_codefile() info_path = InfoPath(**info_path.__dict__) df_summary, summary, samples = make_summary(fit, param_names=param_names, summary_params=summary_params) output = save_summary_to_disk(df_summary, summary, info_path=info_path) return { "df": df_summary, "table": summary, "samples": samples, "path_txt": output["path_txt"], "path_csv": output["path_csv"] }
def test_save_summary_specify_hpdi(): fit = get_fit() outdir = "tarpan/cmdstanpy/model_info/summary_test" if os.path.isdir(outdir): shutil.rmtree(outdir) save_summary(fit, param_names=["mu", "tau", 'eta.1'], summary_params=SummaryParams(hpdis=[0.05, 0.99])) assert os.path.isfile(os.path.join(outdir, "summary.txt")) assert os.path.isfile(os.path.join(outdir, "summary.csv"))
def run_model(): model = CmdStanModel(stan_file="eight_schools.stan") data = { "J": 8, "y": [28, 8, -3, 7, -1, 1, 18, 12], "sigma": [15, 10, 16, 11, 9, 11, 10, 18] } fit = model.sample(data=data, chains=4, cores=4, seed=1, sampling_iters=1000, warmup_iters=1000) # Make summary with custom HPDI values save_summary(fit, param_names=['mu', 'tau', 'eta.1'], summary_params=SummaryParams(hpdis=[0.05, 0.99]))
def save_analysis(fit, param_names=None, info_path=InfoPath(), summary_params=SummaryParams()): """ Create all analysis files at once: diagnostic, summary, trace and posterior. Parameters ----------- fit : cmdstanpy.stanfit.CmdStanMCMC Contains the samples from cmdstanpy. param_names : list of str Names of parameters to plot. Plot all parameters if None. """ info_path.set_codefile() save_diagnostic(fit, info_path=info_path) summary = save_summary(fit, param_names=param_names, info_path=info_path, summary_params=summary_params) make_tree_plot(summary['df'], param_names=param_names, info_path=info_path, summary_params=summary_params) save_traceplot(fit, param_names=param_names, info_path=info_path) save_histogram_from_summary(summary['samples'], summary['df'], param_names=param_names, info_path=info_path, summary_params=summary_params) save_pair_plot(summary['samples'], param_names=param_names, info_path=info_path)
def make_tree_plot(df_summary, param_names=None, info_path=InfoPath(), tree_params: TreePlotParams = TreePlotParams(), summary_params=SummaryParams()): """ Make tree plot of parameters. """ info_path = InfoPath(**info_path.__dict__) tree_plot_data = extract_tree_plot_data(df_summary, param_names=param_names, summary_params=summary_params) fig, ax = tree_plot(tree_plot_data, params=tree_params) info_path.base_name = info_path.base_name or 'summary' info_path.extension = info_path.extension or 'pdf' the_path = get_info_path(info_path) fig.savefig(the_path, dpi=info_path.dpi) plt.close(fig)
def make_summary(fit, param_names, summary_params=SummaryParams()): """ Returns statistical summary table for parameters: mean, std, mode, hpdi. Parameters ---------- fit : cmdstanpy.stanfit.CmdStanMCMC Contains the samples from cmdstanpy. param_names : list of str Names of parameters to be included in the summar. Include all if None. """ param_names = filter_param_names(fit.column_names, param_names) samples = fit.get_drawset(params=param_names) # Get R_hat values from the summary # -------- df_summary = fit.summary() df_summary.rename( index=(lambda name: name.replace('[', '.').replace(']', '')), inplace=True) df_summary = df_summary[['N_Eff', 'R_hat']] df_summary['N_Eff'] = np.round(df_summary['N_Eff']) df_summary['N_Eff'] = df_summary['N_Eff'].astype(int) # Get the summary df_summary, table = sample_summary(df=samples, extra_values=df_summary, params=summary_params) return df_summary, table, samples
def save_histogram_from_summary(samples, summary, param_names=None, info_path=InfoPath(), histogram_params=HistogramParams(), summary_params=SummaryParams()): """ Make histograms for the parameters from posterior destribution. Parameters ----------- samples : Panda's DataFrame Each column contains samples from posterior distribution. summary : Panda's DataFrame Summary information about each column. param_names : list of str Names of the parameters for plotting. If None, all will be plotted. """ info_path = InfoPath(**info_path.__dict__) figures_and_axes = make_histograms( samples, summary, param_names=param_names, params=histogram_params, summary_params=summary_params) base_name = info_path.base_name or "histogram" info_path.extension = info_path.extension or 'pdf' for i, figure_and_axis in enumerate(figures_and_axes): info_path.base_name = f'{base_name}_{i + 1:02d}' plot_path = get_info_path(info_path) fig = figure_and_axis[0] fig.savefig(plot_path, dpi=info_path.dpi) plt.close(fig)
def save_compare_parameters( fits, labels, extra_values=[], param_names=None, type: CompareParametersType = CompareParametersType.TEXT, info_path=InfoPath(), summary_params=SummaryParams()): """ Saves a text table that compares model parameters Parameters ---------- fits : list of cmdstanpy.stanfit.CmdStanMCMC Contains the samples from cmdstanpy. labels : list of str Names of the models in `fits` list. extra_values : list of dict Additional values to be shown in the table: [ { "mu": 2.3, "sigma": 3.3 } ] param_names : list of str Names of parameters. Include all if None. type : CompareParametersType Format of values in the text table. info_path : InfoPath Path information for creating summaries. """ info_path.set_codefile() models = [] for fit in fits: param_names = filter_param_names(fit.column_names, param_names) samples = fit.get_drawset(params=param_names) models.append(samples) shared_save_compare_parameters(models, labels=labels, extra_values=extra_values, type=type, param_names=param_names, info_path=info_path, summary_params=summary_params)
def compare_parameters( models, labels, extra_values=[], type: CompareParametersType = CompareParametersType.TEXT, param_names=None, summary_params=SummaryParams()): """ Create model parameters Parameters ---------- models : list Panda's data frames List of data frames for each model, containg sample values for multiple parameters (one parameter is one data frame column). Supply multiple data frames to compare parameter distributions. labels : list of str Names of the models in `models` list. extra_values : list of dict Additional values to be shown in the table: [ { "mu": 2.3, "sigma": 3.3 } ] type : CompareParametersType Format of values in the text table. param_names : list of str Names of parameters. Include all if None. Returns -------- df: Panda's data frame Table in Panda's format txt : str Table in text format """ if len(models) == 0: raise ValueError('Models list is empty') return if (len(models) + len(extra_values)) != len(labels): raise ValueError('Models list length is different from labels') return samples = models[0] column_names = list(samples) param_names = filter_param_names(column_names, param_names) df = pd.DataFrame(index=labels, columns=param_names) param_names_filtered = None for samples, label in zip(models, labels): column_names = list(samples) if param_names_filtered is None: param_names_filtered = filter_param_names(column_names, param_names) samples = samples[param_names_filtered] df_summary, _ = sample_summary(samples, params=summary_params) values = [ format_parameter(df_summary.loc[name], type) for name in param_names_filtered ] df.loc[label] = values # Add extra values # --------------- extra_labels = labels[len(models):] for data, label in zip(extra_values, extra_labels): column_names = list(data.keys()) values = [] for name in param_names_filtered: text_value = '' if name in data: text_value = format_value(data[name], type) values.append(text_value) df.loc[label] = values table = tabulate(df, headers=param_names_filtered, tablefmt="pipe", stralign="right") return df, table
def make_histogram_one_page(i_start, samples, summary, param_names, params: HistogramParams, summary_params=SummaryParams()): """ Make a single file with histograms for the parameters from posterior destribution. """ nrows = math.ceil((len(param_names) - i_start) / params.ncols) if nrows > params.num_plot_rows: nrows = params.num_plot_rows ncols = params.ncols fig_height = 4 * nrows fig_width = 12 # Special case: if there is just one parameter show a plot with one column if len(param_names) == 1: ncols = 1 fig_width /= 2 fig, ax = plt.subplots( nrows=nrows, ncols=ncols, figsize=(fig_width, fig_height), squeeze=False) axes = ax.flatten() for i_axis, ax in enumerate(axes): i_param = i_start + i_axis if i_param >= len(param_names): break parameter = param_names[i_param] param_samples = samples[parameter] data = summary.loc[parameter] # Exclude extreme outliers from the samples # to avoid the blow-up of the x-axis range inner_range = np.percentile(param_samples, [0.5, 99.5]) samples_for_kde = param_samples[(param_samples > inner_range[0]) & (param_samples < inner_range[1])] sns.distplot(samples_for_kde, kde=False, norm_hist=True, ax=ax, hist_kws={ "color": params.hist_color, "zorder": 1, "edgecolor": params.hist_edge_color, "linewidth": 1, "alpha": 1}) # Show KDEs for the error bars (HPDIs) # ----------- hpdis = sorted(summary_params.hpdi_percent(), reverse=True) for i, hpdi in enumerate(hpdis): start = f'{hpdi}CI-' end = f'{hpdi}CI+' # KDE plot sns.kdeplot(samples_for_kde, shade=False, clip=[data[start], data[end]], label=f'{hpdi}% HPDI', ax=ax, legend=None, color=params.kde_colors[i], linestyle=params.kde_line_styles[i], linewidth=2) if i == len(hpdis) - 1: # Show shade under KDE for the last error bar sns.kdeplot(samples_for_kde, shade=True, clip=[data[start], data[end]], color="#000000", label='_nolegend_', alpha=0.2, zorder=10, ax=ax, legend=None, linewidth=2) ax.axvline(x=data['Mean'], label='Mean', linewidth=1.5, linestyle='dashed', color='#33AA66') ax.axvline(x=data['Mode'], label='Mode', linewidth=1.5, color='#FF66AA') ax.set_xlabel(parameter) # Do not draw the axes for non-existing plots for ax in axes[len(param_names):]: ax.axis('off') handles, labels = axes[0].get_legend_handles_labels() fig.legend(handles, labels, loc='upper center', mode='expand', ncol=len(labels)) fig.tight_layout(rect=[0, 0, 1, 0.95]) return (fig, ax)
def extract_tree_plot_data(df, param_names=None, groups=None, summary_params=SummaryParams()): """ Extract data used to for tree plot function from a dataframe. Parameters ----------- param_names: list of str List of parameters to plot. If None, plot all. df : Panda's data frame Data frame containing summary groups : list Tree plot data. If passed, the data frame's data will be added to it. Returns ------- Array of dictionaries that will be used to make tree plot. For example, here we plot values of two variables "temperature" and "pressure" from two observations. Each value has multiple error bars, the 95% and 68% bars, for example. [ { "name": "temperature" "values": [ { value: 10, error_bars: [[6, 16], [9, 11]] }, { value: 40, error_bars: [[10, 80], [38, 42]] }, ] }, { "name": "pressure" "values": [ { value: 1.1, error_bars: [[0.1, 2.3], [0.9, 1.2]] }, { value: 1.6, error_bars: [[0.6, 2.7], [1.1, 1.9]] } ] } ] """ if groups is None: groups = [] for column_name, row in df.iterrows(): if param_names is not None: # If param_names contains 'a', we will also plot # parameters named 'a.1', 'a.2' etc. if (column_name not in param_names and re.sub( r'\.[0-9]+\Z', '', column_name) not in param_names): continue column_summary = row group_data = None # Check if `groups` already has the column for group in groups: if group['name'] == column_name: group_data = group break # Have not found group data - create one if group_data is None: group_data = {'name': column_name, 'values': []} groups.append(group_data) # Add new value value = {} value['value'] = column_summary["Mode"] group_data['values'].append(value) # Add error bars from the HPDI values # --------- error_bars = [] hpdis = sorted(summary_params.hpdi_percent(), reverse=True) for hpdi in hpdis: start = f'{hpdi}CI-' end = f'{hpdi}CI+' if start in column_summary: error_bars.append([column_summary[start], column_summary[end]]) value['error_bars'] = error_bars return groups