Beispiel #1
0
def save_histogram(samples, param_names=None,
                   info_path=InfoPath(),
                   histogram_params=HistogramParams(),
                   summary_params=SummaryParams()):
    """
    Make histograms for the parameters from posterior destribution.

    Parameters
    -----------

    samples : Panda's DataFrame

        Each column contains samples from posterior distribution.

    param_names : list of str

        Names of the parameters for plotting. If None, all will be plotted.
    """

    info_path.set_codefile()
    df_summary, table = sample_summary(df=samples)

    save_histogram_from_summary(samples, df_summary,
                                param_names=param_names,
                                info_path=info_path,
                                histogram_params=histogram_params,
                                summary_params=summary_params)
Beispiel #2
0
def print_summary(fit, param_names=None, summary_params=SummaryParams()):
    """
    Saves statistical summary of the samples using mean, std, mode, hpdi.

    Parameters
    ----------

    fit : cmdstanpy.stanfit.CmdStanMCMC

        Contains the samples from cmdstanpy.

    param_names : list of str

        Names of parameters to be included in the summary. Include all if None.


    Returns
    -------
    dict:
        df:
            Panda's data frame containing the summary
        table: str
            Summary table in text format.
        samples: Panda's data frame
            Combined samples from all chains.
    """

    df_summary, summary, samples = make_summary(fit,
                                                param_names=param_names,
                                                summary_params=summary_params)

    print(summary)

    return {"df": df_summary, "table": summary, "samples": samples}
Beispiel #3
0
def save_analysis(samples, param_names=None, info_path=InfoPath(),
                  summary_params=SummaryParams()):
    """
    Creates all analysis files at once: summary, trace and posterior.

    Parameters
    -----------

    samples : Panda's DataFrame

        Each column contains samples from posterior distribution.

    param_names : list of str

        Names of parameters to plot. Plot all parameters if None.
    """

    info_path.set_codefile()

    summary = save_summary(
        samples, param_names=param_names, info_path=info_path,
        summary_params=summary_params)

    make_tree_plot(summary['df'], param_names=param_names, info_path=info_path,
                   summary_params=summary_params)

    save_histogram_from_summary(
        samples, summary['df'], param_names=param_names,
        info_path=info_path, summary_params=summary_params)

    save_pair_plot(samples, param_names=param_names, info_path=info_path)
Beispiel #4
0
def save_histogram(fit,
                   param_names=None,
                   info_path=InfoPath(),
                   summary_params=SummaryParams(),
                   histogram_params=HistogramParams()):
    """
    Make histograms of parameter distributions.

    Parameters
    ----------

    fit : cmdstanpy.stanfit.CmdStanMCMC
        Samples from cmdstanpy.

    param_names : list of str
        Names of parameters to be included in the summar. Include all if None.

    info_path : InfoPath
        Path information for creating summaries.

    """

    info_path.set_codefile()

    df_summary, summary, samples = make_summary(
        fit, param_names=param_names, summary_params=summary_params)

    save_histogram_from_summary(
        samples, df_summary, param_names=param_names,
        info_path=info_path, summary_params=summary_params,
        histogram_params=histogram_params)
Beispiel #5
0
def save_compare_parameters(
    models,
    labels,
    extra_values=[],
    type: CompareParametersType = CompareParametersType.TEXT,
    param_names=None,
    info_path=InfoPath(),
    summary_params=SummaryParams()):
    """
    Saves a text table that compares model parameters

    Parameters
    ----------

    models : list Panda's data frames
        List of data frames for each model, containg sample values for
        multiple parameters (one parameter is one data frame column).
        Supply multiple data frames to compare parameter distributions.

    labels : list of str
        Names of the models in `models` list.

    extra_values : list of dict
        Additional values to be shown in the table:

        [
            {
                "mu": 2.3,
                "sigma": 3.3
            }
        ]

    type : CompareParametersType
        Format of values in the text table.

    param_names : list of str
        Names of parameters. Include all if None.

    info_path : InfoPath
        Path information for creating summaries.
    """

    info_path.set_codefile()

    df, table = compare_parameters(models=models,
                                   labels=labels,
                                   extra_values=extra_values,
                                   type=type,
                                   param_names=param_names,
                                   summary_params=summary_params)

    info_path = InfoPath(**info_path.__dict__)
    info_path.base_name = info_path.base_name or "parameters_compared"
    info_path.extension = 'txt'
    path_to_txt = get_info_path(info_path)

    with open(path_to_txt, "w") as text_file:
        print(table, file=text_file)
Beispiel #6
0
def save_tree_plot(models,
                   extra_values=[],
                   param_names=None,
                   info_path=InfoPath(),
                   summary_params=SummaryParams(),
                   tree_params=TreePlotParams()):
    """
    Save a tree plot that summarises parameter distributions.
    Can compare summaries from multiple models, when multiple samples are
    supplied. One can also supply additional markers
    to be compared with using `extra_values` parameter.

    Parameters
    ----------

    models : list Panda's data frames

        List of data frames for each model, containg sample values for
        multiple parameters (one parameter is one data frame column).
        Supply multiple data frames to see their distribution summaries
        compared on the tree plot.

    extra_values : list of dict
        Additional markers to be shown on tree plot, without error bars:

        [
            {
                "mu": 2.3,
                "sigma": 3.3
            }
        ]

    param_names : list of str

        Names of parameters. Include all if None.

    info_path : InfoPath

        Path information for creating summaries.

    """

    info_path.set_codefile()
    summaries = []

    for samples in models:
        column_names = list(samples)
        param_names = filter_param_names(column_names, param_names)
        summary, _ = sample_summary(samples, params=summary_params)
        summaries.append(summary)

    for values in extra_values:
        summaries.append(summary_from_dict(values))

    make_comparative_tree_plot(summaries,
                               info_path=info_path,
                               tree_params=tree_params)
Beispiel #7
0
def save_tree_plot(fits,
                   extra_values=[],
                   param_names=None,
                   info_path=InfoPath(),
                   summary_params=SummaryParams(),
                   tree_params=TreePlotParams()):
    """
    Save a tree plot that summarises parameter distributions.
    Can compare summaries from multiple models, when multiple fits are
    supplied. One can also supply additional markers
    to be compared with using `extra_values` parameter.

    Parameters
    ----------

    fits : list of cmdstanpy.stanfit.CmdStanMCMC

        Contains the samples from cmdstanpy.

    extra_values : list of dict
        Additional markers to be shown on tree plot, without error bars:

        [
            {
                "mu": 2.3,
                "sigma": 3.3
            }
        ]

    param_names : list of str

        Names of parameters. Include all if None.

    info_path : InfoPath

        Path information for creating summaries.

    """

    info_path.set_codefile()
    summaries = []

    for fit in fits:
        param_names = filter_param_names(fit.column_names, param_names)
        samples = fit.get_drawset(params=param_names)
        summary, _ = sample_summary(samples, params=summary_params)
        summaries.append(summary)

    for values in extra_values:
        summaries.append(summary_from_dict(values))

    make_comparative_tree_plot(summaries,
                               info_path=info_path,
                               tree_params=tree_params)
Beispiel #8
0
def make_histograms(samples, summary, param_names=None,
                    params=HistogramParams(),
                    summary_params=SummaryParams()):
    """
    Make multiple files with
    histograms for the parameters from posterior destribution.

    Parameters
    -----------

    samples : Panda's DataFrame

        Each column contains samples from posterior distribution.

    summary : Panda's DataFrame

        Summary information about each column.

    param_names : list of str

        Names of the parameters for plotting. If None, all will be plotted.
    """
    param_names = filter_param_names(samples.columns, param_names)

    # Total number of plots
    n_plots = math.ceil(math.ceil(len(param_names) / params.ncols) /
                        params.num_plot_rows)

    if n_plots > params.max_plot_pages:
        print((
            f'Showing only first {params.max_plot_pages} '
            f'pages out of {n_plots} of histogram.'
            'Consider specifying "param_names".'))

        n_plots = params.max_plot_pages

    if n_plots < 1:
        n_plots = 1

    figures_and_axes = []

    # Make multiple traceplots
    for i_plot in range(n_plots):
        fig, ax = make_histogram_one_page(
            i_start=i_plot * params.num_plot_rows * params.ncols,
            samples=samples,
            summary=summary,
            param_names=param_names,
            params=params,
            summary_params=summary_params)

        figures_and_axes.append([fig, ax])

    return figures_and_axes
Beispiel #9
0
def test_print_summary(capsys):
    fit = get_fit()

    result = print_summary(fit,
                           param_names=["mu", "tau", 'eta.1'],
                           summary_params=SummaryParams(hpdis=[0.05, 0.99]))

    assert "8.05" in capsys.readouterr().out

    assert result["df"].shape == (3, 11)
    assert "8.05" in result["table"]
    assert result["samples"].shape == (4000, 3)
Beispiel #10
0
def save_summary(fit,
                 param_names=None,
                 info_path=InfoPath(),
                 summary_params=SummaryParams()):
    """
    Saves statistical summary of the samples using mean, std, mode, hpdi.

    Parameters
    ----------

    fit : cmdstanpy.stanfit.CmdStanMCMC

        Contains the samples from cmdstanpy.

    param_names : list of str

        Names of parameters to be included in the summary. Include all if None.

    info_path : InfoPath

        Path information for creating summaries.

    Returns
    -------
    dict:
        df:
            Panda's data frame containing the summary
        table: str
            Summary table in text format.
        samples: Panda's data frame
            Combined samples from all chains
        path_txt: str
            Path to the text summary
        path_csv: str
            Path to summary in CSV format
    """

    info_path.set_codefile()
    info_path = InfoPath(**info_path.__dict__)

    df_summary, summary, samples = make_summary(fit,
                                                param_names=param_names,
                                                summary_params=summary_params)

    output = save_summary_to_disk(df_summary, summary, info_path=info_path)

    return {
        "df": df_summary,
        "table": summary,
        "samples": samples,
        "path_txt": output["path_txt"],
        "path_csv": output["path_csv"]
    }
Beispiel #11
0
def test_save_summary_specify_hpdi():
    fit = get_fit()

    outdir = "tarpan/cmdstanpy/model_info/summary_test"

    if os.path.isdir(outdir):
        shutil.rmtree(outdir)

    save_summary(fit,
                 param_names=["mu", "tau", 'eta.1'],
                 summary_params=SummaryParams(hpdis=[0.05, 0.99]))

    assert os.path.isfile(os.path.join(outdir, "summary.txt"))
    assert os.path.isfile(os.path.join(outdir, "summary.csv"))
Beispiel #12
0
def run_model():
    model = CmdStanModel(stan_file="eight_schools.stan")

    data = {
        "J": 8,
        "y": [28,  8, -3,  7, -1,  1, 18, 12],
        "sigma": [15, 10, 16, 11,  9, 11, 10, 18]
    }

    fit = model.sample(data=data, chains=4, cores=4, seed=1,
                       sampling_iters=1000, warmup_iters=1000)

    # Make summary with custom HPDI values
    save_summary(fit, param_names=['mu', 'tau', 'eta.1'],
                 summary_params=SummaryParams(hpdis=[0.05, 0.99]))
Beispiel #13
0
def save_analysis(fit,
                  param_names=None,
                  info_path=InfoPath(),
                  summary_params=SummaryParams()):
    """
    Create all analysis files at once: diagnostic, summary, trace and posterior.

    Parameters
    -----------

    fit : cmdstanpy.stanfit.CmdStanMCMC

        Contains the samples from cmdstanpy.

    param_names : list of str

        Names of parameters to plot. Plot all parameters if None.
    """

    info_path.set_codefile()
    save_diagnostic(fit, info_path=info_path)

    summary = save_summary(fit,
                           param_names=param_names,
                           info_path=info_path,
                           summary_params=summary_params)

    make_tree_plot(summary['df'],
                   param_names=param_names,
                   info_path=info_path,
                   summary_params=summary_params)

    save_traceplot(fit, param_names=param_names, info_path=info_path)

    save_histogram_from_summary(summary['samples'],
                                summary['df'],
                                param_names=param_names,
                                info_path=info_path,
                                summary_params=summary_params)

    save_pair_plot(summary['samples'],
                   param_names=param_names,
                   info_path=info_path)
Beispiel #14
0
def make_tree_plot(df_summary,
                   param_names=None,
                   info_path=InfoPath(),
                   tree_params: TreePlotParams = TreePlotParams(),
                   summary_params=SummaryParams()):
    """
    Make tree plot of parameters.
    """

    info_path = InfoPath(**info_path.__dict__)
    tree_plot_data = extract_tree_plot_data(df_summary,
                                            param_names=param_names,
                                            summary_params=summary_params)

    fig, ax = tree_plot(tree_plot_data, params=tree_params)
    info_path.base_name = info_path.base_name or 'summary'
    info_path.extension = info_path.extension or 'pdf'
    the_path = get_info_path(info_path)
    fig.savefig(the_path, dpi=info_path.dpi)
    plt.close(fig)
Beispiel #15
0
def make_summary(fit, param_names, summary_params=SummaryParams()):
    """
    Returns statistical summary table for parameters:
    mean, std, mode, hpdi.

    Parameters
    ----------

    fit : cmdstanpy.stanfit.CmdStanMCMC

        Contains the samples from cmdstanpy.

    param_names : list of str

        Names of parameters to be included in the summar. Include all if None.
    """

    param_names = filter_param_names(fit.column_names, param_names)
    samples = fit.get_drawset(params=param_names)

    # Get R_hat values from the summary
    # --------

    df_summary = fit.summary()

    df_summary.rename(
        index=(lambda name: name.replace('[', '.').replace(']', '')),
        inplace=True)

    df_summary = df_summary[['N_Eff', 'R_hat']]
    df_summary['N_Eff'] = np.round(df_summary['N_Eff'])
    df_summary['N_Eff'] = df_summary['N_Eff'].astype(int)

    # Get the summary
    df_summary, table = sample_summary(df=samples,
                                       extra_values=df_summary,
                                       params=summary_params)

    return df_summary, table, samples
Beispiel #16
0
def save_histogram_from_summary(samples, summary, param_names=None,
                                info_path=InfoPath(),
                                histogram_params=HistogramParams(),
                                summary_params=SummaryParams()):
    """
    Make histograms for the parameters from posterior destribution.

    Parameters
    -----------

    samples : Panda's DataFrame

        Each column contains samples from posterior distribution.

    summary : Panda's DataFrame

        Summary information about each column.

    param_names : list of str

        Names of the parameters for plotting. If None, all will be plotted.
    """

    info_path = InfoPath(**info_path.__dict__)

    figures_and_axes = make_histograms(
        samples, summary, param_names=param_names,
        params=histogram_params,
        summary_params=summary_params)

    base_name = info_path.base_name or "histogram"
    info_path.extension = info_path.extension or 'pdf'

    for i, figure_and_axis in enumerate(figures_and_axes):
        info_path.base_name = f'{base_name}_{i + 1:02d}'
        plot_path = get_info_path(info_path)
        fig = figure_and_axis[0]
        fig.savefig(plot_path, dpi=info_path.dpi)
        plt.close(fig)
Beispiel #17
0
def save_compare_parameters(
    fits,
    labels,
    extra_values=[],
    param_names=None,
    type: CompareParametersType = CompareParametersType.TEXT,
    info_path=InfoPath(),
    summary_params=SummaryParams()):
    """
    Saves a text table that compares model parameters

    Parameters
    ----------

    fits : list of cmdstanpy.stanfit.CmdStanMCMC

        Contains the samples from cmdstanpy.

    labels : list of str

        Names of the models in `fits` list.

    extra_values : list of dict
        Additional values to be shown in the table:

        [
            {
                "mu": 2.3,
                "sigma": 3.3
            }
        ]

    param_names : list of str

        Names of parameters. Include all if None.

    type : CompareParametersType

        Format of values in the text table.

    info_path : InfoPath

        Path information for creating summaries.

    """

    info_path.set_codefile()
    models = []

    for fit in fits:
        param_names = filter_param_names(fit.column_names, param_names)
        samples = fit.get_drawset(params=param_names)
        models.append(samples)

    shared_save_compare_parameters(models,
                                   labels=labels,
                                   extra_values=extra_values,
                                   type=type,
                                   param_names=param_names,
                                   info_path=info_path,
                                   summary_params=summary_params)
Beispiel #18
0
def compare_parameters(
    models,
    labels,
    extra_values=[],
    type: CompareParametersType = CompareParametersType.TEXT,
    param_names=None,
    summary_params=SummaryParams()):
    """
    Create model parameters

    Parameters
    ----------

    models : list Panda's data frames
        List of data frames for each model, containg sample values for
        multiple parameters (one parameter is one data frame column).
        Supply multiple data frames to compare parameter distributions.

    labels : list of str
        Names of the models in `models` list.

    extra_values : list of dict
        Additional values to be shown in the table:

        [
            {
                "mu": 2.3,
                "sigma": 3.3
            }
        ]

    type : CompareParametersType
        Format of values in the text table.

    param_names : list of str
        Names of parameters. Include all if None.

    Returns
    --------
    df: Panda's data frame
        Table in Panda's format
    txt : str
        Table in text format
    """

    if len(models) == 0:
        raise ValueError('Models list is empty')
        return

    if (len(models) + len(extra_values)) != len(labels):
        raise ValueError('Models list length is different from labels')
        return

    samples = models[0]
    column_names = list(samples)
    param_names = filter_param_names(column_names, param_names)

    df = pd.DataFrame(index=labels, columns=param_names)
    param_names_filtered = None

    for samples, label in zip(models, labels):
        column_names = list(samples)

        if param_names_filtered is None:
            param_names_filtered = filter_param_names(column_names,
                                                      param_names)

        samples = samples[param_names_filtered]
        df_summary, _ = sample_summary(samples, params=summary_params)

        values = [
            format_parameter(df_summary.loc[name], type)
            for name in param_names_filtered
        ]

        df.loc[label] = values

    # Add extra values
    # ---------------

    extra_labels = labels[len(models):]

    for data, label in zip(extra_values, extra_labels):
        column_names = list(data.keys())
        values = []

        for name in param_names_filtered:
            text_value = ''

            if name in data:
                text_value = format_value(data[name], type)

            values.append(text_value)

        df.loc[label] = values

    table = tabulate(df,
                     headers=param_names_filtered,
                     tablefmt="pipe",
                     stralign="right")

    return df, table
Beispiel #19
0
def make_histogram_one_page(i_start, samples, summary, param_names,
                            params: HistogramParams,
                            summary_params=SummaryParams()):
    """
    Make a single file with histograms for the parameters
    from posterior destribution.
    """

    nrows = math.ceil((len(param_names) - i_start) / params.ncols)

    if nrows > params.num_plot_rows:
        nrows = params.num_plot_rows

    ncols = params.ncols
    fig_height = 4 * nrows
    fig_width = 12

    # Special case: if there is just one parameter show a plot with one column
    if len(param_names) == 1:
        ncols = 1
        fig_width /= 2

    fig, ax = plt.subplots(
        nrows=nrows,
        ncols=ncols, figsize=(fig_width, fig_height),
        squeeze=False)

    axes = ax.flatten()

    for i_axis, ax in enumerate(axes):
        i_param = i_start + i_axis

        if i_param >= len(param_names):
            break

        parameter = param_names[i_param]
        param_samples = samples[parameter]
        data = summary.loc[parameter]

        # Exclude extreme outliers from the samples
        # to avoid the blow-up of the x-axis range
        inner_range = np.percentile(param_samples, [0.5, 99.5])

        samples_for_kde = param_samples[(param_samples > inner_range[0])
                                        & (param_samples < inner_range[1])]

        sns.distplot(samples_for_kde, kde=False, norm_hist=True, ax=ax,
                     hist_kws={
                        "color": params.hist_color,
                        "zorder": 1,
                        "edgecolor": params.hist_edge_color,
                        "linewidth": 1,
                        "alpha": 1})

        # Show KDEs for the error bars (HPDIs)
        # -----------

        hpdis = sorted(summary_params.hpdi_percent(), reverse=True)

        for i, hpdi in enumerate(hpdis):
            start = f'{hpdi}CI-'
            end = f'{hpdi}CI+'

            # KDE plot
            sns.kdeplot(samples_for_kde, shade=False,
                        clip=[data[start], data[end]],
                        label=f'{hpdi}% HPDI', ax=ax, legend=None,
                        color=params.kde_colors[i],
                        linestyle=params.kde_line_styles[i],
                        linewidth=2)

            if i == len(hpdis) - 1:
                # Show shade under KDE for the last error bar
                sns.kdeplot(samples_for_kde, shade=True,
                            clip=[data[start], data[end]],
                            color="#000000",
                            label='_nolegend_', alpha=0.2,
                            zorder=10,
                            ax=ax, legend=None,
                            linewidth=2)

        ax.axvline(x=data['Mean'], label='Mean', linewidth=1.5,
                   linestyle='dashed', color='#33AA66')

        ax.axvline(x=data['Mode'], label='Mode', linewidth=1.5,
                   color='#FF66AA')

        ax.set_xlabel(parameter)

    # Do not draw the axes for non-existing plots
    for ax in axes[len(param_names):]:
        ax.axis('off')

    handles, labels = axes[0].get_legend_handles_labels()
    fig.legend(handles, labels, loc='upper center', mode='expand',
               ncol=len(labels))

    fig.tight_layout(rect=[0, 0, 1, 0.95])

    return (fig, ax)
Beispiel #20
0
def extract_tree_plot_data(df,
                           param_names=None,
                           groups=None,
                           summary_params=SummaryParams()):
    """
    Extract data used to for tree plot function from a dataframe.

    Parameters
    -----------

    param_names: list of str

        List of parameters to plot. If None, plot all.

    df : Panda's data frame
        Data frame containing summary

    groups : list
        Tree plot data. If passed, the data frame's data will be added to it.

    Returns
    -------

    Array of dictionaries that will be used to make tree plot.

    For example, here we plot values of two variables "temperature"
    and "pressure" from two observations.
    Each value has multiple error bars, the 95% and 68% bars, for example.

    [
        {
            "name": "temperature"
            "values": [
                {
                    value: 10,
                    error_bars: [[6, 16], [9, 11]]
                },
                {
                    value: 40,
                    error_bars: [[10, 80], [38, 42]]
                },
            ]
        },
        {
            "name": "pressure"
            "values": [
                {
                    value: 1.1,
                    error_bars: [[0.1, 2.3], [0.9, 1.2]]
                },
                {
                    value: 1.6,
                    error_bars: [[0.6, 2.7], [1.1, 1.9]]
                }
            ]
        }
    ]
    """

    if groups is None:
        groups = []

    for column_name, row in df.iterrows():
        if param_names is not None:
            # If param_names contains 'a', we will also plot
            # parameters named 'a.1', 'a.2' etc.
            if (column_name not in param_names and re.sub(
                    r'\.[0-9]+\Z', '', column_name) not in param_names):
                continue

        column_summary = row
        group_data = None

        # Check if `groups` already has the column
        for group in groups:
            if group['name'] == column_name:
                group_data = group
                break

        # Have not found group data - create one
        if group_data is None:
            group_data = {'name': column_name, 'values': []}

            groups.append(group_data)

        # Add new value

        value = {}
        value['value'] = column_summary["Mode"]
        group_data['values'].append(value)

        # Add error bars from the HPDI values
        # ---------

        error_bars = []
        hpdis = sorted(summary_params.hpdi_percent(), reverse=True)

        for hpdi in hpdis:
            start = f'{hpdi}CI-'
            end = f'{hpdi}CI+'

            if start in column_summary:
                error_bars.append([column_summary[start], column_summary[end]])

        value['error_bars'] = error_bars

    return groups