Beispiel #1
0
    def plot_ind(self,
                 var_names: Union[str, List[str]] = None,
                 show_density: bool = True,
                 credible_interval: float = 0.94):
        """Plots individual posterior distributions, using ArviZ.

        Parameters
        ----------
        var_names
            Parameter(s) to plot. If not specified, show all model parameters.
        show_density
            Whether to show density. True or False. Defaults to True.
        credible_interval
            Credible interval to plot. Defaults to 0.94.
        """
        if var_names is None:
            var_names = list(self.parameters_desc)

        if show_density:
            kind = 'ridgeplot'
        else:
            kind = 'forestplot'

        az.plot_forest(self.fit,
                       kind=kind,
                       var_names=var_names,
                       credible_interval=credible_interval,
                       combined=True,
                       colors='gray',
                       ridgeplot_alpha=0.8)
        plt.show()
Beispiel #2
0
 def forest(self, query='opsin=="chr2" & delay_length==60'):
     trace_post_query = utils.query_posterior(trace=self.trace, posterior=self.posterior, query=query) if query else \
         self.trace.posterior['mu_per_condition']
     az.plot_forest(trace_post_query,
                    combined=True,
                    kind='ridgeplot',
                    ridgeplot_alpha=.5)
def run(n_samples=1000):
    model = build_model()
    with model:
        trace = pm.sample(draws=n_samples, tune=1000, target_accept=0.99)

    az.plot_trace(trace)
    az.plot_forest(trace)
Beispiel #4
0
def plot_model_comparison_CIs(model_res_dict):
    var_names = [
        'remr_lnVR', 'rema_lnVR', 'fema_lnVR', 'rema_lnCVR', 'fema_lnCVR'
    ]
    data = [
        az.convert_to_dataset(
            {model: np.exp(model_res_dict[model].posterior.mu.values)})
        for model in var_names
    ]
    _ = az.plot_forest(data,
                       combined=True,
                       hdi_prob=0.95,
                       quartiles=True,
                       colors='black',
                       figsize=(10, 4),
                       var_names=var_names,
                       model_names=len(var_names) * [''])
    plt.xlim(0.78, 1.23)
    plt.title('95% HDI for meta-analytic direct effect $e^\\mu$')
    plt.grid()
    plt.savefig(os.path.join(parent_dir_name,
                             f'output/hdi_model_comparison.tiff'),
                format='tiff',
                dpi=500,
                bbox_inches="tight")
Beispiel #5
0
def plot_forest(trace, variable, var_name=None):
    fig, ax = az.plot_forest(trace, var_names=variable, credible_interval=0.95)
    ax[0].set_title('')
    ax[0].set_title('95% credible intervals', size=15, loc="left")
    ax[0].spines['left'].set_visible(True)
    if var_name is not None:
        ax[0].set_yticklabels(var_name)
        ax[0].tick_params()
    return fig, ax
Beispiel #6
0
 def plot_forest(self):
     if not (self.mcmc_ and self.data_):
         raise AttributeError('Object needs to be fit first.')
     else:
         _ = az.plot_forest(  # NOQA
             self.data_,
             var_names=['mu', 'sigma', 'log_nu'],
             credible_interval=0.95,
             figsize=(10, 10))
         plt.show()
Beispiel #7
0
def hdi_param(m_idata, model_type, prior_level):

    fig, ax = plt.subplots(figsize=(10, 7))

    az.plot_forest(
        m_idata,
        var_names=["alpha", "beta", "sigma"],
        combined=True,  # combine chains 
        kind='ridgeplot',  # instead of default which does not show distribution
        ridgeplot_truncate=False,  # do show the tails 
        hdi_prob=.8,  # hdi prob .8 here. 
        ridgeplot_alpha=0.5,  # looks prettier
        ridgeplot_quantiles=[0.5],  # show mean
        ax=ax  # add to our axis
    )

    fig.suptitle("Python/pyMC3: HDI intervals for parameters")
    fig.tight_layout()

    plt.savefig(f"../plots_python/{model_type}_{prior_level}_HDI_param.jpeg",
                dpi=300)
Beispiel #8
0
def analyze_post(post, method):
    print_summary(post, 0.95, False)
    fig, ax = plt.subplots()
    az.plot_forest(post, hdi_prob=0.95, figsize=(10, 4), ax=ax)
    plt.title(method)
    pml.savefig(f'multicollinear_forest_plot_{method}.pdf')
    plt.show()

    # post = m6_1.sample_posterior(random.PRNGKey(1), p6_1, (1000,))
    fig, ax = plt.subplots()
    az.plot_pair(post, var_names=["br", "bl"],
                 scatter_kwargs={"alpha": 0.1}, ax=ax)
    pml.savefig(f'multicollinear_joint_post_{method}.pdf')
    plt.title(method)
    plt.show()

    sum_blbr = post["bl"] + post["br"]
    fig, ax = plt.subplots()
    az.plot_kde(sum_blbr, label="sum of bl and br", ax=ax)
    plt.title(method)
    pml.savefig(f'multicollinear_sum_post_{method}.pdf')
    plt.show()
Beispiel #9
0
def plot_param_diagnostics(mod,
                           incl_noise_params=False,
                           incl_trend_params=False,
                           incl_smooth_params=False,
                           which='trace',
                           **kwargs):
    """
    Parameters
    -----------
    mod : orbit model object
    which : str, {'density', 'trace', 'pair', 'autocorr', 'posterior', 'forest'}
    incl_noise_params : bool
        if plot noise parameters; default False
    incl_trend_params : bool
        if plot trend parameters; default False
    incl_smooth_params : bool
        if plot smoothing parameters; default False
    **kwargs :
        other parameters passed to arviz functions

    Returns
    -------
        matplotlib axes object
    """
    posterior_samples = get_arviz_plot_dict(
        mod,
        incl_noise_params=incl_noise_params,
        incl_trend_params=incl_trend_params,
        incl_smooth_params=incl_smooth_params)

    if which == "trace":
        axes = az.plot_trace(posterior_samples, **kwargs)
    elif which == "density":
        axes = az.plot_density(posterior_samples, **kwargs)
    elif which == "posterior":
        axes = az.plot_posterior(posterior_samples, **kwargs)
    elif which == "pair":
        axes = az.plot_pair(posterior_samples, **kwargs)
    elif which == "autocorr":
        axes = az.plot_autocorr(posterior_samples, **kwargs)
    elif which == "forest":
        axes = az.plot_forest(posterior_samples, **kwargs)
    else:
        raise Exception(
            "please use one of 'trace', 'density', 'posterior', 'pair', 'autocorr', 'forest' for kind."
        )

    return axes
Beispiel #10
0
def plot_model_comparison_CIs(model_res_dict):
    fig, ax = plt.subplots(nrows=1)
    datasets = [
        az.convert_to_dataset({
            drug_class:
            np.exp(model_res_dict[drug_class].posterior.mu.values)
        }) for drug_class in DRUG_CLASSES
    ]
    _ = az.plot_forest(datasets,
                       combined=True,
                       credible_interval=0.95,
                       quartiles=True,
                       colors='black',
                       var_names=DRUG_CLASSES,
                       model_names=['', '', '', ''],
                       ax=ax)
    ax.set_title('95% HDI $e^\\mu$')
    plt.tight_layout()
    plt.savefig(os.path.join(parent_dir_name,
                             f'output/hdi_drug_class_comparison.tiff'),
                format='tiff',
                dpi=500,
                bbox_inches="tight")
    return plt
Beispiel #11
0
def run_model(month=7,
              n_samples=1000,
              interp_type='ncs',
              binary=True,
              spike=0.9,
              hdi_prob=0.95,
              zero_inf=0.7):

    # preprocessing
    binary_str = 'binary' if binary else 'nonbinary'
    df = pd.read_csv('../data/' + interp_type + '-pop-deaths-and-' +
                     binary_str + '-mandates.csv',
                     index_col=0)
    df = df.rename(columns={
        "Age Group": "Age_Group",
        "COVID-19 Deaths": "covid_19_deaths"
    })
    test_df = df[df["Month"] == month]
    sex = np.array(test_df["Sex"])
    mandates = test_df.iloc[:,
                            -4:]  # takes all of the 4 mandate columns that currently exist
    age = test_df["Age_Group"]
    covid_deaths = test_df["covid_19_deaths"]
    population = test_df[
        "Population"] / 1000000  # makes the population in units of millions
    n = len(test_df["Age_Group"].unique()
            )  # should decrease by 1 after proper age filtering

    age_data = pd.get_dummies(test_df["Age_Group"]).drop("Under 1 year",
                                                         axis=1)
    sex_data = pd.get_dummies(test_df["Sex"], drop_first=True)

    # run the model

    with pm.Model() as model:

        # spike and slab prior
        tau = pm.InverseGamma('tau', alpha=20, beta=20)
        xi = pm.Bernoulli('xi', p=spike, shape=len(mandates.columns))
        beta_mandates = pm.MvNormal('beta_mandate',
                                    mu=0,
                                    cov=tau * np.eye(len(mandates.columns)),
                                    shape=len(mandates.columns))

        # age prior
        mu_age_mean = np.linspace(-5, 5, len(age_data.columns))
        cov = pm.HalfNormal('cov', sigma=2)
        mu_age = pm.MvNormal('mu_age',
                             mu=mu_age_mean,
                             cov=np.identity(len(age_data.columns)),
                             shape=(1, 10))
        beta_age = pm.MvNormal('beta_age',
                               mu=mu_age,
                               cov=(cov**2) * np.identity(10),
                               shape=(1, 10))

        # sex prior
        mu_sex = pm.Normal('mu_sex', mu=0, sigma=1)
        sigma_sex = pm.HalfNormal('simga_sex', sigma=2)
        beta_sex = pm.Normal('beta_sex', mu=mu_sex, sigma=sigma_sex)

        # intercept prior
        mu_intercept = pm.Normal('mu_intercept', mu=0, sigma=1)
        sigma_intercept = pm.HalfNormal('simga_intercept', sigma=2)
        beta_intercept = pm.Normal('beta_intercept',
                                   mu=mu_intercept,
                                   sigma=sigma_intercept)

        # mean setup for likelihood
        mandates = np.array(mandates).astype(theano.config.floatX)
        population = np.array(population).astype(theano.config.floatX)
        sex = np.array(sex_data).astype(theano.config.floatX)
        age = np.array(age_data).astype(theano.config.floatX)
        w_mandates = theano.shared(mandates, 'w_mandate')
        w_sex = theano.shared(sex, 'w_sex')
        w_age = theano.shared(age, 'w_age')
        mean = beta_intercept + pm.math.matrix_dot(w_mandates, xi*beta_mandates) \
                            + pm.math.matrix_dot(w_sex, beta_sex).T \
                            + pm.math.matrix_dot(w_age, beta_age.T).T

        # likelihood
        obs = pm.ZeroInflatedPoisson('y_obs',
                                     psi=zero_inf,
                                     theta=population * tt.exp(mean),
                                     observed=covid_deaths)
        # obs = pm.Normal('crap', mu=mean, sigma=3, observed=covid_deaths)

        # sample from posterior
        trace = pm.sample(n_samples,
                          tune=n_samples,
                          nuts={'target_accept': 0.98})

    # posterior hdis
    mandates = test_df.iloc[:, -4:]
    x = az.summary(trace, var_names=["beta_mandate"], hdi_prob=hdi_prob)
    x.index = mandates.columns
    x.to_csv('../images/posteriors/mandate_' + interp_type + '_' + binary_str +
             '_' + 'summary.csv')
    x = az.summary(trace, var_names=["beta_sex"], hdi_prob=hdi_prob)
    x.index = sex_data.columns
    x.to_csv('../images/posteriors/sex_' + interp_type + '_' + binary_str +
             '_' + 'summary.csv')
    x = az.summary(trace, var_names=["beta_age"], hdi_prob=hdi_prob)
    x.index = age_data.columns
    x.to_csv('../images/posteriors/age_' + interp_type + '_' + binary_str +
             '_' + 'summary.csv')
    x = az.summary(trace, var_names=["beta_intercept"], hdi_prob=hdi_prob)
    x.to_csv('../images/posteriors/intercept_' + interp_type + '_' +
             binary_str + '_' + 'summary.csv')

    # posterior distributions
    ax = az.plot_forest(trace,
                        'ridgeplot',
                        var_names=["beta_intercept"],
                        combined=True,
                        hdi_prob=0.99999)
    ax[0].set_title(r'Posterior Distribution of $\beta_0$')
    plt.savefig('../images/posteriors/intercept_posteriors_' + interp_type +
                '_' + binary_str + '.png')

    ax = az.plot_forest(trace,
                        'ridgeplot',
                        var_names=["beta_age"],
                        combined=True,
                        hdi_prob=0.99999)
    ax[0].set_yticklabels(reversed(age_data.columns))
    ax[0].set_title(r'Posterior Distribution of $\beta_{age}$')
    plt.savefig('../images/posteriors/age_posteriors_' + interp_type + '_' +
                binary_str + '.png')

    ax = az.plot_forest(trace,
                        'ridgeplot',
                        var_names=["beta_sex"],
                        combined=True,
                        hdi_prob=0.99999)
    ax[0].set_yticklabels(reversed(sex_data.columns))
    ax[0].set_title(r'Posterior Distribution of $\beta_{sex}$')
    plt.savefig('../images/posteriors/sex_posteriors_' + interp_type + '_' +
                binary_str + '.png')

    ax = az.plot_forest(trace,
                        'ridgeplot',
                        var_names=["beta_mandate"],
                        combined=True,
                        hdi_prob=0.99999)
    ax[0].set_yticklabels(reversed(mandates.columns))
    ax[0].set_title(r'Posterior Distribution of $\beta_{mandate}$')
    plt.savefig('../images/posteriors/mandate_posteriors_' + interp_type +
                '_' + binary_str + '.png')

    # ESS Plots
    ax = az.plot_ess(trace, var_names=["beta_intercept"])
    ax.set_title(r'$\beta_0$  ESS')
    plt.savefig('../images/ess/' + interp_type + '_' + binary_str +
                '_interceptESS.png')

    ax = az.plot_ess(trace, var_names=["beta_age"])
    ax[0, 0].set_title(r'$\beta_{age[1-4]}$  ESS', fontsize=18)
    ax[0, 1].set_title(r'$\beta_{age[15-24]}$  ESS', fontsize=18)
    ax[0, 2].set_title(r'$\beta_{age[25-34]}$  ESS', fontsize=18)
    ax[1, 0].set_title(r'$\beta_{age[35-44]}$  ESS', fontsize=18)
    ax[1, 1].set_title(r'$\beta_{age[45-54]}$  ESS', fontsize=18)
    ax[1, 2].set_title(r'$\beta_{age[5-14]}$  ESS', fontsize=18)
    ax[2, 0].set_title(r'$\beta_{age[55-64]}$  ESS', fontsize=18)
    ax[2, 1].set_title(r'$\beta_{age[65-74]}$  ESS', fontsize=18)
    ax[2, 2].set_title(r'$\beta_{age[75-84]}$  ESS', fontsize=18)
    ax[3, 0].set_title(r'$\beta_{age[85+]}$  ESS', fontsize=18)
    plt.savefig('../images/ess/' + interp_type + '_' + binary_str +
                '_ageESS.png')

    ax = az.plot_ess(trace, var_names=["beta_sex"])
    ax.set_title(r'$\beta_{sex}$  ESS')
    plt.savefig('../images/ess/' + interp_type + '_' + binary_str +
                '_sexESS.png')

    ax = az.plot_ess(trace, var_names=["beta_mandate"])
    ax[0].set_title(r'$\beta_{mandate[April]}$  ESS', fontsize=18)
    ax[1].set_title(r'$\beta_{mandate[May]}$  ESS', fontsize=18)
    ax[2].set_title(r'$\beta_{mandate[June]}$  ESS', fontsize=18)
    ax[3].set_title(r'$\beta_{mandate[July]}$  ESS', fontsize=18)
    plt.savefig('../images/ess/' + interp_type + '_' + binary_str +
                '_mandateESS.png')

    # posterior predictive checking
    with model:
        ppc = pm.sample_posterior_predictive(trace, var_names=["y_obs"])
    az.plot_ppc(az.from_pymc3(posterior_predictive=ppc, model=model))
    plt.savefig('../images/posterior_predictive/' + interp_type + '_' +
                binary_str + '.png')

    # return trace so that user can work with posterior data directly
    return trace
Beispiel #12
0
def conduct_bayesian(observations_file_path, mu_init, beta_init):
    df = pd.read_csv(observations_file_path)
    # Get list of unique damage state values:
    ds_list = df['DS Number'].unique()
    for ds in range(0, len(ds_list)):
        df_sub = df.loc[df['DS Number'] == ds_list[ds]]
        xj = np.array(df_sub['demand'])
        zj = np.array(df_sub['fail'])
        nj = np.array(df_sub['total'])
        mu_ds = mu_init[ds]
        beta_ds = beta_init[ds]
        with pm.Model() as model:
            # Set up the prior:
            mu = pm.Normal('mu', mu_ds, 2.71)
            beta = pm.Normal('beta', beta_ds, 0.03)

            # Define fragility function equation:
            def normal_cdf(mu, beta, xj):
                """Compute the log of the cumulative density function of the normal."""
                return 0.5 * (1 + tt.erf(
                    (tt.log(xj) - mu) / (beta * tt.sqrt(2))))

            # Define likelihood:
            # like = pm.Binomial('like', p=p, observed=zj, n=nj)
            like = pm.Binomial('like',
                               p=normal_cdf(mu, beta, xj),
                               observed=zj,
                               n=nj)
            for RV in model.basic_RVs:
                print(RV.name, RV.logp(model.test_point))
            # Determine the posterior
            trace = pm.sample(2000, cores=1, return_inferencedata=True)
            # Posterior predictive check are a great way to validate model:
            # Generate data from the model using parameters from draws from the posterior:
            ppc = pm.sample_posterior_predictive(
                trace, var_names=['mu', 'beta', 'like'])
        # Calculate failure probabilities using samples:
        im = np.arange(70, 200, 5)
        pf_ppc = []
        for i in range(0, len(ppc['mu'])):
            y = pf(im, ppc['mu'][i], ppc['beta'][i])
            pf_ppc.append(y)
        # Plot the HPD:
        _, ax = plt.subplots()
        az.plot_hdi(im,
                    pf_ppc,
                    fill_kwargs={
                        'alpha': 0.2,
                        'color': 'blue',
                        'label': 'bounds of prediction: 94% HPD'
                    })
        # Calculate and plot the mean outcome:
        pf_mean = pf(im, ppc['mu'].mean(), ppc['beta'].mean())
        ax.plot(im,
                pf_mean,
                label='mean of prediction',
                color='r',
                linestyle='dashed')
        # Plot the mean of the simulation-based fragility:
        pf_sim = pf(im, mu_ds, beta_ds)
        ax.plot(im, pf_sim, label='simulation-based', color='k')
        # Plot the observations:
        ax.scatter(xj, zj / nj, color='r', marker='^', label='observations')
        ax.legend()
        plt.show()
        # Looking at the difference between the prior of the parameters and updated distributions:
        new_mu_mean, new_mu_std = norm.fit(ppc['mu'])
        plt.hist(ppc['mu'], bins=25, density=True, alpha=0.4, color='b')
        xmin, xmax = plt.xlim()
        x = np.linspace(xmin, xmax, 100)
        p_prior = norm.pdf(x, mu_ds, 2.71)
        p_new = norm.pdf(x, new_mu_mean, new_mu_std)
        plt.plot(x, p_prior, 'k', linewidth=2, label='prior distribution')
        plt.plot(x,
                 p_new,
                 'r',
                 linewidth=2,
                 label='updated distribution',
                 linestyle='dashed')
        # Note az.plot_violin(trace, var_names=['mu']) can be helpful for seeing distribution of parameter values
        # Plot the posterior distributions of each RV
        fig, ax = plt.subplots()
        az.plot_trace(trace, chain_prop={'color': ['blue', 'red']})
        az.plot_posterior(trace)
        az.plot_forest(trace, var_names=['mu', 'beta'])
        plt.show()
        print(az.summary(trace))
Beispiel #13
0
def main(args):
    print("Loading data...")
    teams, df = load_data()
    nt = len(teams)
    train = df[df["split"] == "train"]

    print("Starting inference...")
    with pm.Model() as model:
        # priors
        alpha = pm.Normal("alpha", mu=0, sigma=1)
        sd_att = pm.HalfStudentT("sd_att", nu=3, sigma=2.5)
        sd_def = pm.HalfStudentT("sd_def", nu=3, sigma=2.5)

        home = pm.Normal("home", mu=0, sigma=1)  # home advantage

        # team-specific model parameters
        attack = pm.Normal("attack", mu=0, sigma=sd_att, shape=nt)
        defend = pm.Normal("defend", mu=0, sigma=sd_def, shape=nt)

        # data
        home_id = pm.Data("home_data", train["Home_id"])
        away_id = pm.Data("away_data", train["Away_id"])

        # likelihood
        theta1 = tt.exp(alpha + home + attack[home_id] - defend[away_id])
        theta2 = tt.exp(alpha + attack[away_id] - defend[home_id])

        pm.Poisson("s1", mu=theta1, observed=train["score1"])
        pm.Poisson("s2", mu=theta2, observed=train["score2"])

    with model:
        fit = pm.sample(
            draws=args.num_samples,
            tune=args.num_warmup,
            chains=args.num_chains,
            cores=args.num_cores,
            random_seed=args.rng_seed,
        )

    print("Analyse posterior...")
    az.plot_forest(
        fit,
        var_names=("alpha", "home", "sd_att", "sd_def"),
        backend="bokeh",
    )

    az.plot_trace(
        fit,
        var_names=("alpha", "home", "sd_att", "sd_def"),
        backend="bokeh",
    )

    # Attack and defence
    quality = teams.copy()
    quality = quality.assign(
        attack=fit["attack"].mean(axis=0),
        attacksd=fit["attack"].std(axis=0),
        defend=fit["defend"].mean(axis=0),
        defendsd=fit["defend"].std(axis=0),
    )
    quality = quality.assign(
        attack_low=quality["attack"] - quality["attacksd"],
        attack_high=quality["attack"] + quality["attacksd"],
        defend_low=quality["defend"] - quality["defendsd"],
        defend_high=quality["defend"] + quality["defendsd"],
    )

    plot_quality(quality)

    # Predicted goals and table
    predict = df[df["split"] == "predict"]

    with model:
        pm.set_data({"home_data": predict["Home_id"]})
        pm.set_data({"away_data": predict["Away_id"]})

        predicted_score = pm.sample_posterior_predictive(
            fit, var_names=["s1", "s2"], random_seed=1)

    predicted_full = predict.copy()
    predicted_full = predicted_full.assign(
        score1=predicted_score["s1"].mean(axis=0).round(),
        score1error=predicted_score["s1"].std(axis=0),
        score2=predicted_score["s2"].mean(axis=0).round(),
        score2error=predicted_score["s2"].std(axis=0),
    )

    predicted_full = train.append(
        predicted_full.drop(columns=["score1error", "score2error"]))

    print(score_table(df))
    print(score_table(predicted_full))
# models. Here we'll just use the [arviz](https://arviz-
# devs.github.io/arviz/index.html) package to explore the credible intervals
# of each of the covariance and variance parameters, although it makes
# available a much wider set of tools for analysis.

import arviz as az

# Collect the observation error covariance parameters
az_obs_cov = az.convert_to_inference_data({
    ('Var[%s]' % mod.endog_names[i] if i == j else 'Cov[%s, %s]' %
     (mod.endog_names[i], mod.endog_names[j])): store_obs_cov[nburn + 1:, i, j]
    for i in range(mod.k_endog) for j in range(i, mod.k_endog)
})

# Plot the credible intervals
az.plot_forest(az_obs_cov, figsize=(8, 7))

# Collect the state innovation variance parameters
az_state_cov = az.convert_to_inference_data({
    r'$\sigma^2$[%s]' % mod.state_names[i]: store_state_cov[nburn + 1:, i]
    for i in range(mod.k_states)
})

# Plot the credible intervals
az.plot_forest(az_state_cov, figsize=(8, 7))

# ### Appendix: performance
#
# Finally, we run a few simple tests to compare the performance of the KFS
# and CFA simulation smoothers by using the `%timeit` Jupyter notebook
# magic.
Beispiel #15
0
"""
Ridgeplot
=========

_thumb: .8, .5
"""
import matplotlib.pyplot as plt
import arviz as az

az.style.use("arviz-darkgrid")

rugby_data = az.load_arviz_data("rugby")
axes = az.plot_forest(
    rugby_data,
    kind="ridgeplot",
    var_names=["defs"],
    linewidth=4,
    combined=True,
    ridgeplot_overlap=1.5,
    colors="blue",
    figsize=(9, 4),
)
axes[0].set_title("Relative defensive strength\nof Six Nation rugby teams")

plt.show()
def main(args):
    print("Loading data...")
    teams, df = load_data()
    train = df[df["split"] == "train"]

    print("Starting inference...")
    rng_key = random.PRNGKey(args.rng_seed)
    mcmc = run_inference(
        model,
        train["Home_id"].values,
        train["Away_id"].values,
        train["score1"].values,
        train["score2"].values,
        rng_key,
        args,
    )

    fit = az.from_numpyro(mcmc)

    print("Analyse posterior...")
    az.plot_forest(
        fit,
        var_names=("alpha", "home", "sd_att", "sd_def"),
        backend="bokeh",
    )

    az.plot_trace(
        fit,
        var_names=("alpha", "home", "sd_att", "sd_def"),
        backend="bokeh",
    )

    fit = mcmc.get_samples()

    # Attack and defence
    quality = teams.copy()
    quality = quality.assign(
        attack=fit["attack"].mean(axis=0),
        attacksd=fit["attack"].std(axis=0),
        defend=fit["defend"].mean(axis=0),
        defendsd=fit["defend"].std(axis=0),
    )
    quality = quality.assign(
        attack_low=quality["attack"] - quality["attacksd"],
        attack_high=quality["attack"] + quality["attacksd"],
        defend_low=quality["defend"] - quality["defendsd"],
        defend_high=quality["defend"] + quality["defendsd"],
    )

    plot_quality(quality)

    # Predicted goals and table
    predict = df[df["split"] == "predict"]

    predictive = Predictive(model, fit, return_sites=["s1", "s2"])

    predicted_score = predictive(
        random.PRNGKey(0),
        home_id=predict["Home_id"].values,
        away_id=predict["Away_id"].values,
    )

    predicted_full = predict.copy()
    predicted_full = predicted_full.assign(
        score1=predicted_score["s1"].mean(axis=0).round(),
        score1error=predicted_score["s1"].std(axis=0),
        score2=predicted_score["s2"].mean(axis=0).round(),
        score2error=predicted_score["s2"].std(axis=0),
    )

    predicted_full = train.append(
        predicted_full.drop(columns=["score1error", "score2error"]))

    print(score_table(df))
    print(score_table(predicted_full))
Beispiel #17
0
    prior = pm.sample_prior_predictive(samples = 30)
    posterior_1 = pm.sample()
    posterior_pred_1 = pm.sample_posterior_predictive(posterior_1)

pm.traceplot(posterior_1);

data = az.from_pymc3(trace=posterior_1,
                    prior=prior,
                    posterior_predictive=posterior_pred_1)
data

az.style.use('arviz-darkgrid')

fig, axes = az.plot_forest(data,
                        kind='ridgeplot',
                        combined=False,
                        ridgeplot_overlap=2,
                        colors='white',
                        figsize=(10, 3))
axes[0].set_title('model_1 posteriors parameters distributions');

df = pd.DataFrame()
df = df.assign(alpha = pd.Series(prior['alpha']),
               sigma = pd.Series(prior['sigma']),
               beta = pd.Series(prior['beta']))
df.head()

priors = pd.DataFrame()

for i in range(df.shape[0]):
    priors['prior_'+str(i)] = df.loc[i,'alpha'] + df.loc[i,'beta'] * d['area']
Beispiel #18
0
beta_mean = np.mean(betas)
hyper_mean = alpha_mean / (alpha_mean + beta_mean)
print('hyper mean')
print(hyper_mean)
hyper_mean2 = np.mean(alphas / (alphas + betas))
print(hyper_mean2)

mle = G_samples / N_samples
pooled_mle = np.sum(G_samples) / np.sum(N_samples)

print('pooled mle')
print(pooled_mle)

axes = az.plot_forest(trace_h,
                      var_names='θ',
                      hdi_prob=0.95,
                      combined=False,
                      colors='cycle')
y_lims = axes[0].get_ylim()
#axes[0].vlines(post_hyper_mean, *y_lims)
pml.savefig('hbayes_binom_covid_forest.png', dpi=300)

fig, axs = plt.subplots(4, 1, figsize=(8, 8))
axs = np.reshape(axs, 4)
xs = np.arange(J)
ax = axs[0]
ax.bar(xs, G_samples)
ax.set_ylim(0, 5)
ax.set_title('number of cases (truncated at 5)')
ax = axs[1]
ax.bar(xs, N_samples)
Beispiel #19
0
def main():
    parser = argparse.ArgumentParser(
        description='Train PMF on CSV-formatted count matrix')
    parser.add_argument(
        '-f', '--csv-file', nargs='?', type=str,
        help="Enter the CSV file"
    )
    parser.add_argument(
        '-e', '--epoch', nargs='?', type=int, default=300,
        help='Enter Epoch value: Default: 300'
    )
    parser.add_argument(
        '-d', '--dimension', nargs='?', type=int, default=2,
        help='Enter embedding dimension. Default: 2'
    )
    parser.add_argument(
        '-b', '--batch-size', nargs='?', type=int, default=5000,
        help='Enter batch size. Default: 5000'
    )

    parser.add_argument(
        '-lr', '--learning-rate', nargs='?', type=float, default=0.01,
        help='Enter float. Default: 0.01'
    )
    
    parser.add_argument(
        '-c', '--clip-value', nargs='?', type=float, default=3.,
        help='Gradient clip value. Default: 3.0'
    )

    parser.add_argument(
        '-lt', '--log-transform',
        help='Log-transform?', action='store_true'
    )

    parser.add_argument(
        '-rn', '--row-normalize',
        help='Row normalize based on counts?', action='store_true'
    )

    args = parser.parse_args(sys.argv[1:])
    if args.csv_file is None:
        sys.exit("You need to specify a csv file")
    elif not os.path.exists(args.csv_file):
        sys.exit("File doesn't exist")
    else:
        _FILENAME = args.csv_file

    _BATCH_SIZE = args.batch_size
    _LOG_TRANSFORM = args.log_transform
    _EPOCH_NUMBER = args.epoch
    _DIMENSION = args.dimension
    _LEARNING_RATE = args.learning_rate
    _ROW_NORMALIZE = args.row_normalize
    _CLIP_VALUE = args.clip_value

    with open(_FILENAME) as f:
        csv_file = csv.reader(f)
        columns = len(next(csv_file))

    csv_data0 = tf.data.experimental.CsvDataset(
        _FILENAME, [tf.float64]*columns)
    csv_data0 = csv_data0.enumerate()

    csv_data = csv_data0.map(
        lambda j, *x: {
            'indices': j,
            'counts': tf.squeeze(tf.stack(x, axis=-1))
        })

    # Grab a batch to compute statistics
    colsums = []
    batch_sizes = []
    N = 0
    for batch in iter(csv_data.batch(_BATCH_SIZE, drop_remainder=False)):
        colsums += [tf.reduce_sum(batch['counts'], axis=0, keepdims=True)]
        N += batch['counts'].shape[0]

    colsums = tf.add_n(colsums)
    colmeans = colsums/N
    rowmean = tf.reduce_sum(colmeans)

    if _ROW_NORMALIZE:
        csv_data = csv_data0.map(
            lambda j, *x: {
                'indices': j,
                'counts': tf.squeeze(tf.stack(x, axis=-1)),
                'normalization': tf.reduce_max([
                    tf.reduce_sum(x), 1.])/rowmean
            })

    csv_data_batched = csv_data.batch(_BATCH_SIZE, drop_remainder=True)
    csv_data_batched = csv_data_batched.prefetch(
        tf.data.experimental.AUTOTUNE)

    factor = PoissonMatrixFactorization(
        csv_data_batched, latent_dim=_DIMENSION, strategy=None,
        scale_columns=True, log_transform=_LOG_TRANSFORM,
        column_norms=colmeans,
        u_tau_scale=1.0/np.sqrt(columns*N),
        dtype=tf.float64)

    factor.calibrate_advi(
        num_epochs=_EPOCH_NUMBER,
        rel_tol=1e-4, clip_value=_CLIP_VALUE,
        learning_rate=_LEARNING_RATE)

    print("Saving the encoding matrix")

    filename = f"{_FILENAME}_{_DIMENSION}D_encoding"
    filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.csv"
    with open(filename, "w") as f:
        writer = csv.writer(f)
        encoding = factor.encoding_matrix().numpy().T
        for row in range(encoding.shape[0]):
            writer.writerow(encoding[row, :])

    print("Saving the trained model object")
    filename = f"{_FILENAME}_{_DIMENSION}D_model"
    filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.pkl"
    factor.save(filename)

    print("Saving figure with the encodings")

    fig, ax = plt.subplots(1, 2, figsize=(14, 8))
    D = factor.feature_dim
    pcm = ax[0].imshow(
        factor.encoding_matrix().numpy()[::-1, :],
        vmin=0, cmap="Blues")
    ax[0].set_yticks(np.arange(factor.feature_dim))
    ax[0].set_yticklabels(np.arange(factor.feature_dim))
    ax[0].set_ylabel("item")
    ax[0].set_xlabel("factor dimension")
    ax[0].set_xticks(np.arange(_DIMENSION))
    ax[0].set_xticklabels(np.arange(_DIMENSION))

    surrogate_samples = factor.surrogate_distribution.sample(250)
    if 's' in surrogate_samples.keys():
        weights = surrogate_samples['s'] / \
            tf.reduce_sum(surrogate_samples['s'], -2, keepdims=True)
        intercept_data = az.convert_to_inference_data(
            {
                r"":
                    (
                        tf.squeeze(surrogate_samples['w'])
                        * weights[:, -1, :]
                        * factor.eta_i
                    ).numpy().T})
    else:
        intercept_data = az.convert_to_inference_data(
            {
                r"":
                    (
                        tf.squeeze(surrogate_samples['w'])
                        * factor.eta_i).numpy().T})

    fig.colorbar(pcm, ax=ax[0], orientation="vertical")
    az.plot_forest(intercept_data, ax=ax[1])
    ax[1].set_xlabel("background rate")
    ax[1].set_ylim((-0.014, .466))
    ax[1].set_title("65% and 95% CI")
    ax[1].axvline(1.0, linestyle='dashed', color="black")
    filename = f"{_FILENAME}_{_DIMENSION}D_encoding_"
    filename += f"lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.pdf"
    plt.savefig(
        filename,
        bbox_inches='tight')

    print("Generating representations")
    filename = f"{_FILENAME}_{_DIMENSION}D_representation"
    filename += f"_lt_{_LOG_TRANSFORM}_rn_{_ROW_NORMALIZE}.csv"

    csv_data_batched = csv_data.batch(_BATCH_SIZE, drop_remainder=False)
    with open(filename, 'w') as f:
        writer = csv.writer(f)
        for record in iter(csv_data_batched):
            z = factor.encode(tf.cast(record['data'], factor.dtype)).numpy()
            if _ROW_NORMALIZE:
                z *= (record['normalization'].numpy())[:, np.newaxis]
            ind = record['indices'].numpy()
            for row in range(z.shape[0]):
                writer.writerow(np.concatenate([[ind[row]], z[row, :]]))
    # set the priors on scale and df
    sigma = pm.HalfCauchy("sigma", 5)
    df = pm.Exponential("df", 1 / 30)
    # specify the likelihood of the data
    y_obs = pm.StudentT("y_obs",
                        mu=alpha_temp[idx] + beta[idx] * x_centered,
                        sd=sigma,
                        nu=df,
                        observed=y_m)
    # inference step
    trace_unp = pm.sample(2000)

# -------------- analyse the posterior -------------------------------------- #

with unpooled_model:
    az.plot_forest(trace_unp, var_names=["alpha", "beta"], combined=True)

# ---------------- specify a hierarchical probabilistic model ----------------------------- #

with pm.Model() as hierarchical_model:
    # specify a set of hyper-priors
    alpha_m_temp = pm.Normal("alpha_m_temp", mu=0, sd=10)
    alpha_s_temp = pm.HalfNormal("alpha_s_temp", sd=10)
    beta_m = pm.Normal("beta_m", mu=0, sd=10)
    beta_s = pm.HalfNormal("beta_s", sd=10)
    # set the priors on parameters
    alpha_temp = pm.Normal("alpha_temp",
                           mu=alpha_m_temp,
                           sd=alpha_s_temp,
                           shape=M)
    beta = pm.Normal("beta", mu=beta_m, sd=beta_s, shape=M)
Beispiel #21
0
    β = pm.Normal('β', mu=0, sd=10, shape=M)
    ϵ = pm.HalfCauchy('ϵ', 5)
    ν = pm.Exponential('ν', 1/30)

    y_pred = pm.StudentT('y_pred', mu=α_tmp[idx] + β[idx] * x_centered,
                         sd=ϵ, nu=ν, observed=y_m)

    α = pm.Deterministic('α', α_tmp - β * x_m.mean())

    trace_up = pm.sample(2000)


# In[28]:


az.plot_forest(trace_up, var_names=['α', 'β'], combined=True)
plt.savefig('B11197_03_14.png', dpi=300)


#
#  <img src='B11197_03_15.png' width="700">

# In[29]:


with pm.Model() as hierarchical_model:
    # hyper-priors
    α_μ_tmp = pm.Normal('α_μ_tmp', mu=0, sd=10)
    α_σ_tmp = pm.HalfNormal('α_σ_tmp', 10)
    β_μ = pm.Normal('β_μ', mu=0, sd=10)
    β_σ = pm.HalfNormal('β_σ', sd=10)
    mu = pm.Normal('mu', mu=0, sd=10, shape=groups)
    sigma = pm.HalfNormal('sigma', sd=10, shape=groups)
    y = pm.Normal('y', mu=mu[idx], sd=sigma[idx], observed=diff)
    trace_cs_nh = pm.sample(1000)

# In[37]:
with pm.Model() as cs_h:
    # hyper_priors
    mu_mu = pm.Normal('mu_mu', mu=0, sd=10)
    sigma_mu = pm.HalfNormal('sigma_mu', 10)

    # priors
    mu = pm.Normal('mu', mu=mu_mu, sd=sigma_mu, shape=groups)
    sigma = pm.HalfNormal('sigma', sd=10, shape=groups)

    y = pm.Normal('y', mu=mu[idx], sd=sigma[idx], observed=diff)

    trace_cs_h = pm.sample(1000)

# In[38]:

_, axes = az.plot_forest([trace_cs_nh, trace_cs_h],
                         model_names=['n_h', 'h'],
                         var_names='mu',
                         combined=False,
                         colors='cycle')
y_lims = axes[0].get_ylim()
axes[0].vlines(trace_cs_h['mu_mu'].mean(), *y_lims)

plt.savefig('B11197_02_22.png', dpi=300)
ax.axvline(height.mean())
ax.set(title='Posterior predictive of the mean',
       xlabel='mean(x)',
       ylabel='Frequency')

_, ax = plt.subplots(figsize=(12, 6))
ax.hist([h.mean() for h in no_collinear_ppc['h']])
ax.axvline(height.mean())
ax.set(title='Posterior predictive of the mean',
       xlabel='mean(x)',
       ylabel='Frequency')

# Plot posterior density for models
#%%
az.plot_density([trace_collinear, trace_no_collinear],
                data_labels=['collinear', 'no collinear'],
                var_names=['br'],
                shade=0.1)

# Compare plots
#%%
az.plot_forest([trace_collinear, trace_no_collinear],
               model_names=['collinear', 'no collinear'],
               var_names=['br', 'a', 'sigma'])

#%%
az.plot_forest([trace_collinear, trace_no_collinear],
               model_names=['collinear', 'no collinear'],
               var_names=['br', 'a', 'sigma'],
               kind='ridgeplot')
Beispiel #24
0
    alpha = pm.Normal('alpha', mu=0, sd=10)
    beta = pm.Normal('beta', mu=0, sd=2, shape=len(x_n))

    mu = alpha + pm.math.dot(x_1, beta)
    theta = pm.Deterministic('theta', 1 / (1 + pm.math.exp(-mu)))
    bd = pm.Deterministic('bd',
                          -alpha / beta[1] - beta[0] / beta[1] * x_1[:, 0])

    yl = pm.Bernoulli('yl', p=theta, observed=y_1)

    trace_1 = pm.sample(2000)

# In[13]:

varnames = ['alpha', 'beta']
az.plot_forest(trace_1, var_names=varnames)

# In[14]:

idx = np.argsort(x_1[:, 0])
bd = trace_1['bd'].mean(0)[idx]
plt.scatter(x_1[:, 0], x_1[:, 1], c=[f'C{x}' for x in y_0])
plt.plot(x_1[:, 0][idx], bd, color='k')

az.plot_hpd(x_1[:, 0], trace_1['bd'], color='k')

plt.xlabel(x_n[0])
plt.ylabel(x_n[1])
plt.savefig('B11197_04_05.png', dpi=300)

# ## Interpreting the coefficients of a logistic regression
    mu = pm.MvNormal("ab_dept",
                     mu=tt.stack([a, bm]),
                     chol=chol,
                     shape=(Ndept, 2))

    a_dept = pm.Deterministic("a_dept", mu[:, 0])
    bm_dept = pm.Deterministic("bm_dept", mu[:, 1])

    p = pm.math.invlogit(mu[Dept_id, 0] + mu[Dept_id, 1] * d_ad["male"])
    admit = pm.Binomial("admit", p=p, n=d_ad.applications, observed=d_ad.admit)

    trace_13_3 = pm.sample(5000, tune=1000)

# %%
az.plot_forest(trace_13_3,
               var_names=["bm_dept", "a_dept"],
               credible_interval=0.89)

# %%
with pm.Model() as m_13_4:
    a = pm.Normal("a", 0, 10)
    sigma_dept = pm.HalfCauchy("sigma_dept", 2)
    a_dept = pm.Normal("a_dept", a, sigma_dept, shape=Ndept)
    p = pm.math.invlogit(a_dept[Dept_id])
    admit = pm.Binomial("admit", p=p, n=d_ad.applications, observed=d_ad.admit)

    trace_13_4 = pm.sample(4500, tune=500)

comp_df = az.compare({
    "m13_2": trace_13_2,
    "m13_3": trace_13_3,
Beispiel #26
0
az.summary(trace_h)



J = len(N_samples)
post_mean = np.zeros(J)
samples = trace_h['θ']
post_mean = np.mean(samples, axis=0)
post_hyper_mean = trace_h['μ'].mean()

mle = G_samples / N_samples
pooled_mle = np.sum(G_samples) / np.sum(N_samples)


axes = az.plot_forest(
    trace_h, var_names='θ', combined=False, colors='cycle')
y_lims = axes[0].get_ylim()
axes[0].vlines(post_hyper_mean, *y_lims)

axes = az.plot_forest(
    trace_h, var_names='θ', combined=True, colors='cycle',
    kind='ridgeplot')



# Show posterior over hparans
fig, ax= plt.subplots(1,1)
x = np.linspace(0, 1, 100)
for i in np.random.randint(0, len(trace_h), size=100):
    u = trace_h['μ'][i]
    k = trace_h['κ'][i]
Beispiel #27
0
    def make_plots(self,
                   run_params,
                   run_data=None,
                   energy_data=None,
                   runs_np=True,
                   out_dir=None):
        """Create trace + KDE plots of lattice observables and energy data."""
        type_str = 'figures_np' if runs_np else 'figures_tf'
        figs_dir = os.path.join(self._log_dir, type_str)
        fig_dir = os.path.join(figs_dir, run_params['run_str'])
        io.check_else_make_dir(fig_dir)

        dataset = None
        energy_dataset = None
        try:
            fname, title_str = self._plot_setup(run_params)
        except FileNotFoundError:
            return dataset, energy_dataset

        tp_fname = f'{fname}_traceplot'
        pp_fname = f'{fname}_posterior'
        rp_fname = f'{fname}_ridgeplot'

        dataset = self.build_dataset(run_data, run_params)

        tp_out_file = os.path.join(fig_dir, f'{tp_fname}.pdf')
        pp_out_file = os.path.join(fig_dir, f'{pp_fname}.pdf')

        var_names = ['tunneling_rate', 'plaqs_diffs']
        if hasattr(dataset, 'dx'):
            var_names.append('dx')
        var_names.extend(['accept_prob', 'charges_squared', 'charges'])

        tp_out_file_ = None
        pp_out_file_ = None
        if out_dir is not None:
            io.check_else_make_dir(out_dir)
            tp_out_file1 = os.path.join(out_dir, f'{tp_fname}.pdf')
            pp_out_file1 = os.path.join(out_dir, f'{pp_fname}.pdf')

        ###################################################
        # Create traceplot + posterior plot of observables
        ###################################################
        self._plot_trace(dataset,
                         tp_out_file,
                         var_names=var_names,
                         out_file1=tp_out_file1)

        self._plot_posterior(dataset,
                             pp_out_file,
                             var_names=var_names,
                             out_file1=pp_out_file1)

        # * * * * * * * * * * * * * * * * *
        # Create ridgeplot of plaq diffs  *
        # * * * * * * * * * * * * * * * * *
        rp_out_file = os.path.join(fig_dir, f'{rp_fname}.pdf')
        _ = az.plot_forest(dataset,
                           kind='ridgeplot',
                           var_names=['plaqs_diffs'],
                           ridgeplot_alpha=0.4,
                           ridgeplot_overlap=0.1,
                           combined=False)
        fig = plt.gcf()
        fig.suptitle(title_str, fontsize='x-large', y=1.025)
        self._savefig(fig, rp_out_file)
        if out_dir is not None:
            rp_out_file1 = os.path.join(out_dir, f'{rp_fname}.pdf')
            self._savefig(fig, rp_out_file1)

        # * * * * * * * * * * * * * * * * * * * * * * * * * *
        # Create traceplot + posterior plot of energy data  *
        # * * * * * * * * * * * * * * * * * * * * * * * * * *
        if energy_data is not None:
            energy_dataset = self.energy_plots(energy_data,
                                               run_params,
                                               fname,
                                               out_dir=out_dir)

        return dataset, energy_dataset
def main(args):
    print("Loading data...")
    teams, df = load_data()
    train = df[df["split"] == "train"]
    nt = len(teams)

    print("Starting inference...")
    mcmc = run_inference(
        num_chains=args.num_chains,
        num_results=args.num_samples,
        num_burnin_steps=args.num_warmup,
        nt=nt,
    )

    samples = dict(
        zip(
            ["alpha", "home", "sd_att", "sd_def", "attack", "defend"],
            [np.swapaxes(sample, 0, 1) for sample in mcmc],
        )
    )

    fit = az.from_dict(samples)

    print("Analyse posterior...")
    az.plot_forest(
        fit,
        var_names=("alpha", "home", "sd_att", "sd_def"),
        backend="bokeh",
    )

    az.plot_trace(
        fit,
        var_names=("alpha", "home", "sd_att", "sd_def"),
        backend="bokeh",
    )

    # Attack and defence
    quality = teams.copy()
    quality = quality.assign(
        attack=samples["attack"].mean(axis=(0, 1)),
        attacksd=samples["attack"].std(axis=(0, 1)),
        defend=samples["defend"].mean(axis=(0, 1)),
        defendsd=samples["defend"].std(axis=(0, 1)),
    )
    quality = quality.assign(
        attack_low=quality["attack"] - quality["attacksd"],
        attack_high=quality["attack"] + quality["attacksd"],
        defend_low=quality["defend"] - quality["defendsd"],
        defend_high=quality["defend"] + quality["defendsd"],
    )

    plot_quality(quality)

    # Predicted goals and table
    predict = df[df["split"] == "predict"]

    theta1 = (
        samples["alpha"].flatten()[..., np.newaxis]
        + samples["home"].flatten()[..., np.newaxis]
        + tf.gather(
            samples["attack"].reshape(-1, samples["attack"].shape[-1]),
            predict["Home_id"],
            axis=-1,
        )
        - tf.gather(
            samples["defend"].reshape(-1, samples["defend"].shape[-1]),
            predict["Away_id"],
            axis=-1,
        )
    )

    theta2 = (
        samples["alpha"].flatten()[..., np.newaxis]
        + tf.gather(
            samples["attack"].reshape(-1, samples["attack"].shape[-1]),
            predict["Away_id"],
            axis=-1,
        )
        - tf.gather(
            samples["defend"].reshape(-1, samples["defend"].shape[-1]),
            predict["Home_id"],
            axis=-1,
        )
    )

    s1 = np.array(tfd.Poisson(log_rate=theta1).sample())
    s2 = np.array(tfd.Poisson(log_rate=theta2).sample())

    predicted_full = predict.copy()
    predicted_full = predicted_full.assign(
        score1=s1.mean(axis=0).round(),
        score1error=s1.std(axis=0),
        score2=s2.mean(axis=0).round(),
        score2error=s2.std(axis=0),
    )

    predicted_full = train.append(
        predicted_full.drop(columns=["score1error", "score2error"])
    )

    print(score_table(df))
    print(score_table(predicted_full))
from collections import defaultdict
import arviz as az

#https://github.com/probml/pmtk3/blob/master/demos/cancerRatesEb.m

data_y = np.array([0, 0, 2, 0, 1, 1, 0, 2, 1, 3, 0, 1, 1, 1, 54, 0, 0, 1, 3, 0]);
data_n = np.array([1083, 855, 3461, 657, 1208, 1025, 527, 1668, 583, 582, 917, 857,
    680, 917, 53637, 874, 395, 581, 588, 383]);
N = len(data_n)

# We put a prior on the mean and precision () of the Beta distribution,
# instead of on the alpha and beta parameters 
with pm.Model() as model_h:
    mu = pm.Beta('mu', 1., 1.)
    kappa = pm.HalfNormal('kappa', 500)
    alpha = pm.Deterministic('alpha', mu*kappa)
    beta = pm.Deterministic('beta', (1.0-mu)*kappa)
    theta = pm.Beta('theta', alpha=alpha, beta=beta, shape=N)
    y = pm.Binomial('y', p=theta, observed=data_y, n=data_n)



np.random.seed(0)
with model_h:
  trace_h = pm.sample(1000, chains=4)
  
az.summary(trace_h).round(4)

az.plot_forest(trace_h, var_names=["theta"], combined=True, credible_interval=0.95);

az.plot_forest(trace_h, var_names=["theta"], combined=True, kind='ridgeplot');
#https://github.com/probml/pmtk3/blob/master/demos/cancerRatesEb.m

data_y = np.array(
    [0, 0, 2, 0, 1, 1, 0, 2, 1, 3, 0, 1, 1, 1, 54, 0, 0, 1, 3, 0])
data_n = np.array([
    1083, 855, 3461, 657, 1208, 1025, 527, 1668, 583, 582, 917, 857, 680, 917,
    53637, 874, 395, 581, 588, 383
])
N = len(data_n)

# We put a prior on the mean and precision () of the Beta distribution,
# instead of on the alpha and beta parameters
with pm.Model() as model_h:
    mu = pm.Beta('mu', 1., 1.)
    kappa = pm.HalfNormal('kappa', 500)
    alpha = pm.Deterministic('alpha', mu * kappa)
    beta = pm.Deterministic('beta', (1.0 - mu) * kappa)
    theta = pm.Beta('theta', alpha=alpha, beta=beta, shape=N)
    y = pm.Binomial('y', p=theta, observed=data_y, n=data_n)

np.random.seed(0)
with model_h:
    trace_h = pm.sample(1000, chains=2, cores=1)

az.summary(trace_h).round(4)

az.plot_forest(trace_h, var_names=["theta"], combined=True, hdi_prob=0.95)

az.plot_forest(trace_h, var_names=["theta"], combined=True, kind='ridgeplot')

plt.show()