Exemple #1
0
def plot_ppc_and_score(trace, data, ax=None, title='PPC', paras=None):

    # Sample PPC
    ppc_trace = pm.sample_posterior_predictive(trace=trace, var_names=['y'])

    # Calculate LOO score
    loo = az.loo(trace).loo
    loo_text = "LOO = %.2f"%loo

    # Aggregate binary responses
    new_trace = []
    for soa in sorted(set((data.SOA_IN_FRAMES))):
        new_trace.append(ppc_trace['y'][:,(data.SOA_IN_FRAMES==soa) & 
                                        (data.PROBE_SALIENT==0)].mean(axis=1))
        new_trace.append(ppc_trace['y'][:,(data.SOA_IN_FRAMES==soa) & 
                                        (data.PROBE_SALIENT==1)].mean(axis=1))
    ppc_trace = {'y': np.array(new_trace).T}
        
    # Prepare axes if none provided
    if ax is None: f,ax= plt.subplots() 

    # Get SOAs and condition mask from data
    SOAs = sorted(set(data['SOA_IN_MS'])) 
    cond  = data.groupby(['SOA_IN_MS', 'PROBE_SALIENT'])['PROBE_SALIENT'].min().values 

    # Plot
    az.plot_hdi(y=ppc_trace['y'][:,cond==0],x=SOAs, color='k', ax=ax, 
                hdi_prob=0.95, fill_kwargs={'alpha' : 0.23})  
    az.plot_hdi(y=ppc_trace['y'][:,cond==1],x=SOAs, color='g', ax=ax, 
                hdi_prob=0.95, fill_kwargs={'alpha' : 0.23})  
    ax.plot(SOAs, np.mean(ppc_trace['y'][:,cond==0],axis=0), color='k')  
    ax.plot(SOAs, np.mean(ppc_trace['y'][:,cond==1],axis=0), color='g')  
    pf_mean = data.groupby(['SOA_IN_MS', 'PROBE_SALIENT']).mean().PROBE_FIRST_RESPONSE
    pf_count = data.groupby(['SOA_IN_MS', 'PROBE_SALIENT']).sum().PROBE_FIRST_RESPONSE
    pf_obs =  data.groupby(['SOA_IN_MS', 'PROBE_SALIENT']).count().PROBE_FIRST_RESPONSE
    pf_ci = abs(np.array(prop_ci(pf_count.values, pf_obs.values)) - pf_mean.values)

    ax.plot(SOAs, pf_mean.values[::2], 'k.')   
    ax.errorbar(np.array(SOAs)-0.5, pf_mean.values[::2],
                pf_ci[:,::2], fmt='none', color='k', alpha=0.5)
    ax.plot(SOAs, pf_mean.values[1::2], 'g.')   
    ax.errorbar(np.array(SOAs)+0.5, pf_mean.values[1::2],
                pf_ci[:,1::2], fmt='none', color='g', alpha=0.5)
    ax.axvline(0, linestyle='dashed')
    ax.axhline(0.5, linestyle='dashed')
    ax.text(-20,0, loo_text)

    if paras is not None:
        for i, varname in enumerate(paras):
            stats = az.summary(trace, var_names=[varname], hdi_prob=.95)  
            for j, s in enumerate(stats['mean']):
                text = r'$' + varname + r'$: %.2f [%.2f, %.2f]'
                text = text%(s, stats['hdi_2.5%'][j], stats['hdi_97.5%'][j])
                posx, posy = .1 + .5 - (1 - j) * .5, 0.95 - (.05*i) - ((1-j)*.5)
                ax.text(posx, posy, text, transform = ax.transAxes, color=['k','g'][j])
    ax.set_title(title)
Exemple #2
0
def make_plot(trace):
    plot_training_data()
    # plot logistic curve
    theta = trace['θ'].mean(axis=0)
    idx = np.argsort(x_c)
    plt.plot(x_c[idx], theta[idx], color='C2', lw=3)
    az.plot_hdi(x_c, trace['θ'], color='C2')

    # plot decision boundary
    plt.vlines(trace['bd'].mean(), 0, 1, color='k')
    bd_hpd = az.hdi(trace['bd'])
    plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='k', alpha=0.5)
Exemple #3
0
def conduct_bayesian(observations_file_path, mu_init, beta_init):
    df = pd.read_csv(observations_file_path)
    # Get list of unique damage state values:
    ds_list = df['DS Number'].unique()
    for ds in range(0, len(ds_list)):
        df_sub = df.loc[df['DS Number'] == ds_list[ds]]
        xj = np.array(df_sub['demand'])
        zj = np.array(df_sub['fail'])
        nj = np.array(df_sub['total'])
        mu_ds = mu_init[ds]
        beta_ds = beta_init[ds]
        with pm.Model() as model:
            # Set up the prior:
            mu = pm.Normal('mu', mu_ds, 2.71)
            beta = pm.Normal('beta', beta_ds, 0.03)

            # Define fragility function equation:
            def normal_cdf(mu, beta, xj):
                """Compute the log of the cumulative density function of the normal."""
                return 0.5 * (1 + tt.erf(
                    (tt.log(xj) - mu) / (beta * tt.sqrt(2))))

            # Define likelihood:
            # like = pm.Binomial('like', p=p, observed=zj, n=nj)
            like = pm.Binomial('like',
                               p=normal_cdf(mu, beta, xj),
                               observed=zj,
                               n=nj)
            for RV in model.basic_RVs:
                print(RV.name, RV.logp(model.test_point))
            # Determine the posterior
            trace = pm.sample(2000, cores=1, return_inferencedata=True)
            # Posterior predictive check are a great way to validate model:
            # Generate data from the model using parameters from draws from the posterior:
            ppc = pm.sample_posterior_predictive(
                trace, var_names=['mu', 'beta', 'like'])
        # Calculate failure probabilities using samples:
        im = np.arange(70, 200, 5)
        pf_ppc = []
        for i in range(0, len(ppc['mu'])):
            y = pf(im, ppc['mu'][i], ppc['beta'][i])
            pf_ppc.append(y)
        # Plot the HPD:
        _, ax = plt.subplots()
        az.plot_hdi(im,
                    pf_ppc,
                    fill_kwargs={
                        'alpha': 0.2,
                        'color': 'blue',
                        'label': 'bounds of prediction: 94% HPD'
                    })
        # Calculate and plot the mean outcome:
        pf_mean = pf(im, ppc['mu'].mean(), ppc['beta'].mean())
        ax.plot(im,
                pf_mean,
                label='mean of prediction',
                color='r',
                linestyle='dashed')
        # Plot the mean of the simulation-based fragility:
        pf_sim = pf(im, mu_ds, beta_ds)
        ax.plot(im, pf_sim, label='simulation-based', color='k')
        # Plot the observations:
        ax.scatter(xj, zj / nj, color='r', marker='^', label='observations')
        ax.legend()
        plt.show()
        # Looking at the difference between the prior of the parameters and updated distributions:
        new_mu_mean, new_mu_std = norm.fit(ppc['mu'])
        plt.hist(ppc['mu'], bins=25, density=True, alpha=0.4, color='b')
        xmin, xmax = plt.xlim()
        x = np.linspace(xmin, xmax, 100)
        p_prior = norm.pdf(x, mu_ds, 2.71)
        p_new = norm.pdf(x, new_mu_mean, new_mu_std)
        plt.plot(x, p_prior, 'k', linewidth=2, label='prior distribution')
        plt.plot(x,
                 p_new,
                 'r',
                 linewidth=2,
                 label='updated distribution',
                 linestyle='dashed')
        # Note az.plot_violin(trace, var_names=['mu']) can be helpful for seeing distribution of parameter values
        # Plot the posterior distributions of each RV
        fig, ax = plt.subplots()
        az.plot_trace(trace, chain_prop={'color': ['blue', 'red']})
        az.plot_posterior(trace)
        az.plot_forest(trace, var_names=['mu', 'beta'])
        plt.show()
        print(az.summary(trace))
Exemple #4
0
df = sns.load_dataset('iris')
iris = df.query("species == ('setosa', 'versicolor')")
y = pd.Categorical(iris['species']).codes
x = iris[iris.columns[:-1]].values
x = x[:, 0] - x[:, 0].mean()
print(x)
with pm.Model() as model:
    alpha = pm.Normal('alpha', 0, 10)
    beta = pm.Normal('beta', 0, 10)
    mu = alpha + pm.math.dot(x, beta)
    p = pm.Deterministic('p', pm.math.sigmoid(mu))
    y_lik = pm.Bernoulli('y_lik', p=p, observed=y)
    b = pm.Deterministic('b', -alpha / beta)
    trace_m = pm.sample(draws=1000, cores=1, chains=3, random_seed=1)
    pp = pm.sample_posterior_predictive(trace_m)

_, ax = plt.subplots(figsize=(12, 8))
xs = np.linspace(x.min(), x.max(), 1000)
theta = trace_m['p'].mean(axis=0)
sns.lineplot(xs,
             1 / (1 + np.exp(-(trace_m['alpha'].mean(axis=0) +
                               trace_m['beta'].mean(axis=0) * xs))),
             ax=ax)
plt.vlines(trace_m['b'].mean(axis=0), 0, 1)
az.plot_hdi(x, trace_m['p'], ax=ax)
hdi = az.hdi(trace_m['b'], hdi_prob=0.98)
plt.fill_betweenx([0, 1], hdi[0], hdi[1], color='k', alpha=0.5)
sns.scatterplot(x, y, ax=ax)
plt.xlabel('sepal_length')
plt.show()
Exemple #5
0
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
import seaborn as sns
import arviz as az
from sklearn.preprocessing import scale

df = sns.load_dataset('iris')
print(df.head())
iris = df.query("species == ('setosa', 'versicolor')")
y = pd.Categorical(iris['species']).codes
x = iris[iris.columns[:-1]].values

with pm.Model() as Model:
    alpha = pm.Normal('alpha', mu=0, sigma=100)
    beta = pm.Normal('beta', mu=0, sigma=2, shape=(2))
    mu = alpha + pm.math.dot(x[:, 0:2], beta)
    p = pm.Deterministic('p', pm.math.sigmoid(mu))
    db = pm.Deterministic('db',
                          -(alpha / beta[1]) - (beta[0] / beta[1]) * x[:, 0])
    pm.Bernoulli('p-lik', p=p, observed=y)
    trace_m = pm.sample(2000, cores=1)
    #pp = pm.sample_posterior_predictive(trace_m)

_, ax = plt.subplots(figsize=(12, 8))
theta = trace_m['db'].mean(axis=0)
ax.scatter(x[:, 0], x[:, 1], c=[f'C{k}' for k in y])
ix = np.argsort(x[:, 0])
ax.plot(x[:, 0][ix], theta[ix])
az.plot_hdi(x[:, 0], trace_m['db'], color='k', ax=ax)
plt.show()
Exemple #6
0
    y = pm.Data('y', y_obs)
    theta_0 = pm.Normal('intercept', mu=0, sigma=2)
    theta_1 = pm.Normal('coefx', mu=0, sigma=2)
    theta_2 = pm.Normal('coefxSqd', mu=0, sigma=2)
    theta = pm.Deterministic('theta', theta_0 + theta_1*xs + theta_2*xs**2)
    sigma = pm.HalfCauchy('sigma', 100)
    y_lik = pm.Normal('y_lik', mu=theta, sigma=sigma, observed=y)
    trace_linear = pm.sample(tune=2000, chains=1, cores=1)
    pp_samples = pm.sample_posterior_predictive(trace=trace_linear, random_seed=123)

y_pred = pp_samples['y_lik'].mean(axis=0)

_, axi = plt.subplots(1, 4, figsize=(8, 5))
sns.scatterplot(x, y_obs, ax=axi[0]).set_title("Data")
sns.lineplot(x, y_pred, ax=axi[0])
az.plot_hdi(x, trace_linear['theta'], hdi_prob=0.98, ax=axi[0], color='gray')
az.plot_posterior(trace_linear, var_names=['intercept', 'coefx'], ax=axi[1])
az.plot_posterior(trace_linear, var_names=['coefx'], ax=axi[2])
az.plot_posterior(trace_linear, var_names=['coefxSqd'], ax=axi[3])
plt.show()


with linear_Model:
    pm.set_data({'xs': [1, 5.6, 4]})
    y_test = pm.sample_posterior_predictive(trace=trace_linear)
print(y_test['y_lik'].mean(axis=0))
print(1 + 3.2 * 1 + 4 * 1**2)



Exemple #7
0
"""
Plot HDI
========

_thumb: .8, .8
"""
import bokeh.plotting as bkp
import numpy as np
import arviz as az

x_data = np.random.normal(0, 1, 100)
y_data = 2 + x_data * 0.5
y_data_rep = np.random.normal(y_data, 0.5, (200, 100))
x_data_sorted = np.sort(x_data)

ax = az.plot_hdi(x_data, y_data_rep, color="red", backend="bokeh", show=False)
ax.line(x_data_sorted, 2 + x_data_sorted * 0.5, line_color="black", line_width=3)

if az.rcParams["plot.bokeh.show"]:
    bkp.show(ax)
Exemple #8
0
    bd = pm.Deterministic('bd', -α / β)  # decision boundary

    yl = pm.Bernoulli('yl', p=θ, observed=y_0)

    trace_0 = pm.sample(1000)

varnames = ['α', 'β', 'bd']
az.summary(trace_0, varnames)

theta = trace_0['θ'].mean(axis=0)
idx = np.argsort(x_c)

plt.figure()
# plot logistic curve
plt.plot(x_c[idx], theta[idx], color='C2', lw=3)
az.plot_hdi(x_c, trace_0['θ'], color='C2')

# plot decision boundary
plt.vlines(trace_0['bd'].mean(), 0, 1, color='k')
bd_hpd = az.hdi(trace_0['bd'])
plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='k', alpha=0.5)

# plot jittered data
plt.scatter(x_c,
            np.random.normal(y_0, 0.02),
            marker='.',
            color=[f'C{x}' for x in y_0])

plt.xlabel(x_n)
plt.ylabel('p(y=1)', rotation=0)
# use original scale for xticks
Exemple #9
0
    y_ = pm.Bernoulli('y', p=pm.math.sigmoid(f), observed=space_flu)
    trace_space_flu = pm.sample(1000,
                                chains=1,
                                compute_convergence_checks=False)

X_new = np.linspace(0, 80, 200)[:, None]

with model_space_flu:
    f_pred = gp.conditional('f_pred', X_new)
    pred_samples = pm.sample_posterior_predictive(trace_space_flu,
                                                  var_names=['f_pred'],
                                                  samples=1000)

_, ax = plt.subplots(figsize=(10, 6))

fp = logistic(pred_samples['f_pred'])
fp_mean = np.nanmean(fp, 0)

ax.scatter(age,
           np.random.normal(space_flu, 0.02),
           marker='.',
           color=[f'C{ci}' for ci in space_flu])

ax.plot(X_new[:, 0], fp_mean, 'C2', lw=3)

az.plot_hdi(X_new[:, 0], fp, color='C2')
ax.set_yticks([0, 1])
ax.set_yticklabels(['healthy', 'sick'])
ax.set_xlabel('age')
pml.savefig('gp_classify_spaceflu.pdf', dpi=300)
def plot_hdi(t,
             y,
             n_idx,
             m_idata,
             model_type,
             prior_level,
             kind="all",
             hdi_prob=(.95, .8)):

    # unpack tuple & get unique t.
    high, low = hdi_prob
    t_unique = np.unique(t)
    n_time = len(t_unique)

    # take out ppc
    if kind == "full":
        ppc = m_idata.posterior_predictive
        y_pred = ppc["y_pred"].mean(axis=0).values
        y_mean = y_pred.mean(axis=(0, 1))
        outcome = y_pred.reshape((4000 * n_idx, n_time))

    elif kind == "fixed":
        # should be the same as just posterior here I think.
        ppc = m_idata.posterior_predictive
        alpha = ppc.alpha.values  #shape: (1, 4.000)
        beta = ppc.beta.values  #shape: (1, 4.000)
        outcome = (alpha + beta * t_unique[:, None]).T
        y_mean = outcome.mean(axis=0)

    elif kind == "predictions":
        ppc = m_idata.predictions
        y_pred = ppc["y_pred"].mean(axis=0).values
        y_mean = y_pred.mean(axis=(0, 1))
        outcome = y_pred.reshape((4000 * n_idx, n_time))

    # set up plot
    fig, ax = plt.subplots(figsize=(10, 7))

    # plot data
    ax.scatter(t, y, color="darkorange", alpha=0.5)

    # plot mean
    ax.plot(t_unique, y_mean, color="darkorange")

    # plot lower interval
    az.plot_hdi(t_unique,
                outcome,
                ax=ax,
                fill_kwargs={
                    'alpha': 0.4,
                    "label": f"{low*100}% HPD intervals"
                },
                hdi_prob=low)

    # plot higher interval
    az.plot_hdi(t_unique,
                outcome,
                ax=ax,
                fill_kwargs={
                    'alpha': 0.3,
                    "label": f"{high*100}% HDI intervals"
                },
                hdi_prob=high)

    # add legend, title and formatting.
    ax.legend()
    fig.suptitle(f"Python/pyMC3: Prediction Intervals ({kind})")
    fig.tight_layout()
    plt.savefig(f"../plots_python/{model_type}_{prior_level}_HDI_{kind}.jpeg",
                dpi=300)
    )

# plot mean
ax.plot(
    t_unique, 
    y_mean,
    color = "darkorange" # aesthetics
    )

# set HDI intervals 
high, low = (.95, .8) 

# plot lower interval
az.plot_hdi(
    t_unique,
    outcome,
    ax = ax,
    fill_kwargs= {'alpha': 0.4, "label": "80% HPD intervals"},
    hdi_prob = low)

# plot higher interval
az.plot_hdi(
    t_unique,
    outcome,
    ax = ax,
    fill_kwargs = {'alpha': 0.3, "label": "95% HDI intervals"},
    hdi_prob = high)

# add legend, title and tight layout. 
ax.legend()
fig.suptitle("Python/pyMC3: Prediction Intervals (fixed)")
fig.tight_layout()
Exemple #12
0
"""
Plot HDI
========

_thumb: .8, .8
"""
import matplotlib.pyplot as plt
import numpy as np
import arviz as az

az.style.use("arviz-darkgrid")

x_data = np.random.normal(0, 1, 100)
y_data = 2 + x_data * 0.5
y_data_rep = np.random.normal(y_data, 0.5, (200, 100))
plt.plot(x_data, y_data, "C6")
az.plot_hdi(x_data, y_data_rep, color="k", plot_kwargs={"ls": "--"})

plt.show()
Exemple #13
0
    μ = α + pm.math.dot(x_1, β)
    θ = pm.Deterministic('θ', 1 / (1 + pm.math.exp(-μ)))
    bd = pm.Deterministic('bd', -α / β[1] - β[0] / β[1] * x_1[:, 0])

    yl = pm.Bernoulli('yl', p=θ, observed=y_1)

    trace_1 = pm.sample(2000, cores=1, chains=2)

varnames = ['α', 'β']
#az.plot_forest(trace_1, var_names=varnames);

idx = np.argsort(x_1[:, 0])
bd = trace_1['bd'].mean(0)[idx]

plt.figure()
plt.scatter(x_1[:, 0], x_1[:, 1], c=[f'C{x}' for x in y_1])
plt.plot(x_1[:, 0][idx], bd, color='k')

az.plot_hdi(x_1[:, 0], trace_1['bd'], color='k')

plt.xlabel(x_n[0])
plt.ylabel(x_n[1])

plt.tight_layout()
if unbalanced:
    pml.savefig('logreg_iris_bayes_2d_unbalanced.pdf', dpi=300)
else:
    pml.savefig('logreg_iris_bayes_2d.pdf', dpi=300)

plt.show()
Exemple #14
0
def conduct_bayesian_norm(xj,
                          zj,
                          nj,
                          mu_init,
                          beta_init,
                          draws,
                          target_accept,
                          plot_flag=True):
    """
    A function to conduct Bayesian updating of fragility models.
    (Optional): Produce MCMC-related plots (trace). Default: True
    Notes:
    Here intensity measure is the wind speed: A normalizing factor (max wind speed) is used to improve
    numerical stability.

    Prior distributions are designated according to the assumption that the Bayesian analysis will utilize
    wind fragility functions from HAZUS (see priors for mu and beta). See De Brujin et al. (2020) for more details.

    De Bruijn J. et al. (2020). "Using rapid damage observations from social media for Bayesian updating of hurricane
    vulnerability functions: A case study of Hurricane Dorian." Nat.Hazards Earth Syst. Sci. Discuss. [preprint],
    https://doi.org/10.5194/nhess-2020-282.

    The likelihood function is modeled using a Binomial distribution see Lallemant et al. (2015) for more details.

    :param xj: An array or list of observed intensity measure values for the damage measure.
    :param zj: An array or list of failure observations (number of failed buildings/components) for the given damage
                and intensity measure.
    :param nj: An array or list of the total # of buildings for the given damage measure and intensity measure
    :param mu_init: The mean value of the prior distribution for the logarithmic mean.
    :param beta_init: The mean value of the prior distribution for the logarithmic std. dev.
    :param num_samples: (Optional) The number of samples to conduct MCMC.
    :param plot_flag: (Optional) Produce the trace plot for the MCMC and updated distributions for parameters.
    :return: updated_values: A dictionary with each parameter's updated mean and standard deviation.
    """
    # Step 1: Normalize the intensity measure:
    norm_analysis = True
    if norm_analysis:
        norm_factor = max(xj)
        xj = xj / norm_factor
        mu_init = mu_init / norm_factor
        mu_std_dev = 15 / norm_factor
        #nj = nj/15
        #zj = zj/15
        #beta_init = beta_init/norm_factor
        #beta_std_dev = 0.03/norm_factor
    else:
        mu_std_dev = 15
    beta_std_dev = 0.03
    # Step 2: Build the Bayesian model in PyMC3:
    with pm.Model() as model:
        # Step 3a: Set up the prior
        # Here we assume Normal distributions for both parameters of the fragility
        # Note: Parameters for mu are also normalized for compatibility with intensity measure values.
        # See De Brujin et al. for more information regarding the initialization of prior distributions.
        #mu = pm.Normal('mu', mu_init/norm_factor, 15/norm_factor)
        BoundedNormal = pm.Bound(pm.Normal, lower=0.0)
        #x = BoundedNormal('x', mu=1.0, sigma=3.0)
        theta1 = BoundedNormal('theta1', mu_init, mu_std_dev)
        theta2 = BoundedNormal('theta2', beta_init, beta_std_dev)

        # Step 3b: Set up the likelihood function:
        # The likelihood in this model is represented via a Binomial distribution.
        # See Lallemant et al. (2015) for MLE derivation.
        # Define fragility function equation:
        def normal_cdf(theta1, theta2, xj):
            """Compute the log of the cumulative density function of the normal."""
            return 0.5 * (1 + tt.erf(
                (tt.log(xj / theta1)) / (theta2 * tt.sqrt(2))))

        # Define the likelihood:
        like = pm.Binomial('like',
                           p=normal_cdf(theta1, theta2, xj),
                           observed=zj,
                           n=nj)
        # Uncomment to do an initial check of parameter values (lookout for like = +/-inf)
        #for RV in model.basic_RVs:
        #   print(RV.name, RV.logp(model.test_point))
        # Step 3c: Determine the posterior
        # Note: can manually change number of cores if more computational power is available.
        trace_idata = pm.sample(draws,
                                cores=1,
                                return_inferencedata=True,
                                random_seed=72,
                                target_accept=target_accept)
        #trace = pm.sample(8000, cores=1, return_inferencedata=True, tune=2000, random_seed=52)  #tune=2000
        # (Optional): Plotting the trace and updated distributions for parameters:
        if plot_flag:
            from matplotlib import rcParams
            rcParams['font.family'] = "Times New Roman"
            rcParams.update({'font.size': 16})
            az.plot_trace(trace_idata, chain_prop={'color': ['blue', 'red']})
            # Plot the autocorrelation to check convergence:
            ax_corr = az.plot_autocorr(trace_idata, combined=True)
            ax_corr[0].set_title(r'$\theta_1$')
            ax_corr[1].set_title(r'$\theta_2$')
            ax_corr[0].set_ylabel('Autocorrelation')
            ax_corr[0].set_xlabel('Lag')
            ax_corr[1].set_xlabel('Lag')
        # Step 4: Generate summary statistics for the MCMC:
        print('Summary statistics for the MCMC:')
        print(az.summary(
            trace_idata))  # Note: can output this DataFrame if needed
        df = az.summary(trace_idata, hdi_prob=0.95)
        # Step 5: Sample from the posterior and save updated values for mean and std. dev:
        ppc = pm.sample_posterior_predictive(
            trace_idata, var_names=['theta1', 'theta2', 'like'])
        # Re-scale values for logarithmic mean:
        if norm_analysis:
            df['mean']['theta1'] = df['mean']['theta1'] * norm_factor
            df['sd']['theta1'] = df['sd']['theta1'] * norm_factor
            ppc['theta1'] = ppc['theta1'] * norm_factor
        else:
            pass
        # Export analysis results:
        summary_dict = {}
        for row in df.index:
            for col in df.columns:
                new_key = row + col
                if 'hdi' in new_key and 'theta1' in new_key and norm_analysis:
                    new_val = df[col][row] * norm_factor
                else:
                    new_val = df[col][row]
                summary_dict[new_key] = new_val
        # Export MCMC details:
        mcmc_dict = {'draws': draws, 'target_accept': target_accept}
        for key in mcmc_dict:
            summary_dict[key] = mcmc_dict[key]
        df_summary = pd.DataFrame(summary_dict, index=[0], dtype='object')
        # Sample directly from the posterior to create figures:
        updated_values = {
            'theta1': {
                'mean': ppc['theta1'].mean(),
                'std dev': np.std(ppc['theta1'])
            },
            'theta2': {
                'mean': ppc['theta2'].mean(),
                'std dev': np.std(ppc['theta2'])
            }
        }
        if plot_flag:
            # Plot prior and updated distributions for parameters:
            # mu
            fig, ax = plt.subplots()
            ax.hist(ppc['theta1'] / 2.237,
                    bins=25,
                    density=True,
                    alpha=0.4,
                    color='cornflowerblue',
                    label='posterior samples')
            ax.set_xlim(50 / 2.237, 200 / 2.237)
            xmin, xmax = ax.set_xlim()
            x = np.linspace(xmin, xmax, 100)
            if norm_analysis:
                p_prior = norm.pdf(x, mu_init * norm_factor / 2.237,
                                   mu_std_dev * norm_factor / 2.237)
            else:
                p_prior = norm.pdf(x, mu_init / 2.237, mu_std_dev / 2.237)
            p_new = norm.pdf(x, updated_values['theta1']['mean'] / 2.237,
                             updated_values['theta1']['std dev'] / 2.237)
            ax.plot(x, p_prior, 'k', linewidth=2, label='prior')
            ax.plot(x,
                    p_new,
                    'r',
                    linewidth=2,
                    label='updated',
                    linestyle='dashed')
            ax.set_title('Prior and updated distributions for ' +
                         r'$\theta_1$')
            ax.set_xlabel(r'$\theta_1$')
            ax.set_ylabel('Probability')
            ax.legend()
            plt.show()
            # beta
            fig2, ax2 = plt.subplots()
            ax2.hist(ppc['theta2'],
                     bins=25,
                     density=True,
                     alpha=0.4,
                     color='cornflowerblue',
                     label='posterior samples')
            ax2.set_xlim(0, 0.3)
            xmin2, xmax2 = ax2.set_xlim()
            x2 = np.linspace(xmin2, xmax2, 100)
            p_prior2 = norm.pdf(x2, beta_init, 0.03)
            p_new2 = norm.pdf(x2, updated_values['theta2']['mean'],
                              updated_values['theta2']['std dev'])
            ax2.plot(x2, p_prior2, 'k', linewidth=2, label='prior')
            ax2.plot(x2,
                     p_new2,
                     'r',
                     linewidth=2,
                     label='updated',
                     linestyle='dashed')
            ax2.set_title('Prior and updated distributions for ' +
                          r'$\theta_2$')
            ax2.set_xlabel(r'$\theta_2$')
            ax2.set_ylabel('Probability')
            ax2.legend()
            plt.show()
            # Create forestplots:
            # ax_forest = az.plot_forest(trace_idata.posterior['theta1']*norm_factor/2.237, hdi_prob=0.95, combined=True, var_names=['theta1'])
            # ax_forest[0].set_xlim(45, 75)
            # plt.show()
            # ax_forest2 = az.plot_forest(trace_idata, hdi_prob=0.95, combined=True,
            #                             var_names=['theta2'])
            # ax_forest2[0].set_xlim(0, 0.5)
            # plt.show()
            # Calculate failure probabilities for prior, updated:
            im = np.arange(70, 200, 2)
            # Mean of simulation-based fragility:
            if norm_analysis:
                pf_sim = pf(im, mu_init * norm_factor, beta_init)
            else:
                pf_sim = pf(im, mu_init, beta_init)
            # Mean of updated fragility:
            pf_mean = pf(im, ppc['theta1'].mean(), ppc['theta2'].mean())
            # Calculate entire distribution of pfs using posterior samples:
            pf_ppc = []
            for i in range(0, len(ppc['theta1'])):
                y = pf(im, ppc['theta1'][i], ppc['theta2'][i])
                pf_ppc.append(y)
            # Plot the credible intervals, mean outcome of prediction, mean of simulation-based:
            fig3, ax3 = plt.subplots()
            ax3.set_clip_on(False)
            ax3.set_ylim(0, 1.2)
            ax3.spines['right'].set_visible(False)
            ax3.spines['top'].set_visible(False)
            az.plot_hdi(im / 2.237,
                        pf_ppc,
                        hdi_prob=0.95,
                        fill_kwargs={
                            'alpha': 0.1,
                            'color': 'paleturquoise',
                            'label': '95% credible interval'
                        })
            ax3.plot(im / 2.237,
                     pf_mean,
                     label='mean of prediction',
                     color='r',
                     linestyle='dashed')
            ax3.plot(im / 2.237,
                     pf_sim,
                     label='mean of simulation-based',
                     color='k')
            # Plot the observations:
            if norm_analysis:
                ax3.scatter(xj * norm_factor / 2.237,
                            zj / nj,
                            color='darkviolet',
                            label='observations',
                            zorder=5,
                            s=70)
            else:
                ax3.scatter(xj / 2.237,
                            zj / nj,
                            color='darkviolet',
                            label='observations',
                            zorder=5,
                            s=70)
            ax3.set_xlabel('Wind Speed [m/s]')
            ax3.set_ylabel('Probability of Failure')
            ax3.legend()
            plt.show()
    return df_summary
Exemple #15
0
def plot_dependence(
    idata,
    X=None,
    Y=None,
    kind="pdp",
    xs_interval="linear",
    xs_values=None,
    var_idx=None,
    var_discrete=None,
    samples=50,
    instances=10,
    random_seed=None,
    sharey=True,
    rug=True,
    smooth=True,
    indices=None,
    grid="long",
    color="C0",
    color_mean="C0",
    alpha=0.1,
    figsize=None,
    smooth_kwargs=None,
    ax=None,
):
    """
    Partial dependence or individual conditional expectation plot

    Parameters
    ----------
    idata: InferenceData
        InferenceData containing a collection of BART_trees in sample_stats group
    X : array-like
        The covariate matrix.
    Y : array-like
        The response vector.
    kind : str
        Whether to plor a partial dependence plot ("pdp") or an individual conditional expectation
        plot ("ice"). Defaults to pdp.
    xs_interval : str
        Method used to compute the values X used to evaluate the predicted function. "linear",
        evenly spaced values in the range of X. "quantiles", the evaluation is done at the specified
        quantiles of X. "insample", the evaluation is done at the values of X.
        For discrete variables these options are ommited.
    xs_values : int or list
        Values of X used to evaluate the predicted function. If ``xs_interval="linear"`` number of
        points in the evenly spaced grid. If ``xs_interval="quantiles"``quantile or sequence of
        quantiles to compute, which must be between 0 and 1 inclusive.
        Ignored when ``xs_interval="insample"``.
    var_idx : list
        List of the indices of the covariate for which to compute the pdp or ice.
    var_discrete : list
        List of the indices of the covariate treated as discrete.
    samples : int
        Number of posterior samples used in the predictions. Defaults to 50
    instances : int
        Number of instances of X to plot. Only relevant if ice ``kind="ice"`` plots.
    random_seed : int
        random_seed used to sample from the posterior. Defaults to None.
    sharey : bool
        Controls sharing of properties among y-axes. Defaults to True.
    rug : bool
        Whether to include a rugplot. Defaults to True.
    smooth=True,
        If True the result will be smoothed by first computing a linear interpolation of the data
        over a regular grid and then applying the Savitzky-Golay filter to the interpolated data.
        Defaults to True.
    grid : str or tuple
        How to arrange the subplots. Defaults to "long", one subplot below the other.
        Other options are "wide", one subplot next to eachother or a tuple indicating the number of
        rows and columns.
    color : matplotlib valid color
        Color used to plot the pdp or ice. Defaults to "C0"
    color_mean : matplotlib valid color
        Color used to plot the mean pdp or ice. Defaults to "C0",
    alpha : float
        Transparency level, should in the interval [0, 1].
    figsize : tuple
        Figure size. If None it will be defined automatically.
    smooth_kwargs : dict
        Additional keywords modifying the Savitzky-Golay filter.
        See scipy.signal.savgol_filter() for details.
    ax : axes
        Matplotlib axes.

    Returns
    -------
    axes: matplotlib axes
    """
    if kind not in ["pdp", "ice"]:
        raise ValueError(f"kind={kind} is not suported. Available option are 'pdp' or 'ice'")

    if xs_interval not in ["insample", "linear", "quantiles"]:
        raise ValueError(
            f"""{xs_interval} is not suported.
                          Available option are 'insample', 'linear' or 'quantiles'"""
        )

    rng = RandomState(seed=random_seed)

    if isinstance(X, pd.DataFrame):
        X_names = list(X.columns)
        X = X.values
    else:
        X_names = []

    if isinstance(Y, pd.DataFrame):
        Y_label = f"Predicted {Y.name}"
    else:
        Y_label = "Predicted Y"

    num_observations = X.shape[0]
    num_covariates = X.shape[1]

    indices = list(range(num_covariates))

    if var_idx is None:
        var_idx = indices
    if var_discrete is None:
        var_discrete = []

    if X_names:
        X_labels = [X_names[idx] for idx in var_idx]
    else:
        X_labels = [f"X_{idx}" for idx in var_idx]

    if xs_interval == "linear" and xs_values is None:
        xs_values = 10

    if xs_interval == "quantiles" and xs_values is None:
        xs_values = [0.05, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.95]

    if kind == "ice":
        instances = np.random.choice(range(X.shape[0]), replace=False, size=instances)

    new_Y = []
    new_X_target = []
    y_mins = []

    new_X = np.zeros_like(X)
    idx_s = list(range(X.shape[0]))
    for i in var_idx:
        indices_mi = indices[:]
        indices_mi.pop(i)
        y_pred = []
        if kind == "pdp":
            if i in var_discrete:
                new_X_i = np.unique(X[:, i])
            else:
                if xs_interval == "linear":
                    new_X_i = np.linspace(np.nanmin(X[:, i]), np.nanmax(X[:, i]), xs_values)
                elif xs_interval == "quantiles":
                    new_X_i = np.quantile(X[:, i], q=xs_values)
                elif xs_interval == "insample":
                    new_X_i = X[:, i]

            for x_i in new_X_i:
                new_X[:, indices_mi] = X[:, indices_mi]
                new_X[:, i] = x_i
                y_pred.append(np.mean(predict(idata, rng, X_new=new_X, size=samples), 1))
            new_X_target.append(new_X_i)
        else:
            for instance in instances:
                new_X = X[idx_s]
                new_X[:, indices_mi] = X[:, indices_mi][instance]
                y_pred.append(np.mean(predict(idata, rng, X_new=new_X, size=samples), 0))
            new_X_target.append(new_X[:, i])
        y_mins.append(np.min(y_pred))
        new_Y.append(np.array(y_pred).T)

    if ax is None:
        if grid == "long":
            fig, axes = plt.subplots(len(var_idx), sharey=sharey, figsize=figsize)
        elif grid == "wide":
            fig, axes = plt.subplots(1, len(var_idx), sharey=sharey, figsize=figsize)
        elif isinstance(grid, tuple):
            fig, axes = plt.subplots(grid[0], grid[1], sharey=sharey, figsize=figsize)
        axes = np.ravel(axes)
    else:
        axes = [ax]
        fig = ax.get_figure()

    for i, ax in enumerate(axes):
        if i >= len(var_idx):
            ax.set_axis_off()
            fig.delaxes(ax)
        else:
            var = var_idx[i]
            if var in var_discrete:
                if kind == "pdp":
                    y_means = new_Y[i].mean(0)
                    hdi = az.hdi(new_Y[i])
                    ax.errorbar(
                        new_X_target[i],
                        y_means,
                        (y_means - hdi[:, 0], hdi[:, 1] - y_means),
                        fmt=".",
                        color=color,
                    )
                else:
                    ax.plot(new_X_target[i], new_Y[i], ".", color=color, alpha=alpha)
                    ax.plot(new_X_target[i], new_Y[i].mean(1), "o", color=color_mean)
                ax.set_xticks(new_X_target[i])
            elif smooth:
                if smooth_kwargs is None:
                    smooth_kwargs = {}
                smooth_kwargs.setdefault("window_length", 55)
                smooth_kwargs.setdefault("polyorder", 2)
                x_data = np.linspace(np.nanmin(new_X_target[i]), np.nanmax(new_X_target[i]), 200)
                x_data[0] = (x_data[0] + x_data[1]) / 2
                if kind == "pdp":
                    interp = griddata(new_X_target[i], new_Y[i].mean(0), x_data)
                else:
                    interp = griddata(new_X_target[i], new_Y[i], x_data)

                y_data = savgol_filter(interp, axis=0, **smooth_kwargs)

                if kind == "pdp":
                    az.plot_hdi(
                        new_X_target[i], new_Y[i], color=color, fill_kwargs={"alpha": alpha}, ax=ax
                    )
                    ax.plot(x_data, y_data, color=color_mean)
                else:
                    ax.plot(x_data, y_data.mean(1), color=color_mean)
                    ax.plot(x_data, y_data, color=color, alpha=alpha)

            else:
                idx = np.argsort(new_X_target[i])
                if kind == "pdp":
                    az.plot_hdi(
                        new_X_target[i],
                        new_Y[i],
                        smooth=smooth,
                        fill_kwargs={"alpha": alpha},
                        ax=ax,
                    )
                    ax.plot(new_X_target[i][idx], new_Y[i][idx].mean(0), color=color)
                else:
                    ax.plot(new_X_target[i][idx], new_Y[i][idx], color=color, alpha=alpha)
                    ax.plot(new_X_target[i][idx], new_Y[i][idx].mean(1), color=color_mean)

            if rug:
                lb = np.min(y_mins)
                ax.plot(X[:, var], np.full_like(X[:, var], lb), "k|")

            ax.set_xlabel(X_labels[i])

    fig.text(-0.05, 0.5, Y_label, va="center", rotation="vertical", fontsize=15)
    return axes
Exemple #16
0
t_eval = np.arange(0, 180)

inference_data = az.from_cmdstan('../results/outputs/*.csv')
chains = [i for i in range(18)]
samples = [i for i in range(20000)]
incidence = []
for i in range(5000):
    chain = np.random.choice(chains)
    sample = np.random.choice(samples)
    beta_start = inference_data.posterior.data_vars['beta_start'][chain,
                                                                  sample].data
    beta_end = inference_data.posterior.data_vars['beta_end'][chain,
                                                              sample].data
    k = inference_data.posterior.data_vars['k'][chain, sample].data
    seir = TimeVaryingSLAPIR(t_eval=t_eval,
                             beta_start=beta_start,
                             beta_end=beta_end,
                             k=k,
                             m=90,
                             init=init)
    incidence.append(seir.jit_solve().y[18, :])

az.plot_hdi(t_eval, incidence, hdi_prob=0.95)
plt.plot(np.arange(0, 180), daily_cases.newcountconfirmed[20:200])
plt.xlabel('Time (days)')
plt.ylabel('Incidence')
plt.tight_layout()
#plt.show()
plt.savefig('../results/plots/model_fit.pdf')
Exemple #17
0
import arviz as az
from scipy import stats as st
import matplotlib.pyplot as plt
import numpy as np
import graphviz

n = 50
theta_0 = 2
theta_1 = 0.5
xs = st.uniform(0, 30).rvs(n)
y_true = theta_0 + theta_1 * xs
y_obs = y_true + st.norm(0, 0.5).rvs(n)

with pm.Model() as BRegression:
    theta_0 = pm.Normal('theta_0', mu=0, sigma=10)
    theta_1 = pm.Normal('theta_1', mu=0, sigma=10)
    sigma = pm.HalfCauchy('sigma', 10)
    mu = pm.Deterministic('mu', theta_0 + theta_1 * xs)
    pm.Normal('y_lik', mu=mu, sigma=sigma, observed=y_obs)
    model_trace = pm.sample(draws=5000, tune=2000, cores=1, chains=4)
    pp = pm.sample_posterior_predictive(trace=model_trace)

_, axi = plt.subplots(figsize=(12, 5))
axi.plot(xs, pp['y_lik'].mean(axis=0), c='k')
az.plot_hdi(xs, model_trace['mu'], hdi_prob=0.98, ax=axi, color='gray')
axi.scatter(xs, y_obs)
plt.ylabel('y_observed', rotation=0, labelpad=30)
az.plot_posterior(model_trace, var_names=['theta_0', 'theta_1'])
plt.show()
#pm.model_to_graphviz(BRegression).view()