def plot_model_comparison_waic(model_res_dict): model_compare = az.compare(model_res_dict, seed=1, scale='log', ic='waic') az.plot_compare(model_compare, plot_ic_diff=False, plot_standard_error=True, insample_dev=False) plt.title('Model comparison based on WAIC with log scale') plt.subplots_adjust(top=0.9, bottom=0.15) plt.savefig(os.path.join(parent_dir_name, f'output/waic_model_comparison.tiff'), format='tiff', dpi=500, bbox_inches="tight")
def compareplot(*args, **kwargs): if 'comp_df' in kwargs: comp_df = kwargs['comp_df'].copy() else: args = list(args) comp_df = args[0].copy() if 'WAIC' in comp_df.columns: comp_df = comp_df.rename(index=str, columns={ 'WAIC': 'waic', 'pWAIC': 'p_waic', 'dWAIC': 'd_waic', 'SE': 'se', 'dSE': 'dse', 'var_warn': 'warning' }) elif 'LOO' in comp_df.columns: comp_df = comp_df.rename(index=str, columns={ 'LOO': 'loo', 'pLOO': 'p_loo', 'dLOO': 'd_loo', 'SE': 'se', 'dSE': 'dse', 'shape_warn': 'warning' }) if 'comp_df' in kwargs: kwargs['comp_df'] = comp_df else: args[0] = comp_df return az.plot_compare(*args, **kwargs)
def compareplot(*args, **kwargs): if "comp_df" in kwargs: comp_df = kwargs["comp_df"].copy() else: args = list(args) comp_df = args[0].copy() if "WAIC" in comp_df.columns: comp_df = comp_df.rename( index=str, columns={ "WAIC": "waic", "pWAIC": "p_waic", "dWAIC": "d_waic", "SE": "se", "dSE": "dse", "var_warn": "warning", }, ) elif "LOO" in comp_df.columns: comp_df = comp_df.rename( index=str, columns={ "LOO": "loo", "pLOO": "p_loo", "dLOO": "d_loo", "SE": "se", "dSE": "dse", "shape_warn": "warning", }, ) if "comp_df" in kwargs: kwargs["comp_df"] = comp_df else: args[0] = comp_df return az.plot_compare(*args, **kwargs)
def compare_plot(data): """ functions for creating the WAIC compare plot function. Creates data for WAIC here using the dictionary of models provided. Plot doesn't need to be dynamically redrawn with interactive functions so latency is not really an issue. Which is why its easier just to create the WAIC data here. Plot only redrawn each time a new model is added. """ model_data = {} for key, value in data.items(): model_data[key] = value.model_arviz_data comp = az.compare( model_data, ic='waic', scale='log', ) comp.replace([np.inf, -np.inf], np.nan) if comp.isnull().values.any(): # if null values present then - in the cases ive seen - it means the model is using data # with missing values. Therefore notify the user that this feature is not available. return pn.widgets.StaticText(name='', value='Data contains missing values so can\'t compute WAIC') elif comp.shape[0] < 2: # for some reason this plot creates an error tha will stop the whole app from loading # if only one model is plotted. Therefore notify the user that a second configuration # is required before this feature is enabled. return pn.widgets.StaticText(name='', value='Add another configuration to compare models') else: kwg = dict(height=450, width=650,toolbar_location='right') plot = az.plot_compare( comp, backend='bokeh', show=False, backend_kwargs=kwg, order_by_rank=True, ) # plot does not generate a legend automatically so create one manually here by capturing the # plot features and giving them labels li1 = LegendItem(label='WAIC', renderers=[plot.renderers[2]]) li2 = LegendItem(label='Stadard Error', renderers=[plot.renderers[3]]) li3 = LegendItem(label='In-Sample Deviance', renderers=[plot.renderers[4]]) legend = Legend(items=[li1, li2, li3]) legend.location = (10,-10) plot.add_layout(legend, place='right') plot.width = 800 return plot
waic_l = az.waic(trace_l) waic_l # In[11]: cmp_df = az.compare({ 'model_l': trace_l, 'model_p': trace_p }, method='BB-pseudo-BMA') cmp_df # In[12]: az.plot_compare(cmp_df) plt.savefig('B11197_05_08.png', dpi=300) # ## Model Averaging # In[13]: w = 0.5 y_lp = pm.sample_posterior_predictive_w([trace_l, trace_p], samples=1000, models=[model_l, model_p], weights=[w, 1 - w]) # In[14]: _, ax = plt.subplots(figsize=(10, 6))
p_value = np.mean(T_sim >= T_obs) az.plot_kde(T_sim, ax=ax[idx]) ax[idx].axvline(T_obs, 0, 1, color='k', ls='--') ax[idx].set_title(f'K = {clusters[idx]} \n p-value {p_value:.2f}') #ax[idx].set_yticks([]) ax[idx].set_xlabel('iqr') pml.savefig('gmm_chooseK_pymc3_pval.pdf') # Compute information criteria for the 4 models # Use Bayesian Bootstrapping together with # pseudo Bayes Model Averaging # See "Bayesian analysis with Python" p201 comp = az.compare(dict(zip(clusters, traces)), method='BB-pseudo-BMA') comp az.plot_compare(comp) pml.savefig('gmm_chooseK_pymc3_waic.pdf') # Gaussian mixture model using PyMC3 # Based on https://github.com/aloctavodia/BAP/blob/master/code/Chp6/06_mixture_models.ipynb import pymc3 as pm import numpy as np import scipy.stats as stats import pandas as pd import theano.tensor as tt import matplotlib.pyplot as plt import arviz as az np.random.seed(42)
az.waic(trace_m6_14, m6_14) # %% compare_df = az.compare( { "m6_11": trace_m6_11, "m6_12": trace_m6_12, "m6_13": trace_m6_13, "m6_14": trace_m6_14, }, method="pseudo-BMA", ) compare_df # %% az.plot_compare(compare_df) # %% diff = np.random.normal(loc=6.7, scale=7.26, size=100000) sum(diff < 0) / 100000 # %% coeftab = pd.DataFrame( { "m6_11": pm.summary(trace_m6_11)["mean"], "m6_12": pm.summary(trace_m6_12)["mean"], "m6_13": pm.summary(trace_m6_13)["mean"], "m6_14": pm.summary(trace_m6_14)["mean"], } ) coeftab
""" Compare Plot ============ _thumb: .5, .5 """ import matplotlib.pyplot as plt import arviz as az az.style.use("arviz-darkgrid") model_compare = az.compare({ "Centered 8 schools": az.load_arviz_data("centered_eight"), "Non-centered 8 schools": az.load_arviz_data("non_centered_eight"), }) az.plot_compare(model_compare, figsize=(12, 4)) plt.show()
""" Compare Plot ============ _thumb: .5, .5 """ import arviz as az model_compare = az.compare({ "Centered 8 schools": az.load_arviz_data("centered_eight"), "Non-centered 8 schools": az.load_arviz_data("non_centered_eight"), }) ax = az.plot_compare(model_compare, figsize=(12, 4), backend="bokeh")
p = pm.math.invlogit(a + (bp + bpC * d.condition) * d.prosoc_left) pulled_left = pm.Binomial("pulled_left", 1, p, observed=d.pulled_left) trace_10_3 = pm.sample(1000, tune=1000) # %% comp_df = az.compare({ "m10.1": trace_10_1, "m10.2": trace_10_2, "m10.3": trace_10_3 }) comp_df # %% az.plot_compare(comp_df) # %% az.summary(trace_10_3, credible_interval=0.89, round_to=2) # %% np.exp(0.61) # %% logistic(4) # %% logistic(4 + 0.61) # %% d_pred = pd.DataFrame({"prosoc_left": [0, 1, 0, 1], "condition": [0, 0, 1, 1]})
3) **p_waic**, the value of the penalization term. We can roughly think of this value as the estimated effective number of parameters (but do not take that too seriously). 4) **d_waic**, the relative difference between the value of WAIC/LOO for the top-ranked model and the value of WAIC/LOO for each model. For this reason we will always get a value of 0 for the first model. 5) **weight**, the weights assigned to each model. These weights can be loosely interpreted as the probability of each model (among the compared models) given the data. See model averaging section for more details. 6) **se**, the standard error for the WAIC/LOO computations. The standard error can be useful to assess the uncertainty of the WAIC/LOO estimates. By default these errors are computed using bootstrapping. 7) **dse**, the standard errors of the difference between two values of WAIC/LOO. The same way that we can compute the standard error for each value of WAIC/LOO, we can compute the standard error of the differences between two values of WAIC/LOO. Notice that both quantities are not necessarily the same, the reason is that the uncertainty about WAIC/LOO is correlated between models. This quantity is always 0 for the top-ranked model. 8) **warning**, when computing WAIC/LOO, the possible values can be `True` or `False`. If `True` the computation of WAIC/LOO may not be reliable. This warning for WAIC is based on an empirical determined cutoff value and need to be interpreted with caution. The warning for LOO has better empirical and theoretical support. 9) **waic_scale**, the scale of the reported values. The default is the deviance scale as previously mentioned this is obtained by multiplying the log-score by -2. Other options are log -- this is the log-score multiplied by 1 (this reverts the order: a higher WAIC/LOO will be better) -- and negative-log -- this is the log-score multiplied by -1 (as with the deviance scale, a lower value is better). ## The plot_compare function ArviZ also provides another convenience function that takes the output of `compare(.)` and produces a summary plot in the style of the one used in the book Statistical Rethinking by Richard McElreath. az.plot_compare(cmp); The empty circle represents the values of WAIC/LOO and the black error bars associated with them are the values of the standard deviation of WAIC/LOO. The value of the best WAIC/LOO is also indicated with a vertical dashed grey line to ease comparison with other WAIC/LOO values. The filled black dots are the in-sample deviance of each model, i.e. the log-score without the penalty term. For all models except the top-ranked one we also get a triangle indicating the value of the difference of WAIC between that model and the top model and a grey errorbar indicating the standard error of the differences between the top-ranked WAIC/LOO and WAIC/LOO for each model. ## Point-wise model comparison Comparing models is a good way to get a better understanding about them...
def compare_models(df, models: dict, extra_model_args: list = None, parallel=False, plotose=False, **kwargs): """ kwargs are forwarded to split_train_predict->fit_numpyro compare_models(models={'Hier':bayes.Numpyro.model_hier, 'Hier+covariance':bayes.Numpyro.model_hier_covar, 'Twostep Exponential':bayes.TwoStep.model_twostep, 'Twostep Gamma':bayes.TwoStep.model_twostep, }, data=[df,df,df_monster,df_monster], extra_args=[{}, {}, {'prior':'Exponential'}, {'prior':'Gamma'}]) """ # TODO save all model args in BayesWindow in self # Calculate extra_model_args = extra_model_args or np.tile({}, len(models)) if parallel: traces = Parallel(n_jobs=min(os.cpu_count(), len(models)))( delayed(split_train_predict)( df, model, num_chains=1, **kwargs, **extra_model_arg) for model, extra_model_arg in zip(models.values(), extra_model_args)) else: traces = [ split_train_predict(df, model, y=kwargs['y'], **extra_model_arg) for model, extra_model_arg in zip(tqdm(models.values()), extra_model_args) ] # save tp dict traces_dict = {} # initialize results for key, trace in zip(models.keys(), traces): traces_dict[key] = trace # Plot if plotose: for trace_name, trace in traces_dict.items(): # Plot PPC az.plot_ppc( trace, # flatten=[treatment], # flatten_pp=data_cols[2], mean=False, # num_pp_samples=1000, # kind='cumulative' ) plt.title(trace_name) plt.show() r2(trace) # Weird that r2=1 # Waic try: print('======= WAIC (higher is better): =========') print(az.waic(trace, pointwise=True)) print(az.waic(trace, var_name='y')) except TypeError: pass try: for trace_name in traces_dict.keys(): trace = traces_dict[trace_name] # Print diagnostics and effect size print( f"n(Divergences) = {trace.sample_stats.diverging.sum(['chain', 'draw']).values}" ) try: slope = trace.posterior['v_mu'].sel({ 'v_mu_dim_0': 1 }).mean(['chain']).values except Exception: slope = trace.posterior['b'].mean(['chain']).values print( f'Effect size={(slope.mean() / slope.std()).round(2)} == {trace_name}' ) except Exception: pass model_compare = az.compare(traces_dict) # , var_name='y') az.plot_compare(model_compare, textsize=12, show=True) return model_compare