def loo(trace, model=None): """ Calculates leave-one-out (LOO) cross-validation for out of sample predictive model fit, following Vehtari et al. (2015). Cross-validation is computed using Pareto-smoothed importance sampling (PSIS). Returns log pointwise predictive density calculated via approximated LOO cross-validation. """ model = modelcontext(model) log_py = log_post_trace(trace, model) # Importance ratios r = 1. / np.exp(log_py) r_sorted = np.sort(r, axis=0) # Extract largest 20% of importance ratios and fit generalized Pareto to each # (returns tuple with shape, location, scale) q80 = int(len(log_py) * 0.8) pareto_fit = np.apply_along_axis(lambda x: pareto.fit(x, floc=0), 0, r_sorted[q80:]) if np.any(pareto_fit[0] > 0.5): warnings.warn("""Estimated shape parameter of Pareto distribution is for one or more samples is greater than 0.5. This may indicate that the variance of the Pareto smoothed importance sampling estimate is very large.""") # Calculate expected values of the order statistics of the fitted Pareto S = len(r_sorted) M = S - q80 z = (np.arange(M) + 0.5) / M expvals = map(lambda x: pareto.ppf(z, x[0], scale=x[2]), pareto_fit.T) # Replace importance ratios with order statistics of fitted Pareto r_sorted[q80:] = np.vstack(expvals).T # Unsort ratios (within columns) before using them as weights r_new = np.array( [r[np.argsort(i)] for r, i in zip(r_sorted, np.argsort(r, axis=0))]) # Truncate weights to guarantee finite variance w = np.minimum(r_new, r_new.mean(axis=0) * S**0.75) loo_lppd = np.sum( np.log(np.sum(w * np.exp(log_py), axis=0) / np.sum(w, axis=0))) return loo_lppd
def loo(trace, model=None): """ Calculates leave-one-out (LOO) cross-validation for out of sample predictive model fit, following Vehtari et al. (2015). Cross-validation is computed using Pareto-smoothed importance sampling (PSIS). Returns log pointwise predictive density calculated via approximated LOO cross-validation. """ model = modelcontext(model) log_py = log_post_trace(trace, model) # Importance ratios r = 1./np.exp(log_py) r_sorted = np.sort(r, axis=0) # Extract largest 20% of importance ratios and fit generalized Pareto to each # (returns tuple with shape, location, scale) q80 = int(len(log_py)*0.8) pareto_fit = np.apply_along_axis(lambda x: pareto.fit(x, floc=0), 0, r_sorted[q80:]) if np.any(pareto_fit[0] > 0.5): warnings.warn("""Estimated shape parameter of Pareto distribution is for one or more samples is greater than 0.5. This may indicate that the variance of the Pareto smoothed importance sampling estimate is very large.""") # Calculate expected values of the order statistics of the fitted Pareto S = len(r_sorted) M = S - q80 z = (np.arange(M)+0.5)/M expvals = map(lambda x: pareto.ppf(z, x[0], scale=x[2]), pareto_fit.T) # Replace importance ratios with order statistics of fitted Pareto r_sorted[q80:] = np.vstack(expvals).T # Unsort ratios (within columns) before using them as weights r_new = np.array([r[np.argsort(i)] for r,i in zip(r_sorted, np.argsort(r, axis=0))]) # Truncate weights to guarantee finite variance w = np.minimum(r_new, r_new.mean(axis=0) * S**0.75) loo_lppd = np.sum(np.log(np.sum(w * np.exp(log_py), axis=0) / np.sum(w, axis=0))) return loo_lppd
def loo(trace, model=None, pointwise=False): """Calculates leave-one-out (LOO) cross-validation for out of sample predictive model fit, following Vehtari et al. (2015). Cross-validation is computed using Pareto-smoothed importance sampling (PSIS). Parameters ---------- trace : result of MCMC run model : PyMC Model Optional model. Default None, taken from context. pointwise: bool if True the pointwise predictive accuracy will be returned. Default False Returns ------- namedtuple with the following elements: loo: approximated Leave-one-out cross-validation loo_se: standard error of loo p_loo: effective number of parameters loo_i: and array of the pointwise predictive accuracy, only if pointwise True """ model = modelcontext(model) log_py = log_post_trace(trace, model) # Importance ratios r = np.exp(-log_py) r_sorted = np.sort(r, axis=0) # Extract largest 20% of importance ratios and fit generalized Pareto to each # (returns tuple with shape, location, scale) q80 = int(len(log_py) * 0.8) pareto_fit = np.apply_along_axis( lambda x: pareto.fit(x, floc=0), 0, r_sorted[q80:]) if np.any(pareto_fit[0] > 0.7): warnings.warn("""Estimated shape parameter of Pareto distribution is greater than 0.7 for one or more samples. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.""") elif np.any(pareto_fit[0] > 0.5): warnings.warn("""Estimated shape parameter of Pareto distribution is greater than 0.5 for one or more samples. This may indicate that the variance of the Pareto smoothed importance sampling estimate is very large.""") # Calculate expected values of the order statistics of the fitted Pareto S = len(r_sorted) M = S - q80 z = (np.arange(M) + 0.5) / M expvals = map(lambda x: pareto.ppf(z, x[0], scale=x[2]), pareto_fit.T) # Replace importance ratios with order statistics of fitted Pareto r_sorted[q80:] = np.vstack(expvals).T # Unsort ratios (within columns) before using them as weights r_new = np.array([r[np.argsort(i)] for r, i in zip(r_sorted.T, np.argsort(r.T, axis=1))]).T # Truncate weights to guarantee finite variance w = np.minimum(r_new, r_new.mean(axis=0) * S**0.75) loo_lppd_i = - 2. * logsumexp(log_py, axis=0, b=w / np.sum(w, axis=0)) loo_lppd_se = np.sqrt(len(loo_lppd_i) * np.var(loo_lppd_i)) loo_lppd = np.sum(loo_lppd_i) lppd = np.sum(logsumexp(log_py, axis=0, b=1. / log_py.shape[0])) p_loo = lppd + (0.5 * loo_lppd) if pointwise: LOO_r = namedtuple('LOO_r', 'LOO, LOO_se, p_LOO, LOO_i') return LOO_r(loo_lppd, loo_lppd_se, p_loo, loo_lppd_i) else: LOO_r = namedtuple('LOO_r', 'LOO, LOO_se, p_LOO') return LOO_r(loo_lppd, loo_lppd_se, p_loo)
def loo(trace, model=None, pointwise=False, progressbar=False): """Calculates leave-one-out (LOO) cross-validation for out of sample predictive model fit, following Vehtari et al. (2015). Cross-validation is computed using Pareto-smoothed importance sampling (PSIS). Parameters ---------- trace : result of MCMC run model : PyMC Model Optional model. Default None, taken from context. pointwise: bool if True the pointwise predictive accuracy will be returned. Default False progressbar: bool Whether or not to display a progress bar in the command line. The bar shows the percentage of completion, the evaluation speed, and the estimated time to completion Returns ------- namedtuple with the following elements: loo: approximated Leave-one-out cross-validation loo_se: standard error of loo p_loo: effective number of parameters loo_i: and array of the pointwise predictive accuracy, only if pointwise True """ model = modelcontext(model) log_py = _log_post_trace(trace, model, progressbar=progressbar) if log_py.size == 0: raise ValueError('The model does not contain observed values.') # Importance ratios r = np.exp(-log_py) r_sorted = np.sort(r, axis=0) # Extract largest 20% of importance ratios and fit generalized Pareto to each # (returns tuple with shape, location, scale) q80 = int(len(log_py) * 0.8) pareto_fit = np.apply_along_axis( lambda x: pareto.fit(x, floc=0), 0, r_sorted[q80:]) if np.any(pareto_fit[0] > 0.7): warnings.warn("""Estimated shape parameter of Pareto distribution is greater than 0.7 for one or more samples. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.""") elif np.any(pareto_fit[0] > 0.5): warnings.warn("""Estimated shape parameter of Pareto distribution is greater than 0.5 for one or more samples. This may indicate that the variance of the Pareto smoothed importance sampling estimate is very large.""") # Calculate expected values of the order statistics of the fitted Pareto S = len(r_sorted) M = S - q80 z = (np.arange(M) + 0.5) / M expvals = map(lambda x: pareto.ppf(z, x[0], scale=x[2]), pareto_fit.T) # Replace importance ratios with order statistics of fitted Pareto r_sorted[q80:] = np.vstack(expvals).T # Unsort ratios (within columns) before using them as weights r_new = np.array([r[np.argsort(i)] for r, i in zip(r_sorted.T, np.argsort(r.T, axis=1))]).T # Truncate weights to guarantee finite variance w = np.minimum(r_new, r_new.mean(axis=0) * S**0.75) loo_lppd_i = - 2. * logsumexp(log_py, axis=0, b=w / np.sum(w, axis=0)) loo_lppd_se = np.sqrt(len(loo_lppd_i) * np.var(loo_lppd_i)) loo_lppd = np.sum(loo_lppd_i) lppd = np.sum(logsumexp(log_py, axis=0, b=1. / log_py.shape[0])) p_loo = lppd + (0.5 * loo_lppd) if pointwise: LOO_r = namedtuple('LOO_r', 'LOO, LOO_se, p_LOO, LOO_i') return LOO_r(loo_lppd, loo_lppd_se, p_loo, loo_lppd_i) else: LOO_r = namedtuple('LOO_r', 'LOO, LOO_se, p_LOO') return LOO_r(loo_lppd, loo_lppd_se, p_loo)
def loo(trace, model=None, n_eff=False): """ Calculates leave-one-out (LOO) cross-validation for out of sample predictive model fit, following Vehtari et al. (2015). Cross-validation is computed using Pareto-smoothed importance sampling (PSIS). Parameters ---------- trace : result of MCMC run model : PyMC Model Optional model. Default None, taken from context. n_eff: bool if True the effective number parameters will be computed and returned. Default False Returns ------- elpd_loo: log pointwise predictive density calculated via approximated LOO cross-validation p_loo: effective number parameters, only if n_eff True """ model = modelcontext(model) log_py = log_post_trace(trace, model) # Importance ratios py = np.exp(log_py) r = 1. / py r_sorted = np.sort(r, axis=0) # Extract largest 20% of importance ratios and fit generalized Pareto to each # (returns tuple with shape, location, scale) q80 = int(len(log_py) * 0.8) pareto_fit = np.apply_along_axis( lambda x: pareto.fit(x, floc=0), 0, r_sorted[q80:]) if np.any(pareto_fit[0] > 0.7): raise ValueError("""Estimated shape parameter of Pareto distribution is for one or more samples is greater than 0.7. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.""") if np.any(pareto_fit[0] > 0.5): warnings.warn("""Estimated shape parameter of Pareto distribution is for one or more samples is greater than 0.5. This may indicate that the variance of the Pareto smoothed importance sampling estimate is very large.""") # Calculate expected values of the order statistics of the fitted Pareto S = len(r_sorted) M = S - q80 z = (np.arange(M) + 0.5) / M expvals = map(lambda x: pareto.ppf(z, x[0], scale=x[2]), pareto_fit.T) # Replace importance ratios with order statistics of fitted Pareto r_sorted[q80:] = np.vstack(expvals).T # Unsort ratios (within columns) before using them as weights r_new = np.array([r[np.argsort(i)] for r, i in zip(r_sorted, np.argsort(r, axis=0))]) # Truncate weights to guarantee finite variance w = np.minimum(r_new, r_new.mean(axis=0) * S**0.75) loo_lppd = np.sum(np.log(np.sum(w * py, axis=0) / np.sum(w, axis=0))) if n_eff: p_loo = np.sum(np.log(np.mean(py, axis=0))) - loo_lppd return -2 * loo_lppd, p_loo else: return -2 * loo_lppd
def loo(trace, model=None, n_eff=False): """ Calculates leave-one-out (LOO) cross-validation for out of sample predictive model fit, following Vehtari et al. (2015). Cross-validation is computed using Pareto-smoothed importance sampling (PSIS). Parameters ---------- trace : result of MCMC run model : PyMC Model Optional model. Default None, taken from context. n_eff: bool if True the effective number parameters will be computed and returned. Default False Returns ------- elpd_loo: log pointwise predictive density calculated via approximated LOO cross-validation p_loo: effective number parameters, only if n_eff True """ model = modelcontext(model) log_py = log_post_trace(trace, model) # Importance ratios py = np.exp(log_py) r = 1. / py r_sorted = np.sort(r, axis=0) # Extract largest 20% of importance ratios and fit generalized Pareto to each # (returns tuple with shape, location, scale) q80 = int(len(log_py) * 0.8) pareto_fit = np.apply_along_axis(lambda x: pareto.fit(x, floc=0), 0, r_sorted[q80:]) if np.any(pareto_fit[0] > 0.7): raise ValueError("""Estimated shape parameter of Pareto distribution is for one or more samples is greater than 0.7. You should consider using a more robust model, this is because importance sampling is less likely to work well if the marginal posterior and LOO posterior are very different. This is more likely to happen with a non-robust model and highly influential observations.""") if np.any(pareto_fit[0] > 0.5): warnings.warn("""Estimated shape parameter of Pareto distribution is for one or more samples is greater than 0.5. This may indicate that the variance of the Pareto smoothed importance sampling estimate is very large.""") # Calculate expected values of the order statistics of the fitted Pareto S = len(r_sorted) M = S - q80 z = (np.arange(M) + 0.5) / M expvals = map(lambda x: pareto.ppf(z, x[0], scale=x[2]), pareto_fit.T) # Replace importance ratios with order statistics of fitted Pareto r_sorted[q80:] = np.vstack(expvals).T # Unsort ratios (within columns) before using them as weights r_new = np.array( [r[np.argsort(i)] for r, i in zip(r_sorted, np.argsort(r, axis=0))]) # Truncate weights to guarantee finite variance w = np.minimum(r_new, r_new.mean(axis=0) * S**0.75) loo_lppd = np.sum(np.log(np.sum(w * py, axis=0) / np.sum(w, axis=0))) if n_eff: p_loo = np.sum(np.log(np.mean(py, axis=0))) - loo_lppd return -2 * loo_lppd, p_loo else: return -2 * loo_lppd