def df_from_model(model): r_t = model.trace['r_t'] mean = np.mean(r_t, axis=0) median = np.median(r_t, axis=0) hpd_90 = az.hdi(r_t, .9) hpd_50 = az.hdi(r_t, .5) approx_positives = model.trace['approx_positives'] median_approx_pos = np.median(approx_positives, axis=0) hpd_90_approx_pos = az.hdi(approx_positives, .9) cases = model.trace['cases'] median_cases = np.median(cases, axis=0) hpd_90_cases = az.hdi(cases, .9) idx = pd.MultiIndex.from_product([[model.region], model.trace_index], names=['region', 'date']) df = pd.DataFrame(data=np.c_[mean, median, hpd_90, hpd_50, median_approx_pos, hpd_90_approx_pos, median_cases, hpd_90_cases], index=idx, columns=[ 'mean', 'median', 'lower_90', 'upper_90', 'lower_50', 'upper_50', 'median_approx_pos', 'lower_90_approx_pos', 'upper_90_approx_pos', 'median_cases', 'lower_90_cases', 'upper_90_cases' ]) return df
def reportar_intervalos(self, discard, thin, hdi=True): ''' Imprimer los valores de los parámetros, tanto los valores más probables, como las incertezas a uno y dos sigmas. ''' sns.set(style='darkgrid', palette="muted", color_codes=True) sns.set_context("paper", font_scale=1.2, rc={ "font.size": 10, "axes.labelsize": 12 }) if isinstance(self.sampler, np.ndarray) == True: #Es una cadenas procesada samples = self.sampler len_chain, ndim = samples.shape else: samples = self.sampler.get_chain(discard=discard, flat=True, thin=thin) len_chain, nwalkers, ndim = self.sampler.get_chain().shape labels = self.labels for i in range(ndim): mean = np.mean(samples[:, i]) one_s = 68 two_s = 95 if hdi == True: one_sigma = az.hdi(samples, hdi_prob=one_s / 100)[i] two_sigma = az.hdi(samples, hdi_prob=two_s / 100)[i] else: one_sigma = [ scoreatpercentile(samples[:, i], 100 - one_s), scoreatpercentile(samples[:, i], one_s) ] two_sigma = [ scoreatpercentile(samples[:, i], 100 - two_s), scoreatpercentile(samples[:, i], two_s) ] q1 = np.diff([one_sigma[0], mean, one_sigma[1]]) q2 = np.diff([two_sigma[0], mean, two_sigma[1]]) #print(one_sigma,two_sigma) if np.abs(one_sigma[0]) < 10**(-2): #Reporto intervalo inferior txt = "\mathrm{{{0}}} < {1:.3f}({2:.3f})" txt = txt.format(labels[i], mean + q1[1], mean + q2[1]) else: txt = "\mathrm{{{3}}} = {0:.3f}_{{-{1:.3f}({4:.3f})}}^{{+{2:.3f}({5:.3f})}}" txt = txt.format(mean, q1[0], q1[1], labels[i], q2[0], q2[1]) display(Math(txt))
def mcmc_stats(runs, burnin, prob, batch): """ 入力 runs: モンテカルロ標本 burnin: バーンインの回数 prob: 区間確率 (0 < prob < 1) batch: 乱数系列の分割数 出力 事後統計量のデータフレーム """ traces = runs[burnin:, :] n = traces.shape[0] // batch k = traces.shape[1] alpha = 100 * (1.0 - prob) post_mean = np.mean(traces, axis=0) post_median = np.median(traces, axis=0) post_sd = np.std(traces, axis=0) mc_err = [az.mcse(traces[:, i].reshape((n, batch), order='F')).item(0) \ for i in range(k)] ci_lower = np.percentile(traces, 0.5 * alpha, axis=0) ci_upper = np.percentile(traces, 100 - 0.5 * alpha, axis=0) hpdi = az.hdi(traces, prob) rhat = [az.rhat(traces[:, i].reshape((n, batch), order='F')).item(0) \ for i in range(k)] stats = np.vstack((post_mean, post_median, post_sd, mc_err, ci_lower, ci_upper, hpdi.T, rhat)).T stats_string = [ '平均', '中央値', '標準偏差', '近似誤差', '信用区間(下限)', '信用区間(上限)', 'HPDI(下限)', 'HPDI(上限)', '$\\hat R$' ] param_string = ['平均 $\\mu$', '分散 $\\sigma^2$'] return pd.DataFrame(stats, index=param_string, columns=stats_string)
def hdi(self, var_name: str, credible_mass: float = 0.95): """Calculate the highest posterior density interval (HDI) This function calculates a *credible interval* which contains the ``credible_mass`` most likely values of the parameter, given the data. Also known as an HPD interval. Parameters ---------- var_name : str Name of variable. credible_mass : float The HDI will cover credible_mass * 100% of the probability mass. Default: 0.95, i.e. a 95% HDI. Returns ------- (float, float) The endpoints of the HPD """ check_credible_mass(credible_mass) az_major, az_minor, *_ = arviz.__version__.split('.') if (int(az_major), int(az_minor)) >= (0, 8): return tuple(arviz.hdi(self.trace[var_name], hdi_prob=credible_mass)) else: return tuple(arviz.hpd(self.trace[var_name], credible_interval=credible_mass))
def _contour_two_detectors(self, levels=[0.68], colors=["green"], ax=None, **kwargs): dt = self._dt[0] assert len(levels) == len(colors) dkey = list(self._universe.detectors.keys()) d1 = self._universe.detectors[dkey[0]] d2 = self._universe.detectors[dkey[1]] for i, level in enumerate(levels): dt1, dt2 = av.hdi(dt, hdi_prob=level) compute_annulus_from_time_delay(dt1 * u.s, dt2 * u.s, d1, d2, color=colors[i], ax=ax, **kwargs)
def summarize_inference_data(inference_data: az.InferenceData): """ Summarizes an inference_data object into the form that we publish on rt.live """ posterior = inference_data.posterior hdi_mass = 80 hpdi = az.hdi(posterior.r_t, hdi_prob=hdi_mass / 100).r_t observed_positive = inference_data.constant_data.observed_positive.to_series() scale_to_positives = lambda data: observed_positive.mean() / np.mean(data) * data tests = inference_data.constant_data.tests.to_series() normalized_positive = observed_positive / tests.clip(0.1 * tests.max()) summary = pd.DataFrame( data={ "mean": posterior.r_t.mean(["draw", "chain"]), "median": posterior.r_t.median(["chain", "draw"]), f"lower_{hdi_mass}": hpdi[:, 0], f"upper_{hdi_mass}": hpdi[:, 1], "infections": scale_to_positives( posterior.infections.mean(["draw", "chain"]) ), "test_adjusted_positive": scale_to_positives( posterior.test_adjusted_positive.mean(["draw", "chain"]) ), "test_adjusted_positive_raw": scale_to_positives(normalized_positive), "positive": observed_positive, "tests": tests, }, index=pd.Index(posterior.date.values, name="date"), ) return summary
def _calculate_hdi_and_map(self, d, mean, interval): # Calculate HDI interval and MAP q = az.hdi(d, hdi_prob=interval) m = _calculate_map(d) q = np.array([q[0], m, q[1]]) if mean is True: q = np.append(q, np.mean(d)) return q
def _get_intervals(draws, method, bounds): # Derives HDI or credible intervals for plotting purposes if method == "hdi": il = az.hdi(draws, bounds) elif method == "quantile": il = np.quantile(draws, bounds) x, kde_density = _calculate_kde(draws, num=100) subx = x[(x > il[0]) & (x < il[1])] kde_density = kde_density[(x > il[0]) & (x < il[1])] return {"x": subx, "y": kde_density}
def make_plot(trace): plot_training_data() # plot logistic curve theta = trace['θ'].mean(axis=0) idx = np.argsort(x_c) plt.plot(x_c[idx], theta[idx], color='C2', lw=3) az.plot_hdi(x_c, trace['θ'], color='C2') # plot decision boundary plt.vlines(trace['bd'].mean(), 0, 1, color='k') bd_hpd = az.hdi(trace['bd']) plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='k', alpha=0.5)
def mk_hpd(obs_positive_trace,data): obs_positive_trace_hpd = az.hdi(obs_positive_trace["pi"],hdi_prob=0.68) hpd_low=obs_positive_trace_hpd[:,0] hpd_high=obs_positive_trace_hpd[:,1] data['true_incidence']=[obs_positive_trace['pi'][:,i].mean() for i in data.index] data['true_positive']=data.total*data.true_incidence data['true_positive_low']=data.total.values*hpd_low data['true_positive_high']=data.total.values*hpd_high return data
def summarize_median_and_hdi( samples, prefix: str, hdi_prob: typing.Union[float, typing.Iterable[float]] = 0.9 ) -> typing.Dict[str, float]: """ Extract median, lower and upper bound and return it as a dict. Parameters ---------- samples : array-like samples to summarize (N_samples,) or (N_dates, N_samples) prefix : str name of the quantity hdi_prob : float, iterable see arviz.hdi If multiple HDI probs are passed, each will get its own entry in the resulting dict. Numpy arrays are automatically converted to lists, to avoid problems in JSON serialization. Returns ------- summary : dict a dict with median, lower and upper HDI(s) """ samples = numpy.array(samples).T result = {prefix: numpy.median(samples, axis=0)} if numpy.isscalar(hdi_prob): hdi = arviz.hdi(samples, hdi_prob=hdi_prob).T result[f'{prefix}_lower'] = hdi[0] result[f'{prefix}_upper'] = hdi[1] else: for hp in hdi_prob: hdi = arviz.hdi(samples, hdi_prob=hp).T result[f'{prefix}_lower_{hp}'] = hdi[0] result[f'{prefix}_upper_{hp}'] = hdi[1] # convert numpy arrays to lists, to avoid problems in json serialization for k, v in result.items(): if numpy.shape(v): result[k] = list(v) return result
def cal_HPD_CI(df, burn_in=2000): """ get HPD CI through mcmc.txt directly instead of reading the log/out file. Only calculate high density probility 95%. Args: df (pd.DataFrame): [description] burn_in (int, optional): [description]. Defaults to 2000. """ col2CI = {} for colname, col in df.iteritems(): vals = col.values[burn_in:] col2CI[colname] = az.hdi(vals, hdi_prob=.95) return col2CI
def plot_simulated_vs_actual_histogram_test( observed_histograms: np.array, posterior_samples: np.array, products_to_test: np.array, plot_histograms: bool = False, return_raw_simulations: bool = True, ) -> np.ndarray: print(posterior_samples.shape) products_to_test = products_to_test.astype("int") simulated_histograms = np.zeros((posterior_samples.shape[0], len(products_to_test), 5)) # Get the total number of reviews of the products we want to test # We will simulate as many reviews for each products as exist in their observed histograms # total_reviews = np.sum(observed_histograms[products_to_test, :], axis=1) params = {"review_prior": np.ones(5), "tendency_to_rate": 0.05, "simulation_type": "histogram"} simulator = simulator_class.DoubleRhoSimulator(params) # Take posterior samples of the products we want to test # We will simulate distributions using these posterior samples as parameters parameters = np.swapaxes(posterior_samples[:, products_to_test, :], 0, 1).reshape((-1, 2)) # We need to expand total reviews to be same number as the number of simulations to be run # total_reviews = np.tile(total_reviews[:, None], (1, posterior_samples.shape[0])).flatten() simulator.simulation_parameters = {"rho": parameters} with tqdm_joblib(tqdm(desc="Simulations", total=parameters.shape[0])) as progress_bar: simulations = Parallel(n_jobs=mp.cpu_count())( delayed(simulator.simulate_review_histogram)(i) for i in range(parameters.shape[0]) ) simulations = np.array(simulations) simulated_histograms[:, :, :] = simulations.reshape((-1, len(products_to_test), 5), order="F") simulated_histograms /= np.sum(simulated_histograms, axis=-1)[:, :, None] if plot_histograms: for i in range(len(products_to_test)): plt.figure() plt.plot( np.arange(5) + 1, observed_histograms[i, :] / np.sum(observed_histograms[i, :]), linewidth=4.0, color="black", ) # Get the HPDs of the simulated histograms hpd = arviz.hdi(simulated_histograms[:, i, :], hdi_prob=0.95) plt.fill_between(np.arange(5) + 1, hpd[:, 0], hpd[:, 1], color="black", alpha=0.4) plt.ylim([0, 1]) if return_raw_simulations: return simulations else: return simulated_histograms
def review_histogram_correlation( observed_histograms: np.ndarray, simulated_histograms: np.ndarray) -> np.ndarray: # Calculates the pearson/linear correlation between observed and simulated review histograms # Each histogram is 5 numbers (1 for each rating) - this calculates the correlation between those 5 # numbers in the observed and simulated histograms # Calculates 3 corr. coeffs. in each comparison, using the mean, and the 95% HPD limits of the # simulated histograms respectively assert ( observed_histograms.shape[0] == simulated_histograms.shape[1]), f""" Observed histograms have {observed_histograms.shape[0]} products while simulated histograms have {simulated_histograms.shape[1]} products. Need to be equal """ assert ( observed_histograms.shape[1] == 5 ), f"Observed review histograms need to be 5D, found shape {observed_histograms.shape} instead" assert ( simulated_histograms.shape[2] == 5 ), f"Simulated review histograms need to be 5D, found shape {simulated_histograms.shape} instead" # Calculate mean and 95% HPD of the simulated histograms simulation_mean = np.mean(simulated_histograms, axis=0) assert (observed_histograms.shape == simulation_mean.shape), f""" Mean of all simulated histograms for the products should have the same shape as the set of observed histograms of products """ hpd = np.array([ arviz.hdi(simulated_histograms[:, i, :], hdi_prob=0.95) for i in range(observed_histograms.shape[0]) ]) assert hpd.shape == observed_histograms.shape + tuple((2, )), f""" Shape of hpd array should be {observed_histograms.shape + (2,)}, found {hpd.shape} instead """ # Will store correlations in the order of HPD_0, mean, HPD_1 correlations = [] for product in range(hpd.shape[0]): r_0, p_0 = pearsonr(observed_histograms[product, :], hpd[product, :, 0]) r_mean, p_mean = pearsonr(observed_histograms[product, :], simulation_mean[product, :]) r_1, p_1 = pearsonr(observed_histograms[product, :], hpd[product, :, 1]) correlations.append([r_0, r_mean, r_1]) return np.array(correlations)
def pystan_adjust(self, model_params: Dict, execution_params: Dict = {}) -> Union[Tuple, pystan.StanModel]: credible_interval_size = execution_params.get('credible_interval_size', 0.95) # Validate model_params using marshmallow try: ModelParamsSchema().load(model_params) except ValidationError as err: print("Error: ", err.messages) return None, None, None satisfactory_model_found = False n_trials = 0 while not satisfactory_model_found: # If number of attempts exceeded, return Nones n_trials += 1 if n_trials >= self.trials_lim: print('no models met the HMC diagnostics in {trials_lim} trials') return None, None, None # Attempt to fit a model try: summary_df_parsed, hmc_diagnostics_passed = self.fit_one_pystan_model(model_params) except ZeroDivisionError: return None, None, None # get model result model_result = summary_df_parsed['50%'] lower, upper = arviz.hdi(summary_df_parsed['samples'], credible_interval_size) best_fit = summary_df_parsed['fit'] try: raw_prev = model_params['y_prev_obs'] / model_params['n_prev_obs'] bounded = result_is_bounded(model_result, raw_prev) except (ZeroDivisionError, ValueError): return None, None, None satisfactory_model_found = bounded and hmc_diagnostics_passed if self.return_fit: return best_fit else: return lower, model_result, upper
# load idata # m_idata = az.from_netcdf("../models_python/idata_covariation_generic.nc") ### meta-data ### model_type = "covariation" prior_level = "generic" kind = "individual" ID = 0 for ID in idx_unique: # only relevant idx ID_tmp = m_idata.posterior_predictive.sel(idx=ID) # small and large small = az.hdi(ID_tmp, hdi_prob=0.8)["y_pred"] large = az.hdi(ID_tmp, hdi_prob=0.95)["y_pred"] # y values for the right idx y = train[train["idx"] == ID].y.values # plot fig, ax = plt.subplots(figsize=(10, 7)) ax.scatter(t_unique, y, color="darkorange", s=50) ax.vlines(t_unique, small.sel(hdi="lower"), small.sel(hdi="higher"), color="orange", alpha=0.5, linewidth=15) ax.vlines(t_unique,
post_summary.minmax post_summary.mean post_summary.variance sum(posterior[p_grid < 0.5]) sum(post_sample < 0.5)/len(post_sample) len(post_sample[(post_sample>.5) & (post_sample<.75)])/len(post_sample) np.percentile(post_sample, q = (20, 50, 80)) interval = np.percentile(post_sample, q = (25, 75)) section = np.arange(interval[0], interval[1], 1/100) plt.plot(p_grid, post_density(p_grid), 'k-') plt.fill_between(section, post_density(section)) plt.title('50% Percentile Interval') plt.xlabel('proportion water') plt.ylabel('Density') plt.show() np.argmax(posterior) p_grid[np.argmax(posterior)] # highest posterior density interval [outsourcing :)] hdi(post_sample, alpha=0.5) # maybe include exmple with loss function ### 3.3: Sampling to simulate prediction # (nothing new here)
def statistical_analysis(serial, bl, opt): # Inspired by https://docs.pymc.io/notebooks/BEST.html y = create_pd_data_frame(serial, bl, opt) μ_m = y.value.mean() μ_s = y.value.std() σ_low = µ_s / 1000 σ_high = µ_s * 1000 with pm.Model() as model: serial_mean = pm.Normal('serial_mean', mu=µ_m, sd=1000 * µ_s) serial_std = pm.Uniform('serial_std', lower=µ_s / 1000, upper=1000 * µ_s) λ_serial = serial_std**-2 bl_mean = pm.Normal('bl_mean', mu=μ_m, sd=1000 * μ_s) bl_std = pm.Uniform('bl_std', lower=µ_s / 1000, upper=1000 * µ_s) λ_bl = bl_std**-2 opt_mean = {} opt_std = {} λ_opt = {} for f in opt: opt_mean[f] = pm.Normal('opt_{}_mean'.format(f), mu=μ_m, sd=1000 * μ_s) opt_std[f] = pm.Uniform('opt_{}_std'.format(f), lower=µ_s / 1000, upper=1000 * µ_s) λ_opt[f] = opt_std[f]**-2 ν = pm.Exponential('ν_minus_one', 1 / 29.) + 1 dist_serial = pm.StudentT('serial', nu=ν, mu=serial_mean, lam=λ_serial, observed=serial) dist_bl = pm.StudentT('bl', nu=ν, mu=bl_mean, lam=λ_bl, observed=bl) dist_opt = {} for f in opt: dist_opt[f] = pm.StudentT('opt_{}'.format(f), nu=ν, mu=opt_mean[f], lam=λ_opt[f], observed=opt[f]) dmean_serial_bl = pm.Deterministic('dmean_serial_bl', serial_mean - bl_mean) dmean_bl_opt = {} for f in opt: dmean_bl_opt[f] = pm.Deterministic('dmean_bl_opt_{}'.format(f), bl_mean - opt_mean[f]) speedup_bl = pm.Deterministic('speedup_bl', serial_mean / bl_mean) speedup_opt = {} improv_opt = {} for f in opt: speedup_opt = pm.Deterministic('speedup_opt_{}'.format(f), serial_mean / opt_mean[f]) improv_opt = pm.Deterministic('improv_opt_{}'.format(f), bl_mean / opt_mean[f]) trace = pm.sample(draws=3000, tune=2000) res1 = [('serial', 'serial_mean'), ('bl', 'bl_mean')] res2 = [('bl', 'speedup_bl')] res3 = [] res4 = [('bl', 'dmean_serial_bl')] for f in opt: res1 += [('opt_{}'.format(f), 'opt_{}_mean'.format(f))] res2 += [('opt_{}'.format(f), 'speedup_opt_{}'.format(f))] res3 += [('opt_{}'.format(f), 'improv_opt_{}'.format(f))] res4 += [('opt_{}'.format(f), 'dmean_bl_opt_{}'.format(f))] runtime = {} for r in res1: tr = trace[r[1]] hdi = az.hdi(tr) runtime[r[0]] = (hdi[0], tr.mean(), hdi[1]) speedup = {} for r in res2: tr = trace[r[1]] hdi = az.hdi(tr) speedup[r[0]] = (hdi[0], tr.mean(), hdi[1]) improv = {} for r in res3: tr = trace[r[1]] hdi = az.hdi(tr) improv[r[0]] = (hdi[0], tr.mean(), hdi[1]) prob = {} for r in res4: tr = trace[r[1]] prob[r[0]] = (tr > 0).sum() / len(tr) return (runtime, speedup, improv, prob)
def test_time_varying_model(): np.random.seed(1039) data = gen_toy_data() formula_str = "1 + C(weekday)" X_df = patsy.dmatrix(formula_str, data, return_type="dataframe") X_np = X_df.values xi_shape = X_np.shape[1] xi_0_true = np.array([2.0, -2.0, 2.0, -2.0, 2.0, -2.0, 2.0]).reshape(xi_shape, 1) xi_1_true = np.array([2.0, -2.0, 2.0, -2.0, 2.0, -2.0, 2.0]).reshape(xi_shape, 1) xis_rv_true = np.stack([xi_0_true, xi_1_true], axis=1) with pm.Model(**TV_CONFIG) as sim_model: _ = create_dirac_zero_hmm(X_np, mu=1000, xis=xis_rv_true, observed=np.zeros(X_np.shape[0])) sim_point = pm.sample_prior_predictive(samples=1, model=sim_model) y_t = sim_point["Y_t"].squeeze().astype(int) split = int(len(y_t) * 0.7) train_y, test_V = y_t[:split], sim_point["V_t"].squeeze()[split:] train_X, test_X = X_np[:split, :], X_np[split:, :] X = shared(train_X, name="X", borrow=True) Y = shared(train_y, name="y_t", borrow=True) with pm.Model() as model: xis_rv = pm.Normal("xis", 0, 10, shape=xis_rv_true.shape) _ = create_dirac_zero_hmm(X, 1000, xis_rv, Y) number_of_draws = 500 with model: steps = [ FFBSStep([model.V_t]), pm.NUTS( vars=[ model.gamma_0, model.Gamma, ], target_accept=0.90, ), ] with model: posterior_trace = pm.sample( draws=number_of_draws, step=steps, random_seed=100, return_inferencedata=True, chains=1, cores=1, progressbar=True, idata_kwargs={"dims": { "Y_t": ["date"], "V_t": ["date"] }}, ) # Update the shared variable values Y.set_value(np.ones(test_X.shape[0], dtype=Y.dtype)) X.set_value(test_X) model.V_t.distribution.shape = (test_X.shape[0], ) hdi_data = az.hdi(posterior_trace, hdi_prob=0.95, var_names=["xis"]).to_dataframe() hdi_data = hdi_data.unstack(level="hdi") xis_true_flat = xis_rv_true.squeeze().flatten() check_idx = ~np.in1d(np.arange(len(xis_true_flat)), np.arange(3, len(xis_true_flat), step=4)) assert np.all( xis_true_flat[check_idx] <= hdi_data["xis", "higher"].values[check_idx]) assert np.all( xis_true_flat[check_idx] >= hdi_data["xis", "lower"].values[check_idx]) trace = posterior_trace.posterior.drop_vars(["Gamma", "V_t"]) with aesara.config.change_flags(compute_test_value="off"): adds_pois_ppc = pm.sample_posterior_predictive( trace, var_names=["V_t", "Y_t", "Gamma"], model=model) assert (np.abs(adds_pois_ppc["V_t"] - test_V) / test_V.shape[0]).mean() < 1e-2
df = sns.load_dataset('iris') iris = df.query("species == ('setosa', 'versicolor')") y = pd.Categorical(iris['species']).codes x = iris[iris.columns[:-1]].values x = x[:, 0] - x[:, 0].mean() print(x) with pm.Model() as model: alpha = pm.Normal('alpha', 0, 10) beta = pm.Normal('beta', 0, 10) mu = alpha + pm.math.dot(x, beta) p = pm.Deterministic('p', pm.math.sigmoid(mu)) y_lik = pm.Bernoulli('y_lik', p=p, observed=y) b = pm.Deterministic('b', -alpha / beta) trace_m = pm.sample(draws=1000, cores=1, chains=3, random_seed=1) pp = pm.sample_posterior_predictive(trace_m) _, ax = plt.subplots(figsize=(12, 8)) xs = np.linspace(x.min(), x.max(), 1000) theta = trace_m['p'].mean(axis=0) sns.lineplot(xs, 1 / (1 + np.exp(-(trace_m['alpha'].mean(axis=0) + trace_m['beta'].mean(axis=0) * xs))), ax=ax) plt.vlines(trace_m['b'].mean(axis=0), 0, 1) az.plot_hdi(x, trace_m['p'], ax=ax) hdi = az.hdi(trace_m['b'], hdi_prob=0.98) plt.fill_betweenx([0, 1], hdi[0], hdi[1], color='k', alpha=0.5) sns.scatterplot(x, y, ax=ax) plt.xlabel('sepal_length') plt.show()
fh.plot_hdi(t=t_test, y=y_test, n_idx=n_idx_test, m_idata=m_idata, model_type="covariation", prior_level="generic", kind="predictions") model_type = "covariation" prior_level = "generic" # plot hdi for individual aliens for ID in idx_unique_test: # only relevant idx ID_tmp = m_idata.predictions.sel(idx=ID) # small and large hdi interval hdi1 = az.hdi(ID_tmp, hdi_prob=0.8)["y_pred"] hdi2 = az.hdi(ID_tmp, hdi_prob=0.95)["y_pred"] # y values for the right idx y = test[test["idx"] == ID].y.values fh.hdi_ID(t_unique=t_unique_test, y=y, hdi1=hdi1, hdi2=hdi2, model_type=model_type, prior_level=prior_level, type="test", ID=ID)
"fs_cos_2": daypart_fs_cos_2[test_index], "fs_cos_3": daypart_fs_cos_3[test_index], "fs_cos_4": daypart_fs_cos_4[test_index], "fs_cos_5": daypart_fs_cos_5[test_index], "cooling_temp": outdoor_temp_c[test_index], "heating_temp": outdoor_temp_h[test_index] }) posterior_hdi = pm.sample_posterior_predictive(partial_pooling_trace, keep_size=True) posterior = pm.sample_posterior_predictive(partial_pooling_trace) # Calculate predictions and HDI predictions = np.exp(posterior['y'].mean(0)) hdi_data = az.hdi(posterior_hdi) lower_bound = np.array(np.exp( hdi_data.to_array().sel(hdi='lower'))).flatten() higher_bound = np.array(np.exp( hdi_data.to_array().sel(hdi='higher'))).flatten() # Calculate cvrmse and coverage of the HDI mse = mean_squared_error(df.total_electricity[test_index], predictions) rmse = sqrt(mse) cvrmse = rmse / df.total_electricity.mean() coverage = sum((lower_bound <= df.total_electricity[test_index]) & (df.total_electricity[test_index] <= higher_bound) ) * 100 / len(test_index) partial_pooling_cv_accuracy.append(cvrmse) coverage_list.append(coverage)
def optimum_intervals( self, hdi_prob=0.95, multimodal=True, opt_samples=200, space_samples=500, only_mean=True, random_state=None, ): """Estimate highest density intervals for the optimum. Employs Thompson sampling to obtain samples from the optimum distribution. For each dimension separately, it will then estimate highest density intervals. Parameters ---------- hdi_prob : float, default=0.95 The total probability each interval should cover. multimodal : bool, default=True If True, more than one interval can be returned for one parameter. opt_samples : int, default=200 Number of samples to generate from the optimum distribution. space_samples : int, default=500 Number of samples to cover the optimization space with. only_mean : bool, default=True If True, it will only sample optima from the mean Gaussian process. This is usually faster, but can underestimate the uncertainty. If False, it will also sample the hyperposterior of the kernel parameters. random_state : int, RandomState instance or None, optional (default: None) The generator used to initialize the centers. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. Returns ------- intervals : list of ndarray Outputs an array of size (n_modes, 2) for each dimension in the optimization space. Raises ------ NotImplementedError If the user calls the function on an optimizer containing at least one categorical parameter. """ if self.space.is_partly_categorical: raise NotImplementedError( "Highest density interval not implemented for categorical parameters." ) X = self.space.rvs(n_samples=space_samples, random_state=random_state) X = self.space.transform(X) optimum_samples = self.gp.sample_y(X, sample_mean=only_mean, n_samples=opt_samples, random_state=random_state) X_opt = X[np.argmin(optimum_samples, axis=0)] intervals = [] for i, col in enumerate(X_opt.T): raw_interval = hdi(col, hdi_prob=hdi_prob, multimodal=multimodal) intervals.append( self.space.dimensions[i].inverse_transform(raw_interval)) return intervals
def get_predictions(self, trace, empirical=True, return_summary_stats=True, thin=5): test_data = self.create_test_matrix(empirical=empirical) test_data.index.name = 'test_values' pred = self.model.predict(trace, 'mean', test_data, inplace=False,)[ 'posterior']['chose_risky_mean'].to_dataframe() pred.index = pred.index.set_names('test_values', -1) pred = pred.join(test_data).loc[( slice(None), slice(None, None, thin)), :] if return_summary_stats: m = pred.groupby(['subject', 'x', 'risky_first', 'n_safe'])[ ['chose_risky_mean']].mean() ci = pred.groupby(['subject', 'x', 'risky_first', 'n_safe'])['chose_risky_mean'].apply(lambda x: pd.Series(az.hdi(x.values), index=['lower', 'higher'])).unstack() m = m.join(ci) return m else: return pred
def plot_test_parameter_recovery( parameters: np.array, num_posterior_samples: int, simulator_type: str, simulation_type: str, plot_posteriors: bool = False, get_stats: bool = False, param_posterior_prob_band: Optional[float] = None, ) -> np.ndarray: # Simulate review histograms using provided parameters params = {"review_prior": np.ones(5), "tendency_to_rate": 0.05, "simulation_type": simulation_type} simulator = simulator_class.DoubleRhoSimulator(params) simulator.simulation_parameters = {"rho": parameters} with tqdm_joblib(tqdm(desc="Simulations", total=parameters.shape[0])) as progress_bar: simulations = Parallel(n_jobs=mp.cpu_count())( delayed(simulator.simulate_review_histogram)(i) for i in range(parameters.shape[0]) ) simulations = np.array(simulations) # The parameter prior doesn't matter here as it will be overridden by that of the loaded inference object parameter_prior = sbi.utils.BoxUniform( low=torch.tensor([0.0, 0.0]).type(torch.FloatTensor), high=torch.tensor([4.0, 4.0]).type(torch.FloatTensor) ) inferrer = inference_class.HistogramInference(parameter_prior=parameter_prior) inferrer.load_simulator(dirname=ARTIFACT_PATH, simulator_type=simulator_type, simulation_type=simulation_type) inferrer.load_inference(dirname=ARTIFACT_PATH) posterior_samples = inferrer.get_posterior_samples(simulations, num_samples=num_posterior_samples) # Plot the posterior samples inferred for the simulated data # We will plot upto 4 plots in one row of the panel if plot_posteriors: if len(parameters) <= 4: fig, ax = plt.subplots(1, len(parameters), squeeze=False) else: fig, ax = plt.subplots((len(parameters) + 1) // 4, 4, squeeze=False) row_index = 0 for i in range(len(parameters)): if len(parameters) > 4: row_index = i // 4 ax[row_index, i % 4].hist( posterior_samples[:, i, 0], color="black", alpha=0.5, bins=10, label=r"$\rho_{-}$" ) ax[row_index, i % 4].axvline(x=parameters[i, 0], linewidth=3.0, color="black", linestyle="--") ax[row_index, i % 4].hist(posterior_samples[:, i, 1], color="red", alpha=0.5, bins=10, label=r"$\rho_{+}$") ax[row_index, i % 4].axvline(x=parameters[i, 1], linewidth=3.0, color="red", linestyle="--") ax[row_index, i % 4].set_xlim([0, 4]) ax[row_index, i % 4].set_xticks([0, 1, 2, 3, 4]) ax[row_index, i % 4].legend() # add a big axis, hide frame fig.add_subplot(111, frameon=False) # hide tick and tick label of the big axis plt.tick_params(labelcolor="none", top=False, bottom=False, left=False, right=False) plt.xlabel(r"$\rho_{-}, \rho_{+}$") plt.ylabel("Number of samples") # If asked, print how many of the provided parameters are recovered by the inference engine # i.e, how often do the supplied parameters lie within the 95% HPD of the posterior if get_stats: f = open(ARTIFACT_PATH / "stats_parameter_recovery.txt", "w") assert ( posterior_samples.shape == (num_posterior_samples,) + parameters.shape ), f""" Expected shape {(num_posterior_samples,) + parameters.shape} for array of posterior samples, but got {posterior_samples.shape} instead """ # First get the HPD of each recovered posterior distribution hpd = np.array([arviz.hdi(posterior_samples[:, i, :], hdi_prob=0.95) for i in range(parameters.shape[0])]) assert hpd.shape == parameters.shape + (2,), f"Found shape {hpd.shape} for hpd" # See how many of the supplied rho_- and rho_+ are contained in these HPDs contained_rho_0 = [ True if (parameters[i, 0] < hpd[i, 0, 1] and parameters[i, 0] > hpd[i, 0, 0]) else False for i in range(parameters.shape[0]) ] contained_rho_1 = [ True if (parameters[i, 1] < hpd[i, 1, 1] and parameters[i, 1] > hpd[i, 1, 0]) else False for i in range(parameters.shape[0]) ] print( f""" rho- is recovered {np.sum(contained_rho_0)} times out of {parameters.shape[0]} = {100*(np.sum(contained_rho_0) / parameters.shape[0]):0.2f}%" """, file=f, ) print( f""" rho+ is recovered {np.sum(contained_rho_1)} times out of {parameters.shape[0]} = {100*(np.sum(contained_rho_1) / parameters.shape[0]):0.2f}%" """, file=f, ) print("=======================================================", file=f) # Now get the probability that the posterior distribution puts in a band/region around # the passed parameter values. For good parameter recovery, this number should be high assert ( param_posterior_prob_band is not None ), f""" Posterior probability band around parameter values need to be passed if stats are needed """ param_band_low = parameters - param_posterior_prob_band param_band_high = parameters + param_posterior_prob_band rho_0_probs = (posterior_samples[:, :, 0] >= param_band_low[None, :, 0]) * ( posterior_samples[:, :, 0] <= param_band_high[None, :, 0] ) rho_0_probs = np.mean(rho_0_probs, axis=0) rho_1_probs = (posterior_samples[:, :, 1] >= param_band_low[None, :, 1]) * ( posterior_samples[:, :, 1] <= param_band_high[None, :, 1] ) rho_1_probs = np.mean(rho_1_probs, axis=0) print( f""" In {100*np.mean(rho_0_probs>=0.5):0.2f}% of cases, the inferred posterior places more than 50% probability in a band of {2*param_posterior_prob_band} around the true value of rho- """, file=f, ) print( f""" In {100*np.mean(rho_1_probs>=0.5):0.2f}% of cases, the inferred posterior places more than 50% probability in a band of {2*param_posterior_prob_band} around the true value of rho+ """, file=f, ) f.close() # Finally, plot the distribution of the posterior probability the inference engine places in a # band around the true value of rho- and rho+ plt.figure() plt.hist(rho_0_probs, alpha=0.5, label=r"$\rho_{-}$") plt.hist(rho_1_probs, alpha=0.5, label=r"$\rho_{+}$") plt.legend() plt.title( f"Posterior probability placed by inference engine in a band of {2*param_posterior_prob_band}" + f"\n around the true value of the parameters ({parameters.shape[0]} trials)", fontsize=24.0, ) return posterior_samples
trace_0 = pm.sample(1000) varnames = ['α', 'β', 'bd'] az.summary(trace_0, varnames) theta = trace_0['θ'].mean(axis=0) idx = np.argsort(x_c) plt.figure() # plot logistic curve plt.plot(x_c[idx], theta[idx], color='C2', lw=3) az.plot_hdi(x_c, trace_0['θ'], color='C2') # plot decision boundary plt.vlines(trace_0['bd'].mean(), 0, 1, color='k') bd_hpd = az.hdi(trace_0['bd']) plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='k', alpha=0.5) # plot jittered data plt.scatter(x_c, np.random.normal(y_0, 0.02), marker='.', color=[f'C{x}' for x in y_0]) plt.xlabel(x_n) plt.ylabel('p(y=1)', rotation=0) # use original scale for xticks locs, _ = plt.xticks() plt.xticks(locs, np.round(locs + xmean, 1)) #plt.xticks(x_c[idx], np.round(x_0[idx], 1)) plt.tight_layout()
fp = logistic(pred_samples['f_pred']) fp_mean = np.mean(fp, 0) ax.plot(X_new[:, 0], fp_mean) # plot the data (with some jitter) and the true latent function ax.scatter(x_1, np.random.normal(y, 0.02), marker='.', color=[f'C{x}' for x in y]) az.plot_hdi(X_new[:, 0], fp, color='C2') db = np.array([find_midpoint(f, X_new[:, 0], 0.5) for f in fp]) db_mean = db.mean() db_hpd = az.hdi(db) ax.vlines(db_mean, 0, 1, color='k') ax.fill_betweenx([0, 1], db_hpd[0], db_hpd[1], color='k', alpha=0.5) ax.set_xlabel('sepal_length') ax.set_ylabel('θ', rotation=0) pml.savefig('gp_classify_iris1.pdf', dpi=300) # Change kernel to be sum of SE and linear, to improve tail behavior with pm.Model() as model_iris2: #ℓ = pm.HalfCauchy("ℓ", 1) ℓ = pm.Gamma('ℓ', 2, 0.5) c = pm.Normal('c', x_1.min()) τ = pm.HalfNormal('τ', 5) cov = (pm.gp.cov.ExpQuad(1, ℓ) + τ * pm.gp.cov.Linear(1, c) + pm.gp.cov.WhiteNoise(1E-5))
def bayesian_model_comparison(df): # Preprocess df["log_v"] = log_electricity = np.log(df["total_electricity"]).values total_electricity = df.total_electricity.values # Create local variables (assign daypart, cluster and weekday values need to start from 0) # clusters are use profile categories, heat_clusters and cool_clusters indicate days having similar # temperature dependence (likely to modify this in the new version of the preprocessing) df.t = pd.to_datetime(pd.Series(df.t)) df.s = df.s - 1 df.weekday = df.weekday - 1 clusters = df.s unique_clusters = clusters.unique() dayparts = df.daypart weekdays = df.weekday unique_dayparts = dayparts.unique() unique_weekdays = weekdays.unique() n_hours = len(df.index) outdoor_temp_c = df.outdoor_temp_c outdoor_temp_h = df.outdoor_temp_h outdoor_temp_lp_c = df.outdoor_temp_lp_c outdoor_temp_lp_h = df.outdoor_temp_lp_h daypart_fs_sin_1 = df.daypart_fs_sin_1 daypart_fs_sin_2 = df.daypart_fs_sin_2 daypart_fs_sin_3 = df.daypart_fs_sin_3 daypart_fs_cos_1 = df.daypart_fs_cos_1 daypart_fs_cos_2 = df.daypart_fs_cos_2 daypart_fs_cos_3 = df.daypart_fs_cos_3 # create coords for pymc3 coords = {"obs_id": np.arange(total_electricity.size)} coords["profile_cluster"] = unique_clusters coords["daypart"] = unique_dayparts coords["weekday"] = unique_weekdays # Create kfold cross-validation splits kf = KFold(n_splits=5) kf.get_n_splits(df) # Create arrays to save model results partial_pool_cvrmse_list = [] no_pool_cvrmse_list = [] complete_pool_cvrmse_list = [] partial_pool_coverage_list = [] no_pool_coverage_list = [] complete_pool_coverage_list = [] for train_index, test_index in kf.split(df): coords = {"obs_id": np.arange(total_electricity[train_index].size)} coords["profile_cluster"] = unique_clusters coords["daypart"] = unique_dayparts coords["weekday"] = unique_weekdays # Partial Pooling with pm.Model(coords=coords) as partial_pooling: profile_cluster_idx = pm.Data("profile_cluster_idx", clusters[train_index], dims="obs_id") daypart = pm.Data("daypart", dayparts[train_index], dims="obs_id") weekday = pm.Data("weekday", weekdays[train_index], dims="obs_id") fs_sin_1 = pm.Data("fs_sin_1", daypart_fs_sin_1[train_index], dims="obs_id") fs_sin_2 = pm.Data("fs_sin_2", daypart_fs_sin_2[train_index], dims="obs_id") fs_sin_3 = pm.Data("fs_sin_3", daypart_fs_sin_3[train_index], dims="obs_id") fs_cos_1 = pm.Data("fs_cos_1", daypart_fs_cos_1[train_index], dims="obs_id") fs_cos_2 = pm.Data("fs_cos_2", daypart_fs_cos_2[train_index], dims="obs_id") fs_cos_3 = pm.Data("fs_cos_3", daypart_fs_cos_3[train_index], dims="obs_id") # cooling_temp = pm.Data("cooling_temp", outdoor_temp_c[train_index], dims="obs_id") # heating_temp = pm.Data("heating_temp", outdoor_temp_h[train_index], dims="obs_id") cooling_temp_lp = pm.Data("cooling_temp_lp", outdoor_temp_lp_c[train_index], dims="obs_id") heating_temp_lp = pm.Data("heating_temp_lp", outdoor_temp_lp_h[train_index], dims="obs_id") # Hyperpriors: bf = pm.Normal("bf", mu=0.0, sigma=1.0) sigma_bf = pm.Exponential("sigma_bf", 1.0) a = pm.Normal("a", mu=0.0, sigma=1.0) sigma_a = pm.Exponential("sigma_a", 1.0) # btc = pm.Normal("btc", mu=0.0, sigma=1.0, dims="daypart") # bth = pm.Normal("bth", mu=0.0, sigma=1.0, dims="daypart") btclp = pm.Normal("btclp", mu=0.0, sigma=1.0, dims="daypart") bthlp = pm.Normal("bthlp", mu=0.0, sigma=1.0, dims="daypart") # Varying intercepts a_cluster = pm.Normal("a_cluster", mu=a, sigma=sigma_a, dims=("daypart", "profile_cluster")) # Varying slopes: bs1 = pm.Normal("bs1", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) bs2 = pm.Normal("bs2", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) bs3 = pm.Normal("bs3", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) bc1 = pm.Normal("bc1", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) bc2 = pm.Normal("bc2", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) bc3 = pm.Normal("bc3", mu=bf, sigma=sigma_bf, dims=("profile_cluster")) # Expected value per county: mu = a_cluster[daypart, profile_cluster_idx] + bs1[profile_cluster_idx] * fs_sin_1 + \ bs2[profile_cluster_idx] * fs_sin_2 + bs3[profile_cluster_idx] * fs_sin_3 + \ bc1[profile_cluster_idx] * fs_cos_1 + bc2[profile_cluster_idx] * fs_cos_2 + \ bc3[profile_cluster_idx] * fs_cos_3 + \ btclp[daypart] * cooling_temp_lp + \ bthlp[daypart] * heating_temp_lp # btc[daypart] * cooling_temp + bth[daypart] * heating_temp + \ # Model error: sigma = pm.Exponential("sigma", 1.0) # Likelihood y = pm.Normal("y", mu, sigma=sigma, observed=log_electricity[train_index], dims="obs_id") # Fitting with partial_pooling: approx = pm.fit( n=50000, method='fullrank_advi', callbacks=[CheckParametersConvergence(tolerance=0.01)]) partial_pooling_trace = approx.sample(1000) # Sampling from the posterior setting test data to check the predictions on unseen data with partial_pooling: pm.set_data({ "profile_cluster_idx": clusters[test_index], "daypart": dayparts[test_index], # "weekday":weekdays, "fs_sin_1": daypart_fs_sin_1[test_index], "fs_sin_2": daypart_fs_sin_2[test_index], "fs_sin_3": daypart_fs_sin_3[test_index], "fs_cos_1": daypart_fs_cos_1[test_index], "fs_cos_2": daypart_fs_cos_2[test_index], "fs_cos_3": daypart_fs_cos_3[test_index], # "cooling_temp":outdoor_temp_c, "heating_temp": outdoor_temp_h, "cooling_temp_lp": outdoor_temp_lp_c[test_index], "heating_temp_lp": outdoor_temp_lp_h[test_index] }) partial_pool_posterior_hdi = pm.sample_posterior_predictive( partial_pooling_trace, keep_size=True) partial_pool_posterior = pm.sample_posterior_predictive( partial_pooling_trace) partial_pool_prior = pm.sample_prior_predictive(150) # Calculate predictions and HDI partial_pool_predictions = np.exp(partial_pool_posterior['y'].mean(0)) hdi_data = az.hdi(partial_pool_posterior_hdi) partial_pool_lower_bound = np.array( np.exp(hdi_data.to_array().sel(hdi='lower'))).flatten() partial_pool_higher_bound = np.array( np.exp(hdi_data.to_array().sel(hdi='higher'))).flatten() # Calculate cvrmse and coverage of the HDI partial_pool_mse = mean_squared_error(df.total_electricity[test_index], partial_pool_predictions) partial_pool_rmse = sqrt(partial_pool_mse) partial_pool_cvrmse = partial_pool_rmse / df.total_electricity.mean() partial_pool_coverage = sum( (partial_pool_lower_bound <= df.total_electricity[test_index]) & (df.total_electricity[test_index] <= partial_pool_higher_bound) ) * 100 / len(test_index) partial_pool_cvrmse_list.append(partial_pool_cvrmse) partial_pool_coverage_list.append(partial_pool_coverage) # No Pooling with pm.Model(coords=coords) as no_pooling: profile_cluster_idx = pm.Data("profile_cluster_idx", clusters[train_index], dims="obs_id") daypart = pm.Data("daypart", dayparts[train_index], dims="obs_id") weekday = pm.Data("weekday", weekdays[train_index], dims="obs_id") fs_sin_1 = pm.Data("fs_sin_1", daypart_fs_sin_1[train_index], dims="obs_id") fs_sin_2 = pm.Data("fs_sin_2", daypart_fs_sin_2[train_index], dims="obs_id") fs_sin_3 = pm.Data("fs_sin_3", daypart_fs_sin_3[train_index], dims="obs_id") fs_cos_1 = pm.Data("fs_cos_1", daypart_fs_cos_1[train_index], dims="obs_id") fs_cos_2 = pm.Data("fs_cos_2", daypart_fs_cos_2[train_index], dims="obs_id") fs_cos_3 = pm.Data("fs_cos_3", daypart_fs_cos_3[train_index], dims="obs_id") # cooling_temp = pm.Data("cooling_temp", outdoor_temp_c[train_index], dims="obs_id") # heating_temp = pm.Data("heating_temp", outdoor_temp_h[train_index], dims="obs_id") cooling_temp_lp = pm.Data("cooling_temp_lp", outdoor_temp_lp_c[train_index], dims="obs_id") heating_temp_lp = pm.Data("heating_temp_lp", outdoor_temp_lp_h[train_index], dims="obs_id") # Priors: a_cluster = pm.Normal("a_cluster", mu=0.0, sigma=1.0, dims=("daypart", "profile_cluster")) btclp = pm.Normal("btclp", mu=0.0, sigma=1.0, dims="daypart") bthlp = pm.Normal("bthlp", mu=0.0, sigma=1.0, dims="daypart") bs1 = pm.Normal("bs1", mu=0.0, sigma=1.0, dims="profile_cluster") bs2 = pm.Normal("bs2", mu=0.0, sigma=1.0, dims="profile_cluster") bs3 = pm.Normal("bs3", mu=0.0, sigma=1.0, dims="profile_cluster") bc1 = pm.Normal("bc1", mu=0.0, sigma=1.0, dims="profile_cluster") bc2 = pm.Normal("bc2", mu=0.0, sigma=1.0, dims="profile_cluster") bc3 = pm.Normal("bc3", mu=0.0, sigma=1.0, dims="profile_cluster") # Expected value per county: mu = a_cluster[daypart, profile_cluster_idx] + bs1[profile_cluster_idx] * fs_sin_1 + \ bs2[profile_cluster_idx] * fs_sin_2 + bs3[profile_cluster_idx] * fs_sin_3 + \ bc1[profile_cluster_idx] * fs_cos_1 + bc2[profile_cluster_idx] * fs_cos_2 + \ bc3[profile_cluster_idx] * fs_cos_3 + \ btclp[daypart] * cooling_temp_lp + \ bthlp[daypart] * heating_temp_lp # btc[daypart] * cooling_temp + bth[daypart] * heating_temp + \ # Model error: sigma = pm.Exponential("sigma", 1.0) # Likelihood y = pm.Normal("y", mu, sigma=sigma, observed=log_electricity[train_index], dims="obs_id") # Fitting with no_pooling: approx = pm.fit( n=50000, method='fullrank_advi', callbacks=[CheckParametersConvergence(tolerance=0.01)]) no_pooling_trace = approx.sample(1000) # Sampling from the posterior setting test data to check the predictions on unseen data with no_pooling: pm.set_data({ "profile_cluster_idx": clusters[test_index], "daypart": dayparts[test_index], # "weekday":weekdays, "fs_sin_1": daypart_fs_sin_1[test_index], "fs_sin_2": daypart_fs_sin_2[test_index], "fs_sin_3": daypart_fs_sin_3[test_index], "fs_cos_1": daypart_fs_cos_1[test_index], "fs_cos_2": daypart_fs_cos_2[test_index], "fs_cos_3": daypart_fs_cos_3[test_index], # "cooling_temp":outdoor_temp_c, "heating_temp": outdoor_temp_h, "cooling_temp_lp": outdoor_temp_lp_c[test_index], "heating_temp_lp": outdoor_temp_lp_h[test_index] }) no_pool_posterior_hdi = pm.sample_posterior_predictive( no_pooling_trace, keep_size=True) no_pool_posterior = pm.sample_posterior_predictive( no_pooling_trace) no_pool_prior = pm.sample_prior_predictive(150) # Calculate predictions and HDI no_pool_predictions = np.exp(no_pool_posterior['y'].mean(0)) no_pool_hdi_data = az.hdi(no_pool_posterior_hdi) no_pool_lower_bound = np.array( np.exp(no_pool_hdi_data.to_array().sel(hdi='lower'))).flatten() no_pool_higher_bound = np.array( np.exp(no_pool_hdi_data.to_array().sel(hdi='higher'))).flatten() # Calculate cvrmse and coverage of the HDI no_pool_mse = mean_squared_error(df.total_electricity[test_index], no_pool_predictions) no_pool_rmse = sqrt(no_pool_mse) no_pool_cvrmse = no_pool_rmse / df.total_electricity.mean() no_pool_coverage = sum( (no_pool_lower_bound <= df.total_electricity[test_index]) & (df.total_electricity[test_index] <= no_pool_higher_bound) ) * 100 / len(test_index) no_pool_cvrmse_list.append(no_pool_cvrmse) no_pool_coverage_list.append(no_pool_coverage) # Complete pooling with pm.Model(coords=coords) as complete_pooling: fs_sin_1 = pm.Data("fs_sin_1", daypart_fs_sin_1[train_index], dims="obs_id") fs_sin_2 = pm.Data("fs_sin_2", daypart_fs_sin_2[train_index], dims="obs_id") fs_sin_3 = pm.Data("fs_sin_3", daypart_fs_sin_3[train_index], dims="obs_id") fs_cos_1 = pm.Data("fs_cos_1", daypart_fs_cos_1[train_index], dims="obs_id") fs_cos_2 = pm.Data("fs_cos_2", daypart_fs_cos_2[train_index], dims="obs_id") fs_cos_3 = pm.Data("fs_cos_3", daypart_fs_cos_3[train_index], dims="obs_id") # cooling_temp = pm.Data("cooling_temp", outdoor_temp_c[train_index], dims="obs_id") # heating_temp = pm.Data("heating_temp", outdoor_temp_h[train_index], dims="obs_id") cooling_temp_lp = pm.Data("cooling_temp_lp", outdoor_temp_lp_c[train_index], dims="obs_id") heating_temp_lp = pm.Data("heating_temp_lp", outdoor_temp_lp_h[train_index], dims="obs_id") # Priors: a = pm.Normal("a", mu=0.0, sigma=1.0) btclp = pm.Normal("btclp", mu=0.0, sigma=1.0) bthlp = pm.Normal("bthlp", mu=0.0, sigma=1.0) bs1 = pm.Normal("bs1", mu=0.0, sigma=1.0) bs2 = pm.Normal("bs2", mu=0.0, sigma=1.0) bs3 = pm.Normal("bs3", mu=0.0, sigma=1.0) bc1 = pm.Normal("bc1", mu=0.0, sigma=1.0) bc2 = pm.Normal("bc2", mu=0.0, sigma=1.0) bc3 = pm.Normal("bc3", mu=0.0, sigma=1.0) # Expected value per county: mu = a + bs1 * fs_sin_1 + bs2 * fs_sin_2 + bs3 * fs_sin_3 + bc1 * fs_cos_1 + bc2 * fs_cos_2 + \ bc3 * fs_cos_3 + btclp * cooling_temp_lp + bthlp * heating_temp_lp # btc[daypart] * cooling_temp + bth[daypart] * heating_temp + \ # Model error: sigma = pm.Exponential("sigma", 1.0) # Likelihood y = pm.Normal("y", mu, sigma=sigma, observed=log_electricity[train_index], dims="obs_id") # Fitting with complete_pooling: approx = pm.fit( n=50000, method='fullrank_advi', callbacks=[CheckParametersConvergence(tolerance=0.01)]) complete_pooling_trace = approx.sample(1000) # Sampling from the posterior setting test data to check the predictions on unseen data with complete_pooling: pm.set_data({ "fs_sin_1": daypart_fs_sin_1[test_index], "fs_sin_2": daypart_fs_sin_2[test_index], "fs_sin_3": daypart_fs_sin_3[test_index], "fs_cos_1": daypart_fs_cos_1[test_index], "fs_cos_2": daypart_fs_cos_2[test_index], "fs_cos_3": daypart_fs_cos_3[test_index], # "cooling_temp":outdoor_temp_c, "heating_temp": outdoor_temp_h, "cooling_temp_lp": outdoor_temp_lp_c[test_index], "heating_temp_lp": outdoor_temp_lp_h[test_index] }) complete_pool_posterior_hdi = pm.sample_posterior_predictive( complete_pooling_trace, keep_size=True) complete_pool_posterior = pm.sample_posterior_predictive( complete_pooling_trace) complete_pool_prior = pm.sample_prior_predictive(150) # Calculate predictions and HDI complete_pool_predictions = np.exp( complete_pool_posterior['y'].mean(0)) complete_pool_hdi_data = az.hdi(complete_pool_posterior_hdi) complete_pool_lower_bound = np.array( np.exp( complete_pool_hdi_data.to_array().sel(hdi='lower'))).flatten() complete_pool_higher_bound = np.array( np.exp(complete_pool_hdi_data.to_array().sel( hdi='higher'))).flatten() # Calculate cvrmse and coverage of the HDI complete_pool_mse = mean_squared_error( df.total_electricity[test_index], complete_pool_predictions) complete_pool_rmse = sqrt(complete_pool_mse) complete_pool_cvrmse = complete_pool_rmse / df.total_electricity.mean() complete_pool_coverage = sum( (complete_pool_lower_bound <= df.total_electricity[test_index]) & (df.total_electricity[test_index] <= complete_pool_higher_bound) ) * 100 / len(test_index) complete_pool_cvrmse_list.append(complete_pool_cvrmse) complete_pool_coverage_list.append(complete_pool_coverage) # Export Results np_cvrmse = np.mean(no_pool_cvrmse_list) cp_cvrmse = np.mean(complete_pool_cvrmse_list) pp_cvrmse = np.mean(partial_pool_cvrmse_list) np_coverage = np.mean(no_pool_coverage_list) cp_coverage = np.mean(complete_pool_coverage_list) pp_coverage = np.mean(partial_pool_coverage_list) export_data = { 'partial_pooling_cvrmse': [pp_cvrmse], 'no_pooling_cvrmse': [np_cvrmse], 'complete_pooling_cvrmse': [cp_cvrmse], 'partial_pooling_coverage': [pp_coverage], 'no_pooling_coverage': [np_coverage], 'complete_pooling_coverage': [cp_coverage] } export_df = pd.DataFrame(data=export_data) return export_df
def create_and_run_models(args): verbose = args.verbose data = pd.read_csv(args.infile) data_start = data[data.P_t >= args.cutoff].index[0] data = data.loc[data_start:] # Now, from the start date, we will project forward and # compute our Rts and Its. R_t_mu, R_t_sigma = args.rt_init_mu, args.rt_init_sigma I_t_mu = data.iloc[0].P_t n_days = len(data) if args.window == -1 else args.window R_t_mus, R_t_lows, R_t_highs = [R_t_mu], [R_t_mu - R_t_sigma * 1.96 ], [R_t_mu + R_t_sigma * 1.96] I_t_mus, I_t_lows, I_t_highs = [I_t_mu], [-np.inf], [np.inf] for i in range(1, n_days): day = data.iloc[i] model = MCMCModel(args.infile, R_t_drift=args.R_t_drift, num_positive=day.P_t, num_tests=day.T_t, dI_t_mu=I_t_mu, N_t=day.N_t, use_real_nt=args.real_nt, R_t_mu=R_t_mu, R_t_sigma=R_t_sigma, verbose=args.verbose).run(chains=args.chains, tune=args.tune, draws=args.draw, cores=args.cores) I_t_1 = model.trace['dI_t_1'] R_t_1 = model.trace['R_t_1'] R_t_mu = np.mean(R_t_1) R_t_sigma = np.std(R_t_1) I_t_mu = np.mean(I_t_1) R_t_bounds = az.hdi(R_t_1, 0.95) R_t_low, R_t_high = R_t_bounds[0], R_t_bounds[1] I_t_bounds = az.hdi(I_t_1, 0.95) I_t_low, I_t_high = I_t_bounds[0], I_t_bounds[1] if verbose: print(i) print(f'R_t: {(R_t_mu, R_t_low, R_t_high)}') print(f'I_t: {(I_t_mu, I_t_low, I_t_high)}') if verbose > 1: print(f'R_t_sigma: {(np.std(R_t_1))}') print('Skew, kurtosis: ', skew(R_t_1), kurtosis(R_t_1)) R_t_mus.append(R_t_mu) R_t_highs.append(R_t_high) R_t_lows.append(R_t_low) I_t_mus.append(I_t_mu) I_t_highs.append(I_t_high) I_t_lows.append(I_t_low) results = pd.DataFrame({ 'R_t_mean': np.array(R_t_mus), 'R_t_low': np.array(R_t_lows), 'R_t_high': np.array(R_t_highs), 'I_t_mean': np.array(I_t_mus), 'I_t_low': np.array(I_t_lows), 'I_t_high': np.array(I_t_highs), }) results.index = data.index[:n_days] return results
def plot_dependence( idata, X=None, Y=None, kind="pdp", xs_interval="linear", xs_values=None, var_idx=None, var_discrete=None, samples=50, instances=10, random_seed=None, sharey=True, rug=True, smooth=True, indices=None, grid="long", color="C0", color_mean="C0", alpha=0.1, figsize=None, smooth_kwargs=None, ax=None, ): """ Partial dependence or individual conditional expectation plot Parameters ---------- idata: InferenceData InferenceData containing a collection of BART_trees in sample_stats group X : array-like The covariate matrix. Y : array-like The response vector. kind : str Whether to plor a partial dependence plot ("pdp") or an individual conditional expectation plot ("ice"). Defaults to pdp. xs_interval : str Method used to compute the values X used to evaluate the predicted function. "linear", evenly spaced values in the range of X. "quantiles", the evaluation is done at the specified quantiles of X. "insample", the evaluation is done at the values of X. For discrete variables these options are ommited. xs_values : int or list Values of X used to evaluate the predicted function. If ``xs_interval="linear"`` number of points in the evenly spaced grid. If ``xs_interval="quantiles"``quantile or sequence of quantiles to compute, which must be between 0 and 1 inclusive. Ignored when ``xs_interval="insample"``. var_idx : list List of the indices of the covariate for which to compute the pdp or ice. var_discrete : list List of the indices of the covariate treated as discrete. samples : int Number of posterior samples used in the predictions. Defaults to 50 instances : int Number of instances of X to plot. Only relevant if ice ``kind="ice"`` plots. random_seed : int random_seed used to sample from the posterior. Defaults to None. sharey : bool Controls sharing of properties among y-axes. Defaults to True. rug : bool Whether to include a rugplot. Defaults to True. smooth=True, If True the result will be smoothed by first computing a linear interpolation of the data over a regular grid and then applying the Savitzky-Golay filter to the interpolated data. Defaults to True. grid : str or tuple How to arrange the subplots. Defaults to "long", one subplot below the other. Other options are "wide", one subplot next to eachother or a tuple indicating the number of rows and columns. color : matplotlib valid color Color used to plot the pdp or ice. Defaults to "C0" color_mean : matplotlib valid color Color used to plot the mean pdp or ice. Defaults to "C0", alpha : float Transparency level, should in the interval [0, 1]. figsize : tuple Figure size. If None it will be defined automatically. smooth_kwargs : dict Additional keywords modifying the Savitzky-Golay filter. See scipy.signal.savgol_filter() for details. ax : axes Matplotlib axes. Returns ------- axes: matplotlib axes """ if kind not in ["pdp", "ice"]: raise ValueError(f"kind={kind} is not suported. Available option are 'pdp' or 'ice'") if xs_interval not in ["insample", "linear", "quantiles"]: raise ValueError( f"""{xs_interval} is not suported. Available option are 'insample', 'linear' or 'quantiles'""" ) rng = RandomState(seed=random_seed) if isinstance(X, pd.DataFrame): X_names = list(X.columns) X = X.values else: X_names = [] if isinstance(Y, pd.DataFrame): Y_label = f"Predicted {Y.name}" else: Y_label = "Predicted Y" num_observations = X.shape[0] num_covariates = X.shape[1] indices = list(range(num_covariates)) if var_idx is None: var_idx = indices if var_discrete is None: var_discrete = [] if X_names: X_labels = [X_names[idx] for idx in var_idx] else: X_labels = [f"X_{idx}" for idx in var_idx] if xs_interval == "linear" and xs_values is None: xs_values = 10 if xs_interval == "quantiles" and xs_values is None: xs_values = [0.05, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.95] if kind == "ice": instances = np.random.choice(range(X.shape[0]), replace=False, size=instances) new_Y = [] new_X_target = [] y_mins = [] new_X = np.zeros_like(X) idx_s = list(range(X.shape[0])) for i in var_idx: indices_mi = indices[:] indices_mi.pop(i) y_pred = [] if kind == "pdp": if i in var_discrete: new_X_i = np.unique(X[:, i]) else: if xs_interval == "linear": new_X_i = np.linspace(np.nanmin(X[:, i]), np.nanmax(X[:, i]), xs_values) elif xs_interval == "quantiles": new_X_i = np.quantile(X[:, i], q=xs_values) elif xs_interval == "insample": new_X_i = X[:, i] for x_i in new_X_i: new_X[:, indices_mi] = X[:, indices_mi] new_X[:, i] = x_i y_pred.append(np.mean(predict(idata, rng, X_new=new_X, size=samples), 1)) new_X_target.append(new_X_i) else: for instance in instances: new_X = X[idx_s] new_X[:, indices_mi] = X[:, indices_mi][instance] y_pred.append(np.mean(predict(idata, rng, X_new=new_X, size=samples), 0)) new_X_target.append(new_X[:, i]) y_mins.append(np.min(y_pred)) new_Y.append(np.array(y_pred).T) if ax is None: if grid == "long": fig, axes = plt.subplots(len(var_idx), sharey=sharey, figsize=figsize) elif grid == "wide": fig, axes = plt.subplots(1, len(var_idx), sharey=sharey, figsize=figsize) elif isinstance(grid, tuple): fig, axes = plt.subplots(grid[0], grid[1], sharey=sharey, figsize=figsize) axes = np.ravel(axes) else: axes = [ax] fig = ax.get_figure() for i, ax in enumerate(axes): if i >= len(var_idx): ax.set_axis_off() fig.delaxes(ax) else: var = var_idx[i] if var in var_discrete: if kind == "pdp": y_means = new_Y[i].mean(0) hdi = az.hdi(new_Y[i]) ax.errorbar( new_X_target[i], y_means, (y_means - hdi[:, 0], hdi[:, 1] - y_means), fmt=".", color=color, ) else: ax.plot(new_X_target[i], new_Y[i], ".", color=color, alpha=alpha) ax.plot(new_X_target[i], new_Y[i].mean(1), "o", color=color_mean) ax.set_xticks(new_X_target[i]) elif smooth: if smooth_kwargs is None: smooth_kwargs = {} smooth_kwargs.setdefault("window_length", 55) smooth_kwargs.setdefault("polyorder", 2) x_data = np.linspace(np.nanmin(new_X_target[i]), np.nanmax(new_X_target[i]), 200) x_data[0] = (x_data[0] + x_data[1]) / 2 if kind == "pdp": interp = griddata(new_X_target[i], new_Y[i].mean(0), x_data) else: interp = griddata(new_X_target[i], new_Y[i], x_data) y_data = savgol_filter(interp, axis=0, **smooth_kwargs) if kind == "pdp": az.plot_hdi( new_X_target[i], new_Y[i], color=color, fill_kwargs={"alpha": alpha}, ax=ax ) ax.plot(x_data, y_data, color=color_mean) else: ax.plot(x_data, y_data.mean(1), color=color_mean) ax.plot(x_data, y_data, color=color, alpha=alpha) else: idx = np.argsort(new_X_target[i]) if kind == "pdp": az.plot_hdi( new_X_target[i], new_Y[i], smooth=smooth, fill_kwargs={"alpha": alpha}, ax=ax, ) ax.plot(new_X_target[i][idx], new_Y[i][idx].mean(0), color=color) else: ax.plot(new_X_target[i][idx], new_Y[i][idx], color=color, alpha=alpha) ax.plot(new_X_target[i][idx], new_Y[i][idx].mean(1), color=color_mean) if rug: lb = np.min(y_mins) ax.plot(X[:, var], np.full_like(X[:, var], lb), "k|") ax.set_xlabel(X_labels[i]) fig.text(-0.05, 0.5, Y_label, va="center", rotation="vertical", fontsize=15) return axes