def build_model(historic_prices: pd.DataFrame, base_days: int, vault: Vault, config: Config) -> pd.DataFrame: pct_changes = historic_prices.price.pct_change().dropna().values[-base_days * 24:] with pm.Model() as model: mu = pm.Normal("mu", mu=0, sigma=0.1) sd = pm.HalfNormal("sd", sigma=0.1) obs = pm.Normal("obs", mu=mu, sigma=sd, observed=pct_changes) trace = pm.sample(5000, cores=config.SAMPLING_CORES, tune=5000) mus = np.random.choice(trace.get_values('mu'), size=config.N_POSTERIOR_SAMPLES, replace=True) sds = np.random.choice(trace.get_values('sd'), size=config.N_POSTERIOR_SAMPLES, replace=True) posterior_samples = np.random.normal(mus, sds, size=(config.MODEL_HORIZON_DAYS*24, config.N_POSTERIOR_SAMPLES)) posterior_samples = np.transpose(posterior_samples) posterior_growths = np.cumsum(posterior_samples, axis=1) latest_known_price = historic_prices.loc[historic_prices.index.max()].price price_projections = to_float((1 + posterior_growths) * latest_known_price, vault.decimals) hpd_95 = pm.hpd(price_projections, hdi_prob=0.95) hpd_50 = pm.hpd(price_projections, hdi_prob=0.5) model = pd.DataFrame.from_dict( { 'hpd_95_lower': hpd_95[:,0], 'hpd_95_upper': hpd_95[:,1], 'hpd_50_lower': hpd_50[:,0], 'hpd_50_upper': hpd_50[:,1] } ) index = pd.DatetimeIndex(pd.date_range(historic_prices.index.max() + pd.DateOffset(hours=1), periods=config.MODEL_HORIZON_DAYS*24, freq='H')) model = model.set_index(index).resample('1D').nearest() model['vault'] = vault.name model['base_days'] = base_days return model
def generate_ebbinghaus_data_figure_3(): fig, (ax1, ax2) = plt.subplots(ncols=1, nrows=2) fig.set_size_inches(5.5, 4.5) # plot 1 ax1.plot(delay, savings, marker='o', linestyle='--') ax1.plot(delay, np.median(np.exp(mu), axis=0), color='red', linestyle='-') ax1.set_title('c. Log-log plot (blue) and power law model estimates (red)') ax1.set_xlabel('Delay (log hours)') ax1.set_xscale('log', basex=10) ax1.set_ylabel('Savings (log \\%)') ax1.set_yscale('log', basey=10) ax1.grid(b=True, which='minor', color='w', linewidth=1.0) # plot 2 yerr = [ np.median(np.exp(mu), axis=0) - pm.hpd(np.exp(mu))[:, 0], pm.hpd(np.exp(mu))[:, 1] - np.median(np.exp(mu), axis=0) ] ax2.errorbar(savings, np.median(np.exp(mu), axis=0), yerr=yerr, marker='o', linestyle='') ax2.plot(np.linspace(0, 100, 10), np.linspace(0, 100, 10), color='red', linestyle=':') ax2.set_title('Power law model: Observed vs. predicted savings') ax2.set_xlabel('Observed savings (\\%)') ax2.set_ylabel('Predicted savings (\\%)') ax2.grid(b=True, which='minor', color='w', linewidth=1.0) # clean up and save plt.tight_layout(pad=0.5, w_pad=0.2, h_pad=0.7) plt.savefig('./figures/ebbinghaus_data_3.eps') plt.savefig('./figures/ebbinghaus_data_3.png') plt.savefig('./figures/ebbinghaus_data_3.pdf')
def generate_log_freq_figure(): fig, (ax1, ax2) = plt.subplots(ncols=1, nrows=2) fig.set_size_inches(5.5, 5.5) # plot 1 ax1.plot(freq, rt, marker='o', linestyle='') ax1.plot(freq, mu.mean(axis=0), color='red', linestyle='-') ax1.set_title('Observed (blue) \& predicted (red) RTs\ against log frequency') ax1.set_xlabel('Log frequency (log of \# tokens/1 million words)') ax1.set_xscale('log', basex=10) ax1.set_ylabel('RTs (s)') ax1.grid(b=True, which='minor', color='w', linewidth=1.0) # plot 2 yerr=[mu.mean(axis=0)-pm.hpd(mu)[:,0],\ pm.hpd(mu)[:,1]-mu.mean(axis=0)] ax2.errorbar(rt, mu.mean(axis=0), yerr=yerr,\ marker='o', linestyle='') ax2.plot(np.linspace(0.5, 0.7, 10), np.linspace(0.5, 0.7, 10), color='red', linestyle=':') ax2.set_title('Log frequency model: Observed vs. predicted RTs') ax2.set_xlabel('Observed RTs (s)') ax2.set_ylabel('Predicted RTs (s)') ax2.grid(b=True, which='minor', color='w', linewidth=1.0) # clean up and save plt.tight_layout(pad=0.5, w_pad=0.2, h_pad=1.9) plt.savefig('./figures/log_freq_model_figure.eps') plt.savefig('./figures/log_freq_model_figure.png') plt.savefig('./figures/log_freq_model_figure.pdf')
def plot_fits_w_estimates(y_obs, ppc, ax=None, legend=False): """ Plot Fits with Uncertainty Estimates""" iy = np.argsort(y_obs) ix = np.arange(iy.size) lik_mean = ppc.mean(axis=0) lik_hpd = pm.hpd(ppc) lik_hpd_05 = pm.hpd(ppc, alpha=0.5) r2 = r2_score(y_obs, lik_mean) mae = mean_absolute_error(y_obs, lik_mean) if ax is None: _, ax = pl.subplots(figsize=(12, 8)) ax.scatter(ix, y_obs.values[iy], label='observed', edgecolor='k', s=40, color='w', marker='d', zorder=2) ax.scatter(ix, lik_mean[iy], label='model mean -- $r^2$=%.2f -- mae=%.2f' % (r2, mae), edgecolor='k', s=40, color='w', zorder=3) ax.fill_between(ix, y1=lik_hpd_05[iy, 0], y2=lik_hpd_05[iy, 1], color='gray', label='model output 50%CI', zorder=1, linestyle='-', lw=2, edgecolor='k') ax.fill_between( ix, y1=lik_hpd[iy, 0], y2=lik_hpd[iy, 1], color='k', alpha=0.75, label='model output 95%CI', zorder=0, ) if legend: ax.legend(loc='upper left') return ax
def mcmc_stats(runs, burnin, prob, batch): """ 入力 runs: モンテカルロ標本 burnin: バーンインの回数 prob: 区間確率 (0 < prob < 1) batch: 乱数系列の分割数 出力 事後統計量のデータフレーム """ traces = runs[burnin:, :] n = traces.shape[0] // batch k = traces.shape[1] alpha = 100 * (1.0 - prob) post_mean = np.mean(traces, axis=0) post_median = np.median(traces, axis=0) post_sd = np.std(traces, axis=0) mc_err = pm.mc_error(traces, batches=batch) ci_lower = np.percentile(traces, 0.5 * alpha, axis=0) ci_upper = np.percentile(traces, 100 - 0.5 * alpha, axis=0) hpdi = pm.hpd(traces, 1.0 - prob) rhat = [pm.gelman_rubin(traces[:, i].reshape((n, batch), order='F')) \ for i in range(k)] stats = np.vstack((post_mean, post_median, post_sd, mc_err, ci_lower, ci_upper, hpdi.T, rhat)).T stats_string = ['平均', '中央値', '標準偏差', '近似誤差', '信用区間(下限)', '信用区間(上限)', 'HPDI(下限)', 'HPDI(上限)', '$\\hat R$'] param_string = ['平均 $\\mu$', '分散 $\\sigma^2$'] return pd.DataFrame(stats, index=param_string, columns=stats_string)
def plot_model_ppc_stats(self, ppc, y_obs, alpha_level1=0.05, alpha_level2=0.5, ax=None): if ax is None: _, ax = pl.subplots() iy = np.argsort(y_obs) ix = np.arange(iy.size) ppc_mean = ppc.mean(axis=0) ax.scatter(ix, y_obs.values[iy], label='observed', edgecolor='k', s=50, color='steelblue') ax.scatter(ix, ppc_mean[iy], label='prediction mean', edgecolor='k', s=50, color='red') if alpha_level2: lik_hpd_2 = pm.hpd(ppc, alpha=alpha_level2) ax.fill_between( ix, y1=lik_hpd_2[iy, 0], y2=lik_hpd_2[iy, 1], alpha=0.5, color='k', label=f'prediction {1-alpha_level2:.2f}%CI', ) if alpha_level1: lik_hpd_1 = pm.hpd(ppc, alpha=alpha_level1) ax.fill_between( ix, y1=lik_hpd_1[iy, 0], y2=lik_hpd_1[iy, 1], alpha=0.5, color='k', label=f'prediction {1-alpha_level1:.2f}%CI', ) ax.legend(loc='best') return ax
def compare_parameters_hierarchical(model, parameters=['v', 'gamma', 's', 'tau'], comparisons=None): """Compute comparisons of group level parameters between groups / conditions. Args: model (glambox.GLAM): Fitted GLAM instance parameters (list, optional): List of parameters. Defaults to ['v', 'gamma', 's', 'tau']. comparisons (list of tuples, optional): List of comparisons to perform. Must be a list of tuples, e.g., `[('A', 'B'), ('A', 'C')]`. Defaults to None. Returns: pandas.DataFrame: Distribution statistics of group level parameter differences. """ if comparisons is None: comparisons = [] n_params = len(parameters) n_comps = len(comparisons) comparison_df = [] for p, parameter in enumerate(parameters): # Comparisons for c, comparison in enumerate(comparisons): comparison_string = '{}-{}'.format(*comparison) df_pc = pd.DataFrame(dict(parameter=parameter, comparison=comparison_string), index=[0]) # Check if parameter has dependence if model.design[parameter]['dependence'] is not None: # Then, if both conditions are present, compute posterior of the difference c0_present = (comparison[0] in model.design[parameter]['conditions']) c1_present = (comparison[1] in model.design[parameter]['conditions']) if c0_present & c1_present: difference = ( model.trace[0].get_values(parameter + '_' + comparison[0] + '_mu') - model.trace[0].get_values(parameter + '_' + comparison[1] + '_mu')) hpd_lower, hpd_upper = hpd(difference, alpha=0.05) df_pc['hpd_2.5'] = hpd_lower df_pc['hpd_97.5'] = hpd_upper df_pc['mean'] = np.mean(difference) df_pc['p>0'] = np.mean(difference > 0) else: # Otherwise, state that at least one condition is not present. df_pc['warning'] = 'At least one condition is missing.' else: # Or that the parameter has no dependencies. df_pc['warning'] = 'Parameter has no dependencies.' comparison_df.append(df_pc) comparison_df = pd.concat(comparison_df, sort=False).reset_index(drop=True) return comparison_df
def getAngelRate(data, n_sample=10000, n_chain=3, ax=None): # データの整理 data_0 = data.query('campaign != 1') data_1 = data.query('campaign == 1') d = np.array([[ sum(data_0['angel'] == 0), sum(data_0['angel'] == 1), sum(data_0['angel'] == 2) ], [ sum(data_1['angel'] == 0), sum(data_1['angel'] == 1), sum(data_1['angel'] == 2) ]]) weight = np.array([[1.0, 1.0, 1.0], [1.0, 0.0, 2.0]]) # パラメータ推定 with pm.Model() as model: alpha = [1., 1., 1.] # hyper-parameter of DirichletDist. pi = pm.Dirichlet('pi', a=np.array(alpha)) for i in np.arange(d.shape[0]): piw = pi * weight[i] m = pm.Multinomial('m_%s' % (i), n=np.sum(d[i]), p=piw, observed=d[i]) trace = pm.sample(n_sample, chains=n_chain) np.savetxt('trace_pi.csv', trace['pi'], delimiter=',') # Silver hpd_l, hpd_u = pm.hpd(trace['pi'][:, 1]) print('Silver : 95% HPD : {}-{}'.format(hpd_l, hpd_u)) print('Silver ExpectedValue : {}'.format(trace['pi'][:, 1].mean())) # Gold hpd_l, hpd_u = pm.hpd(trace['pi'][:, 2]) print('Gold : 95% HPD : {}-{}'.format(hpd_l, hpd_u)) print('Gold ExpectedValue : {}'.format(trace['pi'][:, 2].mean())) # save fig if ax is not None: pm.plot_posterior(trace['pi'][:, 0], ax=ax[0]) pm.plot_posterior(trace['pi'][:, 1], ax=ax[1]) pm.plot_posterior(trace['pi'][:, 2], ax=ax[2]) ax[0].set_title('Nothing') ax[1].set_title('SilverAngel') ax[2].set_title('GoldAngel') return trace
def compare_parameters_individual(model, parameters, comparisons=None): if comparisons is None: comparisons = [] n_params = len(parameters) n_comps = len(comparisons) subjects = model.data['subject'].unique().astype(int) summaries = [summary(trace) for trace in model.trace] comparison_df = [] for p, parameter in enumerate(parameters): # Comparisons for c, comparison in enumerate(comparisons): comparison_string = '{}-{}'.format(*comparison) df_pc = pd.DataFrame(dict(subject=subjects, parameter=parameter, comparison=comparison_string), index=subjects) # Check if parameter has dependence if model.design[parameter]['dependence'] is not None: # Then, if both conditions are present, plot posterior of the difference c0_present = ( comparison[0] in model.design[parameter]['conditions']) c1_present = ( comparison[1] in model.design[parameter]['conditions']) if c0_present & c1_present: differences = np.array([(model.trace[i].get_values(parameter + '_' + comparison[0]) - model.trace[i].get_values(parameter + '_' + comparison[1])) for i in subjects])[:, :, 0, 0] means = np.mean(differences, axis=1) hpdlower, hpdupper = hpd(differences.T, alpha=0.05).T plarger0 = np.mean(differences > 0, axis=1) df_pc['mean'] = means df_pc['hpd_2.5'] = hpdlower df_pc['hpd_97.5'] = hpdupper df_pc['p>0'] = plarger0 else: # Otherwise, state that at least one condition is not present. df_pc['warning'] = 'At least one condition is missing.' else: # Or that the parameter has no dependencies. df_pc['warning'] = 'Parameter has no dependencies.' comparison_df.append(df_pc) comparison_df = pd.concat(comparison_df, sort=False).sort_values('subject').reset_index(drop=True) return comparison_df
def visualize_posteriors(samples, param_names): ''' Function visualizes the approximate posteriors for all parameters based on the generated samples. Returns posterior means, precision and HDI for all parameters. ''' fig, axes = plt.subplots(1,samples.shape[-1], figsize=(15,6)) hdi = []; means = [] for c, vals in enumerate(samples.reshape(-1,samples.shape[-1]).T): mean = np.round(np.mean(vals),3); hdi_low, hdi_high = np.round(pymc3.hpd(vals),3) # calc mean and HDI of params means.append(mean); hdi.append([hdi_low, hdi_high]) axes[c].axvline(x=hdi_low, c='r'); axes[c].axvline(x=hdi_high, c='r') sns.distplot(vals, bins=50, ax=axes[c]) # create histogramms for params with approx. density plots axes[c].set_title("{0} mean: {1} \n HDI_low: {2} - HDI_high: {3}".format(param_names[c],np.round(mean, 3), np.round(hdi_low,3), np.round(hdi_high,3))) fig.suptitle('Posterior Distributions', fontsize=20) fig.tight_layout(rect=[0, 0.03, 1, 0.90]) return np.asarray(means), np.asarray(hdi)
def plot_counterfactual(data, trace, variables, parameters, intercept='a', hpdi=0.10, xlab='', ylab=''): """ x = predictor of interest intercept = The string used to denote the intercept (i.e., alpha, or beta_0). Defaults to 'a' variables and parameters are lists that must be in the same order (corresponding data column + coefficient). The first value in each list should be the predictor of interest. Do not include the intercept term. """ # Calculate value of x term (predictor of interest) x = data[variables[0]] x_coef = parameters[0] x_space = np.linspace(x.min(), x.max(), 50) x_value = trace[x_coef] * x_space[:, None] # Calculate value of other variables, holding them to the mean value. controls = np.array(data[variables[1]].mean()) for item in variables[2:]: controls = np.hstack((controls, [data[item].mean()])) control_coefficients = [] for item in parameters[1:]: control_coefficients.append(trace[item]) control_values = np.dot(controls, control_coefficients) # Calculate the predicted mean. mu_predicted = trace[intercept] + x_value + control_values mu_hpd = pm.hpd(mu_predicted.T, alpha=hpdi) plt.plot(x_space, mu_predicted.mean(1), 'k') plt.plot(x_space, mu_hpd[:, 0], 'k--') plt.plot(x_space, mu_hpd[:, 1], 'k--') plt.xlabel(xlab) plt.ylabel(ylab)
def plot_model_fits2(self, y_obs, y_pred=None, title=None, ax=None, ci=0.95): if y_pred is None: y_pred = self.trace_.get_values('mu') y_obs = y_obs.values mask = np.logical_not(np.isnan(y_obs)) y_obs = y_obs[mask] y_pred_mean = np.mean(y_pred, axis=0)[mask] y_pred_hpd = pm.hpd(y_pred, alpha=1 - ci)[mask] xi = np.arange(y_obs.size) iy = np.argsort(y_obs) if ax is None: _, ax = pl.subplots(figsize=(12, 8), ) ax.set_title(title) ax.plot(xi, y_obs[iy], marker='.', ls='', markeredgecolor='darkblue', markersize=13, label='observed') ax.plot(xi, y_pred_mean[iy], marker='o', color='indigo', ls='', markeredgecolor='k', alpha=0.5, label='predicted avg.') ax.fill_between(xi, y_pred_hpd[iy, 0], y_pred_hpd[iy, 1], color='k', alpha=0.5, label=f'{ci*100}%CI on pred.') ax.legend(loc='best') return ax
def _summarize_one_variable(ppc_samples, variable): """ Provide mean and hpd summaries of given variable. :param ppc_samples: pymc3 ppc samples :param variable: key of dict ppc_samples :return: DataFrame, (variable_mean, variable_hpd_lower, variable_hpd_upper), n rows = n columns in ppcs_samples[variable] which should correspond to number of input data points """ # row = sample, column = original data point sample_array = ppc_samples[variable] hpds = pm.hpd(sample_array, alpha=.3) d = dict() # collect all samples into one field per input row # more elegant way? d[f'{variable}_samples'] = list(sample_array.T) d[f'{variable}_hpd_lower'] = hpds[:, 0] d[f'{variable}_hpd_upper'] = hpds[:, 1] d[f'{variable}_mean'] = np.mean(sample_array, axis=0) return (pd.DataFrame(d))
def hdi(self, var_name: str, credible_mass: float = 0.95): """Calculate the highest posterior density interval (HDI) This function calculates a *credible interval* which contains the ``credible_mass`` most likely values of the parameter, given the data. Also known as an HPD interval. Parameters ---------- var_name : str Name of variable. credible_mass : float The HDI will cover credible_mass * 100% of the probability mass. Default: 0.95, i.e. a 95% HDI. Returns ------- (float, float) The endpoints of the HPD """ check_credible_mass(credible_mass) return tuple(pm.hpd(self.trace[var_name], alpha=(1 - credible_mass)))
def plot_regression_line(x, y, mu, hdpi=0.1, xlab='', ylab=''): ''' x: The predictor variable y: The response variable mu: The mu value from the PyMC3 model trace. (eg., trace['mu']) hdpi: The alpha value for the HDPI. 0.1 corresponds to the 90% interval Plots a scatter plot of the data and then the regression line with the HDPI interval shaded. ''' mu_hpd = pm.hpd(mu, alpha=hdpi) plt.scatter(x, y, alpha=0.5) plt.plot(x, mu.mean(0), 'C2') # MAP line (column-wise mean of mu) # HPDI fill-in index = np.argsort(x) plt.fill_between(x[index], mu_hpd[:, 0][index], mu_hpd[:, 1][index], color='C2', alpha=0.25) plt.xlabel(str(xlab)) plt.ylabel(str(ylab))
def stats(self) -> pd.DataFrame: """Stats task result.""" import pymc3 as pm idxes = [col for col in self.trace.columns if not col.startswith('RAW')] means = [] sds = [] hpd_2_5s = [] hpd_97_5s = [] cols = ['mean', 'sd', 'hpd_2.5', 'hpd_97.5'] data = [means, sds, hpd_2_5s, hpd_97_5s] for var_name in idxes: d_i = self.trace[var_name] means.append(np.mean(d_i)) sds.append(np.std(d_i)) hpd_2_5, hpd_97_5 = pm.hpd(d_i) hpd_2_5s.append(hpd_2_5) hpd_97_5s.append(hpd_97_5) df = pd.DataFrame({col: data[i] for i, col in enumerate(cols)}, index=idxes, columns=cols) return df
# out_pai = pm.Deterministic('out_pai', mu + beta * dag_time + gamma1 * z1 + gamma2 * z2 + gamma3 * z3 + gamma4 * z4) Observed = pm.Binomial("Observed", dag_sum, out_pai, observed=dag_fault) # 观测值 # start = pm.find_MAP() step = pm.Metropolis() trace = pm.sample(10000, step=step) chain = trace # logistic(chain, locals()) varnames = ['gamma1', 'mu', 'beta', 'out_pai'] varnames1 = ['out_pai'] pm.traceplot(chain, varnames) plt.show() sig0 = pm.hpd(trace['out_pai'], alpha=0.6)[0] sig = pm.hpd(trace['out_pai'], alpha=0.6)[1] sig1 = pm.hpd(trace['out_pai'], alpha=0.6)[2] sig2 = pm.hpd(trace['out_pai'], alpha=0.6)[3] sig3 = pm.hpd(trace['out_pai'], alpha=0.6)[4] sig4 = pm.hpd(trace['out_pai'], alpha=0.6)[5] plt.figure() ax = sns.distplot(sig0) ax = sns.distplot(sig1) ax = sns.distplot(sig2) ax = sns.distplot(sig3) ax = sns.distplot(sig4) ax = sns.distplot(sig) plt.show()
# Make a summary dataframe # Because this is pretty gaussian, median is pretty much the same # as the mean, although we might as well compute it since we have the distribution # one standard deviation covers 68% of the distribution, might be better to # keep 95% HPD # get strain names first strains = [*df_mcmc.columns] # dataframe to store summary statistics df_summary = pd.DataFrame(index=['median', '_hpd', 'hpd_', 'mean','std'], columns=strains) for strain in strains: # median df_summary.loc['median', strain] = np.median(df_mcmc[strain]) # 95% highest posterior density df_summary.loc[['_hpd', 'hpd_'], strain] = pm.hpd(df_mcmc[strain], alpha=0.05) # mean df_summary.loc['mean', strain] = np.mean(df_mcmc[strain]) # standard deviation df_summary.loc['std', strain] = np.std(df_mcmc[strain]) # plot sample histograms for strain in strains: plt.hist(df_mcmc[strain], bins=100, normed=True, histtype='step', linewidth=2) plt.xlabel('prob. of dauer, $p$') plt.ylabel(r'$P(p\mid d, n)$') plt.legend(strains, loc='upper center'); sns.despine() plt.tight_layout() #plt.savefig('./output/probdistrib_dauer.pdf', transparent=True, bbox_inches='tight') plt.close('all')
trace_0 = pm.sample(5000) chain_0 = trace_0[1000:] varnames = ['alpha', 'beta', 'bd'] pm.traceplot(chain_0, varnames) plt.savefig('img505.png', dpi=300, figsize=(5.5, 5.5)) plt.figure() pm.summary(chain_0, varnames) theta = chain_0['theta'].mean(axis=0) idx = np.argsort(x_0) plt.plot(x_0[idx], theta[idx], color='b', lw=3) plt.axvline(chain_0['bd'].mean(), ymax=1, color='r') bd_hpd = pm.hpd(chain_0['bd']) plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='r', alpha=0.5) plt.plot(x_0, y_0, 'o', color='k') theta_hpd = pm.hpd(chain_0['theta'])[idx] plt.fill_between(x_0[idx], theta_hpd[:, 0], theta_hpd[:, 1], color='b', alpha=0.5) plt.xlabel(x_n, fontsize=16) plt.ylabel(r'$\theta$', rotation=0, fontsize=16) plt.savefig('img506.png', dpi=300, figsize=(5.5, 5.5)) plt.figure()
plt.savefig('img408.png') plt.clf() plt.plot(x, alpha_m + beta_m * x, c='k', label='y ={:.2f} + {:.2f} * x'.format(alpha_m, beta_m)) idx = np.argsort(x) x_ord = x[idx] sig = pm.hpd(trace_n['mu'], alpha=0.02)[idx] plt.fill_between(x_ord, sig[:, 0], sig[:, 1], color='gray') plt.xlabel('$x$', fontsize=16) plt.ylabel('$y$', fontsize=16, rotation=0) plt.savefig('img409.png') """ ppc = pm.sample_ppc(trace_n, samples=100000, model=model) idx = np.argsort(x) x_ord = x[idx] #plt.plot(x, y, 'b.') plt.plot(x, alpha_m + beta_m * x, c='k', label='y = {:.2f} + {:.2f} * x'.format(alpha_m, beta_m)) sig0 = pm.hpd(ppc['y_pred'], alpha=0.5)[idx] sig1 = pm.hpd(ppc['y_pred'], alpha=0.05)[idx] plt.fill_between(x_ord, sig0[:, 0], sig0[:, 1], color='gray', alpha=1) plt.fill_between(x_ord, sig1[:, 0], sig1[:, 1], color='gray', alpha=0.5) plt.xlabel('$x$', fontsize=16) plt.ylabel('$y$', fontsize=16, rotation=0) plt.savefig('img410.png')
def hpd_range(x): hpd = pm.hpd(x) return hpd[1] - hpd[0]
num_already_found = np.mean( [num_true_hits - i.shape[0] for i in normalized_ranks_holder[idx]]) num_remaining = n_hits_pulled - num_already_found num_needed = desired_num_hits - num_already_found fraction_required = num_needed / num_remaining #expected performance on undocked ligands: trace = estimate_holder[idx][0] mu = trace['mu'] nu = trace['nu'] sig = trace['sig'] samples = t(nu, mu, sig).ppf(fraction_required) #this is the fraction of remaining ligands we need to dock to reach the goal. hpd = expit(pm.hpd(samples)) time_hpd = hpd * (n_ligands_to_pull - size) + size time_days = time_hpd / 60 / 60 / 24 time_mean = expit(samples.mean()) * (n_ligands_to_pull - size) + size time_mean = time_mean / 60 / 60 / 24 print(time_mean) df.loc[count] = [ time_mean, time_hpd[0], time_hpd[1], size, desired_num_hits ] count += 1 import altair as alt #now plot :)
yl = pm.Bernoulli('yl', p=theta, observed=y_0) trace_0 = pm.sample(5000) chain_0 = trace_0[1000:] varnames = ['alpha', 'beta', 'bd'] pm.traceplot(chain_0, varnames) plt.savefig('img505b.png') pm.summary(trace_0, varnames) #print(chain_0['theta']) plt.figure() theta = chain_0['theta'].mean(axis=0) idx = np.argsort(x_0) plt.plot(x_0[idx], theta[idx], color='b', lw=3) plt.axvline(chain_0['bd'].mean(), ymax=1, color='r') bd_hpd = pm.hpd(chain_0['bd']) plt.fill_betweenx([0, 1], bd_hpd[0], bd_hpd[1], color='r', alpha=0.5) plt.plot(x_0, y_0, 'o', color='k') theta_hpd = pm.hpd(chain_0['theta'])[idx] plt.fill_between(x_0[idx], theta_hpd[:, 0], theta_hpd[:, 1], color='b', alpha=0.5) plt.xlabel(x_n, fontsize=16) plt.ylabel(r'$\theta$', rotation=0, fontsize=16) plt.savefig('img506b.png')
alpha = pm.Normal(name='alpha', mu=mean_data, sd=std_data) beta = pm.Normal(name='beta', mu=0, sd=10, shape=4) sigma = pm.Uniform(name='sigma', lower=0, upper=std_data) mu = pm.Deterministic('mu', alpha + beta[0] * data_s + beta[1] * data_s2 + beta[2] * data_s3 + beta[3] * data_s4) ret = pm.Normal(name='returns', mu=mu, sd=sigma, observed=ror) trace_model = pm.sample(1000, tune=2000) print(pm.summary(trace_model, ['alpha', 'beta', 'sigma'])) pm.traceplot(trace_model, varnames=['alpha', 'beta', 'sigma']) plt.title('model parameters') plt.show() mu_pred = trace_model['mu'] idx = np.argsort(data_s) mu_hpd = pm.hpd(mu_pred, alpha=0.11)[idx] ret_pred = pm.sample_ppc(trace_model, 10000, model) ret_pred_hpd = pm.hpd(ret_pred['returns'], alpha=0.11)[idx] for r in ror: plt.plot(r) plt.plot(ret_pred_hpd) plt.show() for r in ror: # plt.scatter(data_s, r, c='C0', alpha=0.3) plt.plot(data_s[idx], r, c='C0', alpha=0.3) plt.fill_between(data_s[idx], mu_hpd[:, 0], mu_hpd[:, 1], color='C2', alpha=0.25) plt.fill_between(data_s[idx], ret_pred_hpd[:, 0], ret_pred_hpd[:, 1], color='C2', alpha=0.25) plt.show()
f.close() # Save the trace to the output folder as a numpy array, for later reference # Save every 10th sample from the trace, to avoid any autocorrelation issues np.save("palatability_regression_trace.npy", tr[::10]["coeff_pal"]) # Convert the trace to a dataframe, and save that too # Save every 10th sample from the trace, to avoid any autocorrelation issues tr_df = pm.trace_to_dataframe(tr[::10]) tr_df.to_csv("palatability_regression_trace.csv") # Plot the results of the palatability regression analysis # First just plot the mean regression coefficients for every laser condition, across time fig = plt.figure() mean_coeff = np.mean(tr[::10]["coeff_pal"], axis = 0) hpd_coeff = pm.hpd(tr[::10]["coeff_pal"], alpha = 0.05) for condition in range(unique_lasers[0].shape[0]): plt.plot(x[analyze_indices], mean_coeff[:, condition], linewidth = 3.0, label = "Dur:{}ms, Lag:{}ms".format(unique_lasers[0][condition][0], unique_lasers[0][condition][1])) plt.legend() plt.xlabel("Time post taste delivery (ms)") plt.ylabel("Mean posterior regression coefficient") fig.savefig("palatability_regression_coefficients_mean.png", bbox_inches = "tight") plt.close("all") # Now plot the mean and SD of the regression coefficients for every laser condition, across time fig = plt.figure() for condition in range(unique_lasers[0].shape[0]): plt.plot(x[analyze_indices], np.mean(tr[::10]["coeff_pal"], axis = 0)[:, condition], linewidth = 3.0, label = "Dur:{}ms, Lag:{}ms".format(unique_lasers[0][condition][0], unique_lasers[0][condition][1])) plt.fill_between(x[analyze_indices], hpd_coeff[:, condition, 0], hpd_coeff[:, condition, 1], alpha = 0.5) plt.legend() plt.xlabel("Time post taste delivery (ms)") plt.ylabel("Mean posterior regression coefficient")
from sklearn.model_selection import train_test_split import os import pandas as pd data_file = 'btc.data.csv' if not os.path.isfile(data_file): data = data_reader.get_data_yahoo('BTC-USD') data.to_csv(data_file) data = pd.read_csv(data_file) print(data.Open.head()) print(data.Open.tail()) print(data.Open.iloc[-1]) hpd = pm.hpd(data.Open, alpha=0.05) print('[%f %f]' % (hpd[0], hpd[1])) # _, (ax0, ax1) = plt.subplots(2, 1) # sns.kdeplot(data.Open, ax=ax0) # ax1.plot(data.Open) # plt.show() data = data.as_matrix(columns=['Open']) print(data.shape) standard_scaler = StandardScaler() standard_scaler.fit(data) d = standard_scaler.transform(data) # d = standard_scaler.inverse_transform(d)
f.close() # Save the trace to the output folder as a numpy array, for later reference # Save every 10th sample from the trace, to avoid any autocorrelation issues np.save("palatability_regression_trace.npy", tr[::10]["coeff_pal"]) # Convert the trace to a dataframe, and save that too # Save every 10th sample from the trace, to avoid any autocorrelation issues tr_df = pm.trace_to_dataframe(tr[::10]) tr_df.to_csv("palatability_regression_trace.csv") # Plot the results of the palatability regression analysis # First just plot the mean regression coefficients for every laser condition, across time fig = plt.figure() mean_coeff = np.mean(tr[::10]["coeff_pal"], axis=0) hpd_coeff = pm.hpd(tr[::10]["coeff_pal"], alpha=0.05) for condition in range(unique_lasers[0].shape[0]): plt.plot(x[analyze_indices], mean_coeff[:, condition], linewidth=3.0, label="Dur:{}ms, Lag:{}ms".format(unique_lasers[0][condition][0], unique_lasers[0][condition][1])) plt.legend() plt.xlabel("Time post taste delivery (ms)") plt.ylabel("Mean posterior regression coefficient") fig.savefig("palatability_regression_coefficients_mean.png", bbox_inches="tight") plt.close("all") # Now plot the mean and SD of the regression coefficients for every laser condition, across time fig = plt.figure() for condition in range(unique_lasers[0].shape[0]):
start = pm.find_MAP() step = pm.NUTS(scaling=start) trace_1 = pm.sample(5000, step=step, start=start) chain_1 = trace_1[100:] varnames = ['alpha', 'beta'] pm.traceplot(chain_1) plt.show() idx = np.argsort(x_1[:, 0]) bd = chain_1['bd'].mean(0)[idx] plt.scatter(x_1[:, 0], x_1[:, 1], c=y_1) plt.plot(x_1[:, 0][idx], bd, color='r') bd_hpd = pm.hpd(chain_1['bd'])[idx] plt.fill_between(x_1[:, 0][idx], bd_hpd[:, 0], bd_hpd[:, 1], color='r', alpha=0.5) plt.xlabel(x_n[0], fontsize=16) plt.ylabel(x_n[1], fontsize=16) plt.show() corr = iris[iris['species'] != 'virginica'].corr() mask = np.tri(*corr.shape).T sns.heatmap(corr.abs(), mask=mask, annot=True) plt.show()
ax2.set_xlabel('Observed probabilities') ax2.set_ylabel('Predicted probabilities') ax2.grid(b=True, which='minor', color='w', linewidth=1.0) # clean up and save plt.tight_layout(pad=0.5, w_pad=0.2, h_pad=0.7) plt.savefig('../figures/lex_dec_model_pyactr_no_imaginal.eps') plt.savefig('../figures/lex_dec_model_pyactr_no_imaginal.png') plt.savefig('../figures/lex_dec_model_pyactr_no_imaginal.pdf') #plt.show() generate_lex_dec_pyactr_no_imaginal_figure() decay_posterior = trace["decay"] decay_posterior.mean() pm.hpd(decay_posterior) threshold_posterior = trace["threshold"] threshold_posterior.mean() pm.hpd(threshold_posterior) noise_posterior = trace["noise"] noise_posterior.mean() pm.hpd(noise_posterior) latency_factor_posterior = trace["lf"] latency_factor_posterior.mean() pm.hpd(latency_factor_posterior) latency_exponent_posterior = trace["le"] latency_exponent_posterior.mean()
# Save these findings in a table specific to this unit unit_table = hf5.create_table('/laser_effects_bayesian/unit_summaries', 'unit{:d}'.format(chosen_units[unit]), description = laser_effects) # Now run through the tastes and laser conditions for laser in range(diff.shape[0]): for taste in range(diff.shape[1]): # Get a new row for this taste and laser condition this_condition_results = unit_table.row # Fill in the taste and laser conditions this_condition_results['laser'] = laser + 1 this_condition_results['taste'] = taste + 1 # First check if the control firing was close to zero for this taste/laser combo (comparing it to a sufficiently small number because the control firing rate is always > 0 by definition) if pm.hpd(bayesian_results[laser, taste, :, 0], alpha = sig_level)[0] <= 1e-4: this_condition_results['control_zero'] = 1.0 else: this_condition_results['control_zero'] = 0.0 # Then check if the laser condition has no effect on firing (the diff HPD will overlap zero) diff_hpd = pm.hpd(diff[laser, taste, :], alpha = sig_level) if diff_hpd[0] * diff_hpd[1] < 0: this_condition_results['unchanged'] = 1.0 this_condition_results['enhanced'] = 0.0 this_condition_results['suppressed'] = 0.0 # Firing is enhanced if the diff (control-laser) lies consistently below zero elif diff_hpd[0] < 0 and diff_hpd[1] < 0: this_condition_results['unchanged'] = 0.0 this_condition_results['enhanced'] = 1.0 this_condition_results['suppressed'] = 0.0
org_beta111 = post_beta111 * faults_sd / year_std org_beta00 = post_beta00 * faults_sd + faults_m - ( post_beta111 * faults_sd * year_m / year_std) - (post_beta2 * faults_sd * tem_m / tem_std) # beta_plot = chain2['beta'][:, 0] beta1_plot = chain2['beta1'][:, 0] beta2_plot = chain2['beta2'] # # 后验 plt.figure(figsize=(10, 10)) idx = np.argsort(elec_year) x_ord = elec_year[idx] ppc = pm.sample_ppc(chain2, samples=500, model=unpooled_model) sig_y = pm.hpd(ppc['Observed'][0:42], alpha=0.05)[idx] sig_y1 = pm.hpd(ppc['Observed'][42:91], alpha=0.05)[idx] # plt.fill_between(x_ord, sig_y[:, 0], sig_y[:, 1], color='gray', alpha=0.4) # plt.fill_between(x_ord, sig_y1[:, 0], sig_y1[:, 1], color='red', alpha=0.3) # # # sig_y0 = pm.hpd(ppc['Observed'][1], alpha=0.5)[idx] # # sig_y11 = pm.hpd(ppc['Observed'][1], alpha=0.05)[idx] # # plt.fill_between(x_ord, sig_y[:, 0], sig_y[:, 1], color='gray', alpha=1) # # plt.fill_between(x_ord, sig_y1[:, 0], sig_y1[:, 1], color='gray', alpha=0.5) # idd = range(0, len(chain2['beta2']), 100) plt.figure(figsize=(5, 3), facecolor=(1, 1, 1)) ax = plt.subplot(1, 1, 1) # j, k1 = 0, 6 # for jx in range(7): # k1 = 6
def process_one_etf(top, result_df): print(top) bah_investor = compute_bah([top],period,cash_sum) print('invested:' + str(bah_investor.invested_history[-1])) print('value gained:' + str(bah_investor.history[-1])) print('returns:' + str(bah_investor.ror_history[-1])) investors = [] while len(investors) < MAX_RUNS: investor = run_bah_sim([top],period,cash_sum) if len(investor.ror_history) == 0: continue investors.append(investor) print('%d:%f:%f:%f' % (len(investors), investor.invested, investor.history[-1], investor.ror_history[-1])) returns_bah = [investor.ror_history[-1] for investor in investors] returns_bah = np.array(returns_bah) # returns_bah = np.sort(returns_bah) print('original:%f' % bah_investor.ror_history[-1]) print('observed:%f +/- %f' % (np.mean(returns_bah), np.std(returns_bah))) with pm.Model() as model: mu = pm.Normal('mu', mu=np.mean(returns_bah), sd=np.std(returns_bah)) sigma = pm.Uniform('sigma', lower=0., upper=np.std(returns_bah)) mean_returns = pm.Normal('mean_returns', mu=mu, sd=sigma, observed=np.array(returns_bah)) trace_model = pm.sample(1000, tune=2000) samples_bah = pm.sample_ppc(trace_model, size=10000, model=model) hpd89_bah = pm.hpd(samples_bah['mean_returns'], alpha=0.11) print('mean 89 percentile:' + str(np.mean(hpd89_bah))) investors = [] while len(investors) < MAX_RUNS: investor = compute_one_etf([top],period,cash_sum) if investor.cash == investor.invested: continue if len(investor.ror_history) == 0: continue investors.append(investor) print('%d:%f:%f:%f' % (len(investors), investor.invested, investor.history[-1], investor.ror_history[-1])) returns_chaos = [investor.ror_history[-1] for investor in investors] returns_chaos = np.array(returns_chaos) # returns_chaos = np.sort(returns_chaos) print('original:%f' % (bah_investor.ror_history[-1])) print('observed:%f +/- %f' % (np.mean(returns_chaos), np.std(returns_chaos))) with pm.Model() as model: mu = pm.Normal('mu', mu=np.mean(returns_chaos), sd=np.std(returns_chaos)) sigma = pm.Uniform('sigma', lower=0., upper=np.std(returns_chaos)) mean_returns = pm.Normal('mean_returns', mu=mu, sd=sigma, observed=np.array(returns_chaos)) trace_model = pm.sample(1000, tune=2000) samples_chaos = pm.sample_ppc(trace_model, size=10000, model=model) hpd89_chaos = pm.hpd(samples_chaos['mean_returns'], alpha=0.11) print('mean 89 percentile:' + str(np.mean(hpd89_chaos))) validity_chaos = np.count_nonzero(np.abs(returns_chaos - bah_investor.ror_history[-1]) < 0.05) / len( returns_chaos) * 100. validity_bah = np.count_nonzero(np.abs(returns_bah - bah_investor.ror_history[-1]) < 0.05) / len(returns_bah) * 100. result_df = result_df.append({ 'ticket': top, 'original_returns': bah_investor.ror_history[-1], 'hpd89_bah': np.mean(hpd89_bah), 'hpd89_chaos': np.mean(hpd89_chaos), 'validity_bah': validity_bah, 'validity_chaos': validity_chaos}, ignore_index=True) report = Report(top, bah_investor, returns_bah, hpd89_bah, samples_bah['mean_returns'], returns_chaos, hpd89_chaos, samples_chaos['mean_returns']) report.gen_report() result_df.to_csv(result_csv, index=False) return result_df