def main(): with pm.Model() as model: # Using a strong prior. Meaning the mean is towards zero than towards 1 prior = pm.Beta('prior', 0.5, 3) output = pm.Binomial('output', n=100, observed=50, p=prior) step = pm.Metropolis() trace = pm.sample(1000, step=step) pm.traceplot(trace) pm.plot_posterior(trace, figsize=(5, 5), kde_plot=True, rope=[0.45, 0.55]) # Rope is an interval that you define # This is a value you eppect. You can check # If ROPE fall on HPD or not. If it falls, it means # our value is within HPD and may be increasing sample # size would make our mean estimate better. # gelman rubin pm.gelman_rubin(trace) # forestplot pm.forestplot(trace, varnames=['prior']) # summary [look at mc error here. This is the std error, should be low] pm.df_summary(trace) #autocorrelation pm.autocorrplot(trace) # effective size pm.effective_n(trace)['prior']
def summarize(best_result, kde=True, plot=True): trace, model = best_result if plot: ax = pm.plot_posterior(trace[100:], varnames=[ r"group1_mean", r"group2_mean", r"group1_std", "group2_std", r"ν_minus_one" ], kde_plot=kde, color="C0") if kde: for a in (1, 3): ax[a].lines[0].set_color("C1") plt.figure() pm.plot_posterior(trace[1000:], varnames=[ "difference of means", "difference of stds", "effect size" ], ref_val=0, kde_plot=True, color="C2") plt.figure() pm.forestplot(trace[1000:], varnames=[v.name for v in model.vars[:2]]) plt.figure() pm.forestplot(trace[1000:], varnames=[v.name for v in model.vars[2:]]) pm.summary( trace[1000:], varnames=["difference of means", "difference of stds", "effect size"])
def trial1(): radon = pd.read_csv('data/radon.csv')[['county', 'floor', 'log_radon']] # print(radon.head()) county = pd.Categorical(radon['county']).codes # print(county) niter = 1000 with pm.Model() as hm: # County hyperpriors mu_a = pm.Normal('mu_a', mu=0, sd=10) sigma_a = pm.HalfCauchy('sigma_a', beta=1) mu_b = pm.Normal('mu_b', mu=0, sd=10) sigma_b = pm.HalfCauchy('sigma_b', beta=1) # County slopes and intercepts a = pm.Normal('slope', mu=mu_a, sd=sigma_a, shape=len(set(county))) b = pm.Normal('intercept', mu=mu_b, sd=sigma_b, shape=len(set(county))) # Houseehold errors sigma = pm.Gamma("sigma", alpha=10, beta=1) # Model prediction of radon level mu = a[county] + b[county] * radon.floor.values # Data likelihood y = pm.Normal('y', mu=mu, sd=sigma, observed=radon.log_radon) start = pm.find_MAP() step = pm.NUTS(scaling=start) hm_trace = pm.sample(niter, step, start=start) plt.figure(figsize=(8, 60)) pm.forestplot(hm_trace, varnames=['slope', 'intercept'])
def _plot_changepoints(self, alpha, plot_kwargs): plt.figure(**plot_kwargs) pm.forestplot(self.trace, varnames=['changepoints_%s' % self.name], ylabels=self.changepoints.astype(str)) plt.grid() plt.title("Growth Change Points") plt.show()
def _plot_holidays(self, alpha, plot_kwargs): plt.figure(**plot_kwargs) pm.forestplot(self.trace[self.skip_first // self.chains:], alpha=alpha, varnames=[self.priors_names['holidays']], ylabels=self.holidays) plt.grid() plt.show()
def _plot_intercept(self, alpha: float, plot_kwargs: Dict): plt.figure(**plot_kwargs) pm.forestplot( self.trace[self.skip_first // self.chains:], var_names=[self.priors_names["intercept"]], ridgeplot_alpha=alpha, ) plt.show()
def _plot_changepoints(self, alpha, plot_kwargs): plt.figure(**plot_kwargs) pm.forestplot(self.trace[self.skip_first // self.chains:], alpha=alpha, varnames=[self.priors_names['changepoints']], ylabels=self.changepoints.astype(str)) plt.grid() plt.title("Growth Change Points") plt.show()
def _plot_regressors(self, alpha: float, plot_kwargs: Dict): plt.figure(**plot_kwargs) pm.forestplot( self.trace[self.skip_first // self.chains:], alpha=alpha, varnames=[self.priors_names["regressors"]], ylabels=self.regressors, ) plt.grid() plt.show()
def _plot_changepoints(self, alpha: float, plot_kwargs: Dict): plt.figure(**plot_kwargs) pm.forestplot( self.trace[self.skip_first // self.chains:], alpha=alpha, varnames=[self.priors_names["changepoints"]], ylabels=np.array(self.changepoints).astype(str), ) plt.grid() plt.title("Growth Change Points") plt.show()
def forestplot(self, varnames=None): """Generate a forestplot with 95% credible intervals and R hat statistic. Parameters ---------- varnames : iterable of str or None, optional The model variables to generate plots for (default None). If None, defaults to all variables. """ varnames = varnames or self.model_variables pm.forestplot(self.trace, varnames=varnames, color='#8BCAF1')
def main(): data = np.array([ 51.06, 55.12, 53.73, 50.24, 52.05, 56.40, 48.45, 52.34, 55.65, 51.49, 51.86, 63.43, 53.00, 56.09, 51.93, 52.31, 52.33, 57.48, 57.44, 55.14, 53.93, 54.62, 56.09, 68.58, 51.36, 55.47, 50.73, 51.94, 54.95, 50.39, 52.91, 51.5, 52.68, 47.72, 49.73, 51.82, 54.99, 52.84, 53.19, 54.52, 51.46, 53.73, 51.61, 49.81, 52.42, 54.3, 53.84, 53.16 ]) # look at the distribution of the data sns.kdeplot(data) # All these distributions are used to model std # It is safe to use exponential # half cauchy has a fat tail # Exponential parameter lambda high indicates a high steep # Ineverse gamma with pm.Model() as model: mu = pm.Uniform('mu', 30, 80) sigma = pm.HalfNormal('sigma', sd=10) df = pm.Exponential( 'df', 1.5) # lamda = 1.5, it will be more steep, 0.5 less output = pm.StudentT('output', mu=mu, sigma=sigma, nu=df, observed=data) trace = pm.sample(1000) # gelman rubin pm.gelman_rubin(trace) # forestplot pm.forestplot(trace) # summary [look at mc error here. This is the std error, should be low] pm.summary(trace) #autocorrelation pm.autocorrplot(trace) # effective size pm.effective_n(trace)
def _plot_holidays(self, alpha: float, plot_kwargs: dict): plt.figure(**plot_kwargs) ax = pm.forestplot( self.trace[self.skip_first // self.chains:], ridgeplot_alpha=alpha, var_names=[self.priors_names["holidays"]], ) ax[0].set_yticklabels(self.holidays[::-1]) plt.grid() plt.show()
def main(): X, Y = generate_sample() with pm.Model() as model: alpha = pm.Normal('alpha', mu=0, sd=20) beta = pm.Normal('beta', mu=0, sd=20) sigma = pm.Uniform('sigma', lower=0) y = pm.Normal('y', mu=beta*X+alpha, sd=sigma, observed=Y) start = pm.find_MAP() step = pm.NUTS(state=start) with model: if (multicore): trace = pm.sample(itenum, step, start=start, njobs=chainnum, random_seed=range(chainnum), progressbar=progress) else: ts = [pm.sample(itenum, step, chain=i, progressbar=progress) for i in range(chainnum)] trace = merge_traces(ts) if (saveimage): pm.traceplot(trace).savefig("simple_linear_trace.png") print "Rhat = {0}".format(pm.gelman_rubin(trace)) t1 = time.clock() print "elapsed time = {0}".format(t1 - t0) #trace if(not multicore): trace=ts[0] with model: pm.traceplot(trace,model.vars) pm.forestplot(trace) with open("simplelinearregression_model.pkl","w") as fpw: pkl.dump(model,fpw) with open("simplelinearregression_trace.pkl","w") as fpw: pkl.dump(trace,fpw) with open("simplelinearregression_model.pkl") as fp: model=pkl.load(fp) with open("simplelinearregression_trace.pkl") as fp: trace=pkl.load(fp)
def show_forest(self, show_feats, feat_labels=None): g = pm.forestplot(self.trace_, varnames=show_feats, ylabels=feat_labels) f = pl.gcf() try: ax = f.get_axes()[1] except IndexError: ax = f.get_axes()[0] ax.grid(axis='y') return g
def plot_model_diagnostics(model, save_dir, file_id, export=True): """generate and export a range of diagnostic plots for a given model""" # ensure folder exists if export is True: if not os.path.exists(save_dir): os.makedirs(save_dir) model_name = model.__class__.__name__ trace_df = pm.trace_to_dataframe(model.trace, varnames=model.df_params) sns.pairplot(trace_df) if export is True: plt.savefig(save_dir + f'{model_name}_{file_id}_pairplot.pdf', format='pdf', bbox_inches='tight') plt.cla() pm.traceplot(model.trace, varnames=model.df_params) if export is True: plt.savefig(save_dir + f'{model_name}_{file_id}_traceplot.pdf', format='pdf', bbox_inches='tight') plt.cla() pm.autocorrplot(model.trace, varnames=model.df_params) if export is True: plt.savefig(save_dir + f'{model_name}_{file_id}_autocorrplot.pdf', format='pdf', bbox_inches='tight') plt.cla() pm.forestplot(model.trace, varnames=model.df_params) if export is True: plt.savefig(save_dir + f'{model_name}_{file_id}_forestplot.pdf', format='pdf', bbox_inches='tight') plt.cla() # close all figs, otherwise we can run out of memory plt.close("all")
def plot_forest_plot(trace, name1, name2): """ Plots a forest plot @param trace a trace object @param name1 the name of the first group @param name2 the name of the second group @returns a forestplot on a gridspec """ fp1 = pm.forestplot(trace, varnames=[name1, name2], rhat=False) return fp1
def _plot_changepoints(self, alpha: float, plot_kwargs: Dict): plt.figure(**plot_kwargs) _, ax = pm.forestplot( self.trace[self.skip_first // self.chains :], ridgeplot_alpha=alpha, var_names=[self.priors_names["changepoints"]], ) ax[0].set_yticklabels(list(np.array(self.changepoints).astype(str))[::-1]) plt.grid() plt.title("Growth Change Points") plt.show()
def plot_forestplots(trace): fig = plt.figure() pm.forestplot(trace, vline=1, varnames=['fold']) fig = plt.figure() pm.forestplot(trace, varnames=['z_factor', 'zp_factor'], xrange=(-1, 1), vline=0.5) fig = plt.figure() pm.forestplot(trace, varnames=['sigma']) plt.figure() pm.forestplot(trace, varnames=['fold_changes'], vline=1)
def forestplot(model, bambi=False, transform=np.array, vline_label=None, rhat=False, **kwargs): """Modified forestplot function Forestplot function from PyMC3, adapted to automatically plot only relevant effects for a BAMBI or PyMC3 model and to add a vertical no effect line to aid with interpreting coefficients :param trace: BAMBI or PyMC3 model object :param transform: function to transform trace (pass np.exp for logistic regression) :param kwargs: keyword args for PyMC3 forestplot function :returns: matplotlib subplot object with forestplot for trace """ if bambi: trace = model.backend.trace varnames = sorted(model.fixed_terms.keys()) else: trace = model varnames = sorted(trace.varnames) pm.forestplot(trace, varnames=varnames, transform=transform, rhat=rhat, **kwargs) g = plt.gca() #g.set(xlim=(None, None)) if vline_label is not None: no_effect = float(transform(0)) g.axes.axvline(no_effect, color='red') g.axes.annotate(vline_label, [no_effect, -.5], rotation=90, va='center', ha='right', color='red') return g
def cmt_example(): obs = { 'n1_Hp_Rp': 519, 'n1_Hp': 10473, 'P_Hp': 0.1561185895315763, 'n_Hp_Rp': 42, 'n_Hn_Rp': 2, 'n_Hp_Rn': 687, 'n_Hn_Rn': 3624 } trace = sample_heuristic_precision(obs, {'draws': 10000, 'tune': 5000}) pm.plot_posterior(trace, credible_interval=0.94) pm.plot_posterior(trace, credible_interval=0.99) help(pm.plot_posterior) pm.traceplot(trace) pm.forestplot(trace) q_samples = trace['q'] np.average(q_samples < 0.03)
# diagnostics: Gelman-Rubin print(pm.diagnostics.gelman_rubin(trace)) # In[40]: # diagnostics: n effective print(pm.diagnostics.effective_n(trace)) # In[41]: pm.forestplot(trace); # In[42]: pm.plot_posterior(trace); # PyMC3, offers a variety of other samplers, found in pm.step_methods. # In[43]: list(filter(lambda x: x[0].isupper(), dir(pm.step_methods)))
def posterior_forestplot(self, **kwargs): return pm.forestplot(self.posterior_, **kwargs)
del model, posterior_samples, model_summary_logscale #%% ############################################################################### # Print results from all models ############################################################################### import matplotlib.pyplot as plt # Model 0 pm.traceplot(collect_results['posterior_samples']) print(collect_results['model_summary_logscale']) plt.figure(figsize=(4, 8)) pm.forestplot(collect_results['posterior_samples'], var_names=['beta'], credible_interval=0.95) pm.forestplot(collect_results['posterior_samples'], var_names=['beta_day'], credible_interval=0.95) #pm.forestplot(collect_results['0']['posterior_samples'], var_names=['alpha'], credible_interval=0.95) # %% filename = os.path.join(os.path.realpath(dir_picklejar), 'rjmcmc_models') outfile = open(filename, 'wb') pickle.dump(collect_results, outfile) outfile.close() # %% REsidual code for safekeeping # # Y_hat_latent = pm.Determinist(of Y_diff_latent)
step = pm.Metropolis() trace = pm.sample(1000, step=step, start=start) burnin = 0 # no burnin chain = trace[burnin:] pm.traceplot(chain, lines={'theta': theta_true}) with beta_binomial: step = pm.Metropolis() multi_trace = pm.sample(1000, step=step, njobs=4) burnin = 0 # no burnin multi_chain = multi_trace[burnin:] pm.traceplot(multi_chain, lines={'theta': theta_true}) # convergence pm.gelman_rubin(multi_chain) pm.forestplot(multi_chain, varnames=['theta']) # summary pm.summary(multi_chain) # autocorrelation pm.autocorrplot(chain) # effective size pm.effective_n(multi_chain)['theta'] # Summerize the posterior pm.plot_posterior(chain, kde_plot=True) plt.show()
def mixed_effects(): le = preprocessing.LabelEncoder() # Convert categorical variables to integer # participants_idx = le.fit_transform(messages['prev_sender']) classes = 'FF49_industry' # classes = 'underwriter_tier' # classes = 'amends' print("Grouping by: {}".format(classes)) FF49_industry = le.fit_transform(df['FF49_industry']) class_idx = le.fit_transform(df[classes]) n_classes = len(le.classes_) NSamples = 50000 burn = NSamples/10 thin = 2 covariates = [ 'Intercept', '#Syndicate Members', '#Lead Underwriters', 'Underwriter Rank', # 'FF49 Industry', 'Amends Down', '#S1A Amendments', 'Share Overhang', 'log(1+Sales)', 'log(Proceeds)', 'CASI', # 'media_1st_pricing', # 'VC', 'IPO Market Returns', 'Industry Returns', 'BAA Spread', ] y = df['days_to_first_price_update'].values # y = np.ma.masked_values(list(df.days_to_first_price_update), value=-999) with pm.Model() as model: # Parameters: intercept = pm.Gamma('Intercept', alpha=.1, beta=.1, shape=n_classes) beta_underwriter_syndicate_size = pm.Normal('#Syndicate Members', mu=0, sd=20) beta_underwriter_num_leads = pm.Normal('#Lead Underwriters', mu=0, sd=20) beta_underwriter_rank_avg = pm.Normal('Underwriter Rank', mu=0, sd=20) beta_num_SEC_amendments = pm.Normal('#S1A Amendments', mu=0, sd=20) # beta_FF49_industry = pm.Normal('FF49 Industry', mu=0, sd=20) beta_amends_down = pm.Normal('Amends Down', mu=0, sd=20) beta_share_overhang = pm.Normal('Share Overhang', mu=0, sd=20) beta_log_sales = pm.Normal('log(1+Sales)', mu=0, sd=20) beta_log_proceeds = pm.Normal('log(Proceeds)', mu=0, sd=20) beta_CASI = pm.Normal('CASI', mu=0, sd=20) # beta_media_1st_pricing = pm.Normal('media_1st_pricing', mu=0, sd=20) # beta_VC = pm.Normal('VC', mu=0, sd=20) beta_BAA_spread = pm.Normal('BAA Spread', mu=0, sd=20) beta_M3_initial_returns = pm.Normal('IPO Market Returns', mu=0, sd=20) beta_M3_indust_rets = pm.Normal('Industry Returns', mu=0, sd=20) # Hyperparameters ## alpha: hyperparameters for neg-binom distribution alpha = pm.Gamma('alpha', alpha=.1, beta=.1) # #Poisson Model Formula mu = 1 + tt.exp( intercept[class_idx] + beta_underwriter_syndicate_size * df.underwriter_syndicate_size + beta_underwriter_num_leads * df.underwriter_num_leads + beta_underwriter_rank_avg * df.underwriter_rank_avg # + beta_FF49_industry * FF49_industry + beta_amends_down * df['Amends Down'] + beta_num_SEC_amendments * df.num_SEC_amendments + beta_share_overhang * df['Share Overhang'] + beta_log_sales * df['log(1+Sales)'] + beta_CASI * df['CASI'] + beta_log_proceeds * df['log(Proceeds)'] # + beta_media_1st_pricing * df.media_1st_pricing # + beta_VC * df.VC + beta_BAA_spread * df['BAA Spread'] + beta_M3_initial_returns * df.M3_initial_returns + beta_M3_indust_rets * df.M3_indust_rets ) # Dependent Variable BoundedNegativeBinomial = pm.Bound(pm.NegativeBinomial, lower=1) y_est = BoundedNegativeBinomial('y_est', mu=mu, alpha=alpha, observed=y) y_pred = BoundedNegativeBinomial('y_pred', mu=mu, alpha=alpha, shape=y.shape) # y_est = pm.NegativeBinomial('y_est', mu=mu, alpha=alpha, observed=y) # y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha, shape=y.shape) # y_est = pm.Poisson('y_est', mu=mu, observed=data) # y_pred = pm.Poisson('y_pred', mu=mu, shape=data.shape) start = pm.find_MAP() step = pm.Metropolis(start=start) # step = pm.NUTS() # backend = pm.backends.Text('test') # trace = pm.sample(NSamples, step, start=start, chain=1, njobs=2, progressbar=True, trace=backend) trace = pm.sample(NSamples, step, start=start, njobs=1, progressbar=True) trace2 = trace trace = trace[-burn::thin] # waic = pm.waic(trace) # dic = pm.dic(trace) # with pm.Model() as model: # trace_loaded = pm.backends.sqlite.load('FF49_industry.sqlite') # y_pred.dump('FF49_industry_missing/y_pred') ## POSTERIOR PREDICTIVE CHECKS y_pred = trace.get_values('y_pred') pm.summary(trace, vars=covariates) # PARAMETER POSTERIORS anno_kwargs = {'xycoords': 'data', 'textcoords': 'offset points', 'rotation': 90, 'va': 'bottom', 'fontsize': 'large'} anno_kwargs2 = {'xycoords': 'data', 'textcoords': 'offset points', 'rotation': 0, 'va': 'bottom', 'fontsize': 'large'} n0, n1, n2, n3 = 1, 5, 9, 14 # numbering for posterior plots # intercepts # mn = pm.df_summary(trace)['mean']['Intercept_log__0'] # ax[0,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(0,15), color=blue, **anno_kwargs2) # mn = pm.df_summary(trace)['mean']['Intercept_log__1'] # ax[0,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(0,15), color=purple, **anno_kwargs2) # coeffs # mn = pm.df_summary(trace)['mean'][2] # ax[1,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5, 10), color=red, **anno_kwargs) # mn = pm.df_summary(trace)['mean'][3] # ax[2,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5,10), color=red, **anno_kwargs) # mn = pm.df_summary(trace)['mean'][4] # ax[3,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5,10), color=red, **anno_kwargs) # plt.savefig('figure1_mixed.png') ax = pm.traceplot(trace, vars=['Intercept']+trace.varnames[n0:n1], lines={k: v['mean'] for k, v in pm.df_summary(trace).iterrows()} ) for i, mn in enumerate(pm.df_summary(trace)['mean'][n0:n1]): # +1 because up and down intercept ax[i,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5,10), color=red, **anno_kwargs) plt.savefig('figure1_mixed.png') ax2 = pm.traceplot(trace, trace.varnames[n1:n2], lines={k: v['mean'] for k, v in pm.df_summary(trace).iterrows()} ) for i, mn in enumerate(pm.df_summary(trace)['mean'][n1:n2]): # +1 because up and down intercept ax2[i,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5,10), color=red, **anno_kwargs) plt.savefig('figure2_mixed.png') ax3 = pm.traceplot(trace, trace.varnames[n2:n3], lines={k: v['mean'] for k, v in pm.df_summary(trace).iterrows()} ) for i, mn in enumerate(pm.df_summary(trace)['mean'][n2:n3]): # +1 because up and down intercept ax3[i,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5,10), color=red, **anno_kwargs) plt.savefig('figure3_mixed.png') # _ = plt.figure(figsize=(5, 6)) _ = pm.forestplot(trace, vars=['Intercept'], ylabels=le.classes_) plt.savefig('forestplot_intercepts.png') _ = pm.forestplot(trace, vars=covariates[1:], ylabels=covariates[1:]) plt.savefig('forestplot_mixed.png') # pm.traceplot(trace, vars=['alpha', 'y_pred']) # def participant_y_pred(entity_name, burn=1000, hierarchical_trace=trace): # """Return posterior predictive for person""" # ix = np.where(le.classes_ == entity_name)[0][0] # return hierarchical_trace['y_pred'][burn:, ix] def participant_y_pred(entity_name, burn=1000, ypred=y_pred): """Return posterior predictive for person""" ix = np.where(le.classes_ == entity_name)[0][0] return ypred[burn:, ix] days = 7 fig = plt.figure(figsize=(16,10)) fig.add_subplot(221) entity_plotA('Up', days=days) fig.add_subplot(222) entity_plotB('Up') fig.add_subplot(223) entity_plotA('Down', days=days) fig.add_subplot(224) entity_plotB('Down') plt.savefig("figure4-postpreddist-updown")
def plot_CMReduction(self): assert self.trace is not None return pm.forestplot(self.trace, var_names=[self.prefix + "CMReduction"], credible_interval=0.9)
sigma = pm.Uniform('sigma', lower=0, upper=10) mu = pm.Deterministic('mu', a + br * leg_right) h = pm.Normal('h', mu=mu, sigma=sigma, observed=height) trace_no_collinear = pm.sample(cores=2) #%% model_collinear.name = 'collinear' model_no_collinear.name = 'no-collinear' df_comp_models = pm.compare({ model_collinear: trace_collinear, model_no_collinear: trace_no_collinear }) df_comp_models #%% pm.forestplot(trace_collinear, var_names=['a', 'bl', 'br', 'sigma']) pm.forestplot(trace_no_collinear, var_names=['a', 'br', 'sigma']) # Posterior predictive #%% collinear_ppc = pm.sample_posterior_predictive(trace_collinear, samples=500, model=model_collinear) no_collinear_ppc = pm.sample_posterior_predictive(trace_no_collinear, samples=500, model=model_no_collinear) _, ax = plt.subplots(figsize=(12, 6)) ax.hist([h.mean() for h in collinear_ppc['h']]) ax.axvline(height.mean()) ax.set(title='Posterior predictive of the mean',
print "MODEL BUILT! READY TO FIND MAP" start = pm.find_MAP() step = pm.Slice() # step = pm.NUTS(scaling=start) niter = 300 trace = pm.sample(niter, step, start, progressbar=True) pm.traceplot(trace) # , vars=['muA']) # plt.savefig("data1.png") plt.show() pm.forestplot(trace) plt.show() # # print ppc['Y_obs'] # print ppc['distributionA'] # print ppc['distributionB'] # tau = pm.Uniform('tau', lower=0, upper=1000) # lam = pm.Uniform('lam', lower = 0, upper = 1000) # alpha = pm.Uniform('alpha', lower = 0.0000000000000001, upper = 100) # p_weekend = float(len(b)) / (len(b) + len(c)) # print "Got here" # # weekend = pm.Bernoulli('weekend', p_weekend, observed = weekend_observed) # print b[:3000] # print c[:3000] # startTimeWeekend = pm.Normal('a', mu = muA, sd = sigmaA, observed = b[:30000])
mean = intercept + slope * X.loc[:, 'Duration'] # Observed values Y_obs = pm.Normal('Y_obs', mu=mean, sd=sigma, observed=y.values) # Sampler step = pm.NUTS() # Posterior distribution linear_trace = pm.sample(1000, step) pm.traceplot(linear_trace, figsize=(12, 12)) # plt.show() pm.plot_posterior(linear_trace, figsize=(12, 10), text_size=20) # plt.show() pm.forestplot(linear_trace) # plt.show() plt.figure(figsize=(8, 8)) pm.plot_posterior_predictive_glm( linear_trace, samples=100, eval=np.linspace(2, 30, 100), linewidth=1, color='red', alpha=0.8, label='Bayesian Posterior Fits', lm=lambda x, sample: sample['Intercept'] + sample['slope'] * x) plt.scatter(X['Duration'], y.values, s=12,
def main(input_dir, output_dir, dataset, model_type, n_samples, n_tune, target_accept, n_cores, seed, init, profile): '''Fit log-parabola model to DATASET. Parameters ---------- input_dir : [type] input directory containing subdirs for each instrument with dl3 data output_dir : [type] where to save the results. traces and two plots dataset : string telescope name model_type : string whether to use the profile likelihood ('wstat' or 'profile') or not ('full') n_samples : int number of samples to draw n_tune : int number of tuning steps target_accept : float target accept fraction for the pymc sampler n_cores : int number of cpu cores to use seed : int random seed init : string pymc init string profile : bool whether to output debugging/profiling information to the console Raises ------ NotImplementedError This does not yet work on the joint dataset. but thats good enough for me. ''' np.random.seed(seed) if dataset == 'joint': #TODO need to calculate mu_b for each observation independently. raise NotImplementedError('This is not implemented for the joint dataset yet.') # observations, lo, hi = load_joint_spectrum_observation(input_dir) else: p = os.path.join(input_dir, dataset) observations, lo, hi = load_spectrum_observations(p) prepare_output(output_dir) # TODO: this has to happen for every observation independently exposure_ratio = observations[0].alpha[0] # print(exposure_ratio) on_data, off_data = get_observed_counts(observations) integrator = init_integrators(observations) print('On Data') display_data(on_data) print('Off Data') display_data(off_data) print('--' * 30) print(f'Fitting data for {dataset} in {len(observations)} observations. ') print(f'Using {len(on_data)} bins with { on_data.sum()} counts in on region and {off_data.sum()} counts in off region.') print(f'Fit range is: {(lo, hi) * u.TeV}.') model = pm.Model(theano_config={'compute_test_value': 'ignore'}) with model: # amplitude = pm.TruncatedNormal('amplitude', mu=4, sd=1, lower=0.01, testval=4) # alpha = pm.TruncatedNormal('alpha', mu=2.5, sd=1, lower=0.00, testval=2.5) # beta = pm.TruncatedNormal('beta', mu=0.5, sd=0.5, lower=0.00000, testval=0.5) amplitude = pm.HalfFlat('amplitude', testval=4) alpha = pm.HalfFlat('alpha', testval=2.5) beta = pm.HalfFlat('beta', testval=0.5) mu_s = forward_fold_log_parabola_symbolic(integrator, amplitude, alpha, beta, observations) # mu_s = forward_fold_log_parabola_analytic(amplitude, alpha, beta, observations) if model_type == 'wstat': print('Building profiled likelihood model') mu_b = pm.Deterministic('mu_b', calc_mu_b(mu_s, on_data, off_data, exposure_ratio)) else: print('Building full likelihood model') mu_b = pm.HalfFlat('mu_b', shape=len(off_data)) pm.Poisson('background', mu=mu_b, observed=off_data, shape=len(off_data)) pm.Poisson('signal', mu=mu_s + exposure_ratio * mu_b, observed=on_data, shape=len(on_data)) print('--' * 30) print('Model debug information:') for RV in model.basic_RVs: print(RV.name, RV.logp(model.test_point)) if profile: model.profile(model.logpt).summary() print(model.check_test_point()) print('--' * 30) print('Plotting landscape:') fig, _ = plot_landscape(model, off_data) fig.savefig(os.path.join(output_dir, 'landscape.pdf')) print('--' * 30) print('Printing graphs:') theano.printing.pydotprint(mu_s, outfile=os.path.join(output_dir, 'graph_mu_s.pdf'), format='pdf', var_with_name_simple=True) theano.printing.pydotprint(mu_s + exposure_ratio * mu_b, outfile=os.path.join(output_dir, 'graph_n_on.pdf'), format='pdf', var_with_name_simple=True) print('--' * 30) print('Sampling likelihood:') with model: trace = pm.sample(n_samples, cores=n_cores, tune=n_tune, init=init, seed=[seed] * n_cores) print('--' * 30) print(f'Fit results for {dataset}') print(trace['amplitude'].mean(), trace['alpha'].mean(), trace['beta'].mean()) print(np.median(trace['amplitude']), np.median(trace['alpha']), np.median(trace['beta'])) print('--' * 30) # print('Plotting traces') # plt.figure() # varnames = ['amplitude', 'alpha', 'beta'] if model_type != 'full' else ['amplitude', 'alpha', 'beta', 'mu_b'] # pm.traceplot(trace, varnames=varnames) # plt.savefig(os.path.join(output_dir, 'traces.pdf')) p = os.path.join(output_dir, 'num_samples.txt') with open(p, "w") as text_file: text_file.write(f'\\num{{{n_samples}}}') p = os.path.join(output_dir, 'num_chains.txt') with open(p, "w") as text_file: text_file.write(f'\\num{{{n_cores}}}') p = os.path.join(output_dir, 'num_tune.txt') with open(p, "w") as text_file: text_file.write(f'\\num{{{n_tune}}}') plt.figure() pm.energyplot(trace) plt.savefig(os.path.join(output_dir, 'energy.pdf')) # plt.figure() # pm.autocorrplot(trace, burn=n_tune) # plt.savefig(os.path.join(output_dir, 'autocorr.pdf')) plt.figure() pm.forestplot(trace, varnames=['amplitude', 'alpha', 'beta']) plt.savefig(os.path.join(output_dir, 'forest.pdf')) trace_output = os.path.join(output_dir, 'traces') print(f'Saving traces to {trace_output}') with model: pm.save_trace(trace, trace_output, overwrite=True)
def _fit_model(): # load data df_a = pd.read_csv(os.path.join(path, 'paper_results', 'analogies.tsv'), sep='\t')[['lang', 'vecs', 'source', 'adjusted score']] df_s = pd.read_csv(os.path.join(path, 'paper_results', 'similarities.tsv'), sep='\t')[['lang', 'vecs', 'source', 'adjusted rank r']] df_n = pd.read_csv(os.path.join(path, 'paper_results', 'norms.tsv'), sep='\t')[['lang', 'vecs', 'norm', 'adjusted r']] df_b = pd.read_csv(os.path.join(path, 'paper_results', 'binder.tsv'), sep='\t')[['lang', 'vecs', 'norm', 'adjusted r']] # keep track of different evaluation tasks df_a['kind'] = 'analogies' df_s['kind'] = 'similarities' df_n['kind'] = 'norms' df_b['kind'] = 'norms' # rename different metrics to score, and various dataset origins to task df_a = df_a.rename(columns={'source': 'task', 'adjusted score': 'score'}) df_s = df_s.rename(columns={'source': 'task', 'adjusted rank r': 'score'}) df_n = df_n.rename(columns={'norm': 'task', 'adjusted r': 'score'}) df_b = df_b.rename(columns={'norm': 'task', 'adjusted r': 'score'}) # stack datasets df = pd.concat([df_a, df_s, df_n, df_b]) # merge in corpus word counts df_corpus = pd.read_csv(os.path.join(path, 'paper_results', 'table_data.tsv'), sep='\t') df = df.merge(df_corpus[['lang', 'vecs', 'words']], how='inner', on=['lang', 'vecs']) df.to_csv('model_data.tsv', sep='\t', index=False) # store merged data for record keeping df['log10_wordcount'] = np.log10(df['words']) # log-transform word counts df['log10_wordcount_z'] = standardize(df['log10_wordcount']) # standardize word counts # create sum-coded contrasts df['wiki'] = df['vecs'].apply(lambda x: sum_contrast(x, 'wiki', 'wiki+subs')) df['subs'] = df['vecs'].apply(lambda x: sum_contrast(x, 'subs', 'wiki+subs')) df['analogies'] = df['kind'].apply(lambda x: sum_contrast(x, 'analogies', 'similarities')) df['norms'] = df['kind'].apply(lambda x: sum_contrast(x, 'norms', 'similarities')) # define PyMC3 model for statistical inference with pm.Model() as beta_model: # define centered Normal priors for all the betas, sd = 1 (mild shrinkage prior) intercept = pm.Normal('μ', mu=0, sd=1) b_wordcount = pm.Normal('β log corpus word count', mu=0, sd=1) b_wiki = pm.Normal('β wiki vs. mean', mu=0, sd=1) b_subs = pm.Normal('β subs vs. mean', mu=0, sd=1) b_norms = pm.Normal('β norms vs. mean', mu=0, sd=1) b_analogies = pm.Normal('β analogies vs. mean', mu=0, sd=1) b_wiki_norms = pm.Normal('β wiki vs. mean:norms vs. mean', mu=0, sd=1) b_wiki_analogies = pm.Normal('β wiki vs. mean:analogies vs. mean', mu=0, sd=1) b_subs_norms = pm.Normal('β subs vs. mean:norms vs. mean', mu=0, sd=1) b_subs_analogies = pm.Normal('β subs vs. mean:analogies vs. mean', mu=0, sd=1) b_wikisubs = pm.Deterministic('β wiki+subs vs. mean', -1 * (b_subs + b_wiki)) b_similarities = pm.Deterministic('β similarities vs. mean', -1 * (b_analogies + b_norms)) b_wikisubs_norms = pm.Deterministic('β wiki+subs vs. mean:norms vs. mean', -1 * (b_subs_norms + b_wiki_norms)) b_wikisubs_analogies = pm.Deterministic('β wiki+subs vs. mean:analogies vs. mean', -1 * (b_subs_analogies + b_wiki_analogies)) b_subs_similarities = pm.Deterministic('β subs vs. mean:similarities vs. mean', -1 * (b_subs_analogies + b_subs_norms)) b_wiki_similarities = pm.Deterministic('β wiki vs. mean:similarities vs. mean', -1 * (b_wiki_analogies + b_wiki_norms)) # given the above, there are two ways to compute the interaction wiki+subs vs.mean:similarities vs. mean # both methods are given below, but we only need to use one # they give the exact same answer though, you can uncomment the second line to verify b_wikisubs_similarities = pm.Deterministic('β wiki+subs vs. mean:similarities vs. mean', -1 * (b_wiki_similarities + b_subs_similarities)) # b_wikisubs_similarities2 = pm.Deterministic('β wiki+subs vs. mean:similarities vs. mean (2)', -1 * (b_wikisubs_analogies + b_wikisubs_norms)) # non-centered parametrization for task-level random intercepts task_codes, task_uniques = df['task'].factorize() # get number of unique groups and code them mu_tilde_task = pm.Normal('μ\u0303 task', mu=0, sd=1, shape=len(task_uniques)) # prior for task group offsets sigma_task = pm.HalfNormal('σ task', sd=1) # prior for task group sigma mu_task = pm.Deterministic('μ task', sigma_task * mu_tilde_task) # task group means (random intercepts) # non-centered parametrization for language-level random intercepts lang_codes, lang_uniques = df['lang'].factorize() # get number of unique groups and code them mu_tilde_lang = pm.Normal('μ\u0303 lang', mu=0, sd=1, shape=len(lang_uniques)) # prior for lang group offsets sigma_lang = pm.HalfNormal('σ lang', sd=1) # prior for lang group sigma mu_lang = pm.Deterministic('μ lang', sigma_lang * mu_tilde_lang) # lang group means (random intercepts) # compute predictions for y, using logit link function y_hat = pm.Deterministic('ŷ', pm.math.invlogit( intercept + b_wordcount * df['log10_wordcount_z'] + b_wiki * df['wiki'] + b_subs * df['subs'] + b_norms * df['norms'] + b_analogies * df['analogies'] + b_wiki_norms * df['wiki'] * df['norms'] + b_wiki_analogies * df['wiki'] * df['analogies'] + b_subs_norms * df['subs'] * df['norms'] + b_subs_analogies * df['subs'] * df['analogies'] + mu_lang[lang_codes] + mu_task[task_codes] )) # define likelihood invphi = pm.HalfNormal('1 / φ', sd=1) # prior for phi, for Beta(mu, phi) parametrization of the likelihood distribution phi = pm.Deterministic('φ', 1 / invphi) y = pm.Beta('y', alpha=y_hat * phi, beta=(1 - y_hat) * phi, observed=df['score']) # sample with 3 chains, 2000 warmup + 4000 posterior samples per chain trace = pm.sample(2500, tune=2500, chains=4, target_accept=.9) # store trace summary as tsv and LaTeX table df_summary = pm.summary(trace, credible_interval=.9) df_summary.to_csv('trace_summary.tsv', sep='\t') with open('trace_summary_latex.txt', 'w') as latextable: latextable.write(df_summary.round(2).to_latex()) # draw and store model graph graph = pm.model_to_graphviz(beta_model) graph.graph_attr['rankdir'] = 'LR' # change graph orientation to left-right (from top-down) graph.render(filename='model', format='pdf', cleanup=True) # draw and store forest plot varnames = [ 'μ', 'β log corpus word count', 'β subs vs. mean', 'β wiki vs. mean', 'β wiki+subs vs. mean', 'β analogies vs. mean', 'β norms vs. mean', 'β similarities vs. mean', 'β subs vs. mean:analogies vs. mean', 'β subs vs. mean:norms vs. mean', 'β subs vs. mean:similarities vs. mean', 'β wiki vs. mean:analogies vs. mean', 'β wiki vs. mean:norms vs. mean', 'β wiki vs. mean:similarities vs. mean', 'β wiki+subs vs. mean:analogies vs. mean', 'β wiki+subs vs. mean:norms vs. mean', 'β wiki+subs vs. mean:similarities vs. mean', ] axes = pm.forestplot(trace, var_names=varnames, credible_interval=.9, combined=True, figsize=(4, 6)) axes[0].set(title='90% credible intervals', xlabel='coefficient (in log-odds)') plt.savefig('forestplot.pdf') plt.savefig('forestplot.png', dpi=600) plt.clf() # draw and store trace plot pm.traceplot(trace) plt.savefig('traceplot.png', dpi=300) # the traceplot is huge, so we lower the resolution and don't store it as pdf plt.clf() return df_summary
y_pred = pm.Normal('y_pred', mu=mu, sd=epsilon, observed=y) start = pm.find_MAP() step = pm.NUTS(scaling=start) trace_red = pm.sample(5000, step=step, start=start) pm.traceplot(trace_red) plt.show() sns.kdeplot(trace_red['beta'][:, 0], trace_red['beta'][:, 1]) plt.xlabel(r'$\beta_1$', fontsize=16) plt.ylabel(r'$\beta_2$', fontsize=16, rotation=0) plt.show() pm.forestplot(trace_red, varnames=['beta']) plt.show() # Masking effect variables N = 100 r = 0.8 x_0 = np.random.normal(size=N) x_1 = np.random.normal(loc=x_0 * r, scale=(1 - r**2)**0.5) y = np.random.normal(loc=x_0 - x_1) X = np.vstack((x_0, x_1)) scatter_plot(X, y) plt.show() with pm.Model() as model_ma: alpha = pm.Normal('alpha', mu=0, sd=10) beta = pm.Normal('beta', mu=0, sd=10, shape=2)
mc.traceplot(trace, vars=['mean', 'sigma'], ax=axes) fig.tight_layout() fig.savefig("ch16-posterior-sample-trace.png") fig.savefig("ch16-posterior-sample-trace.pdf") # In[43]: mu, trace.get_values('mean').mean() # In[44]: s, trace.get_values('sigma').mean() # In[45]: gs = mc.forestplot(trace, vars=['mean', 'sigma']) plt.savefig("ch16-forestplot.pdf") # In[46]: help(mc.summary) # In[47]: mc.summary(trace, vars=['mean', 'sigma']) # ## Linear regression # In[48]: dataset = sm.datasets.get_rdataset("Davis", "car")
effect_size = pm.Deterministic( 'effect size', diff_of_means / np.sqrt( (group1_std**2 + group2_std**2) / 2)) # RUN #trace = pm.sample(2000, cores=2) # Nota Bene: https://github.com/pymc-devs/pymc3/issues/3388 trace = pm.sample(1000, tune=1000, cores=1) pm.kdeplot(np.random.exponential(30, size=10000), shade=0.5) pm.plot_posterior(trace, varnames=[ 'group1_mean', 'group2_mean', 'group1_std', 'group2_std', 'ν_minus_one' ], color='#87ceeb') pm.plot_posterior( trace, varnames=['difference of means', 'difference of stds', 'effect size'], ref_val=0, color='#87ceeb') pm.forestplot(trace, varnames=['group1_mean', 'group2_mean']) pm.forestplot(trace, varnames=['group1_std', 'group2_std', 'ν_minus_one']) pm.summary( trace, varnames=['difference of means', 'difference of stds', 'effect size'])
# Linear combinations of parameters theta = invlogit(alpha + beta*dose) # Model likelihood deaths = Binomial('deaths', n=n, p=theta, observed=y) with bioassay_model: # Draw wamples trace = sample(1000, njobs=2) # Plot two parameters forestplot(trace, varnames=['alpha', 'beta']) ### MOTIVATING EXAMPLE -- LINEAR REGRESSION import numpy as np import matplotlib.pyplot as plt # Initialize random number generator np.random.seed(123) # True parameter values alpha, sigma = 1, 1 beta = [1, 2.5]