Exemple #1
0
def main():

    with pm.Model() as model:
        # Using a strong prior. Meaning the mean is towards zero than towards 1
        prior = pm.Beta('prior', 0.5, 3)

        output = pm.Binomial('output', n=100, observed=50, p=prior)

        step = pm.Metropolis()
        trace = pm.sample(1000, step=step)
        pm.traceplot(trace)

    pm.plot_posterior(trace, figsize=(5, 5), kde_plot=True,
                      rope=[0.45, 0.55])  # Rope is an interval that you define
    # This is a value you eppect. You can check
    # If ROPE fall on HPD or not. If it falls, it means
    # our value is within HPD and may be increasing sample
    # size would make our mean estimate better.

    # gelman rubin
    pm.gelman_rubin(trace)

    # forestplot
    pm.forestplot(trace, varnames=['prior'])

    # summary [look at mc error here. This is the std error, should be low]
    pm.df_summary(trace)

    #autocorrelation
    pm.autocorrplot(trace)

    # effective size
    pm.effective_n(trace)['prior']
Exemple #2
0
def summarize(best_result, kde=True, plot=True):
    trace, model = best_result
    if plot:
        ax = pm.plot_posterior(trace[100:],
                               varnames=[
                                   r"group1_mean", r"group2_mean",
                                   r"group1_std", "group2_std", r"ν_minus_one"
                               ],
                               kde_plot=kde,
                               color="C0")
        if kde:
            for a in (1, 3):
                ax[a].lines[0].set_color("C1")
        plt.figure()
        pm.plot_posterior(trace[1000:],
                          varnames=[
                              "difference of means", "difference of stds",
                              "effect size"
                          ],
                          ref_val=0,
                          kde_plot=True,
                          color="C2")
        plt.figure()
        pm.forestplot(trace[1000:], varnames=[v.name for v in model.vars[:2]])
        plt.figure()
        pm.forestplot(trace[1000:], varnames=[v.name for v in model.vars[2:]])

    pm.summary(
        trace[1000:],
        varnames=["difference of means", "difference of stds", "effect size"])
Exemple #3
0
def trial1():
    radon = pd.read_csv('data/radon.csv')[['county', 'floor', 'log_radon']]
    # print(radon.head())
    county = pd.Categorical(radon['county']).codes
    # print(county)

    niter = 1000
    with pm.Model() as hm:
        # County hyperpriors
        mu_a = pm.Normal('mu_a', mu=0, sd=10)
        sigma_a = pm.HalfCauchy('sigma_a', beta=1)
        mu_b = pm.Normal('mu_b', mu=0, sd=10)
        sigma_b = pm.HalfCauchy('sigma_b', beta=1)

        # County slopes and intercepts
        a = pm.Normal('slope', mu=mu_a, sd=sigma_a, shape=len(set(county)))
        b = pm.Normal('intercept', mu=mu_b, sd=sigma_b, shape=len(set(county)))

        # Houseehold errors
        sigma = pm.Gamma("sigma", alpha=10, beta=1)

        # Model prediction of radon level
        mu = a[county] + b[county] * radon.floor.values

        # Data likelihood
        y = pm.Normal('y', mu=mu, sd=sigma, observed=radon.log_radon)

        start = pm.find_MAP()
        step = pm.NUTS(scaling=start)
        hm_trace = pm.sample(niter, step, start=start)

        plt.figure(figsize=(8, 60))
        pm.forestplot(hm_trace, varnames=['slope', 'intercept'])
Exemple #4
0
 def _plot_changepoints(self, alpha, plot_kwargs):
     plt.figure(**plot_kwargs)
     pm.forestplot(self.trace,
                   varnames=['changepoints_%s' % self.name],
                   ylabels=self.changepoints.astype(str))
     plt.grid()
     plt.title("Growth Change Points")
     plt.show()
Exemple #5
0
 def _plot_holidays(self, alpha, plot_kwargs):
     plt.figure(**plot_kwargs)
     pm.forestplot(self.trace[self.skip_first // self.chains:],
                   alpha=alpha,
                   varnames=[self.priors_names['holidays']],
                   ylabels=self.holidays)
     plt.grid()
     plt.show()
Exemple #6
0
 def _plot_intercept(self, alpha: float, plot_kwargs: Dict):
     plt.figure(**plot_kwargs)
     pm.forestplot(
         self.trace[self.skip_first // self.chains:],
         var_names=[self.priors_names["intercept"]],
         ridgeplot_alpha=alpha,
     )
     plt.show()
Exemple #7
0
 def _plot_changepoints(self, alpha, plot_kwargs):
     plt.figure(**plot_kwargs)
     pm.forestplot(self.trace[self.skip_first // self.chains:], alpha=alpha,
                   varnames=[self.priors_names['changepoints']],
                   ylabels=self.changepoints.astype(str))
     plt.grid()
     plt.title("Growth Change Points")
     plt.show()
Exemple #8
0
 def _plot_regressors(self, alpha: float, plot_kwargs: Dict):
     plt.figure(**plot_kwargs)
     pm.forestplot(
         self.trace[self.skip_first // self.chains:],
         alpha=alpha,
         varnames=[self.priors_names["regressors"]],
         ylabels=self.regressors,
     )
     plt.grid()
     plt.show()
Exemple #9
0
 def _plot_changepoints(self, alpha: float, plot_kwargs: Dict):
     plt.figure(**plot_kwargs)
     pm.forestplot(
         self.trace[self.skip_first // self.chains:],
         alpha=alpha,
         varnames=[self.priors_names["changepoints"]],
         ylabels=np.array(self.changepoints).astype(str),
     )
     plt.grid()
     plt.title("Growth Change Points")
     plt.show()
Exemple #10
0
    def forestplot(self, varnames=None):
        """Generate a forestplot with 95% credible intervals and R hat statistic.
        
           Parameters
           ----------
           varnames : iterable of str or None, optional
               The model variables to generate plots for (default None).
               If None, defaults to all variables.
        """

        varnames = varnames or self.model_variables
        pm.forestplot(self.trace, varnames=varnames, color='#8BCAF1')
def main():

    data = np.array([
        51.06, 55.12, 53.73, 50.24, 52.05, 56.40, 48.45, 52.34, 55.65, 51.49,
        51.86, 63.43, 53.00, 56.09, 51.93, 52.31, 52.33, 57.48, 57.44, 55.14,
        53.93, 54.62, 56.09, 68.58, 51.36, 55.47, 50.73, 51.94, 54.95, 50.39,
        52.91, 51.5, 52.68, 47.72, 49.73, 51.82, 54.99, 52.84, 53.19, 54.52,
        51.46, 53.73, 51.61, 49.81, 52.42, 54.3, 53.84, 53.16
    ])

    # look at the distribution of the data
    sns.kdeplot(data)

    # All these distributions are used to model std
    # It is safe to use exponential
    # half cauchy has a fat tail
    # Exponential parameter lambda high indicates a high steep
    # Ineverse gamma
    with pm.Model() as model:
        mu = pm.Uniform('mu', 30, 80)
        sigma = pm.HalfNormal('sigma', sd=10)
        df = pm.Exponential(
            'df', 1.5)  # lamda = 1.5, it will be more steep, 0.5 less
        output = pm.StudentT('output',
                             mu=mu,
                             sigma=sigma,
                             nu=df,
                             observed=data)

        trace = pm.sample(1000)

        # gelman rubin
        pm.gelman_rubin(trace)

        # forestplot
        pm.forestplot(trace)

        # summary [look at mc error here. This is the std error, should be low]
        pm.summary(trace)

        #autocorrelation
        pm.autocorrplot(trace)

        # effective size
        pm.effective_n(trace)
Exemple #12
0
 def _plot_holidays(self, alpha: float, plot_kwargs: dict):
     plt.figure(**plot_kwargs)
     ax = pm.forestplot(
         self.trace[self.skip_first // self.chains:],
         ridgeplot_alpha=alpha,
         var_names=[self.priors_names["holidays"]],
     )
     ax[0].set_yticklabels(self.holidays[::-1])
     plt.grid()
     plt.show()
Exemple #13
0
def main():
  X, Y = generate_sample()

  with pm.Model() as model:
    alpha = pm.Normal('alpha', mu=0, sd=20)
    beta = pm.Normal('beta', mu=0, sd=20)
    sigma = pm.Uniform('sigma', lower=0)
    y = pm.Normal('y', mu=beta*X+alpha, sd=sigma, observed=Y)
    start = pm.find_MAP()
    step = pm.NUTS(state=start)

  with model:
    if (multicore):
      trace = pm.sample(itenum, step, start=start,
        njobs=chainnum, random_seed=range(chainnum), progressbar=progress)
    else:
      ts = [pm.sample(itenum, step, chain=i, progressbar=progress)
            for i in range(chainnum)]
      trace = merge_traces(ts)

    if (saveimage):
      pm.traceplot(trace).savefig("simple_linear_trace.png")
    print "Rhat = {0}".format(pm.gelman_rubin(trace))

  t1 = time.clock()
  print "elapsed time = {0}".format(t1 - t0)

  #trace
  if(not multicore):
  	trace=ts[0]
  with model:
  	pm.traceplot(trace,model.vars)

  pm.forestplot(trace)

  with open("simplelinearregression_model.pkl","w") as fpw:
  	pkl.dump(model,fpw)
  with open("simplelinearregression_trace.pkl","w") as fpw:
  	pkl.dump(trace,fpw)
  with open("simplelinearregression_model.pkl") as fp:
  	model=pkl.load(fp)
  with open("simplelinearregression_trace.pkl") as fp:
  	trace=pkl.load(fp)
 def show_forest(self, show_feats, feat_labels=None):
     g = pm.forestplot(self.trace_, varnames=show_feats,
                          ylabels=feat_labels)
     f = pl.gcf()
     try:
         ax = f.get_axes()[1]
     except IndexError:
         ax = f.get_axes()[0]
     ax.grid(axis='y')
     return g
Exemple #15
0
def plot_model_diagnostics(model, save_dir, file_id, export=True):
    """generate and export a range of diagnostic plots for a given model"""

    # ensure folder exists
    if export is True:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

    model_name = model.__class__.__name__

    trace_df = pm.trace_to_dataframe(model.trace, varnames=model.df_params)

    sns.pairplot(trace_df)
    if export is True:
        plt.savefig(save_dir + f'{model_name}_{file_id}_pairplot.pdf',
                    format='pdf',
                    bbox_inches='tight')
        plt.cla()

    pm.traceplot(model.trace, varnames=model.df_params)
    if export is True:
        plt.savefig(save_dir + f'{model_name}_{file_id}_traceplot.pdf',
                    format='pdf',
                    bbox_inches='tight')
        plt.cla()

    pm.autocorrplot(model.trace, varnames=model.df_params)
    if export is True:
        plt.savefig(save_dir + f'{model_name}_{file_id}_autocorrplot.pdf',
                    format='pdf',
                    bbox_inches='tight')
        plt.cla()

    pm.forestplot(model.trace, varnames=model.df_params)
    if export is True:
        plt.savefig(save_dir + f'{model_name}_{file_id}_forestplot.pdf',
                    format='pdf',
                    bbox_inches='tight')
        plt.cla()

    # close all figs, otherwise we can run out of memory
    plt.close("all")
def plot_forest_plot(trace, name1, name2):
    """
    Plots a forest plot

    @param trace a trace object
    @param name1 the name of the first group
    @param name2 the name of the second group
    @returns a forestplot on a gridspec
    """
    fp1 = pm.forestplot(trace, varnames=[name1, name2], rhat=False)
    return fp1
Exemple #17
0
 def _plot_changepoints(self, alpha: float, plot_kwargs: Dict):
     plt.figure(**plot_kwargs)
     _, ax = pm.forestplot(
         self.trace[self.skip_first // self.chains :],
         ridgeplot_alpha=alpha,
         var_names=[self.priors_names["changepoints"]],
     )
     ax[0].set_yticklabels(list(np.array(self.changepoints).astype(str))[::-1])
     plt.grid()
     plt.title("Growth Change Points")
     plt.show()
def plot_forestplots(trace):
    fig = plt.figure()
    pm.forestplot(trace, vline=1, varnames=['fold'])

    fig = plt.figure()
    pm.forestplot(trace,
                  varnames=['z_factor', 'zp_factor'],
                  xrange=(-1, 1),
                  vline=0.5)

    fig = plt.figure()
    pm.forestplot(trace, varnames=['sigma'])

    plt.figure()
    pm.forestplot(trace, varnames=['fold_changes'], vline=1)
Exemple #19
0
def forestplot(model,
               bambi=False,
               transform=np.array,
               vline_label=None,
               rhat=False,
               **kwargs):
    """Modified forestplot function

    Forestplot function from PyMC3, adapted to automatically plot only relevant effects
    for a BAMBI or PyMC3 model and to add a vertical no effect line to aid with interpreting coefficients

    :param trace: BAMBI or PyMC3 model object
    :param transform: function to transform trace (pass np.exp for logistic regression)
    :param kwargs: keyword args for PyMC3 forestplot function
    :returns: matplotlib subplot object with forestplot for trace
    """
    if bambi:
        trace = model.backend.trace
        varnames = sorted(model.fixed_terms.keys())
    else:
        trace = model
        varnames = sorted(trace.varnames)
    pm.forestplot(trace,
                  varnames=varnames,
                  transform=transform,
                  rhat=rhat,
                  **kwargs)
    g = plt.gca()
    #g.set(xlim=(None, None))
    if vline_label is not None:
        no_effect = float(transform(0))
        g.axes.axvline(no_effect, color='red')
        g.axes.annotate(vline_label, [no_effect, -.5],
                        rotation=90,
                        va='center',
                        ha='right',
                        color='red')
    return g
def cmt_example():
    obs = {
        'n1_Hp_Rp': 519,
        'n1_Hp': 10473,
        'P_Hp': 0.1561185895315763,
        'n_Hp_Rp': 42,
        'n_Hn_Rp': 2,
        'n_Hp_Rn': 687,
        'n_Hn_Rn': 3624
    }

    trace = sample_heuristic_precision(obs, {'draws': 10000, 'tune': 5000})

    pm.plot_posterior(trace, credible_interval=0.94)
    pm.plot_posterior(trace, credible_interval=0.99)

    help(pm.plot_posterior)

    pm.traceplot(trace)
    pm.forestplot(trace)

    q_samples = trace['q']
    np.average(q_samples < 0.03)
Exemple #21
0
# diagnostics: Gelman-Rubin
print(pm.diagnostics.gelman_rubin(trace))


# In[40]:


# diagnostics: n effective
print(pm.diagnostics.effective_n(trace))


# In[41]:


pm.forestplot(trace);


# In[42]:


pm.plot_posterior(trace);


# PyMC3, offers a variety of other samplers, found in pm.step_methods.

# In[43]:


list(filter(lambda x: x[0].isupper(), dir(pm.step_methods)))
Exemple #22
0
 def posterior_forestplot(self, **kwargs):
     return pm.forestplot(self.posterior_, **kwargs)
Exemple #23
0
del model, posterior_samples, model_summary_logscale

#%%

###############################################################################
# Print results from all models
###############################################################################
import matplotlib.pyplot as plt

# Model 0
pm.traceplot(collect_results['posterior_samples'])
print(collect_results['model_summary_logscale'])

plt.figure(figsize=(4, 8))
pm.forestplot(collect_results['posterior_samples'],
              var_names=['beta'],
              credible_interval=0.95)
pm.forestplot(collect_results['posterior_samples'],
              var_names=['beta_day'],
              credible_interval=0.95)
#pm.forestplot(collect_results['0']['posterior_samples'], var_names=['alpha'], credible_interval=0.95)

# %%
filename = os.path.join(os.path.realpath(dir_picklejar), 'rjmcmc_models')
outfile = open(filename, 'wb')
pickle.dump(collect_results, outfile)
outfile.close()

# %% REsidual code for safekeeping

#    # Y_hat_latent = pm.Determinist(of Y_diff_latent)
        step = pm.Metropolis()
        trace = pm.sample(1000, step=step, start=start)

    burnin = 0  # no burnin
    chain = trace[burnin:]
    pm.traceplot(chain, lines={'theta': theta_true})

    with beta_binomial:
        step = pm.Metropolis()
        multi_trace = pm.sample(1000, step=step, njobs=4)

    burnin = 0  # no burnin
    multi_chain = multi_trace[burnin:]
    pm.traceplot(multi_chain, lines={'theta': theta_true})

    # convergence
    pm.gelman_rubin(multi_chain)
    pm.forestplot(multi_chain, varnames=['theta'])

    # summary
    pm.summary(multi_chain)

    # autocorrelation
    pm.autocorrplot(chain)

    # effective size
    pm.effective_n(multi_chain)['theta']

    # Summerize the posterior
    pm.plot_posterior(chain, kde_plot=True)
    plt.show()
def mixed_effects():


    le = preprocessing.LabelEncoder()
    # Convert categorical variables to integer
    # participants_idx = le.fit_transform(messages['prev_sender'])

    classes = 'FF49_industry'
    # classes = 'underwriter_tier'
    # classes = 'amends'

    print("Grouping by: {}".format(classes))

    FF49_industry = le.fit_transform(df['FF49_industry'])
    class_idx = le.fit_transform(df[classes])
    n_classes = len(le.classes_)


    NSamples = 50000
    burn = NSamples/10
    thin = 2

    covariates = [
            'Intercept',
            '#Syndicate Members',
            '#Lead Underwriters',
            'Underwriter Rank',
            # 'FF49 Industry',
            'Amends Down',
            '#S1A Amendments',
            'Share Overhang',
            'log(1+Sales)',
            'log(Proceeds)',
            'CASI',
            # 'media_1st_pricing',
            # 'VC',
            'IPO Market Returns',
            'Industry Returns',
            'BAA Spread',
            ]

    y = df['days_to_first_price_update'].values
    # y = np.ma.masked_values(list(df.days_to_first_price_update), value=-999)



    with pm.Model() as model:

        # Parameters:
        intercept = pm.Gamma('Intercept', alpha=.1, beta=.1, shape=n_classes)

        beta_underwriter_syndicate_size = pm.Normal('#Syndicate Members', mu=0, sd=20)
        beta_underwriter_num_leads = pm.Normal('#Lead Underwriters', mu=0, sd=20)
        beta_underwriter_rank_avg = pm.Normal('Underwriter Rank', mu=0, sd=20)
        beta_num_SEC_amendments = pm.Normal('#S1A Amendments', mu=0, sd=20)
        # beta_FF49_industry = pm.Normal('FF49 Industry', mu=0, sd=20)
        beta_amends_down = pm.Normal('Amends Down', mu=0, sd=20)
        beta_share_overhang = pm.Normal('Share Overhang', mu=0, sd=20)
        beta_log_sales = pm.Normal('log(1+Sales)', mu=0, sd=20)
        beta_log_proceeds = pm.Normal('log(Proceeds)', mu=0, sd=20)
        beta_CASI = pm.Normal('CASI', mu=0, sd=20)
        # beta_media_1st_pricing = pm.Normal('media_1st_pricing', mu=0, sd=20)
        # beta_VC = pm.Normal('VC', mu=0, sd=20)
        beta_BAA_spread = pm.Normal('BAA Spread', mu=0, sd=20)
        beta_M3_initial_returns = pm.Normal('IPO Market Returns', mu=0, sd=20)
        beta_M3_indust_rets = pm.Normal('Industry Returns', mu=0, sd=20)

        # Hyperparameters
        ## alpha: hyperparameters for neg-binom distribution
        alpha = pm.Gamma('alpha', alpha=.1, beta=.1)



        # #Poisson Model Formula
        mu = 1 + tt.exp(
                intercept[class_idx]
                + beta_underwriter_syndicate_size * df.underwriter_syndicate_size
                + beta_underwriter_num_leads * df.underwriter_num_leads
                + beta_underwriter_rank_avg * df.underwriter_rank_avg
                # + beta_FF49_industry * FF49_industry
                + beta_amends_down * df['Amends Down']
                + beta_num_SEC_amendments * df.num_SEC_amendments
                + beta_share_overhang * df['Share Overhang']
                + beta_log_sales * df['log(1+Sales)']
                + beta_CASI * df['CASI']
                + beta_log_proceeds * df['log(Proceeds)']
                # + beta_media_1st_pricing * df.media_1st_pricing
                # + beta_VC * df.VC
                + beta_BAA_spread * df['BAA Spread']
                + beta_M3_initial_returns * df.M3_initial_returns
                + beta_M3_indust_rets * df.M3_indust_rets
                    )

        # Dependent Variable
        BoundedNegativeBinomial = pm.Bound(pm.NegativeBinomial, lower=1)
        y_est = BoundedNegativeBinomial('y_est', mu=mu, alpha=alpha, observed=y)
        y_pred = BoundedNegativeBinomial('y_pred', mu=mu, alpha=alpha, shape=y.shape)
        # y_est = pm.NegativeBinomial('y_est', mu=mu, alpha=alpha, observed=y)
        # y_pred = pm.NegativeBinomial('y_pred', mu=mu, alpha=alpha, shape=y.shape)
        # y_est = pm.Poisson('y_est', mu=mu, observed=data)
        # y_pred = pm.Poisson('y_pred', mu=mu, shape=data.shape)

        start = pm.find_MAP()
        step = pm.Metropolis(start=start)
        # step = pm.NUTS()
        # backend = pm.backends.Text('test')
        # trace = pm.sample(NSamples, step, start=start, chain=1, njobs=2, progressbar=True, trace=backend)
        trace = pm.sample(NSamples, step, start=start, njobs=1, progressbar=True)

        trace2 = trace
        trace = trace[-burn::thin]

        # waic = pm.waic(trace)
        # dic = pm.dic(trace)



    # with pm.Model() as model:
    #     trace_loaded = pm.backends.sqlite.load('FF49_industry.sqlite')
        # y_pred.dump('FF49_industry_missing/y_pred')


    ## POSTERIOR PREDICTIVE CHECKS
    y_pred = trace.get_values('y_pred')
    pm.summary(trace, vars=covariates)


    # PARAMETER POSTERIORS
    anno_kwargs = {'xycoords': 'data', 'textcoords': 'offset points',
                    'rotation': 90, 'va': 'bottom', 'fontsize': 'large'}
    anno_kwargs2 = {'xycoords': 'data', 'textcoords': 'offset points',
                    'rotation': 0, 'va': 'bottom', 'fontsize': 'large'}


    n0, n1, n2, n3 = 1, 5, 9, 14 # numbering for posterior plots
    # intercepts
    # mn = pm.df_summary(trace)['mean']['Intercept_log__0']
    # ax[0,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(0,15), color=blue, **anno_kwargs2)
    # mn = pm.df_summary(trace)['mean']['Intercept_log__1']
    # ax[0,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(0,15), color=purple, **anno_kwargs2)
    # coeffs
    # mn = pm.df_summary(trace)['mean'][2]
    # ax[1,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5, 10), color=red, **anno_kwargs)
    # mn = pm.df_summary(trace)['mean'][3]
    # ax[2,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5,10), color=red, **anno_kwargs)
    # mn = pm.df_summary(trace)['mean'][4]
    # ax[3,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5,10), color=red, **anno_kwargs)
    # plt.savefig('figure1_mixed.png')

    ax = pm.traceplot(trace, vars=['Intercept']+trace.varnames[n0:n1],
            lines={k: v['mean'] for k, v in pm.df_summary(trace).iterrows()}
            )

    for i, mn in enumerate(pm.df_summary(trace)['mean'][n0:n1]): # +1 because up and down intercept
        ax[i,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5,10), color=red, **anno_kwargs)
    plt.savefig('figure1_mixed.png')


    ax2 = pm.traceplot(trace, trace.varnames[n1:n2],
            lines={k: v['mean'] for k, v in pm.df_summary(trace).iterrows()}
            )
    for i, mn in enumerate(pm.df_summary(trace)['mean'][n1:n2]): # +1 because up and down intercept
        ax2[i,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5,10), color=red, **anno_kwargs)
    plt.savefig('figure2_mixed.png')



    ax3 = pm.traceplot(trace, trace.varnames[n2:n3],
            lines={k: v['mean'] for k, v in pm.df_summary(trace).iterrows()}
            )
    for i, mn in enumerate(pm.df_summary(trace)['mean'][n2:n3]): # +1 because up and down intercept
        ax3[i,0].annotate('{:.3f}'.format(mn), xy=(mn,0), xytext=(5,10), color=red, **anno_kwargs)
    plt.savefig('figure3_mixed.png')


    # _ = plt.figure(figsize=(5, 6))
    _ = pm.forestplot(trace, vars=['Intercept'], ylabels=le.classes_)
    plt.savefig('forestplot_intercepts.png')
    _ = pm.forestplot(trace, vars=covariates[1:], ylabels=covariates[1:])
    plt.savefig('forestplot_mixed.png')

    # pm.traceplot(trace, vars=['alpha', 'y_pred'])



    # def participant_y_pred(entity_name, burn=1000, hierarchical_trace=trace):
    #     """Return posterior predictive for person"""
    #     ix = np.where(le.classes_ == entity_name)[0][0]
    #     return hierarchical_trace['y_pred'][burn:, ix]

    def participant_y_pred(entity_name, burn=1000, ypred=y_pred):
        """Return posterior predictive for person"""
        ix = np.where(le.classes_ == entity_name)[0][0]
        return ypred[burn:, ix]

    days = 7

    fig = plt.figure(figsize=(16,10))
    fig.add_subplot(221)
    entity_plotA('Up', days=days)
    fig.add_subplot(222)
    entity_plotB('Up')

    fig.add_subplot(223)
    entity_plotA('Down', days=days)
    fig.add_subplot(224)
    entity_plotB('Down')
    plt.savefig("figure4-postpreddist-updown")
Exemple #26
0
 def plot_CMReduction(self):
     assert self.trace is not None
     return pm.forestplot(self.trace,
                          var_names=[self.prefix + "CMReduction"],
                          credible_interval=0.9)
    sigma = pm.Uniform('sigma', lower=0, upper=10)
    mu = pm.Deterministic('mu', a + br * leg_right)
    h = pm.Normal('h', mu=mu, sigma=sigma, observed=height)
    trace_no_collinear = pm.sample(cores=2)

#%%
model_collinear.name = 'collinear'
model_no_collinear.name = 'no-collinear'
df_comp_models = pm.compare({
    model_collinear: trace_collinear,
    model_no_collinear: trace_no_collinear
})
df_comp_models

#%%
pm.forestplot(trace_collinear, var_names=['a', 'bl', 'br', 'sigma'])
pm.forestplot(trace_no_collinear, var_names=['a', 'br', 'sigma'])

# Posterior predictive
#%%
collinear_ppc = pm.sample_posterior_predictive(trace_collinear,
                                               samples=500,
                                               model=model_collinear)
no_collinear_ppc = pm.sample_posterior_predictive(trace_no_collinear,
                                                  samples=500,
                                                  model=model_no_collinear)

_, ax = plt.subplots(figsize=(12, 6))
ax.hist([h.mean() for h in collinear_ppc['h']])
ax.axvline(height.mean())
ax.set(title='Posterior predictive of the mean',
   
    print "MODEL BUILT! READY TO FIND MAP"
    start = pm.find_MAP()
    step = pm.Slice()
    # step = pm.NUTS(scaling=start)

    niter = 300
    trace = pm.sample(niter, step, start, progressbar=True)

    pm.traceplot(trace)
        # , vars=['muA'])
    # plt.savefig("data1.png")
    plt.show()

    pm.forestplot(trace)
    plt.show()

    # # print ppc['Y_obs']
    # print ppc['distributionA']
    # print ppc['distributionB']
    # tau = pm.Uniform('tau', lower=0, upper=1000)
    # lam = pm.Uniform('lam', lower = 0, upper = 1000)
    # alpha = pm.Uniform('alpha', lower = 0.0000000000000001, upper = 100)
    
    # p_weekend = float(len(b)) / (len(b) + len(c))
    # print "Got here"
    # # weekend = pm.Bernoulli('weekend', p_weekend, observed = weekend_observed)
    # print b[:3000]
    # print c[:3000]
    # startTimeWeekend = pm.Normal('a', mu = muA, sd = sigmaA, observed = b[:30000])
Exemple #29
0
    mean = intercept + slope * X.loc[:, 'Duration']

    # Observed values
    Y_obs = pm.Normal('Y_obs', mu=mean, sd=sigma, observed=y.values)

    # Sampler
    step = pm.NUTS()

    # Posterior distribution
    linear_trace = pm.sample(1000, step)

pm.traceplot(linear_trace, figsize=(12, 12))
# plt.show()
pm.plot_posterior(linear_trace, figsize=(12, 10), text_size=20)
# plt.show()
pm.forestplot(linear_trace)
# plt.show()

plt.figure(figsize=(8, 8))
pm.plot_posterior_predictive_glm(
    linear_trace,
    samples=100,
    eval=np.linspace(2, 30, 100),
    linewidth=1,
    color='red',
    alpha=0.8,
    label='Bayesian Posterior Fits',
    lm=lambda x, sample: sample['Intercept'] + sample['slope'] * x)
plt.scatter(X['Duration'],
            y.values,
            s=12,
def main(input_dir, output_dir, dataset, model_type, n_samples, n_tune, target_accept, n_cores, seed, init, profile):
    '''Fit log-parabola model to DATASET. 

    Parameters
    ----------
    input_dir : [type]
        input directory containing subdirs for each instrument with dl3 data
    output_dir : [type]
        where to save the results. traces and two plots
    dataset : string
        telescope name
    model_type : string
        whether to use the profile likelihood ('wstat' or 'profile') or not ('full')
    n_samples : int
        number of samples to draw
    n_tune : int
        number of tuning steps
    target_accept : float
        target accept fraction for the pymc sampler
    n_cores : int
        number of cpu cores to use
    seed : int
        random seed
    init : string
        pymc init string
    profile : bool
        whether to output debugging/profiling information to the console
    Raises
    ------
    NotImplementedError
        This does not yet work on the joint dataset. but thats good enough for me.
    '''
    np.random.seed(seed)

    if dataset == 'joint':
        #TODO need to calculate mu_b for each observation independently.
        raise NotImplementedError('This is not implemented for the joint dataset yet.')
        # observations, lo, hi = load_joint_spectrum_observation(input_dir)
    else:
        p = os.path.join(input_dir, dataset)
        observations, lo, hi = load_spectrum_observations(p)

    prepare_output(output_dir)

    # TODO: this has to happen for every observation independently
    exposure_ratio = observations[0].alpha[0]
    # print(exposure_ratio)
    on_data, off_data = get_observed_counts(observations)

    integrator = init_integrators(observations)

    print('On Data')
    display_data(on_data)

    print('Off Data')
    display_data(off_data)
    
    print('--' * 30)
    print(f'Fitting data for {dataset} in {len(observations)} observations.  ')
    print(f'Using {len(on_data)} bins with { on_data.sum()} counts in on region and {off_data.sum()} counts in off region.')
    print(f'Fit range is: {(lo, hi) * u.TeV}.')
    model = pm.Model(theano_config={'compute_test_value': 'ignore'})
    with model:
        # amplitude = pm.TruncatedNormal('amplitude', mu=4, sd=1, lower=0.01, testval=4)
        # alpha = pm.TruncatedNormal('alpha', mu=2.5, sd=1, lower=0.00, testval=2.5)
        # beta = pm.TruncatedNormal('beta', mu=0.5, sd=0.5, lower=0.00000, testval=0.5)
        amplitude = pm.HalfFlat('amplitude', testval=4)
        alpha = pm.HalfFlat('alpha', testval=2.5)
        beta = pm.HalfFlat('beta', testval=0.5)

        mu_s = forward_fold_log_parabola_symbolic(integrator, amplitude, alpha, beta, observations)
        # mu_s = forward_fold_log_parabola_analytic(amplitude, alpha, beta, observations)

        if model_type == 'wstat':
            print('Building profiled likelihood model')
            mu_b = pm.Deterministic('mu_b', calc_mu_b(mu_s, on_data, off_data, exposure_ratio))
        else:
            print('Building full likelihood model')
            mu_b = pm.HalfFlat('mu_b', shape=len(off_data))

        pm.Poisson('background', mu=mu_b, observed=off_data, shape=len(off_data))
        pm.Poisson('signal', mu=mu_s + exposure_ratio * mu_b, observed=on_data, shape=len(on_data))


    print('--' * 30)
    print('Model debug information:')
    for RV in model.basic_RVs:
        print(RV.name, RV.logp(model.test_point))

    if profile:
        model.profile(model.logpt).summary()

    print(model.check_test_point())

    print('--' * 30)
    print('Plotting landscape:')
    fig, _ = plot_landscape(model, off_data)
    fig.savefig(os.path.join(output_dir, 'landscape.pdf'))

    print('--' * 30)
    print('Printing  graphs:')
    theano.printing.pydotprint(mu_s, outfile=os.path.join(output_dir, 'graph_mu_s.pdf'), format='pdf', var_with_name_simple=True)  
    theano.printing.pydotprint(mu_s + exposure_ratio * mu_b, outfile=os.path.join(output_dir, 'graph_n_on.pdf'), format='pdf', var_with_name_simple=True)  


    print('--' * 30)
    print('Sampling likelihood:')
    with model:
        trace = pm.sample(n_samples, cores=n_cores, tune=n_tune, init=init, seed=[seed] * n_cores)

    print('--' * 30)
    print(f'Fit results for {dataset}')
    print(trace['amplitude'].mean(), trace['alpha'].mean(), trace['beta'].mean())
    print(np.median(trace['amplitude']), np.median(trace['alpha']), np.median(trace['beta']))

    print('--' * 30)
    # print('Plotting traces')
    # plt.figure()
    # varnames = ['amplitude', 'alpha', 'beta'] if model_type != 'full' else ['amplitude', 'alpha', 'beta', 'mu_b']
    # pm.traceplot(trace, varnames=varnames)
    # plt.savefig(os.path.join(output_dir, 'traces.pdf'))

    p = os.path.join(output_dir, 'num_samples.txt')
    with open(p, "w") as text_file:
        text_file.write(f'\\num{{{n_samples}}}')

    p = os.path.join(output_dir, 'num_chains.txt')
    with open(p, "w") as text_file:
        text_file.write(f'\\num{{{n_cores}}}')
    
    p = os.path.join(output_dir, 'num_tune.txt')
    with open(p, "w") as text_file:
        text_file.write(f'\\num{{{n_tune}}}')

    plt.figure()
    pm.energyplot(trace)
    plt.savefig(os.path.join(output_dir, 'energy.pdf'))

    # plt.figure()
    # pm.autocorrplot(trace, burn=n_tune)
    # plt.savefig(os.path.join(output_dir, 'autocorr.pdf'))
    
    plt.figure()
    pm.forestplot(trace, varnames=['amplitude', 'alpha', 'beta'])
    plt.savefig(os.path.join(output_dir, 'forest.pdf'))
    

    trace_output = os.path.join(output_dir, 'traces')
    print(f'Saving traces to {trace_output}')
    with model:
        pm.save_trace(trace, trace_output, overwrite=True)
Exemple #31
0
def _fit_model():
    # load data
    df_a = pd.read_csv(os.path.join(path, 'paper_results', 'analogies.tsv'), sep='\t')[['lang', 'vecs', 'source', 'adjusted score']]
    df_s = pd.read_csv(os.path.join(path, 'paper_results', 'similarities.tsv'), sep='\t')[['lang', 'vecs', 'source', 'adjusted rank r']]
    df_n = pd.read_csv(os.path.join(path, 'paper_results', 'norms.tsv'), sep='\t')[['lang', 'vecs', 'norm', 'adjusted r']]
    df_b = pd.read_csv(os.path.join(path, 'paper_results', 'binder.tsv'), sep='\t')[['lang', 'vecs', 'norm', 'adjusted r']]

    # keep track of different evaluation tasks
    df_a['kind'] = 'analogies'
    df_s['kind'] = 'similarities'
    df_n['kind'] = 'norms'
    df_b['kind'] = 'norms'

    # rename different metrics to score, and various dataset origins to task
    df_a = df_a.rename(columns={'source': 'task', 'adjusted score': 'score'})
    df_s = df_s.rename(columns={'source': 'task', 'adjusted rank r': 'score'})
    df_n = df_n.rename(columns={'norm': 'task', 'adjusted r': 'score'})
    df_b = df_b.rename(columns={'norm': 'task', 'adjusted r': 'score'})

    # stack datasets
    df = pd.concat([df_a, df_s, df_n, df_b])

    # merge in corpus word counts
    df_corpus = pd.read_csv(os.path.join(path, 'paper_results', 'table_data.tsv'), sep='\t')
    df = df.merge(df_corpus[['lang', 'vecs', 'words']], how='inner', on=['lang', 'vecs'])

    df.to_csv('model_data.tsv', sep='\t', index=False)  # store merged data for record keeping

    df['log10_wordcount'] = np.log10(df['words'])  # log-transform word counts
    df['log10_wordcount_z'] = standardize(df['log10_wordcount'])  # standardize word counts

    # create sum-coded contrasts
    df['wiki'] = df['vecs'].apply(lambda x: sum_contrast(x, 'wiki', 'wiki+subs'))
    df['subs'] = df['vecs'].apply(lambda x: sum_contrast(x, 'subs', 'wiki+subs'))
    df['analogies'] = df['kind'].apply(lambda x: sum_contrast(x, 'analogies', 'similarities'))
    df['norms'] = df['kind'].apply(lambda x: sum_contrast(x, 'norms', 'similarities'))

    # define PyMC3 model for statistical inference
    with pm.Model() as beta_model:
        # define centered Normal priors for all the betas, sd = 1 (mild shrinkage prior)
        intercept = pm.Normal('μ', mu=0, sd=1)
        b_wordcount = pm.Normal('β log corpus word count', mu=0, sd=1)
        b_wiki = pm.Normal('β wiki vs. mean', mu=0, sd=1)
        b_subs = pm.Normal('β subs vs. mean', mu=0, sd=1)
        b_norms = pm.Normal('β norms vs. mean', mu=0, sd=1)
        b_analogies = pm.Normal('β analogies vs. mean', mu=0, sd=1)
        b_wiki_norms = pm.Normal('β wiki vs. mean:norms vs. mean', mu=0, sd=1)
        b_wiki_analogies = pm.Normal('β wiki vs. mean:analogies vs. mean', mu=0, sd=1)
        b_subs_norms = pm.Normal('β subs vs. mean:norms vs. mean', mu=0, sd=1)
        b_subs_analogies = pm.Normal('β subs vs. mean:analogies vs. mean', mu=0, sd=1)

        b_wikisubs = pm.Deterministic('β wiki+subs vs. mean', -1 * (b_subs + b_wiki))
        b_similarities = pm.Deterministic('β similarities vs. mean', -1 * (b_analogies + b_norms))
        b_wikisubs_norms = pm.Deterministic('β wiki+subs vs. mean:norms vs. mean', -1 * (b_subs_norms + b_wiki_norms))
        b_wikisubs_analogies = pm.Deterministic('β wiki+subs vs. mean:analogies vs. mean', -1 * (b_subs_analogies + b_wiki_analogies))
        b_subs_similarities = pm.Deterministic('β subs vs. mean:similarities vs. mean', -1 * (b_subs_analogies + b_subs_norms))
        b_wiki_similarities = pm.Deterministic('β wiki vs. mean:similarities vs. mean', -1 * (b_wiki_analogies + b_wiki_norms))

        # given the above, there are two ways to compute the interaction wiki+subs vs.mean:similarities vs. mean
        # both methods are given below, but we only need to use one
        # they give the exact same answer though, you can uncomment the second line to verify
        b_wikisubs_similarities = pm.Deterministic('β wiki+subs vs. mean:similarities vs. mean', -1 * (b_wiki_similarities + b_subs_similarities))
        # b_wikisubs_similarities2 = pm.Deterministic('β wiki+subs vs. mean:similarities vs. mean (2)', -1 * (b_wikisubs_analogies + b_wikisubs_norms))

        # non-centered parametrization for task-level random intercepts
        task_codes, task_uniques = df['task'].factorize()  # get number of unique groups and code them
        mu_tilde_task = pm.Normal('μ\u0303 task', mu=0, sd=1, shape=len(task_uniques))  # prior for task group offsets
        sigma_task = pm.HalfNormal('σ task', sd=1)  # prior for task group sigma
        mu_task = pm.Deterministic('μ task', sigma_task * mu_tilde_task)  # task group means (random intercepts)

        # non-centered parametrization for language-level random intercepts
        lang_codes, lang_uniques = df['lang'].factorize()  # get number of unique groups and code them
        mu_tilde_lang = pm.Normal('μ\u0303 lang', mu=0, sd=1, shape=len(lang_uniques))  # prior for lang group offsets
        sigma_lang = pm.HalfNormal('σ lang', sd=1)  # prior for lang group sigma
        mu_lang = pm.Deterministic('μ lang', sigma_lang * mu_tilde_lang)  # lang group means (random intercepts)

        # compute predictions for y, using logit link function
        y_hat = pm.Deterministic('ŷ', pm.math.invlogit(
            intercept
            + b_wordcount * df['log10_wordcount_z']
            + b_wiki * df['wiki']
            + b_subs * df['subs']
            + b_norms * df['norms']
            + b_analogies * df['analogies']
            + b_wiki_norms * df['wiki'] * df['norms']
            + b_wiki_analogies * df['wiki'] * df['analogies']
            + b_subs_norms * df['subs'] * df['norms']
            + b_subs_analogies * df['subs'] * df['analogies']
            + mu_lang[lang_codes]
            + mu_task[task_codes]
        ))

        # define likelihood
        invphi = pm.HalfNormal('1 / φ', sd=1)  # prior for phi, for Beta(mu, phi) parametrization of the likelihood distribution
        phi = pm.Deterministic('φ', 1 / invphi)
        y = pm.Beta('y', alpha=y_hat * phi, beta=(1 - y_hat) * phi, observed=df['score'])

        # sample with 3 chains, 2000 warmup + 4000 posterior samples per chain
        trace = pm.sample(2500, tune=2500, chains=4, target_accept=.9)

    # store trace summary as tsv and LaTeX table
    df_summary = pm.summary(trace, credible_interval=.9)
    df_summary.to_csv('trace_summary.tsv', sep='\t')
    with open('trace_summary_latex.txt', 'w') as latextable:
        latextable.write(df_summary.round(2).to_latex())

    # draw and store model graph
    graph = pm.model_to_graphviz(beta_model)
    graph.graph_attr['rankdir'] = 'LR'  # change graph orientation to left-right (from top-down)
    graph.render(filename='model', format='pdf', cleanup=True)

    # draw and store forest plot
    varnames = [
        'μ',
        'β log corpus word count',
        'β subs vs. mean',
        'β wiki vs. mean',
        'β wiki+subs vs. mean',
        'β analogies vs. mean',
        'β norms vs. mean',
        'β similarities vs. mean',
        'β subs vs. mean:analogies vs. mean',
        'β subs vs. mean:norms vs. mean',
        'β subs vs. mean:similarities vs. mean',
        'β wiki vs. mean:analogies vs. mean',
        'β wiki vs. mean:norms vs. mean',
        'β wiki vs. mean:similarities vs. mean',
        'β wiki+subs vs. mean:analogies vs. mean',
        'β wiki+subs vs. mean:norms vs. mean',
        'β wiki+subs vs. mean:similarities vs. mean',
    ]
    axes = pm.forestplot(trace, var_names=varnames, credible_interval=.9, combined=True, figsize=(4, 6))
    axes[0].set(title='90% credible intervals', xlabel='coefficient (in log-odds)')
    plt.savefig('forestplot.pdf')
    plt.savefig('forestplot.png', dpi=600)
    plt.clf()

    # draw and store trace plot
    pm.traceplot(trace)
    plt.savefig('traceplot.png', dpi=300)  # the traceplot is huge, so we lower the resolution and don't store it as pdf
    plt.clf()

    return df_summary
Exemple #32
0
    y_pred = pm.Normal('y_pred', mu=mu, sd=epsilon, observed=y)

    start = pm.find_MAP()
    step = pm.NUTS(scaling=start)
    trace_red = pm.sample(5000, step=step, start=start)

pm.traceplot(trace_red)
plt.show()

sns.kdeplot(trace_red['beta'][:, 0], trace_red['beta'][:, 1])
plt.xlabel(r'$\beta_1$', fontsize=16)
plt.ylabel(r'$\beta_2$', fontsize=16, rotation=0)
plt.show()

pm.forestplot(trace_red, varnames=['beta'])
plt.show()

# Masking effect variables
N = 100
r = 0.8
x_0 = np.random.normal(size=N)
x_1 = np.random.normal(loc=x_0 * r, scale=(1 - r**2)**0.5)
y = np.random.normal(loc=x_0 - x_1)
X = np.vstack((x_0, x_1))
scatter_plot(X, y)
plt.show()

with pm.Model() as model_ma:
    alpha = pm.Normal('alpha', mu=0, sd=10)
    beta = pm.Normal('beta', mu=0, sd=10, shape=2)
Exemple #33
0
mc.traceplot(trace, vars=['mean', 'sigma'], ax=axes)
fig.tight_layout()
fig.savefig("ch16-posterior-sample-trace.png")
fig.savefig("ch16-posterior-sample-trace.pdf")

# In[43]:

mu, trace.get_values('mean').mean()

# In[44]:

s, trace.get_values('sigma').mean()

# In[45]:

gs = mc.forestplot(trace, vars=['mean', 'sigma'])
plt.savefig("ch16-forestplot.pdf")

# In[46]:

help(mc.summary)

# In[47]:

mc.summary(trace, vars=['mean', 'sigma'])

# ## Linear regression

# In[48]:

dataset = sm.datasets.get_rdataset("Davis", "car")
    effect_size = pm.Deterministic(
        'effect size', diff_of_means / np.sqrt(
            (group1_std**2 + group2_std**2) / 2))

    # RUN
    #trace = pm.sample(2000, cores=2)  #  Nota Bene: https://github.com/pymc-devs/pymc3/issues/3388
    trace = pm.sample(1000, tune=1000, cores=1)

pm.kdeplot(np.random.exponential(30, size=10000), shade=0.5)

pm.plot_posterior(trace,
                  varnames=[
                      'group1_mean', 'group2_mean', 'group1_std', 'group2_std',
                      'ν_minus_one'
                  ],
                  color='#87ceeb')

pm.plot_posterior(
    trace,
    varnames=['difference of means', 'difference of stds', 'effect size'],
    ref_val=0,
    color='#87ceeb')

pm.forestplot(trace, varnames=['group1_mean', 'group2_mean'])

pm.forestplot(trace, varnames=['group1_std', 'group2_std', 'ν_minus_one'])

pm.summary(
    trace,
    varnames=['difference of means', 'difference of stds', 'effect size'])
Exemple #35
0
    # Linear combinations of parameters
    theta = invlogit(alpha + beta*dose)

    # Model likelihood
    deaths = Binomial('deaths', n=n, p=theta, observed=y)
  




with bioassay_model:

    # Draw wamples
    trace = sample(1000, njobs=2)
    # Plot two parameters
    forestplot(trace, varnames=['alpha', 'beta'])
  



### MOTIVATING EXAMPLE -- LINEAR REGRESSION 
import numpy as np
import matplotlib.pyplot as plt

# Initialize random number generator
np.random.seed(123)

# True parameter values
alpha, sigma = 1, 1
beta = [1, 2.5]