Beispiel #1
0
def convergence_diagnose_birt(m, a):
    # birt_model = pymc.MCMC(...)

    pymc.raftery_lewis(m.a, q=0.025, r=0.01)
    scores = pymc.geweke(m.a, intervals=20)
    pymc.Matplot.geweke_plot(scores)
    pymc.gelman_rubin(m)
Beispiel #2
0
def bayes_model(data, savedir, num_jobs=1):
    mu, alpha, beta = param_selector(data)

    @pm.stochastic(observed=True)
    def custom_stochastic(value=data, mu=mu, alpha=alpha, beta=beta):
        r = np.zeros(len(value))
        for i in range(1, len(value)):
            r[i] = math.exp(-beta * (value[i] - value[i - 1])) * (1 + r[i - 1])
        #Calculate the loglikelihood
        loglik = -value[-1] * mu
        loglik = loglik + alpha / beta * sum(
            np.exp(-beta * (value[-1] - value)) - 1)
        loglik = loglik + np.sum(np.log(mu + alpha * r))
        return loglik

    model = pm.MCMC([mu, alpha, beta, custom_stochastic])

    srfpass = False
    while srfpass == False:
        for i in range(num_jobs):
            model.sample(300000, 60000, 7)
            model.write_csv('{0}full_params{1}.csv'.format(savedir, i))
        srfpass = srf_checker(pm.gelman_rubin(model))
    with open('{0}srf.csv'.format(savedir), 'w') as wfile:
        print('Parameter,SRF', file=wfile)
        srfdict = pm.gelman_rubin(model)
        for k, v in srfdict.items():
            print('{0},{1}'.format(k, v), file=wfile)
def gelman_rubin(chain,labels,plt_label, nchunk=10):

	nparameters = chain.shape[2]
	niter = chain.shape[1]
	gelman_rubin_r = np.zeros(shape=(nchunk,nparameters))
	niter_plot = np.zeros(nchunk,dtype='int')
	for i in xrange(nchunk):
		niter_plot[i] = int(niter/nchunk)*i+int(niter/nchunk)
		gelman_rubin_r[i,:] = pymc.gelman_rubin(chain[:,:niter_plot[i],:])

	### plotting
	fig, ax = plt.subplots(1,1, figsize=(8,8))
	cmap = get_cmap(nparameters)
	for i in xrange(nparameters):
		ax.plot(niter_plot,gelman_rubin_r[:,i],'o',label=labels[i],color=cmap(i),lw=1.5,linestyle='-',alpha=0.6)

	ax.set_ylabel('Gelman-Rubin R')
	ax.set_xlabel('iteration')

	### limits
	min, max = gelman_rubin_r.min(), gelman_rubin_r.max()
	ax.set_ylim(min*0.95,max*1.05)

	if max > 10:
		ax.set_yscale('log',nonposy='clip',subsy=(1,2,4))
		ax.yaxis.set_minor_formatter(minorFormatter)
		ax.yaxis.set_major_formatter(majorFormatter)

	ax.axhline(1.2, linestyle='--', color='red',lw=1,zorder=-1)

	ax.legend(prop={'size':10},ncol=nparameters / 5,numpoints=1,markerscale=0.7)

	fig.tight_layout()
	plt.savefig('gelman_rubin_'+plt_label+'.png',dpi=150)
	plt.close()
    def test_simple(self):

        S2 = copy.copy(S)
        S2.sample(iterations, burnin, progress_bar=0)

        gr = pymc.gelman_rubin(S2)

        for i in gr:
            assert_approx_equal(gr[i], 1., 2)
Beispiel #5
0
 def test_simple(self):
     
     S2 = copy.copy(S)
     S2.sample(10000, 2000, progress_bar=0)
     
     gr = pymc.gelman_rubin(S2)
     
     for i in gr:
         assert_approx_equal(gr[i], 1., 2)
Beispiel #6
0
def fit_two_mcmc(time, 
                 signal, 
                 height_th,
                 one_pulse,
                 sigma0,    # signal noise
                 sum_mu, 
                 sum_tau, 
                 sum_a,
                 sum_b,
                 diff_tau,
                 diff_a,
                 diff_b, 
                 sampling, 
                 burn, 
                 thin,
                 Plot=False, 
                 debug=False, 
                 auto=False):
    
    # LIMIT SEARCH FOR OFFSETS
    _t_initial=time[pd.srlatch_rev(signal,0,height_th)][0] 
    _t_final=time[pd.srlatch_rev(signal,0,height_th)][-1] 
    
    def model(x, f): 
        # PRIORS
        y_err = sigma0
        # print (_t_initial,_t_final, one_x_offset_init)
        one_x_offset = pymc.Uniform("one_x_offset", _t_initial, time[np.argmax(signal)], value=_t_initial)
        two_x_offset = pymc.Uniform("two_x_offset", _t_initial, _t_final, value=_t_final)
        sum_of_amps = pymc.TruncatedNormal("sum_amps", 
                                           mu=sum_mu, 
                                           tau=sum_tau, 
                                           a=sum_a, 
                                           b=sum_b, 
                                           value=sum_mu) #sigma/mu is the n=1 std deviation in units of n=1 amplitude
        diff_of_amps = pymc.TruncatedNormal("diff_amps", 
                                            mu=0, 
                                            tau=diff_tau, 
                                            a=diff_a, 
                                            b=diff_b, 
                                            value=0)
        one_x_amplitude = (sum_of_amps+diff_of_amps)/2
        two_x_amplitude = (sum_of_amps-diff_of_amps)/2
        # MODEL
        @pymc.deterministic(plot=False)
        def mod_two_pulse(x=time, 
                          one_x_offset=one_x_offset, 
                          two_x_offset=two_x_offset, 
                          one_x_amplitude=one_x_amplitude, 
                          two_x_amplitude=two_x_amplitude):
              return one_pulse(x, x_offset=one_x_offset, amplitude=one_x_amplitude)+\
            one_pulse(x, x_offset=two_x_offset, amplitude=two_x_amplitude)

        #likelihoodsy
        y = pymc.Normal("y", mu=mod_two_pulse, tau= 1.0/y_err**2, value=signal, observed=True)
        return locals()

    MDL = pymc.MCMC(model(time,signal), db='pickle') # The sample is stored in a Python serialization (pickle) database
    # MDL.use_step_method(pymc.AdaptiveMetropolis, 
    #     [MDL.sum_of_amps, MDL.diff_of_amps],
    #     scales={MDL.sum_of_amps:np.sqrt(1/sum_tau), 
    #             MDL.diff_of_amps:np.sqrt(1/diff_tau)}, 
    #     )
    if auto: 
        # uses Raftery Lewis to determine fit Parameters per trace: 
        # https://pymc-devs.github.io/pymc/modelchecking.html#convergence-diagnostics
        
        # pilot run
        InitSamples = 4*len(time)
        InitMDL = MDL
        InitMDL.sample(iter=InitSamples, burn=int(InitSamples*.5), thin=10)
        pymc_diagnostic = pymc.raftery_lewis(InitMDL, q=0.025, r=0.02, verbose=0) 
        [EstBurn, EstSampling, EstThin] = np.max(
            np.array(
                [pymc_diagnostic[i] for i in pymc_diagnostic.keys()[1:]] # first key: mod_two_pulse irrelavent
            ),
            axis=0)[2:] # first 2 diagnostics: 1st order Markov Chain irrelavent
        # print [EstBurn, EstSampling, EstThin]
        # actual run
        MDL.sample(iter=EstSampling, burn=EstBurn, thin=EstThin, verbose=0)
    else:
        MDL.sample(iter=sampling, burn=burn, thin=thin, verbose=-1)  
    # thin: consider every 'thin' samples
    # burn: number of samples to discard: decide by num of samples to run till parameters stabilise at desired precision
    if Plot:
        y_fit = MDL.mod_two_pulse.value #get mcmc fitted values
        plt.plot(time, signal, 'b', marker='o', ls='-', lw=1, label='Observed')
        plt.plot(time,y_fit,'k', marker='+', ls='--', ms=5, mew=2, label='Bayesian Fit Values')
        plt.legend()
        pymc.Matplot.plot(MDL)      
    if debug:
        for i in np.arange(10):
            MDL.sample(iter=sampling, burn=burn, thin=thin, verbose=0)
            pymc.gelman_rubin(MDL)
            pymc.Matplot.summary_plot(MDL)
    return MDL #usage: MDL.one_x_offset.value for fitted result
Beispiel #7
0
def get_Bayes(measurements=[], chunksize=5, Ndp=5, iter=50000, burn=5000):

    sc = pymc.Uniform('sc', 0.1, 2.0, value=0.24)
    tau = pymc.Uniform('tau', 0.0, 1.0, value=0.5)

    concinit = 1.0
    conclo = 0.1
    conchi = 10.0
    concentration = pymc.Uniform('concentration',
                                 lower=conclo,
                                 upper=conchi,
                                 value=concinit)

    # The stick-breaking construction: requires Ndp beta draws dependent on the
    # concentration, before the probability mass function is actually constructed.
    #betas = pymc.Beta('betas', alpha=1, beta=concentration, size=Ndp)
    betas = pymc.Beta('betas', alpha=1, beta=1, size=Ndp - 1)

    @pymc.deterministic
    def pmf(betas=betas):
        "Construct a probability mass function for the truncated Dirichlet process"
        # prod = lambda x: np.exp(np.sum(np.log(x))) # Slow but more accurate(?)
        prod = np.prod
        value = map(lambda i, u: u * prod(1.0 - betas[:i]), enumerate(betas))
        value.append(1.0 - sum(value[:]))  # force value to sum to 1
        return value

    # The cluster assignments: each data point's estimated cluster ID.
    # Remove idinit to allow clusterid to be randomly initialized:
    Ndata = len(measurements)
    idinit = np.zeros(Ndata, dtype=np.int64)
    clusterid = pymc.Categorical('clusterid', p=pmf, size=Ndata, value=idinit)

    @pymc.deterministic(name='clustermean')
    def clustermean(clusterid=clusterid, sc=sc, Ndp=Ndp):
        return sc * np.arange(1, Ndp + 1)[clusterid]

    @pymc.deterministic(name='clusterprec')
    def clusterprec(clusterid=clusterid, sc=sc, tau=tau, Ndp=Ndp):
        return 1.0 / (sc * sc * tau * tau * (np.arange(1, Ndp + 1)[clusterid]))

    y = pymc.Normal('y',
                    mu=clustermean,
                    tau=clusterprec,
                    observed=True,
                    value=measurements)

    ## for predictive poeterior simulation
    @pymc.deterministic(name='y_sim')
    def y_sim(value=[0], sc=sc, tau=tau, clusterid=clusterid, Ndp=Ndp):
        n = np.arange(1, Ndp + 1)[np.random.choice(clusterid)]
        return np.random.normal(loc=sc * n, scale=sc * tau * n)

    m = pymc.Model({
        "scale": sc,
        "tau": tau,
        "betas": betas,
        "clusterid": clusterid,
        "normal": y,
        "pred": y_sim
    })

    sc_samples = []
    modes = []
    simulations = []

    for i in range(0, chunksize):
        mc = pymc.MCMC(m)
        mc.sample(iter=50000, burn=10000)
        plot(mc)

        sc_sample = mc.trace('sc')[:]
        sc_samples.append(sc_sample)

        simulation = mc.trace('y_sim')[:]
        simulations.append(simulation)

        plt.hist(measurements,
                 50,
                 fc='gray',
                 histtype='stepfilled',
                 alpha=0.3,
                 normed=False)
        plt.hist(simulation,
                 30,
                 fc='blue',
                 histtype='stepfilled',
                 alpha=0.3,
                 normed=True)
        hist, edges = np.histogram(
            measurements,
            bins=100,
            range=[np.min(measurements) - 0.25,
                   np.max(measurements) + 0.25])

        argm = hist.argmax()
        (edges[argm] + edges[argm + 1]) / 2
        modes.append((edges[argm] + edges[argm + 1]) / 2)

    if chunksize <= 1:
        gr = np.nan
    else:
        pymc.gelman_rubin(sc_samples)

    dic = {
        'gelman_rubin': gr,
        'modes': modes,
        'simulations': simulations,
        'sc_samples': sc_samples
    }
    return dic
Beispiel #8
0
    def sample(self,
               dbname,
               n_runs=1,
               iter=10000,
               burn=1000,
               thin=10,
               gelman_rubin=False,
               progress_bar=False,
               **kwargs):
        """
        Sample from the posteriors using MCMC.

        Implementation note:
        This method sets the attribute `mcmc`.

        Parameters
        ----------
        dbname : str
            Path to the file the pickled MCMC
            object is written to. If the path exists,
            the existing database is updated.
        n_runs : int, optional (default: 1)
            The number of times MCMC is run.
            Must be >1, if Gelman-Rubin statistic
            is used.
        iter : int, optional (default: 10000)
            The number of iterations per MCMC run.
        burn : int, optional (default: 1000)
            The number of samples discarded from
            the beginning of a parameter's trace.
        thin : int, optional (default: 10)
            Each `thin` sample is discarded to
            reduce auto-correlation.
        gelman_rubin : bool, optional (default: False)
            If True, compute the Gelman-Rubin statistic
            for each sampled parameter and print to stdout.
            n_runs must be >1.
        progress_bar : bool, optional (default: False)
            If True, show progress bar while MCMC
            samples.
        **kwargs
            Additional keyword arguments passed
            to PyMC's sample call.
        """
        if self.model is None:
            error_msg = ("Model doesn't exist in sampling stage. "
                         "Please create the model before sampling.")
            sys.exit(error_msg)

        db = "pickle"
        if os.path.isfile(dbname):
            db = pymc.database.pickle.load(dbname)

        # init MCMC sampling object
        self.mcmc = pymc.MCMC(self.model, db=db, dbname=dbname)

        # sample using MCMC
        for _ in range(n_runs):
            self.mcmc.sample(iter=iter,
                             burn=burn,
                             thin=thin,
                             progress_bar=progress_bar,
                             **kwargs)

        # close the database file
        self.mcmc.db.close()

        # compute Gelman-Rubin statistic
        if gelman_rubin:
            if n_runs < 2:
                print("The Gelman-Rubin statistic requires",
                      "multiple MCMC runs.",
                      file=sys.stderr)
            else:
                print("Gelman-Rubin statistics:")
                for param in self.parameters:
                    gr = pymc.gelman_rubin(self.mcmc)[param]
                    print(f"\t{param} : {gr}")
Beispiel #9
0
    mcmc = pymc.MCMC(make_poisson(5, 1., 20.), db='pickle')
    # Run 3 chains:
    for i in range(3):
        mcmc.sample(iter=10000, burn=5000, thin=1)
    print  # to handle missing newline from progress bar

    # Generate a dict of Geweke test z scores for each RV, here using early
    # segments 10% of the chain length, a final segment 50% of the length,
    # and producing scores for 10 early intervals.
    scores = pymc.geweke(mcmc, first=0.1, last=0.5, intervals=10)

    # The Matplot functions automatically produce new figures for each plot.
    pymc.Matplot.geweke_plot(scores['rate'], 'rate')
    pymc.Matplot.geweke_plot(scores['mu'], 'mu')

    print 'Rhat values:', pymc.gelman_rubin(mcmc)

    # Plot credible regions and R values:
    pymc.Matplot.summary_plot(mcmc)


def make_on_off(n_off, expo_off, n_on, expo_on, mean0):
    """
    Make a PyMC model for inferring a Poisson signal rate parameter, `s`, for
    'on-off' observations with uncertain background rate, `b`.

    Parameters
    ----------

    n_off, n_on : int
        Event counts off-source and on-source
Beispiel #10
0
def plot_summary(
    sol,
    save=False,
    draw=True,
    save_as_png=False,
    dpi=None,
    ignore=subplots_to_ignore,
    fig_nb="",
):
    """
    Plots a parameter summary and 
    Gelman-Rubin R-hat for multiple chains
    """

    ext = ['png' if save_as_png else 'pdf'][0]
    ch_nb = sol.mcmc["nb_chain"]

    keys = sorted([k for k in sol.var_dict.keys() if k not in ignore])
    trac = [[sol.var_dict[x].trace(chain=n).mean(axis=0) for x in keys]
            for n in range(ch_nb)]
    deps = [var_depth(sol.var_dict[x]) for x in keys]
    lbls = list(
        reversed(
            flatten([[k + '%s' % (x + 1) for x in range(d)] if d > 1 else k
                     for k, d in zip(keys, deps)])))

    if ch_nb >= 2:
        rhat = [
            gelman_rubin([
                sol.MDL.trace(var, -x)[:] for x in range(sol.mcmc['nb_chain'])
            ]) for var in keys
        ]
        R = np.array(flatten(rhat))
        R[R > 5] = 5
    else:
        print(
            "\nTwo or more chains of equal length required for Gelman-Rubin convergence"
        )
        R = len(lbls) * [None]

    fig, axes = plt.subplots(figsize=(6, 4))
    gs2 = gridspec.GridSpec(3, 3)
    ax1 = plt.subplot(gs2[:, :-1])
    ax2 = plt.subplot(gs2[:, -1], sharey=ax1)
    for i in range(len(lbls)):
        for c in range(ch_nb):
            val_m = np.array(flatten(trac[c]))
            ax1.scatter(val_m[i],
                        len(val_m) - (i + 1),
                        color="C0",
                        marker=".",
                        s=50,
                        facecolor='k',
                        edgecolors='k',
                        alpha=1)
        ax2.scatter(R[i], i, color="C3", marker="<", s=50, alpha=1)

    ax1.set_ylim([-1, len(lbls)])
    ax1.set_yticks(list(range(0, len(lbls))))
    ax1.set_yticklabels([parlbl_dic[l] for l in lbls])
    ax1.set_axisbelow(True)
    ax1.yaxis.grid(True)
    ax1.xaxis.grid(False)
    ax1.set_xlim(ax1.get_xlim())
    ax1.set_xlabel(r'Parameter value')

    plt.setp(ax2.get_yticklabels(), visible=False)
    ax2.set_xlim([0.5, 5.5])
    ax2.set_xticklabels(["", "1", "2", "3", "4", "5+"])
    ax2.set_xticks([
        0.5,
        1,
        2,
        3,
        4,
        5,
    ])
    ax2.set_axisbelow(True)
    ax2.yaxis.grid(True)
    ax2.xaxis.grid(False)
    ax2.set_xlabel(r'$\hat{R}$')
    ax2.axvline(1, ls='--', color='C0', zorder=0)

    plt.tight_layout()
    plt.close(fig)

    if save:
        fn = '%sSUM-%s-%s.%s' % (fig_nb, sol.model_type_str, sol.filename, ext)
        save_figure(fig, subfolder='Summaries', fname=fn, dpi=dpi)

    if draw: return fig
    else: return None
Beispiel #11
0
# 
# It is simple to run multiple chains sequentially in DisMod-MR, although I worry that this gives a false sense of security about the convergence.

# <codecell>

# setup a model and run the chain once

dm = new_model(data)
dm.setup_model('p', rate_model='neg_binom')
%time dm.fit(how='mcmc', iter=2000, burn=1000, thin=1)

# <codecell>

# to run it more times, use the sample method of the dm.mcmc object
# use the same iter/burn/thin settings for future convenience

for i in range(4):
    dm.mcmc.sample(iter=2000, burn=1000, thin=1)

# <codecell>

# calculate Gelman-Rubin statistic for all model variables
R_hat = pm.gelman_rubin(dm.mcmc)

# examine for gamma_p_100
R_hat['gamma_p_100']

# <codecell>


Beispiel #12
0
def plot_summary(sol, save=False, draw=True, save_as_png=False, dpi=None,
                 ignore=default_ignore,
                 fig_nb="",
                 ):
    """
    Plots a parameter summary and 
    Gelman-Rubin R-hat for multiple chains
    """
    
    ext = ['png' if save_as_png else 'pdf'][0]
    ch_nb = sol.mcmc["nb_chain"]

    keys = sorted([k for k in sol.var_dict.keys() if k not in ignore])        
    trac = [[sol.var_dict[x].trace(chain=n).mean(axis=0) for x in keys] for n in range(ch_nb)]
    deps = [var_depth(sol.var_dict[x]) for x in keys]
    lbls = list(reversed(flatten([[k+'%s'%(x+1) for x in range(d)] if d > 1 else k for k, d in zip(keys,deps)])))
    
    if ch_nb >= 2:
        rhat = [gelman_rubin([sol.MDL.trace(var, -x)[:] for x in range(sol.mcmc['nb_chain'])]) for var in keys]
        R = np.array(flatten(rhat))
        R[R > 5] = 5 
    else:
        print("\nTwo or more chains of equal length required for Gelman-Rubin convergence")
        R = len(lbls)*[None]
        
    fig, axes = plt.subplots(figsize=(6,4))
    gs2 = gridspec.GridSpec(3, 3)
    ax1 = plt.subplot(gs2[:, :-1])
    ax2 = plt.subplot(gs2[:, -1], sharey = ax1)
    for i in range(len(lbls)):
        for c in range(ch_nb):
            val_m = np.array(flatten(trac[c]))
            ax1.scatter(val_m[i], len(val_m)-(i+1) , color="C0", marker=".", 
                        s=50, facecolor='k', edgecolors='k',alpha=1)
        ax2.scatter(R[i], i, color="C3", marker="<", s=50, alpha=1)

    ax1.set_ylim([-1, len(lbls)])
    ax1.set_yticks(list(range(0,len(lbls))))
    ax1.set_yticklabels([parlbl_dic[l] for l in lbls])
    ax1.set_axisbelow(True)
    ax1.yaxis.grid(True)
    ax1.xaxis.grid(False)
    ax1.set_xlim(ax1.get_xlim())
    ax1.set_xlabel(r'Parameter value')

    plt.setp(ax2.get_yticklabels(), visible=False)
    ax2.set_xlim([0.5, 5.5])
    ax2.set_xticklabels(["","1","2","3","4","5+"])
    ax2.set_xticks([0.5, 1, 2, 3, 4, 5, ])
    ax2.set_axisbelow(True)
    ax2.yaxis.grid(True)
    ax2.xaxis.grid(False)
    ax2.set_xlabel(r'$\hat{R}$')
    ax2.axvline(1, ls='--', color='C0', zorder=0)

    plt.tight_layout()
    plt.close(fig)        

    if save: 
        fn = '%sSUM-%s-%s.%s'%(fig_nb,sol.model_type_str,sol.filename,ext)
        save_figure(fig, subfolder='Summaries', fname=fn, dpi=dpi)

    if draw:    return fig
    else:       return None
Beispiel #13
0
def plot_summary(sol, save=False, save_as_png=True, fig_dpi=144):
    if save_as_png:
        save_as = 'png'
    else:
        save_as = 'pdf'
    MDL, ch_n = sol.MDL, sol.mcmc["nb_chain"]
    model = get_model_type(sol)
    filename = sol.filename.replace("\\", "/").split("/")[-1].split(".")[0]
    keys = sorted([x.__name__ for x in MDL.deterministics]) + sorted(
        [x.__name__ for x in MDL.stochastics])
    try:
        keys.remove("zmod")
        keys.remove("log_m_i")
        keys.remove("log_tau_i")
        keys.remove("cond")
    except:
        pass
    for (i, k) in enumerate(keys):
        vect = old_div((MDL.trace(k)[:].size), (len(MDL.trace(k)[:])))
        if vect > 1:
            keys[i] = [k + "%d" % n for n in range(1, vect + 1)]
    keys = list(reversed(sorted(flatten(keys))))
    try:
        r_hat = gelman_rubin(MDL)
    except:
        print(
            "\nTwo or more chains of equal length required for Gelman-Rubin convergence"
        )
    fig, axes = plt.subplots(figsize=(6, 4))
    gs2 = gridspec.GridSpec(3, 3)
    ax1 = plt.subplot(gs2[:, :-1])
    ax2 = plt.subplot(gs2[:, -1], sharey=ax1)
    ax2.set_xlabel("R-hat")
    ax2.plot([1, 1], [-1, len(keys)], "--", color="C7", zorder=0)
    for (i, k) in enumerate(keys):
        test = k[-1] not in ["%d" % d for d in range(1, 8)] or k == "R0"
        for c in range(ch_n):
            if test:
                imp = None
                val_m = MDL.stats(k[:imp], chain=c)[k[:imp]]['mean']
                hpd_h = MDL.stats(k[:imp],
                                  chain=c)[k[:imp]]['95% HPD interval'][0]
                hpd_l = MDL.stats(k[:imp],
                                  chain=c)[k[:imp]]['95% HPD interval'][1]
            else:
                imp = -1
                val_m = MDL.stats(k[:imp],
                                  chain=c)[k[:imp]]['mean'][int(k[-1]) - 1]
                hpd_h = MDL.stats(
                    k[:imp],
                    chain=c)[k[:imp]]['95% HPD interval'][0][int(k[-1]) - 1]
                hpd_l = MDL.stats(
                    k[:imp],
                    chain=c)[k[:imp]]['95% HPD interval'][1][int(k[-1]) - 1]
            val = val_m
            err = [[abs(hpd_h - val_m)], [abs(hpd_l - val_m)]]
            if ch_n % 2 != 0: o_s = 0
            else: o_s = 0.5
            ax1.scatter(val,
                        i - (old_div(ch_n, 2)) * (1. / ch_n / 1.4) +
                        (1. / ch_n / 1.4) * (c + o_s),
                        color="C0",
                        marker="o",
                        s=50,
                        edgecolors='C7',
                        alpha=0.7)
            ax1.errorbar(val,
                         i - (old_div(ch_n, 2)) * (1. / ch_n / 1.4) +
                         (1. / ch_n / 1.4) * (c + o_s),
                         xerr=err,
                         color="C7",
                         fmt=" ",
                         zorder=0)
        if ch_n >= 2:
            R = np.array(r_hat[k[:imp]])
            R[R > 3] = 3
            if test:
                ax2.scatter(R, i, color="C1", marker="<", s=50, alpha=0.7)
            else:
                ax2.scatter(R[int(k[-1]) - 1],
                            i,
                            color="C1",
                            marker="<",
                            s=50,
                            alpha=0.7)

    ax1.set_ylim([-1, len(keys)])
    ax1.set_yticks(list(range(0, len(keys))))
    ax1.set_yticklabels(keys)
    plt.setp(ax2.get_yticklabels(), visible=False)
    ax2.set_xlim([0.5, 3.5])
    ax2.set_xticklabels(["", "1", "2", "3+"])
    ax2.set_xticks([0.5, 1, 2, 3])
    ax1.set_xlabel("Parameter values")
    plt.tight_layout()

    if save:
        save_where = '/Figures/Summaries/'
        working_path = getcwd().replace("\\", "/") + "/"
        save_path = working_path + save_where
        print("\nSaving summary figure in:\n", save_path)
        if not path.exists(save_path):
            makedirs(save_path)
        fig.savefig(save_path + 'Summary-%s-%s.%s' %
                    (model, filename, save_as),
                    dpi=fig_dpi,
                    bbox_inches='tight')
    try:
        plt.close(fig)
    except:
        pass

    return fig
Beispiel #14
0
def main(mcmc_args=None):

    print('Setting up parameters and priors...')

    params = Params()
    # Set up location here with command line arguments in a list.
    params.cmd_line_chg(['--kalbar'])
    assert params.site_name + 'fields.txt' == 'data/kalbarfields.txt'
    # Set parameters specific to Bayesian runs
    params.PLOT = False
    params.OUTPUT = False

    # This sends a message to CalcSol on whether or not to use CUDA
    if params.CUDA:
        globalvars.cuda = True
    else:
        globalvars.cuda = False
    # get wind data and day labels
    wind_data, days = PM.get_wind_data(*params.get_wind_params())
    params.ndays = len(days)

    # reduce domain
    params.domain_info = (10000.0, 400)  #25 m sided cells
    domain_res = params.domain_info[0] / params.domain_info[1]
    cell_area = domain_res**2

    locinfo = LocInfo(params.dataset, params.coord, params.domain_info)

    ######################################################################
    #####                        Model Priors                        #####
    ######################################################################
    lam = pm.Beta("lam", 5, 1, value=0.95)
    f_a1 = pm.TruncatedNormal("f_a1", 6, 0.3, 0, 9, value=6)
    f_a2 = pm.TruncatedNormal("f_a2", 20, 0.3, 15, 24, value=20)
    f_b1_p = pm.Gamma("fb1_p", 2, 1, value=1.5, trace=False,
                      plot=False)  #alpha,beta parameterization

    @pm.deterministic(trace=True, plot=True)
    def f_b1(f_b1_p=f_b1_p):
        return f_b1_p + 1

    f_b2_p = pm.Gamma("fb2_p", 2, 1, value=1.5, trace=False, plot=False)

    @pm.deterministic(trace=True, plot=True)
    def f_b2(f_b2_p=f_b2_p):
        return f_b2_p + 1

    g_aw = pm.Gamma("g_aw", 2.2, 1, value=1.0)
    g_bw = pm.Gamma("g_bw", 5, 1, value=3.8)
    # flight diffusion parameters. note: mean is average over flight advection
    sig_x = pm.Gamma("sig_x", 26, 0.15, value=180)
    sig_y = pm.Gamma("sig_y", 15, 0.15, value=150)
    corr_p = pm.Beta("corr_p", 5, 5, value=0.5, trace=False, plot=False)

    @pm.deterministic(trace=True, plot=True)
    def corr(corr_p=corr_p):
        return corr_p * 2 - 1

    # local spread paramters
    sig_x_l = pm.Gamma("sig_xl", 2, 0.08, value=10)
    sig_y_l = pm.Gamma("sig_yl", 2, 0.14, value=10)
    corr_l_p = pm.Beta("corr_l_p", 5, 5, value=0.5, trace=False, plot=False)

    @pm.deterministic(trace=True, plot=True)
    def corr_l(corr_l_p=corr_l_p):
        return corr_l_p * 2 - 1

    mu_r = pm.Normal("mu_r", 1., 1, value=1)
    n_periods = pm.Poisson("n_periods", 30, value=30)
    #alpha_pow = prev. time exponent in ParasitoidModel.h_flight_prob
    xi = pm.Gamma("xi", 1, 1,
                  value=0.75)  # presence to oviposition/emergence factor
    em_obs_prob = pm.Beta("em_obs_prob", 1, 1, value=0.05)  # per-wasp prob of
    # observing emergence in release field grid given max leaf collection
    # this is dependent on the size of the cell surrounding the grid point
    # ...not much to be done about this.
    grid_obs_prob = pm.Beta("grid_obs_prob", 1, 1,
                            value=0.005)  # probability of
    # observing a wasp present in the grid cell given max leaf sampling

    #card_obs_prob = pm.Beta("card_obs_prob",1,1,value=0.5) # probability of
    # observing a wasp present in the grid cell given max leaf sampling

    #### Data collection model background for sentinel fields ####
    # Need to fix linear units for area. Meters would be best.
    # Effective collection area (constant between fields) is very uncertain
    with warnings.catch_warnings():
        # squelsh a warning based on pymc coding we don't need to worry about
        warnings.simplefilter("ignore", RuntimeWarning)
        A_collected = pm.TruncatedNormal("A_collected",
                                         2500,
                                         1 / 2500,
                                         0,
                                         min(locinfo.field_sizes.values()) *
                                         cell_area,
                                         value=2500)  # in m**2
    # Each field has its own binomial probability.
    # Probabilities are likely to be small, and pm.Beta cannot handle small
    #   parameter values. So we will use TruncatedNormal again.
    N = len(locinfo.sent_ids)
    sent_obs_probs = np.empty(N, dtype=object)
    # fix beta for the Beta distribution
    sent_beta = 40
    # mean of Beta distribution will be A_collected/field size
    for n, key in enumerate(locinfo.sent_ids):
        sent_obs_probs[n] = pm.Beta(
            "sent_obs_probs_{}".format(key),
            A_collected / (locinfo.field_sizes[key] * cell_area) * sent_beta /
            (1 - A_collected / (locinfo.field_sizes[key] * cell_area)),
            sent_beta,
            value=0.1 * 3600 / (locinfo.field_sizes[key] * cell_area))

    sent_obs_probs = pm.Container(sent_obs_probs)

    # Max a Posterirori estimates have consistantly returned a value near zero
    #   for sprd_factor. So we will comment these sections.
    # if params.dataset == 'kalbar':
    #     # factor for kalbar initial spread
    #     sprd_factor = pm.Uniform("sprd_factor",0,1,value=0.3)
    # else:
    #     sprd_factor = None
    sprd_factor = None

    #### Collect variables and setup block update ####
    params_ary = pm.Container(
        np.array([
            g_aw, g_bw, f_a1, f_b1, f_a2, f_b2, sig_x, sig_y, corr, sig_x_l,
            sig_y_l, corr_l, lam, n_periods, mu_r
        ],
                 dtype=object))
    # The stochastic variables in this list (and the stochastics behind the
    #   deterministic ones) should be block updated in order to avoid the large
    #   computational expense of evaluating the model multiple times for each
    #   MCMC iteration. To do this, starting step variances must be definied
    #   for each variable. This is done via a scaling dict.
    stoc_vars = [
        g_aw, g_bw, f_a1, f_b1_p, f_a2, f_b2_p, sig_x, sig_y, corr_p, sig_x_l,
        sig_y_l, corr_l_p, lam, n_periods, mu_r
    ]
    step_scales = {
        g_aw: 0.04,
        g_bw: 0.08,
        f_a1: 0.25,
        f_b1_p: 0.05,
        f_a2: 0.25,
        f_b2_p: 0.05,
        sig_x: 2,
        sig_y: 2,
        corr_p: 0.0005,
        sig_x_l: 2,
        sig_y_l: 2,
        corr_l_p: 0.0005,
        lam: 0.0005,
        n_periods: 1,
        mu_r: 0.005
    }

    print('Getting initial model values...')

    ######################################################################
    #####                          Run Model                         #####
    ######################################################################
    @pm.deterministic(plot=False, trace=False)
    def pop_model(params=params,
                  params_ary=params_ary,
                  locinfo=locinfo,
                  wind_data=wind_data,
                  days=days,
                  sprd_factor=sprd_factor):
        '''This function acts as an interface between PyMC and the model.
        Not only does it run the model, but it provides an emergence potential
        based on the population model result projected forward from feasible
        oviposition dates. To modify how this projection happens, edit
        popdensity_to_emergence. Returned values from this function should be
        nearly ready to compare to data.
        '''
        modeltic = time.time()
        ### Alter params with stochastic variables ###

        # g wind function parameters
        params.g_params = tuple(params_ary[0:2])
        # f time of day function parameters
        params.f_params = tuple(params_ary[2:6])
        # Diffusion coefficients
        params.Dparams = tuple(params_ary[6:9])
        params.Dlparams = tuple(params_ary[9:12])
        # Probability of any flight during the day under ideal circumstances
        params.lam = params_ary[12]

        # TRY BOTH SCALINGS - VARYING mu_r and n_periods
        # scaling flight advection to wind advection
        # number of time periods (based on interp_num) in one flight
        params.n_periods = params_ary[
            13]  # if interp_num = 30, this is # of minutes
        params.mu_r = params_ary[14]

        ### PHASE ONE ###
        # First, get spread probability for each day as a coo sparse matrix
        max_shape = np.array([0, 0])
        pm_args = [(days[0], wind_data, *params.get_model_params(),
                    params.r_start)]
        pm_args.extend([(day, wind_data, *params.get_model_params())
                        for day in days[1:params.ndays]])

        ##### Kalbar wind started recording a day late. Spread the population
        #####   locally before running full model.
        if sprd_factor is not None:
            res = params.domain_info[0] / params.domain_info[1]
            mean_drift = np.array([-25., 15.])
            xdrift_int = int(mean_drift[0] // res)
            xdrift_r = mean_drift[0] % res
            ydrift_int = int(mean_drift[1] // res)
            ydrift_r = mean_drift[1] % res
            longsprd = PM.get_mvn_cdf_values(
                res, np.array([xdrift_r, ydrift_r]),
                PM.Dmat(params_ary[6], params_ary[7], params_ary[8]))
            shrtsprd = PM.get_mvn_cdf_values(
                res, np.array([0., 0.]),
                PM.Dmat(params_ary[9], params_ary[10], params_ary[11]))

            mlen = int(
                max(longsprd.shape[0], shrtsprd.shape[0]) +
                max(abs(xdrift_int), abs(ydrift_int)) * 2)
            sprd = np.zeros((mlen, mlen))
            lbds = [
                int(mlen // 2 - longsprd.shape[0] // 2),
                int(mlen // 2 + longsprd.shape[0] // 2 + 1)
            ]
            sprd[lbds[0] - ydrift_int:lbds[1] - ydrift_int, lbds[0] +
                 xdrift_int:lbds[1] + xdrift_int] = longsprd * sprd_factor
            sbds = [
                int(mlen // 2 - shrtsprd.shape[0] // 2),
                int(mlen // 2 + shrtsprd.shape[0] // 2 + 1)
            ]
            sprd[sbds[0]:sbds[1],
                 sbds[0]:sbds[1]] += shrtsprd * (1 - sprd_factor)

            sprd[int(sprd.shape[0] // 2),
                 int(sprd.shape[0] // 2)] += max(0, 1 - sprd.sum())
            pmf_list = [sparse.coo_matrix(sprd)]
        else:
            pmf_list = []

        ###################### Get pmf_list from multiprocessing
        pmf_list.extend(pool.starmap(PM.prob_mass, pm_args))

        for pmf in pmf_list:
            for dim in range(2):
                if pmf.shape[dim] > max_shape[dim]:
                    max_shape[dim] = pmf.shape[dim]

        r_spread = []  # holds the one-day spread for each release day.

        # Reshape the prob. mass function of each release day into solution form
        for ii in range(params.r_dur):
            offset = params.domain_info[1] - pmf_list[ii].shape[0] // 2
            dom_len = params.domain_info[1] * 2 + 1
            r_spread.append(
                sparse.coo_matrix(
                    (pmf_list[ii].data,
                     (pmf_list[ii].row + offset, pmf_list[ii].col + offset)),
                    shape=(dom_len, dom_len)).tocsr())

        ### PHASE TWO ###
        # Pass the probability list, pmf_list, and other info to convolution solver.
        #   This will return the finished population model.
        with Capturing() as output:
            if sprd_factor is not None:
                # extend day count by one
                days_ext = [days[0] - 1]
                days_ext.extend(days)
                modelsol = get_populations(r_spread, pmf_list, days_ext,
                                           params.ndays + 1, dom_len,
                                           max_shape, params.r_dur,
                                           params.r_number, params.r_mthd())
                # remove the first one and start where wind started.
                modelsol = modelsol[1:]
            else:
                modelsol = get_populations(r_spread, pmf_list, days,
                                           params.ndays, dom_len, max_shape,
                                           params.r_dur, params.r_number,
                                           params.r_mthd())

        # modelsol now holds the model results for this run as CSR sparse arrays

        # get emergence potential (measured in expected number of wasps previously
        #   present whose oviposition would result in emergence on the given date)
        #   from the model result
        release_emerg, sentinel_emerg = popdensity_to_emergence(
            modelsol, locinfo)

        # get the expected wasp populations at grid points on sample days
        grid_counts = popdensity_grid(modelsol, locinfo)

        # get the expected wasp populations in cardinal directions
        '''card_counts = popdensity_card(modelsol,locinfo,params.domain_info)'''

        ## For the lists release_emerg and sentinel_emerg:
        ##    Each list entry corresponds to a data collection day (one array)
        ##    In each array:
        ##    Each column corresponds to an emergence observation day (as in data)
        ##    Each row corresponds to a grid point or sentinel field, respectively
        ## For the array grid_counts:
        ##    Each column corresponds to an observation day
        ##    Each row corresponds to a grid point
        ## For the list card_counts:
        ##    Each list entry corresponds to a sampling day (one array)
        ##    Each column corresponds to a step in a cardinal direction
        ##    Each row corresponds to a cardinal direction
        # print('{:03.1f} sec./model at {}'.format(time.time() - modeltic,
        #     time.strftime("%H:%M:%S %d/%m/%Y")),end='\r')
        # sys.stdout.flush()
        return (release_emerg, sentinel_emerg, grid_counts)  #,card_counts)

    print('Parsing model output and connecting to Bayesian model...')

    ######################################################################
    #####                   Connect Model to Data                    #####
    ######################################################################

    ### Parse the results of pop_model into separate deterministic variables ###
    '''Get Poisson probabilities for sentinal field emergence. Parameters:
        xi is constant, emerg is a list of ndarrays, betas is a 1D array of
        field probabilities'''
    Ncollections = len(locinfo.sent_DataFrames)
    sent_poi_rates = []
    for ii in range(Ncollections):
        s_ndays = len(locinfo.sent_DataFrames[ii]['datePR'].unique())
        sent_poi_rates.append(
            pm.Lambda('sent_poi_rate_{}'.format(ii),
                      lambda xi=xi, ndays=s_ndays, betas=sent_obs_probs,
                      emerg_model=pop_model[1][ii]: xi * emerg_model * np.tile(
                          betas, (ndays, 1)).T,
                      trace=False))
    sent_poi_rates = pm.Container(sent_poi_rates)
    '''Return Poisson probabilities for release field grid emergence. Parameters:
        xi is constant, emerg is a list of ndarrays. collection effort is
        specified in locinfo.'''
    Ncollections = len(locinfo.release_DataFrames)
    rel_poi_rates = []
    for ii in range(Ncollections):
        r_effort = locinfo.release_collection[ii]  #fraction of max collection
        r_ndays = len(locinfo.release_DataFrames[ii]['datePR'].unique())
        rel_poi_rates.append(
            pm.Lambda('rel_poi_rate_{}'.format(ii),
                      lambda xi=xi, ndays=r_ndays, r_effort=r_effort, beta=
                      em_obs_prob, emerg_model=pop_model[0][ii]: xi *
                      emerg_model * np.tile(r_effort * beta, (ndays, 1)).T,
                      trace=False))
    rel_poi_rates = pm.Container(rel_poi_rates)

    @pm.deterministic(plot=False, trace=False)
    def grid_poi_rates(locinfo=locinfo,
                       beta=grid_obs_prob,
                       obs_model=pop_model[2]):
        '''Return Poisson probabilities for grid sampling
        obs_model is an ndarray, sampling effort is specified in locinfo.'''
        return beta * locinfo.grid_samples * obs_model

    '''Return Poisson probabilities for cardinal direction sampling
        obs_model is a list of ndarrays, sampling effort is assumed constant'''
    '''
    card_poi_rates = []
    for ii,obs in enumerate(pop_model[3]):
        card_poi_rates.append(pm.Lambda('card_poi_rate_{}'.format(ii),
            lambda beta=card_obs_prob, obs=obs: beta*obs))
    card_poi_rates = pm.Container(card_poi_rates)
    '''

    # Given the expected wasp densities from pop_model, actual wasp densities
    #   are modeled as a thinned Poisson random variable about that mean.
    # Each wasp in the area then has a small probability of being seen.

    ### Connect sentinel emergence data to model ###
    N_sent_collections = len(locinfo.sent_DataFrames)
    # Create list of collection variables
    sent_collections = []
    for ii in range(N_sent_collections):
        # Apparently, pymc does not play well with 2D array parameters
        sent_collections.append(
            np.empty(sent_poi_rates[ii].value.shape, dtype=object))
        for n in range(sent_collections[ii].shape[0]):
            for m in range(sent_collections[ii].shape[1]):
                sent_collections[ii][n, m] = pm.Poisson(
                    "sent_em_obs_{}_{}_{}".format(ii, n, m),
                    sent_poi_rates[ii][n, m],
                    value=float(locinfo.sentinel_emerg[ii][n, m]),
                    observed=True)
    sent_collections = pm.Container(sent_collections)

    ### Connect release-field emergence data to model ###
    N_release_collections = len(locinfo.release_DataFrames)
    # Create list of collection variables
    rel_collections = []
    for ii in range(N_release_collections):
        rel_collections.append(
            np.empty(rel_poi_rates[ii].value.shape, dtype=object))
        for n in range(rel_collections[ii].shape[0]):
            for m in range(rel_collections[ii].shape[1]):
                rel_collections[ii][n, m] = pm.Poisson(
                    "rel_em_obs_{}_{}_{}".format(ii, n, m),
                    rel_poi_rates[ii][n, m],
                    value=float(locinfo.release_emerg[ii][n, m]),
                    observed=True)
    rel_collections = pm.Container(rel_collections)

    ### Connect grid sampling data to model ###
    grid_obs = np.empty(grid_poi_rates.value.shape, dtype=object)
    for n in range(grid_obs.shape[0]):
        for m in range(grid_obs.shape[1]):
            grid_obs[n, m] = pm.Poisson("grid_obs_{}_{}".format(n, m),
                                        grid_poi_rates[n, m],
                                        value=float(locinfo.grid_obs[n, m]),
                                        observed=True)
    grid_obs = pm.Container(grid_obs)

    ### Connect cardinal direction data to model ###
    '''
    N_card_collections = len(locinfo.card_obs_DataFrames)
    # Create list of sampling variables
    card_collections = []
    for ii in range(N_card_collections):
        card_collections.append(np.empty(card_poi_rates[ii].value.shape,
                                         dtype=object))
        for n in range(card_collections[ii].shape[0]):
            for m in range(card_collections[ii].shape[1]):
                card_collections[ii][n,m] = pm.Poisson(
                    "card_obs_{}_{}_{}".format(ii,n,m),
                    card_poi_rates[ii][n,m],
                    value=locinfo.card_obs[ii][n,m],
                    observed=True, plot=False)
    card_collections = pm.Container(card_collections)
    '''

    ######################################################################
    #####                   Collect Model and Run                    #####
    ######################################################################

    ### Collect model ###
    if sprd_factor is not None:
        Bayes_model = pm.Model([
            lam, f_a1, f_a2, f_b1_p, f_b2_p, f_b1, f_b2, g_aw, g_bw, sig_x,
            sig_y, corr_p, corr, sig_x_l, sig_y_l, corr_l_p, corr_l, n_periods,
            mu_r, sprd_factor, grid_obs_prob, xi, em_obs_prob, A_collected,
            sent_obs_probs, params_ary, pop_model, grid_poi_rates,
            rel_poi_rates, sent_poi_rates, grid_obs, rel_collections,
            sent_collections
        ])
    else:
        Bayes_model = pm.Model([
            lam, f_a1, f_a2, f_b1_p, f_b2_p, f_b1, f_b2, g_aw, g_bw, sig_x,
            sig_y, corr_p, corr, sig_x_l, sig_y_l, corr_l_p, corr_l, n_periods,
            mu_r, grid_obs_prob, xi, em_obs_prob, A_collected, sent_obs_probs,
            params_ary, pop_model, grid_poi_rates, rel_poi_rates,
            sent_poi_rates, grid_obs, rel_collections, sent_collections
        ])

    ### Run if parameters were passed in ###
    if mcmc_args is not None:
        if len(mcmc_args) == 3:
            # New run
            nsamples = int(mcmc_args[0])
            burn = int(mcmc_args[1])
            fname = mcmc_args[2]
            if fname[-3:] != '.h5':
                fname += '.h5'
            mcmc = pm.MCMC(Bayes_model,
                           db='hdf5',
                           dbname=fname,
                           dbmode='a',
                           dbcomplevel=0)
            mcmc.use_step_method(pm.AdaptiveMetropolis,
                                 stoc_vars,
                                 scales=step_scales,
                                 interval=500,
                                 shrink_if_necessary=True)
            try:
                tic = time.time()
                print('Sampling...')
                mcmc.sample(nsamples, burn)
                # sampling finished. commit to database and continue
                print('Sampling finished.')
                print('Time elapsed: {}'.format(time.time() - tic))
                print('Saving...')
                #mcmc.save_state()
                mcmc.commit()
                print('Closing...')
                mcmc.db.close()
            except:
                print('Exception: database closing...')
                mcmc.db.close()
                raise
            return
        elif len(mcmc_args) == 2:
            # Resume run
            fname = mcmc_args[0]
            nsamples = int(mcmc_args[1])
            fname = fname.strip()
            if fname[-3:] != '.h5':
                fname += '.h5'
            if os.path.isfile(fname):
                db = pm.database.hdf5.load(fname)
                mcmc = pm.MCMC(Bayes_model, db=db)
                mcmc.use_step_method(pm.AdaptiveMetropolis,
                                     stoc_vars,
                                     scales=step_scales,
                                     interval=500,
                                     shrink_if_necessary=True)
                # database loaded.
            else:
                print('File not found: {}'.format(fname))
                return
            try:
                tic = time.time()
                print('Sampling...')
                mcmc.sample(nsamples)
                # sampling finished. commit to database and continue
                print('Sampling finished.')
                print('Time elapsed: {}'.format(time.time() - tic))
                print('Saving...')
                #mcmc.save_state()
                mcmc.commit()
                print('Closing...')
                mcmc.db.close()
            except:
                print('Exception: database closing...')
                mcmc.db.close()
                raise
            return

    ######################################################################
    #####                   Start Interactive Menu                   #####
    ######################################################################
    print('--------------- MCMC MAIN MENU ---------------')
    print(" 'new': Start a new MCMC chain from the beginning.")
    print("'cont': Continue a previous MCMC chain from an hdf5 file.")
    #print("'plot': Plot traces/distribution from an hdf5 file.")
    print("'quit': Quit.")
    cmd = input('Enter: ')
    cmd = cmd.strip().lower()
    if cmd == 'new':
        print('\n\n')
        print('--------------- New MCMC Chain ---------------')
        while True:
            val = input("Enter number of realizations or 'quit' to quit:")
            val = val.strip()
            if val == 'q' or val == 'quit':
                return
            else:
                try:
                    nsamples = int(val)
                    val2 = input("Enter number of realizations to discard:")
                    val2 = val2.strip()
                    if val2 == 'q' or val2 == 'quit':
                        return
                    else:
                        burn = int(val2)
                    fname = input(
                        "Enter filename to save or 'back' to cancel:")
                    fname = fname.strip()
                    if fname == 'q' or fname == 'quit':
                        return
                    elif fname == 'b' or fname == 'back':
                        continue
                    else:
                        fname = fname + '.h5'
                        break  # BREAK LOOP AND RUN MCMC WITH GIVEN VALUES
                except ValueError:
                    print('Unrecognized input.')
                    continue
        ##### RUN FIRST MCMC HERE #####
        mcmc = pm.MCMC(Bayes_model,
                       db='hdf5',
                       dbname=fname,
                       dbmode='a',
                       dbcomplevel=0)
        mcmc.use_step_method(pm.AdaptiveMetropolis,
                             stoc_vars,
                             scales=step_scales,
                             interval=500,
                             shrink_if_necessary=True)
        try:
            tic = time.time()
            print('Sampling...')
            mcmc.sample(nsamples, burn)
            # sampling finished. commit to database and continue
            print('Sampling finished.')
            print('Time elapsed: {}'.format(time.time() - tic))
            print('Saving...')
            #mcmc.save_state()
            mcmc.commit()
        except:
            print('Exception: database closing...')
            mcmc.db.close()
            raise

    elif cmd == 'cont':
        # Load db and continue
        print('\n')
        while True:
            fname = input("Enter path to database to load, or 'q' to quit:")
            fname = fname.strip()
            if fname.lower() == 'q' or fname.lower() == 'quit':
                return
            else:
                if fname[-3:] != '.h5':
                    fname += '.h5'
                if os.path.isfile(fname):
                    db = pm.database.hdf5.load(fname)
                    mcmc = pm.MCMC(Bayes_model, db=db)
                    mcmc.use_step_method(pm.AdaptiveMetropolis,
                                         stoc_vars,
                                         scales=step_scales,
                                         interval=500,
                                         shrink_if_necessary=True)
                    break  # database loaded
                else:
                    print('File not found.')
                    #continue

    elif cmd == 'plot':
        # Get filename and pass to plotting routine.
        pass
        # return
    elif cmd == 'quit' or cmd == 'q':
        return
    else:
        print('Command not recognized.')
        print('Quitting....')
        return

    ##### MCMC Loop #####
    # This should be reached only by cmd == 'new' or 'cont' with a database.
    # It resumes sampling of a previously sampled chain.
    print('\n')
    while True:
        print('--------------- MCMC ---------------')
        print(" 'report': generate report on traces")
        print("'inspect': launch IPython to inspect state")
        print("    'run': conduct further sampling")
        print("   'quit': Quit")
        cmd = input('Enter: ')
        cmd = cmd.strip()
        cmd = cmd.lower()
        if cmd == 'inspect':
            try:
                import IPython
                IPython.embed()
            except ImportError:
                print('IPython not found.')
            except:
                print('Exception: database closing...')
                mcmc.db.close()
                raise
        elif cmd == 'run':
            val = input("Enter number of realizations or 'back':")
            val = val.strip()
            if val == 'back' or val == 'b':
                continue
            else:
                try:
                    nsamples = int(val)
                except ValueError:
                    print('Unrecognized input.')
                    continue
            # Run chain
            try:
                tic = time.time()
                print('Sampling...')
                mcmc.sample(nsamples)
                # sampling finished. commit to database and continue
                print('Sampling finished.')
                print('Time elapsed: {}'.format(time.time() - tic))
                print('Saving...')
                #mcmc.save_state()
                mcmc.commit()
            except:
                print('Exception: database closing...')
                mcmc.db.close()
                raise
        elif cmd == 'report':
            try:
                import Bayes_Plot
                Bayes_Plot.plot_traces(db=db)
                print('Gelman-Rubin statistics')
                gr = pm.gelman_rubin(mcmc)
                print(gr)
                with open('./diagnostics/gelman-rubin.txt', 'w') as f:
                    f.write('Variable        R_hat\n')
                    f.write('---------------------\n')
                    for key, val in gr.items():
                        f.write(key + ': {}\n'.format(val))
            except:
                print('Exception: database closing...')
                mcmc.db.close()
                raise
        elif cmd == 'quit' or cmd == 'q':
            mcmc.db.close()
            print('Database closed.')
            break
        else:
            print('Command not recognized.')
Beispiel #15
0
	beta = pm.Normal('beta', mu=0, sd=20)
	sigma = pm.Uniform('sigma', lower=0)
	y = pm.Normal('y', mu=beta*X + alpha, sd=sigma, observed=Y)
	start = pm.find_MAP()
	step = pm.NUTS(state=start)

with model:
	if(multicore):
		trace = pm.sample(itenum, step, start=start,
					njobs=chainnum, random_seed=range(chainnum), progress_bar=False)
	else:
		ts=[pm.sample(itenum, step, chain=i, progressbar=False) for i in range(chainnum)] 
		trace=merge_traces(ts)
	if(saveimage): 
		pm.traceplot(trace).savefig("simple_linear_trace.png")
	print "Rhat="+str(pm.gelman_rubin(trace))
		
t1=time.clock()
print "elapsed time="+str(t1-t0)

#trace
if(not multicore):
	trace=ts[0] 
with model:
	pm.traceplot(trace,model.vars)

pm.forestplot(trace)

import pickle as pkl
with open("simplelinearregression_model.pkl","w") as fpw:
	pkl.dump(model,fpw)
Beispiel #16
0
def bodelike_plot(pbproject=HS_PROJECT,
                  model_id='gpa3',
                  varname='phase',
                  control_genotype='VT37804_TNTin',
                  blocked_genotype='VT37804_TNTE',
                  num_chains=4, takelast=10000,
                  alpha=0.05,
                  plot_control=True, plot_silenced=True, img_format='png',
                  show=False):

    def varnames(result, varname):
        hyperfly, hyperfly_postfix, hyperfly_variables, flies, flies_variables = flies_and_variables(result)
        hvar = varname + hyperfly_postfix if varname in set(hyperfly_variables) else None
        fvars = [varname + '_' + fly for fly in flies] if varname in set(flies_variables) else None
        return hvar, fvars

    def mix_chains(chains):
        # assert len(chains) >= num_chains
        mixed = np.array([np.nan] * (num_chains * takelast))
        for i, chain in enumerate(chains):
            mixed[i * takelast: (i+1) * takelast] = chain[-takelast:]
        return mixed

    # Available results
    results = all_computed_results(pbproject.mcmc_dir)
    results = results[results.model_id == model_id]
    ctraces = {}
    straces = {}
    results.genotype = results.genotype.apply(lambda gen: gen.partition('__')[0])
    # Collect and mix traces for all frequencies
    for (model_id, freq), data in results.groupby(('model_id', 'freq')):
        print '\t\t\tCollecting traces for frequency %g' % freq
        control = MCMCRunManager(data[data.genotype == control_genotype].iloc[0]['path'])  # ad-hoc
        silenced = MCMCRunManager(data[data.genotype == blocked_genotype].iloc[0]['path'])  # ad-hoc
        chvar, _ = varnames(control, varname)   # control hierarchical var, fly vars
        shvar, _ = varnames(silenced, varname)  # silenced hierarchical var, fly vars
        ctraces[freq] = mix_chains(control.traces(chvar))
        straces[freq] = mix_chains(silenced.traces(shvar))
    # The frequencies we are interested in...
    freqs = (0.5, 1, 2, 4, 8, 16, 32, 40)
    # Copute HPDs. Compute the rope too, see Kruschke.
    chpds = [hpd(ctraces[freq], alpha) for freq in freqs]
    shpds = [hpd(straces[freq], alpha) for freq in freqs]
    # Plot the traces
    if plot_control:
        plt.plot(np.hstack([ctraces[freq] for freq in freqs]), color='b', label=control_genotype.replace('_', 'x'))
    if plot_silenced:
        plt.plot(np.hstack([straces[freq] for freq in freqs]), color='r', label=blocked_genotype.replace('_', 'x'))
    # Plot the HPD regions + setup ticks
    xticklocations = []
    xticklabels = []
    for i, freq in enumerate(freqs):
        xmin = num_chains * takelast * i
        xmax = num_chains * takelast * (i + 1)
        plt.axvline(x=xmax, color='k')
        plt.plot((xmin, xmax), [chpds[i][0]] * 2, color='c', linewidth=4)
        plt.plot((xmin, xmax), [chpds[i][1]] * 2, color='c', linewidth=4)
        plt.plot((xmin, xmax), [shpds[i][0]] * 2, color='m', linewidth=4)
        plt.plot((xmin, xmax), [shpds[i][1]] * 2, color='m', linewidth=4)
        # Gelman-Rubin R^2 (might interest: Geweke, autocorr, put graphically in the plot)
        cgr = gelman_rubin(ctraces[freq].reshape(num_chains, -1))
        print '\t%s %s control freq %.1f; GR=%.2f' % (model_id, varname, freq, cgr)
        sgr = gelman_rubin(straces[freq].reshape(num_chains, -1))
        print '\t%s %s blocked freq %.1f; GR=%.2f' % (model_id, varname, freq, sgr)
        # xticks
        xticklocations.append(xmin + (xmax - xmin) / 2.)
        xticklabels.append('%g\nbgr=%.2f\ncgr=%.2f' % (freq, sgr, cgr))
    plt.title('Model: %s; Variable: %s' % (model_id, varname))
    plt.xlabel('$\omega$')
    plt.ylabel('%s' % varname)
    plt.tick_params(axis='x',           # changes apply to the x-axis
                    which='both',       # both major and minor ticks are affected
                    top='off',          # ticks along the top edge are off
                    bottom='on',        # ticks along the bottom edge are on
                    labelbottom='on')   # labels along the bottom edge are off
    plt.xticks(xticklocations, xticklabels)
    plt.legend()
    plt.tight_layout()
    # Save
    dest_dir = op.join(pbproject.plots_dir, 'bbode', model, '%s-vs-%s' % (control_genotype, blocked_genotype))
    ensure_dir(dest_dir)
    plt.savefig(op.join(dest_dir, '%s-vs-%s-%s-%s.%s' % (control_genotype,
                                                         blocked_genotype,
                                                         model_id,
                                                         varname,
                                                         img_format)))
    # Show
    if show:
        plt.show()