Beispiel #1
0
def plotPosteriorCr(mdl, trc, rawdata, xlims, npoints=1000):
    #Plot the posterior predictions from model given traces

    #Extract traces - samples that were collected
    trc_mu = pm.trace_to_dataframe(trc)[['Intercept', 'x']]
    trc_sd = pm.trace_to_dataframe(trc)['sd']

    #Recreate the likelihood
    x = np.linspace(xlims[0], xlims[1], npoints).reshape((npoints,1))
    X = x ** np.ones((npoints,2)) * np.arange(2)
    like_mu = np.dot(X,trc_mu.T)
    like_sd = np.tile(trc_sd.T,(npoints,1))
    like = np.random.normal(like_mu, like_sd)

    #Calculate credible regions and plot over the datapoints
    dfp = pd.DataFrame(np.percentile(like, [2.5, 25, 50, 75, 97.5], axis=1).T
                       , columns=['025', '250', '500', '750', '975'])
    dfp['x'] = x

    pal = sns.color_palette('Purples')
    f, ax1d = plt.subplots(1, 1, figsize=(7, 7))
    ax1d.fill_between(dfp['x'], dfp['025'], dfp['975'], alpha=0.5
                      , color=pal[1], label='CR 95%')
    ax1d.fill_between(dfp['x'], dfp['250'], dfp['750'], alpha=0.4
                      , color=pal[4], label='CR 50%')
    ax1d.plot(dfp['x'], dfp['500'], alpha=0.5, color=pal[5], label='Median')
    _ = plt.legend()
    _ = ax1d.set_xlim(xlims)
    _ = sns.regplot(x='x', y='y', data=rawdata, fit_reg=False
                    , scatter_kws={'alpha': 0.8, 's': 80, 'lw': 2, 'edgecolor': 'w'}, ax=ax1d)
Beispiel #2
0
def run_sampling(pm_model,
                 output_dir,
                 ncores=1,
                 nchains=2,
                 max_attempts=2,
                 filename="trace"):
    # Log file output
    logging.basicConfig(
        filename=output_dir + "/sampling.log",
        filemode="w",
        format="%(name)s - %(levelname)s - %(message)s",
    )

    # Sample the model
    divperc = 20

    with pm_model:
        # Run initial chain
        try:
            trace = pm.sample(
                tune=1000,
                draws=4000,
                cores=ncores,
                chains=nchains,
                step=xo.get_dense_nuts_step(),
            )

        except pm.exceptions.SamplingError:
            logging.error("Sampling failed, model misspecified")
            return None

        # Check for divergences, restart sampling if necessary
        divergent = trace["diverging"]
        divperc = divergent.nonzero()[0].size / len(trace) * 100

        n_attempts = 1
        while divperc > 15.0 and n_attempts <= max_attempts:
            # Run sampling
            trace = pm.sample(
                tune=2000,
                draws=n_attempts * 10000,
                cores=ncores,
                chains=nchains,
                step=xo.get_dense_nuts_step(target_accept=0.9),
            )

            n_attempts += 1

        if divperc > 15:
            logging.warning(f"{divperc} of samples are diverging.")
            df = pm.trace_to_dataframe(trace, include_transformed=True)
            df.to_csv(output_dir + f"/{filename}.csv")
            return None

        else:
            df = pm.trace_to_dataframe(trace, include_transformed=True)
            df.to_csv(output_dir + f"/{filename}.csv")

    return trace
Beispiel #3
0
def trace_to_dataframe(trace, model=None, log_post=False):
    """
    Convert a PyMC3 trace to a Pandas DataFrame

    Parameters
    ----------
    trace : PyMC3 trace
        Trace returned from pm.sample()
    model : PyMC3 model, default None
        Model returned from pm.Model()
    log_post : bool, default False
        If True, also compute the log posterior.

    Returns
    -------
    output : Pandas DataFrame
        DataFrame with samples and various sampling statistics.
    """
    df = pm.trace_to_dataframe(trace, chains=[0])
    for stat in trace.stat_names:
        if stat in df.columns:
            warnings.warn('`' + stat + '` is in the variable names.`' +
                          ' Not adding this statistic.')
        else:
            df[stat] = trace.get_sampler_stats(stat, chains=[0])
    if 'chain' in df.columns:
        warnings.warn('`chain` is in the variable name.`' +
                      ' Not adding this statistic.')
    else:
        df['chain'] = np.array([0] * len(df), dtype=int)

    if trace.nchains > 1:
        for chain in trace.chains[1:]:
            df_app = pm.trace_to_dataframe(trace, chains=[chain])
            for stat in trace.stat_names:
                if stat not in df_app.columns:
                    df_app[stat] = trace.get_sampler_stats(stat,
                                                           chains=[chain])
            if 'chain' not in df_app.columns:
                df_app['chain'] = np.array([chain] * len(df_app))

            df = df.append(df_app, ignore_index=True)

    if log_post:
        # Extract the model from context if necessary
        model = pm.modelcontext(model)

        df['log_likelihood'] = _log_like_trace(trace, model).sum(axis=1)
        df['log_prior'] = _log_prior_trace(trace, model).sum(axis=1)
        df['log_posterior'] = df['log_likelihood'] + df['log_prior']

    return df
def test_regional_model(surveys_data):
    # 15% regions more than 5% difference
    # Mean diff by region is 3%
    # Median diff is 2%
    data = surveys_data
    data['net'] = 0
    data.ix[data.response >= 6, 'net'] = 1
    data.ix[data.response < 5, 'net'] = -1

    exp = data.groupby('Region').mean()

    result = run_regional_model(data)
    reg_list = result['heads'][0]['values']
    exp_by_reg = exp.loc[reg_list].net
    nets = compute_net(pm.trace_to_dataframe(result['trace']))

    net_r = {reg_list[i]: nets[
        str(i)] for i in range(len(reg_list))}

    recs = list()
    for reg, vals in net_r.items():
        recs.append((reg, (vals < exp.loc[reg, 'net']).sum()/len(vals)))

    ptile = pd.DataFrame.from_records(recs, columns=['region', 'ptile'])

    assert np.sum((ptile.ptile < 0.95) & (ptile.ptile > 0.05))/len(ptile) > 0.9
Beispiel #5
0
def test_pymc3_gets_reasonable_results():

    # Load data.
    df0 = pd.read_csv("../data/gaia_mc5_velocities.csv")
    m = df0.radial_velocity.values != 0
    m &= np.isfinite(df0.basic_vx.values)
    m &= np.isfinite(df0.basic_vy.values)
    m &= np.isfinite(df0.basic_vz.values)
    df1 = df0.iloc[m]
    df = df1.iloc[0]

    pos = [df["ra"], df["dec"], df["parallax"]]
    pos_err = [df["ra_error"], df["dec_error"], df["parallax_error"]]
    proper = [df["pmra"], df["pmdec"]]
    proper_err = [df["pmra_error"], df["pmdec_error"]]

    # Now move star so it's near the Galactic pole.
    c = SkyCoord('12h51.4m', '+27.13',  unit=(u.hourangle, u.deg),
                 frame='icrs')
    pos[0] = c.ra.value
    pos[1] = c.dec.value

    mu, cov = av.get_prior()
    trace = av.run_pymc3_model(pos, pos_err, proper, proper_err, mu, cov)
    samples = pm.trace_to_dataframe(trace)
Beispiel #6
0
    def test_deterministic_of_observed(self):
        meas_in_1 = pm.theanof.floatX(2 + 4 * np.random.randn(100))
        meas_in_2 = pm.theanof.floatX(5 + 4 * np.random.randn(100))
        with pm.Model() as model:
            mu_in_1 = pm.Normal("mu_in_1", 0, 1)
            sigma_in_1 = pm.HalfNormal("sd_in_1", 1)
            mu_in_2 = pm.Normal("mu_in_2", 0, 1)
            sigma_in_2 = pm.HalfNormal("sd__in_2", 1)

            in_1 = pm.Normal("in_1", mu_in_1, sigma_in_1, observed=meas_in_1)
            in_2 = pm.Normal("in_2", mu_in_2, sigma_in_2, observed=meas_in_2)
            out_diff = in_1 + in_2
            pm.Deterministic("out", out_diff)

            trace = pm.sample(100)
            ppc_trace = pm.trace_to_dataframe(
                trace, varnames=[n for n in trace.varnames if n != "out"]
            ).to_dict("records")
            ppc = pm.sample_posterior_predictive(
                model=model,
                trace=ppc_trace,
                samples=len(ppc_trace),
                vars=(model.deterministics + model.basic_RVs),
            )

            rtol = 1e-5 if theano.config.floatX == "float64" else 1e-3
            assert np.allclose(ppc["in_1"] + ppc["in_2"], ppc["out"], rtol=rtol)
Beispiel #7
0
def sample_from_posterior(approx, varnames, n_subs, responses, last):
    #sample from posterior
    nsample = 1000
    trace = approx.sample(nsample, include_transformed = True)
    
    sample = trace_to_dataframe(trace, include_transformed=True, 
                              varnames=varnames)
        
    #get posterior samples of the predicted response value in the postreversal phase
    gs = np.zeros((10000, -last, n_subs, 2))
    for i in range(10):
        trace = approx.sample(nsample, include_transformed = False)
        gs[i*1000:(i+1)*1000] = trace['G'][:,last:]
    
    post_like = np.exp(gs-gs.max(axis=-1)[:,:,:,None])
    post_like /= post_like.sum(axis = -1)[:,:,:,None]
    
    #measured responses in the post-reversal phase
    res = responses[None,:]
    
    #compute observation likelihood for each posterior sample
    post_ol = post_like[:,:,:,0]**(1-res)*post_like[:,:,:,1]**res
    
    #get posterior predictive log model evidence
    pplme = np.log(post_ol.prod(axis = 1).mean(axis = 0))
    
    #get per subject mean probability of selecting stimuli 2
    plike2 = post_like[:,:,:,1].mean(axis = 0)
    
    return sample, pplme, plike2 
Beispiel #8
0
def likelihood_datatrace_mp(sp, traces, index):
    strace_mp = traces._straces[index]
    strace_mp.chain = 0
    trace_mp = pm.backends.base.MultiTrace([strace_mp])
    datatraces = pm.trace_to_dataframe(trace_mp, hide_transformed_vars=False)
    likelihood_datatrace(sp, datatraces, trace_mp)
    return datatraces
Beispiel #9
0
    def test_deterministic_of_observed(self):
        meas_in_1 = pm.theanof.floatX(2 + 4 * np.random.randn(100))
        meas_in_2 = pm.theanof.floatX(5 + 4 * np.random.randn(100))
        with pm.Model() as model:
            mu_in_1 = pm.Normal('mu_in_1', 0, 1)
            sigma_in_1 = pm.HalfNormal('sd_in_1', 1)
            mu_in_2 = pm.Normal('mu_in_2', 0, 1)
            sigma_in_2 = pm.HalfNormal('sd__in_2', 1)

            in_1 = pm.Normal('in_1', mu_in_1, sigma_in_1, observed=meas_in_1)
            in_2 = pm.Normal('in_2', mu_in_2, sigma_in_2, observed=meas_in_2)
            out_diff = in_1 + in_2
            pm.Deterministic('out', out_diff)

            trace = pm.sample(100)
            ppc_trace = pm.trace_to_dataframe(
                trace,
                varnames=[n for n in trace.varnames
                          if n != 'out']
            ).to_dict('records')
            ppc = pm.sample_posterior_predictive(model=model,
                                                 trace=ppc_trace,
                                                 samples=len(ppc_trace),
                                                 vars=(model.deterministics +
                                                       model.basic_RVs))

            rtol = 1e-5 if theano.config.floatX == 'float64' else 1e-3
            assert np.allclose(ppc['in_1'] + ppc['in_2'],
                               ppc['out'],
                               rtol=rtol)
Beispiel #10
0
    def trace_to_samples(self, trace, data, names=None):
        """
        Create a ``JokerSamples`` instance from a pymc3 trace object.

        Parameters
        ----------
        trace : `~pymc3.backends.base.MultiTrace`
        """
        import pymc3 as pm
        import exoplanet.units as xu

        df = pm.trace_to_dataframe(trace)

        data, *_ = validate_prepare_data(data, self.prior.poly_trend,
                                         self.prior.n_offsets)

        samples = JokerSamples(poly_trend=self.prior.poly_trend,
                               n_offsets=self.prior.n_offsets,
                               t0=data.t0)

        if names is None:
            names = self.prior.par_names

        for name in names:
            par = self.prior.pars[name]
            unit = getattr(par, xu.UNIT_ATTR_NAME)
            samples[name] = df[name].values * unit

        return samples
Beispiel #11
0
    def test_deterministic_of_observed_modified_interface(self):
        meas_in_1 = pm.theanof.floatX(2 + 4 * np.random.randn(100))
        meas_in_2 = pm.theanof.floatX(5 + 4 * np.random.randn(100))
        with pm.Model() as model:
            mu_in_1 = pm.Normal("mu_in_1", 0, 1)
            sigma_in_1 = pm.HalfNormal("sd_in_1", 1)
            mu_in_2 = pm.Normal("mu_in_2", 0, 1)
            sigma_in_2 = pm.HalfNormal("sd__in_2", 1)

            in_1 = pm.Normal("in_1", mu_in_1, sigma_in_1, observed=meas_in_1)
            in_2 = pm.Normal("in_2", mu_in_2, sigma_in_2, observed=meas_in_2)
            out_diff = in_1 + in_2
            pm.Deterministic("out", out_diff)

            trace = pm.sample(100)
            ppc_trace = pm.trace_to_dataframe(
                trace, varnames=[n for n in trace.varnames
                                 if n != "out"]).to_dict("records")
            ppc = pm.sample_posterior_predictive(
                model=model,
                trace=ppc_trace,
                samples=len(ppc_trace),
                var_names=[
                    x.name for x in (model.deterministics + model.basic_RVs)
                ],
            )

            rtol = 1e-5 if theano.config.floatX == "float64" else 1e-3
            assert np.allclose(ppc["in_1"] + ppc["in_2"],
                               ppc["out"],
                               rtol=rtol)
Beispiel #12
0
def single_ucb_model_all(X,
                         fmax,
                         steepness_alpha=1.,
                         steepness_beta=1.,
                         x_midpoint_prec=4.,
                         yscale_alpha=1.,
                         yscale_beta=.5,
                         temperature_alpha=1.,
                         temperature_beta=1.,
                         samples=200):

    df_list = []
    actions = X[0].argmax(axis=1)
    for i in range(X.shape[3]):
        trace = single_ucb_model(X[:, :, :,
                                   i], steepness_alpha, steepness_beta,
                                 x_midpoint_prec, yscale_alpha, yscale_beta,
                                 temperature_alpha, temperature_beta, samples)
        df = pm.trace_to_dataframe(trace)
        df['participant'] = i
        df_list.append(df)
    all_dfs = pd.concat(df_list)
    all_dfs['actions'] = all_dfs['participant'].apply(lambda x: actions[:, x])
    all_dfs['fmax'] = all_dfs['participant'].apply(lambda x: fmax[x])
    return all_dfs
Beispiel #13
0
 def plot_corner(self, *args, **kwargs):
     if not self.hasmcmc:
         raise Exception("Must first run mcmc by calling mcmc()\
         or compute(mcmc=True) with mcmc=True")
     samples = pm.trace_to_dataframe(self.trace,
                                     varnames=[
                                         "mix", "logdeltaQ", "logQ0",
                                         "logperiod", "logamp", "logs2"
                                     ])
     columns = self.build_mcmc_summary()
     corn = corner.corner(samples, *args, **kwargs)
     for i in range(len(columns)):
         plt.annotate(columns[i],
                      xy=(0.38 + 0.08 * i, 0.7),
                      xycoords="figure fraction",
                      fontsize=12)
     columns = self.build_det_summary()
     for i in range(len(columns)):
         plt.annotate(columns[i],
                      xy=(0.55 + 0.08 * i, 0.6),
                      xycoords="figure fraction",
                      fontsize=12)
     plt.annotate("EPIC {0}".format(self.ident),
                  xy=(0.4, 0.95),
                  xycoords="figure fraction",
                  fontsize=30)
     return corn
def parametric_Neiswanger(traces):
    '''Approximate each subposterior aus Gaussian; then construct the combined posterior density (
        also approximated as Gaussian).
    
    Args:
        traces: list of traces, each produced by a different machine for a different batch of the data.
    
    Returns:
        means_total, cov_total: Means and covariance matrix of the approximated combined posterior.
    '''

    means = []
    covs = []

    for trace in traces:
        df = pm.trace_to_dataframe(trace)
        means.append(np.mean(df, axis=0))
        covs.append(np.cov(df, rowvar=0))

    cov_total = np.linalg.inv(
        np.sum(np.array([np.linalg.inv(cov) for cov in covs]), axis=0))
    #return means, covs, cov_total
    means_total = np.dot(
        np.sum(np.array([
            np.dot(np.linalg.inv(cov), np.array(mean))
            for cov, mean in zip(covs, means)
        ]),
               axis=0), cov_total)

    return means_total, cov_total
def main(output_trace_path, X_path, main_cities, output_path):
    # loading data
    with open(output_trace_path, 'rb') as buff:
        data = pickle.load(buff)
        hierarchical_model, hierarchical_trace, scaler, degree_index, \
        response_variable, predictor_variables = data['inference'], data['trace'], data['scaler'], \
                                                 data['city_index_df'], data['response_variable'],\
                                                 data['predictor_variables']

    #list of cities
    degree_index.set_index("CITY", inplace=True)

    # get data of traces
    data = pm.trace_to_dataframe(hierarchical_trace)

    # DO CALCULATION FOR EVERY CLASS IN THE MODEL (CITIES)
    for city in main_cities:
        # get input data and scale if necessary
        X = pd.read_csv(os.path.join(X_path, city + ".csv"))
        prediction, response_variable_real = calc_prediction(
            X, city, data, degree_index, predictor_variables,
            response_variable, scaler)

        # create groups
        df = prediction.groupby("IPCC_SCENARIO")[[response_variable_real
                                                  ]].sum()

        # save results per city
        df.to_csv(os.path.join(output_path, city + ".csv"),
                  index_label="IPCC_SCENARIO")
Beispiel #16
0
def recoverSubject(data, subject, estimates, n_repeats=1, n_tries=1, overwrite=False, progressbar=True):
    """
    Perform parameter recovery for a single subject.
    1) Predict using GLAM with given estimates
    2) Fit GLAM
    3) Save estimates
    """

    print("Processing subject {}...".format(subject))

    # Subset data
    subject_data = data[data['subject'] == subject].copy()
    n_items = subject_data['n_items'].values[0]
    if n_items == 2:
        subject_data = subject_data.drop(['item_value_2', 'gaze_2'], axis=1)
    subject_data['subject'] = 0

    if (overwrite) or (not os.path.isfile(os.path.join('results', 'parameter_recovery', 'recovered_estimates_{}_ins.csv'.format(subject)))):

        parameters = ['v', 's', 'tau', 'gamma']
        # Set up model, supply it with parameter estimates
        generating = glam.GLAM(subject_data, drift='multiplicative')
        generating.make_model('individual', gamma_bounds=(-10, 1), t0_val=0)
        estimates_dict = {parameter: estimates.loc[parameter + '__0_0', 'MAP'] for parameter in parameters}
        estimates_dict['t0'] = 0
        estimates_dict['subject'] = 0
        generating.estimates = pd.DataFrame(estimates_dict, index=np.zeros(1))
        generating.predict(n_repeats=n_repeats)
        generated = generating.prediction
        recovering = glam.GLAM(generated)
        recovering.make_model('individual', gamma_bounds=(-10, 1), t0_val=0)
        recovering = fitModel(recovering, relevant_parameters=parameters, n_tries_max=n_tries, progressbar=progressbar)
        summary = pm.summary(recovering.trace[0])
        for parameter in parameters:
            summary.loc[parameter + '__0_0', 'MAP'] = recovering.estimates[parameter].values[0]
            summary.loc[parameter + '__0_0', 'generating'] = estimates_dict[parameter]
        summary.to_csv(os.path.join('results', 'parameter_recovery', 'recovered_estimates_{}_multiplicative_ins.csv'.format(subject)))

        pm.trace_to_dataframe(recovering.trace[0]).to_csv(os.path.join('results', 'traces', 'parameter_recovery', 'trace_{}_parameter_recovery_ins.csv'.format(subject)))
        pm.traceplot(recovering.trace[0])
        plt.savefig(os.path.join('results', 'traces', 'parameter_recovery', 'plots', 'traceplot_{}_parameter_recovery_ins.png'.format(subject)))
        plt.close()

    else:
        print("Previous recovery results found (Subject {}). Skipping...".format(subject))

    return
Beispiel #17
0
def main(trainkey,
         predkey,
         outputkey,
         inference_method='',
         ncores='',
         nchains=1,
         niters='1500',
         redishost='10.42.72.93'):
    panthera = redishost
    conn = redis.StrictRedis(host=panthera, password='******')
    #predkey = 'p-50x50-guerrero-4'
    #trainkey = 't-luca-guerrero-4'
    #outputkey = 'test-model'
    PDF = preparePredictors(loadDataFrameFromRedis(predkey, conn))
    TDF = loadDataFrameFromRedis(trainkey, conn)

    formula = 'LUCA ~ Longitude + Latitude + Q("Dist.to.road_m") + Population_m + name'

    TM, PM = splitByFormula(formula, TDF, PDF['clean'])
    logger.info("Start modelling inference")

    model = ModelSamplingEffort(TM, PM)
    trace = SampleModel(model,
                        inference_method=inference_method,
                        ncores=ncores,
                        nchains=nchains,
                        niters=niters)
    logger.info("Saving trace")
    try:
        pm.save_trace(
            trace,
            directory=
            '/storage/users/escamill/presence-only-model/output/rawtrace',
            overwrite=True)
    except:
        logger.error("not possible to save trace")
    tracedf = pm.trace_to_dataframe(trace)
    tracedf.to_csv(
        '/storage/users/escamill/presence-only-model/output/trace%s.csv' %
        outputkey,
        encoding='utf8')

    try:
        pred_sample = SamplePredictions(model, TM, PM, trace)
    except:
        logger.error("something went wrong")

    pred_sample.to_csv(
        '/storage/users/escamill/presence-only-model/output/pred_cond-%s.csv' %
        outputkey,
        encoding='utf8')
    # pred sample is a dictionary

    pickle.dump(
        '/storage/users/escamill/presence-only-model/output/pred%s.pickle' %
        outputkey, pred_sample)
    #conn.set(outputkey+'-df',pickle.dumps(tracedf))
    #conn.set(outputkey+'-trace',pickle.dumps(pred_sample))
    logger.info("Finished!")
Beispiel #18
0
def cornerplot(lc, trace, catalog, **kwargs):
    truths = pm.summary(trace)['mean']
    samples = pm.trace_to_dataframe(trace)
    cornerplot = corner.corner(samples, truths=truths, **kwargs)
    pl.annotate("{0} {1}".format(catalog, lc.id),
                xy=(0.4, 0.95),
                xycoords="figure fraction",
                fontsize=30)
    return cornerplot
Beispiel #19
0
 def write_summary_line(self):
     if not self.hasmcmc:
         raise Exception("Must first run mcmc by calling mcmc()\
         or compute(mcmc=True) with mcmc=True")
     samples = pm.trace_to_dataframe(self.trace,
                                     varnames=[
                                         "mix", "logdeltaQ", "logQ0",
                                         "logperiod", "logamp", "logs2"
                                     ])
Beispiel #20
0
def coin_flip_route():
    draws = max(1, min(10000, int(request.args.get('draws', 500))))
    alpha = max(0.01, float(request.args.get('alpha', 1)))
    beta = max(0.01, float(request.args.get('beta', 1)))
    tails = max(0, float(request.args.get('tails', 100)))
    heads = max(0, float(request.args.get('heads', 100)))
    with flip_coins(alpha, beta, tails, heads):
        trace = pm.sample(draws)
    return jsonify(pm.trace_to_dataframe(trace).to_dict(orient='records'))
Beispiel #21
0
def gevtr(calibration_data):

    #Preparing input data
    serie_lista = list(calibration_data)
    #    ano_lista=calibration_data.index.year.tolist()
    ano_lista = calibration_data.index.tolist()
    ano_resetado = pd.Series(list(range(len(calibration_data))))

    #First regression

    slope, intercept, r_value, pvalue_regress, std_err = ss.linregress(
        ano_resetado, calibration_data.values)
    numerator = ano_resetado.map(lambda x: x * slope)

    #Second regression and calculation of detrended series

    dists = []
    for i in range(len(serie_lista)):
        dists.append(
            abs(serie_lista[i] - (slope * ano_resetado[i] + intercept)))
    slope2, intercept2, r_value2, pvalue_regress2, std_err2 = ss.linregress(
        ano_resetado, dists)
    data_reset_index = pd.Series(calibration_data.values)
    denominator = ano_resetado.map(lambda x: x * slope2 + intercept2)
    detrended_series = (data_reset_index - numerator) / denominator

    locm = detrended_series.mean()
    locs = detrended_series.std() / (np.sqrt(len(calibration_data)))
    scalem = detrended_series.std()
    scales = detrended_series.std() / (np.sqrt(2 *
                                               (len(calibration_data) - 1)))
    with pm.Model() as model:
        # Priors for unknown model parameters
        c = pm.Beta(
            'c', alpha=6, beta=9
        )  #c=x-0,5: transformation in gev_logp is required due to Beta domain is between 0 and 1
        loc = pm.Normal('loc', mu=locm, sd=locs)
        scale = pm.Normal('scale', mu=scalem, sd=scales)

        # Likelihood (sampling distribution) of observations | Since GEV is not implemented in pymc a custom log likelihood function was created
        def gev_logp(value):
            scaled = (value - loc) / scale
            logp = -(tt.log(scale) + (((c - 0.5) + 1) / (c - 0.5) * tt.log1p(
                (c - 0.5) * scaled) + (1 + (c - 0.5) * scaled)**(-1 /
                                                                 (c - 0.5))))
            bound1 = loc - scale / (c - 0.5)
            bounds = tt.switch((c - 0.5) > 0, value > bound1, value < bound1)
            return bound(logp, bounds, c != 0)

        gev = pm.DensityDist('gev', gev_logp, observed=detrended_series)
        trace = pm.sample(5000, chains=3, cores=1, progressbar=True)
    pm.traceplot(trace)
    # geweke_plot=pm.geweke(trace, 0.05, 0.5, 20)
    # gelman_and_rubin=pm.diagnostics.gelman_rubin(trace)
    posterior = pm.trace_to_dataframe(trace)
    summary = pm.summary(trace)
    return posterior, summary
Beispiel #22
0
def trace_to_dataframe(trace,
                       model=None,
                       varnames=None,
                       include_transformed=False,
                       log_post=False):
    """
    Convert a PyMC3 trace to a Pandas DataFrame

    To add: Compute logp for each point using model.logp().
    """
    df = pm.trace_to_dataframe(trace, chains=[0])
    for stat in trace.stat_names:
        if stat in df.columns:
            warnings.warn('`' + stat + '` is in the variable names.`' +
                          ' Not adding this statistic.')
        else:
            df[stat] = trace.get_sampler_stats(stat, chains=[0])
    if 'chain' in df.columns:
        warnings.warn('`chain` is in the variable name.`' +
                      ' Not adding this statistic.')
    else:
        df['chain'] = np.array([0] * len(df), dtype=int)

    if len(trace.chains) == 1:
        return df

    for chain in trace.chains[1:]:
        df_app = pm.trace_to_dataframe(trace, chains=[chain])
        for stat in trace.stat_names:
            if stat not in df_app.columns:
                df_app[stat] = trace.get_sampler_stats(stat, chains=[chain])
        if 'chain' not in df_app.columns:
            df_app['chain'] = np.array([chain] * len(df_app))

        df = df.append(df_app, ignore_index=True)

    if log_post:
        # Extract the model from context if necessary
        model = pm.modelcontext(model)

        logp = pymc3.stats._log_post_trace(trace, model).sum(axis=1)
        df['log_posterior'] = logp

    return df
Beispiel #23
0
def corner(trace, **kwargs):
    """
    Make a corner plot from a trace

    Parameters
    ----------
    trace : `~pymc3.backends.base.MultiTrace`
        Trace from SMC/NUTS
    """
    return dfm_corner(pm.trace_to_dataframe(trace), **kwargs)
def excel_posterior(trace, filename):

    #Need to read the data again to set activity number and names
    prj = project_reader(filename)
    WP_NAMES = np.array(prj[1][:, 0])
    WP_NUMBER = prj[1][:, 0].shape[0]

    PV_names = list()
    PVpartial_names = list()
    EV_names = list()
    COMP_names = list()
    SPI_names = list()
    CPI_names = list()
    Index_names = ["SPI_PROJECT", "CPI_PROJECT", "ETC", "EAC", "TEAC"]

    RISK_names = list()
    projectDefinition = prj[1]

    for x in range(WP_NUMBER):
        for y in range(2):
            if (projectDefinition[x][y + 1] != 0):
                rname = projectDefinition[x][0] + "_Risk_%d" % (y + 1)
                RISK_names.append(rname)

    for x in range(WP_NUMBER):
        PV_names.append("PV_%s" % WP_NAMES[x])
        PVpartial_names.append("Partial_PV_%s" % WP_NAMES[x])
        EV_names.append("EV_%s" % WP_NAMES[x])
        COMP_names.append("COMPLETION_%s" % WP_NAMES[x])
        SPI_names.append("SPI_%s" % WP_NAMES[x])
        CPI_names.append("CPI_%s" % WP_NAMES[x])
    all_names = RISK_names + PV_names + PVpartial_names + EV_names + COMP_names + SPI_names + CPI_names + Index_names

    outputName = filename + "Output.xlsx"
    traceName = filename + "Trace.xlsx"
    pm.summary(trace,
               varnames=all_names,
               stat_funcs=[trace_mean, trace_sd,
                           trace_quantiles]).to_excel(outputName,
                                                      sheet_name="Summary")
    pm.plot_posterior(trace, varnames=all_names)
    pm.trace_to_dataframe(trace).to_excel(traceName, sheet_name="Trace")
Beispiel #25
0
def coef(m):
    """
    Return posterior mean of parameters necessary to compute mu, i.e. the predicted mean

    :param m: fitted Model instance
    :return: pandas series, index = parameter names, values = parameter values
    """
    tdf = pm.trace_to_dataframe(m.trace)
    # filter for 'a', i.e. intercept, or for 'b.*', i.e. coefficients
    # together, specify formula for mean
    return tdf.filter(regex=r'^a$|^b.*', axis=1).mean()
Beispiel #26
0
def toy_model(v_samp,
              z_samp,
              logp_prior,
              size=500,
              samples=50,
              steps=1000,
              tune=1000,
              a_true=1.2,
              b_true=-0.5,
              width_true=0.05,
              extratext='_true'):
    ''' The pymc3 linear model z(v) = a*v+b with natural width in log space log_width. The prior contribution to likelihood is an argument to the function.'''
    with pm.Model() as model:
        a = pm.Normal("a", mu=0, sigma=10, testval=a_true)
        b = pm.Normal("b", mu=0, sigma=10, testval=b_true)
        log_width = pm.Normal("log_width",
                              mu=np.log(width_true),
                              sigma=2.0,
                              testval=np.log(width_true))

        mu = a * v_samp + b

        # The line has some width: we're calling it a Gaussian in n
        logp_hyper = -0.5 * (z_samp - mu)**2 * pm.math.exp(
            -2 * log_width) - log_width
        # Here we account for the intermediate prior
        logp = logp_hyper - logp_prior

        # Compute the marginalized likelihood
        max_logp = tt.max(logp, axis=1)
        # max_logp = np.zeros(len(logM_samp))
        marg_logp = max_logp + pm.math.log(
            pm.math.sum(pm.math.exp(logp - max_logp[:, None]), axis=1))
        pm.Potential('marg_logp', marg_logp)

        trace = pm.sample(draws=steps,
                          tune=tune,
                          target_accept=0.9,
                          init='adapt_full',
                          return_inferencedata=False)
        # az.plot_trace(trace)
        print(az.summary(trace, round_to=2))
        print(a_true, b_true, np.log(width_true))
        corner.corner(pm.trace_to_dataframe(trace),
                      truths=[a_true] + [b_true] +
                      [np.log(width_true)])  # Corner plot!
        plt.savefig("PriorToy/Corner_N1000_vfcomplex_prior_samp_mixed%s.png" %
                    (extratext),
                    bbox_inches='tight',
                    dpi=150)
    return
Beispiel #27
0
def run_model(steps=10000):
    model = pymc.Model()
    with model:
        α = 1 / count_data.mean()
        λ1 = pymc.Exponential("λ1", α)
        λ2 = pymc.Exponential("λ2", α)
        τ = pymc.DiscreteUniform("τ", lower=0.0, upper=len(count_data))
        process_mean = mean(τ, λ1, λ2)
        observation = pymc.Poisson("observation", process_mean, observed=count_data)
        start = {"λ1": 10.0, "λ2": 30.0}
        step1 = pymc.Slice([λ1, λ2])
        step2 = pymc.Metropolis([τ])
        trace = pymc.sample(steps, tune=500, start=start, step=[step1, step2], cores=2)
    return pymc.trace_to_dataframe(trace)
Beispiel #28
0
def gev0_shift_2(dataset):
    locm = dataset.mean()
    locs = dataset.std() / (np.sqrt(len(dataset)))
    scalem = dataset.std()
    scales = dataset.std() / (np.sqrt(2 * (len(dataset) - 1)))
    with pm.Model() as model:
        # Priors for unknown model parameters
        c1 = pm.Beta(
            'c1', alpha=6, beta=9
        )  # c=x-0,5: transformation in gev_logp is required due to Beta domain between 0 and 1
        loc1 = pm.Normal('loc1', mu=locm, sd=locs)
        scale1 = pm.Normal('scale1', mu=scalem, sd=scales)

        c2 = pm.Beta('c2', alpha=6, beta=9)
        loc2 = pm.Normal('loc2', mu=locm, sd=locs)
        scale2 = pm.Normal('scale2', mu=scalem, sd=scales)

        c3 = pm.Beta('c3', alpha=6, beta=9)
        loc3 = pm.Normal('loc3', mu=locm, sd=locs)
        scale3 = pm.Normal('scale3', mu=scalem, sd=scales)

        def gev_logp(value):
            scaled = (value - loc_) / scale_
            logp = -(tt.log(scale_) +
                     (((c_ - 0.5) + 1) / (c_ - 0.5) * tt.log1p(
                         (c_ - 0.5) * scaled) +
                      (1 + (c_ - 0.5) * scaled)**(-1 / (c_ - 0.5))))
            bound1 = loc_ - scale_ / (c_ - 0.5)
            bounds = tt.switch((c_ - 0.5) > 0, value > bound1, value < bound1)
            return bound(logp, bounds, c_ != 0)

        tau1 = pm.DiscreteUniform("tau1", lower=0, upper=n_count_data - 2)
        tau2 = pm.DiscreteUniform("tau2",
                                  lower=tau1 + 1,
                                  upper=n_count_data - 1)
        idx = np.arange(n_count_data)

        c_ = pm.math.switch(tau2 >= idx, pm.math.switch(tau1 >= idx, c1, c2),
                            c3)
        loc_ = pm.math.switch(tau2 >= idx,
                              pm.math.switch(tau1 >= idx, loc1, loc2), loc3)
        scale_ = pm.math.switch(tau2 >= idx,
                                pm.math.switch(tau1 >= idx, scale1, scale2),
                                scale3)
        gev = pm.DensityDist('gev', gev_logp, observed=dataset)
        trace = pm.sample(2000, chains=1, progressbar=True)
        posterior = pm.trace_to_dataframe(trace)
        summary = pm.summary(trace)
        #        geweke_plot = pm.geweke(trace, 0.05, 0.5, 20)
        return summary, posterior
Beispiel #29
0
def plot_model_diagnostics(model, save_dir, file_id, export=True):
    """generate and export a range of diagnostic plots for a given model"""

    # ensure folder exists
    if export is True:
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)

    model_name = model.__class__.__name__

    trace_df = pm.trace_to_dataframe(model.trace, varnames=model.df_params)

    sns.pairplot(trace_df)
    if export is True:
        plt.savefig(save_dir + f'{model_name}_{file_id}_pairplot.pdf',
                    format='pdf',
                    bbox_inches='tight')
        plt.cla()

    pm.traceplot(model.trace, varnames=model.df_params)
    if export is True:
        plt.savefig(save_dir + f'{model_name}_{file_id}_traceplot.pdf',
                    format='pdf',
                    bbox_inches='tight')
        plt.cla()

    pm.autocorrplot(model.trace, varnames=model.df_params)
    if export is True:
        plt.savefig(save_dir + f'{model_name}_{file_id}_autocorrplot.pdf',
                    format='pdf',
                    bbox_inches='tight')
        plt.cla()

    pm.forestplot(model.trace, varnames=model.df_params)
    if export is True:
        plt.savefig(save_dir + f'{model_name}_{file_id}_forestplot.pdf',
                    format='pdf',
                    bbox_inches='tight')
        plt.cla()

    # close all figs, otherwise we can run out of memory
    plt.close("all")
Beispiel #30
0
def visualizing_fitting_process(days, trace, visual='yes'):
    varnames = ["period", "t0", "r", "b"]
    if visual == 'yes':
        pm.traceplot(trace, varnames=varnames)
    labels = [
        "period [days]", "transit time [days]", "radius ratio",
        "impact parameter"
    ]
    samples = pm.trace_to_dataframe(trace, varnames=varnames)
    if visual == 'yes':
        corner.corner(samples[["period", "t0", "r__0", "b__0"]], labels=labels)
    # Compute the posterior parameters
    median_radius_ratio = np.median(trace["r"])
    median_impact_parameter = np.median(trace["b"])
    median_period = np.median(trace["period"])
    median_t0 = np.median(trace["t0"])
    median_x_fold = (days - median_t0 +
                     0.5 * median_period) % median_period - 0.5 * median_period
    median_inds = np.argsort(median_x_fold)
    return median_x_fold, median_t0, median_period, median_radius_ratio, median_impact_parameter
Beispiel #31
0
def divergence_plot(nm, ylim=None):

    if nm.hbr.configs['n_chains'] > 1 and nm.hbr.model_type != 'nn':
        a = pm.summary(nm.hbr.trace).round(2)
        plt.figure()
        plt.hist(a['r_hat'], 10)
        plt.title('Gelman-Rubin diagnostic for divergence')

    divergent = nm.hbr.trace['diverging']

    tracedf = pm.trace_to_dataframe(nm.hbr.trace)

    _, ax = plt.subplots(2, 1, figsize=(15, 4), sharex=True, sharey=True)
    ax[0].plot(tracedf.values[divergent == 0].T, color='k', alpha=.05)
    ax[0].set_title('No Divergences', fontsize=10)
    ax[1].plot(tracedf.values[divergent == 1].T, color='C2', lw=.5, alpha=.5)
    ax[1].set_title('Divergences', fontsize=10)
    plt.ylim(ylim)
    plt.xticks(range(tracedf.shape[1]), list(tracedf.columns))
    plt.xticks(rotation=90, fontsize=7)
    plt.tight_layout()
    plt.show()
Beispiel #32
0
        trace = sampler.sample(draws=2000, trace=trace, chains=chains, cores=1,
                               progressbar=False)
        tottime += time.time() - strt

        samples = np.array(trace.get_values("P", combine=False))
        samples = np.moveaxis(samples, 0, 1)
        flag, n_eff = check_convergence(samples)
        if flag:
            break

    time_pymc = tottime
    time_ind_pymc = tottime / n_eff
    n_eff_pymc = n_eff

# Save the trace file
df = pm.trace_to_dataframe(trace)
df.to_hdf(os.path.join(dirname, "pymc-trace.h5"), "trace")

# Make the plots
for n, letter in enumerate(string.ascii_lowercase[1:N_pl+1]):
    fig = plt.figure()

    # Get the posterior median orbital parameters
    p = np.median(trace["P"][:, n])
    t0 = np.median(trace["t0"][:, n])

    # Plot the folded data
    x_fold = (x - t0 + 0.5*p) % p - 0.5*p
    plt.errorbar(x_fold, y, yerr=yerr, fmt=".k")

    plt.annotate("period = {0:.4f} +/- {1:.4f} d"
	# Metropolis sampling works best!
	tr = pm.sample(tune = 10000, draws = 50000, cores = 4, start = pm.find_MAP(), step = pm.Metropolis())

# Print the Gelman-Rubin rhat convergence statistics to a file
f = open("palatability_regression_convergence.txt", "w")
print(str(pm.gelman_rubin(tr)), file = f)
f.close()

# Save the trace to the output folder as a numpy array, for later reference
# Save every 10th sample from the trace, to avoid any autocorrelation issues
np.save("palatability_regression_trace.npy", tr[::10]["coeff_pal"])

# Convert the trace to a dataframe, and save that too
# Save every 10th sample from the trace, to avoid any autocorrelation issues
tr_df = pm.trace_to_dataframe(tr[::10])
tr_df.to_csv("palatability_regression_trace.csv")

# Plot the results of the palatability regression analysis
# First just plot the mean regression coefficients for every laser condition, across time
fig = plt.figure()
mean_coeff = np.mean(tr[::10]["coeff_pal"], axis = 0)
hpd_coeff = pm.hpd(tr[::10]["coeff_pal"], alpha = 0.05)
for condition in range(unique_lasers[0].shape[0]):
	plt.plot(x[analyze_indices], mean_coeff[:, condition], linewidth = 3.0, label = "Dur:{}ms, Lag:{}ms".format(unique_lasers[0][condition][0], unique_lasers[0][condition][1]))
plt.legend()
plt.xlabel("Time post taste delivery (ms)")
plt.ylabel("Mean posterior regression coefficient")
fig.savefig("palatability_regression_coefficients_mean.png", bbox_inches = "tight")
plt.close("all")
# Now plot the mean and SD of the regression coefficients for every laser condition, across time