def plotPosteriorCr(mdl, trc, rawdata, xlims, npoints=1000): #Plot the posterior predictions from model given traces #Extract traces - samples that were collected trc_mu = pm.trace_to_dataframe(trc)[['Intercept', 'x']] trc_sd = pm.trace_to_dataframe(trc)['sd'] #Recreate the likelihood x = np.linspace(xlims[0], xlims[1], npoints).reshape((npoints,1)) X = x ** np.ones((npoints,2)) * np.arange(2) like_mu = np.dot(X,trc_mu.T) like_sd = np.tile(trc_sd.T,(npoints,1)) like = np.random.normal(like_mu, like_sd) #Calculate credible regions and plot over the datapoints dfp = pd.DataFrame(np.percentile(like, [2.5, 25, 50, 75, 97.5], axis=1).T , columns=['025', '250', '500', '750', '975']) dfp['x'] = x pal = sns.color_palette('Purples') f, ax1d = plt.subplots(1, 1, figsize=(7, 7)) ax1d.fill_between(dfp['x'], dfp['025'], dfp['975'], alpha=0.5 , color=pal[1], label='CR 95%') ax1d.fill_between(dfp['x'], dfp['250'], dfp['750'], alpha=0.4 , color=pal[4], label='CR 50%') ax1d.plot(dfp['x'], dfp['500'], alpha=0.5, color=pal[5], label='Median') _ = plt.legend() _ = ax1d.set_xlim(xlims) _ = sns.regplot(x='x', y='y', data=rawdata, fit_reg=False , scatter_kws={'alpha': 0.8, 's': 80, 'lw': 2, 'edgecolor': 'w'}, ax=ax1d)
def run_sampling(pm_model, output_dir, ncores=1, nchains=2, max_attempts=2, filename="trace"): # Log file output logging.basicConfig( filename=output_dir + "/sampling.log", filemode="w", format="%(name)s - %(levelname)s - %(message)s", ) # Sample the model divperc = 20 with pm_model: # Run initial chain try: trace = pm.sample( tune=1000, draws=4000, cores=ncores, chains=nchains, step=xo.get_dense_nuts_step(), ) except pm.exceptions.SamplingError: logging.error("Sampling failed, model misspecified") return None # Check for divergences, restart sampling if necessary divergent = trace["diverging"] divperc = divergent.nonzero()[0].size / len(trace) * 100 n_attempts = 1 while divperc > 15.0 and n_attempts <= max_attempts: # Run sampling trace = pm.sample( tune=2000, draws=n_attempts * 10000, cores=ncores, chains=nchains, step=xo.get_dense_nuts_step(target_accept=0.9), ) n_attempts += 1 if divperc > 15: logging.warning(f"{divperc} of samples are diverging.") df = pm.trace_to_dataframe(trace, include_transformed=True) df.to_csv(output_dir + f"/{filename}.csv") return None else: df = pm.trace_to_dataframe(trace, include_transformed=True) df.to_csv(output_dir + f"/{filename}.csv") return trace
def trace_to_dataframe(trace, model=None, log_post=False): """ Convert a PyMC3 trace to a Pandas DataFrame Parameters ---------- trace : PyMC3 trace Trace returned from pm.sample() model : PyMC3 model, default None Model returned from pm.Model() log_post : bool, default False If True, also compute the log posterior. Returns ------- output : Pandas DataFrame DataFrame with samples and various sampling statistics. """ df = pm.trace_to_dataframe(trace, chains=[0]) for stat in trace.stat_names: if stat in df.columns: warnings.warn('`' + stat + '` is in the variable names.`' + ' Not adding this statistic.') else: df[stat] = trace.get_sampler_stats(stat, chains=[0]) if 'chain' in df.columns: warnings.warn('`chain` is in the variable name.`' + ' Not adding this statistic.') else: df['chain'] = np.array([0] * len(df), dtype=int) if trace.nchains > 1: for chain in trace.chains[1:]: df_app = pm.trace_to_dataframe(trace, chains=[chain]) for stat in trace.stat_names: if stat not in df_app.columns: df_app[stat] = trace.get_sampler_stats(stat, chains=[chain]) if 'chain' not in df_app.columns: df_app['chain'] = np.array([chain] * len(df_app)) df = df.append(df_app, ignore_index=True) if log_post: # Extract the model from context if necessary model = pm.modelcontext(model) df['log_likelihood'] = _log_like_trace(trace, model).sum(axis=1) df['log_prior'] = _log_prior_trace(trace, model).sum(axis=1) df['log_posterior'] = df['log_likelihood'] + df['log_prior'] return df
def test_regional_model(surveys_data): # 15% regions more than 5% difference # Mean diff by region is 3% # Median diff is 2% data = surveys_data data['net'] = 0 data.ix[data.response >= 6, 'net'] = 1 data.ix[data.response < 5, 'net'] = -1 exp = data.groupby('Region').mean() result = run_regional_model(data) reg_list = result['heads'][0]['values'] exp_by_reg = exp.loc[reg_list].net nets = compute_net(pm.trace_to_dataframe(result['trace'])) net_r = {reg_list[i]: nets[ str(i)] for i in range(len(reg_list))} recs = list() for reg, vals in net_r.items(): recs.append((reg, (vals < exp.loc[reg, 'net']).sum()/len(vals))) ptile = pd.DataFrame.from_records(recs, columns=['region', 'ptile']) assert np.sum((ptile.ptile < 0.95) & (ptile.ptile > 0.05))/len(ptile) > 0.9
def test_pymc3_gets_reasonable_results(): # Load data. df0 = pd.read_csv("../data/gaia_mc5_velocities.csv") m = df0.radial_velocity.values != 0 m &= np.isfinite(df0.basic_vx.values) m &= np.isfinite(df0.basic_vy.values) m &= np.isfinite(df0.basic_vz.values) df1 = df0.iloc[m] df = df1.iloc[0] pos = [df["ra"], df["dec"], df["parallax"]] pos_err = [df["ra_error"], df["dec_error"], df["parallax_error"]] proper = [df["pmra"], df["pmdec"]] proper_err = [df["pmra_error"], df["pmdec_error"]] # Now move star so it's near the Galactic pole. c = SkyCoord('12h51.4m', '+27.13', unit=(u.hourangle, u.deg), frame='icrs') pos[0] = c.ra.value pos[1] = c.dec.value mu, cov = av.get_prior() trace = av.run_pymc3_model(pos, pos_err, proper, proper_err, mu, cov) samples = pm.trace_to_dataframe(trace)
def test_deterministic_of_observed(self): meas_in_1 = pm.theanof.floatX(2 + 4 * np.random.randn(100)) meas_in_2 = pm.theanof.floatX(5 + 4 * np.random.randn(100)) with pm.Model() as model: mu_in_1 = pm.Normal("mu_in_1", 0, 1) sigma_in_1 = pm.HalfNormal("sd_in_1", 1) mu_in_2 = pm.Normal("mu_in_2", 0, 1) sigma_in_2 = pm.HalfNormal("sd__in_2", 1) in_1 = pm.Normal("in_1", mu_in_1, sigma_in_1, observed=meas_in_1) in_2 = pm.Normal("in_2", mu_in_2, sigma_in_2, observed=meas_in_2) out_diff = in_1 + in_2 pm.Deterministic("out", out_diff) trace = pm.sample(100) ppc_trace = pm.trace_to_dataframe( trace, varnames=[n for n in trace.varnames if n != "out"] ).to_dict("records") ppc = pm.sample_posterior_predictive( model=model, trace=ppc_trace, samples=len(ppc_trace), vars=(model.deterministics + model.basic_RVs), ) rtol = 1e-5 if theano.config.floatX == "float64" else 1e-3 assert np.allclose(ppc["in_1"] + ppc["in_2"], ppc["out"], rtol=rtol)
def sample_from_posterior(approx, varnames, n_subs, responses, last): #sample from posterior nsample = 1000 trace = approx.sample(nsample, include_transformed = True) sample = trace_to_dataframe(trace, include_transformed=True, varnames=varnames) #get posterior samples of the predicted response value in the postreversal phase gs = np.zeros((10000, -last, n_subs, 2)) for i in range(10): trace = approx.sample(nsample, include_transformed = False) gs[i*1000:(i+1)*1000] = trace['G'][:,last:] post_like = np.exp(gs-gs.max(axis=-1)[:,:,:,None]) post_like /= post_like.sum(axis = -1)[:,:,:,None] #measured responses in the post-reversal phase res = responses[None,:] #compute observation likelihood for each posterior sample post_ol = post_like[:,:,:,0]**(1-res)*post_like[:,:,:,1]**res #get posterior predictive log model evidence pplme = np.log(post_ol.prod(axis = 1).mean(axis = 0)) #get per subject mean probability of selecting stimuli 2 plike2 = post_like[:,:,:,1].mean(axis = 0) return sample, pplme, plike2
def likelihood_datatrace_mp(sp, traces, index): strace_mp = traces._straces[index] strace_mp.chain = 0 trace_mp = pm.backends.base.MultiTrace([strace_mp]) datatraces = pm.trace_to_dataframe(trace_mp, hide_transformed_vars=False) likelihood_datatrace(sp, datatraces, trace_mp) return datatraces
def test_deterministic_of_observed(self): meas_in_1 = pm.theanof.floatX(2 + 4 * np.random.randn(100)) meas_in_2 = pm.theanof.floatX(5 + 4 * np.random.randn(100)) with pm.Model() as model: mu_in_1 = pm.Normal('mu_in_1', 0, 1) sigma_in_1 = pm.HalfNormal('sd_in_1', 1) mu_in_2 = pm.Normal('mu_in_2', 0, 1) sigma_in_2 = pm.HalfNormal('sd__in_2', 1) in_1 = pm.Normal('in_1', mu_in_1, sigma_in_1, observed=meas_in_1) in_2 = pm.Normal('in_2', mu_in_2, sigma_in_2, observed=meas_in_2) out_diff = in_1 + in_2 pm.Deterministic('out', out_diff) trace = pm.sample(100) ppc_trace = pm.trace_to_dataframe( trace, varnames=[n for n in trace.varnames if n != 'out'] ).to_dict('records') ppc = pm.sample_posterior_predictive(model=model, trace=ppc_trace, samples=len(ppc_trace), vars=(model.deterministics + model.basic_RVs)) rtol = 1e-5 if theano.config.floatX == 'float64' else 1e-3 assert np.allclose(ppc['in_1'] + ppc['in_2'], ppc['out'], rtol=rtol)
def trace_to_samples(self, trace, data, names=None): """ Create a ``JokerSamples`` instance from a pymc3 trace object. Parameters ---------- trace : `~pymc3.backends.base.MultiTrace` """ import pymc3 as pm import exoplanet.units as xu df = pm.trace_to_dataframe(trace) data, *_ = validate_prepare_data(data, self.prior.poly_trend, self.prior.n_offsets) samples = JokerSamples(poly_trend=self.prior.poly_trend, n_offsets=self.prior.n_offsets, t0=data.t0) if names is None: names = self.prior.par_names for name in names: par = self.prior.pars[name] unit = getattr(par, xu.UNIT_ATTR_NAME) samples[name] = df[name].values * unit return samples
def test_deterministic_of_observed_modified_interface(self): meas_in_1 = pm.theanof.floatX(2 + 4 * np.random.randn(100)) meas_in_2 = pm.theanof.floatX(5 + 4 * np.random.randn(100)) with pm.Model() as model: mu_in_1 = pm.Normal("mu_in_1", 0, 1) sigma_in_1 = pm.HalfNormal("sd_in_1", 1) mu_in_2 = pm.Normal("mu_in_2", 0, 1) sigma_in_2 = pm.HalfNormal("sd__in_2", 1) in_1 = pm.Normal("in_1", mu_in_1, sigma_in_1, observed=meas_in_1) in_2 = pm.Normal("in_2", mu_in_2, sigma_in_2, observed=meas_in_2) out_diff = in_1 + in_2 pm.Deterministic("out", out_diff) trace = pm.sample(100) ppc_trace = pm.trace_to_dataframe( trace, varnames=[n for n in trace.varnames if n != "out"]).to_dict("records") ppc = pm.sample_posterior_predictive( model=model, trace=ppc_trace, samples=len(ppc_trace), var_names=[ x.name for x in (model.deterministics + model.basic_RVs) ], ) rtol = 1e-5 if theano.config.floatX == "float64" else 1e-3 assert np.allclose(ppc["in_1"] + ppc["in_2"], ppc["out"], rtol=rtol)
def single_ucb_model_all(X, fmax, steepness_alpha=1., steepness_beta=1., x_midpoint_prec=4., yscale_alpha=1., yscale_beta=.5, temperature_alpha=1., temperature_beta=1., samples=200): df_list = [] actions = X[0].argmax(axis=1) for i in range(X.shape[3]): trace = single_ucb_model(X[:, :, :, i], steepness_alpha, steepness_beta, x_midpoint_prec, yscale_alpha, yscale_beta, temperature_alpha, temperature_beta, samples) df = pm.trace_to_dataframe(trace) df['participant'] = i df_list.append(df) all_dfs = pd.concat(df_list) all_dfs['actions'] = all_dfs['participant'].apply(lambda x: actions[:, x]) all_dfs['fmax'] = all_dfs['participant'].apply(lambda x: fmax[x]) return all_dfs
def plot_corner(self, *args, **kwargs): if not self.hasmcmc: raise Exception("Must first run mcmc by calling mcmc()\ or compute(mcmc=True) with mcmc=True") samples = pm.trace_to_dataframe(self.trace, varnames=[ "mix", "logdeltaQ", "logQ0", "logperiod", "logamp", "logs2" ]) columns = self.build_mcmc_summary() corn = corner.corner(samples, *args, **kwargs) for i in range(len(columns)): plt.annotate(columns[i], xy=(0.38 + 0.08 * i, 0.7), xycoords="figure fraction", fontsize=12) columns = self.build_det_summary() for i in range(len(columns)): plt.annotate(columns[i], xy=(0.55 + 0.08 * i, 0.6), xycoords="figure fraction", fontsize=12) plt.annotate("EPIC {0}".format(self.ident), xy=(0.4, 0.95), xycoords="figure fraction", fontsize=30) return corn
def parametric_Neiswanger(traces): '''Approximate each subposterior aus Gaussian; then construct the combined posterior density ( also approximated as Gaussian). Args: traces: list of traces, each produced by a different machine for a different batch of the data. Returns: means_total, cov_total: Means and covariance matrix of the approximated combined posterior. ''' means = [] covs = [] for trace in traces: df = pm.trace_to_dataframe(trace) means.append(np.mean(df, axis=0)) covs.append(np.cov(df, rowvar=0)) cov_total = np.linalg.inv( np.sum(np.array([np.linalg.inv(cov) for cov in covs]), axis=0)) #return means, covs, cov_total means_total = np.dot( np.sum(np.array([ np.dot(np.linalg.inv(cov), np.array(mean)) for cov, mean in zip(covs, means) ]), axis=0), cov_total) return means_total, cov_total
def main(output_trace_path, X_path, main_cities, output_path): # loading data with open(output_trace_path, 'rb') as buff: data = pickle.load(buff) hierarchical_model, hierarchical_trace, scaler, degree_index, \ response_variable, predictor_variables = data['inference'], data['trace'], data['scaler'], \ data['city_index_df'], data['response_variable'],\ data['predictor_variables'] #list of cities degree_index.set_index("CITY", inplace=True) # get data of traces data = pm.trace_to_dataframe(hierarchical_trace) # DO CALCULATION FOR EVERY CLASS IN THE MODEL (CITIES) for city in main_cities: # get input data and scale if necessary X = pd.read_csv(os.path.join(X_path, city + ".csv")) prediction, response_variable_real = calc_prediction( X, city, data, degree_index, predictor_variables, response_variable, scaler) # create groups df = prediction.groupby("IPCC_SCENARIO")[[response_variable_real ]].sum() # save results per city df.to_csv(os.path.join(output_path, city + ".csv"), index_label="IPCC_SCENARIO")
def recoverSubject(data, subject, estimates, n_repeats=1, n_tries=1, overwrite=False, progressbar=True): """ Perform parameter recovery for a single subject. 1) Predict using GLAM with given estimates 2) Fit GLAM 3) Save estimates """ print("Processing subject {}...".format(subject)) # Subset data subject_data = data[data['subject'] == subject].copy() n_items = subject_data['n_items'].values[0] if n_items == 2: subject_data = subject_data.drop(['item_value_2', 'gaze_2'], axis=1) subject_data['subject'] = 0 if (overwrite) or (not os.path.isfile(os.path.join('results', 'parameter_recovery', 'recovered_estimates_{}_ins.csv'.format(subject)))): parameters = ['v', 's', 'tau', 'gamma'] # Set up model, supply it with parameter estimates generating = glam.GLAM(subject_data, drift='multiplicative') generating.make_model('individual', gamma_bounds=(-10, 1), t0_val=0) estimates_dict = {parameter: estimates.loc[parameter + '__0_0', 'MAP'] for parameter in parameters} estimates_dict['t0'] = 0 estimates_dict['subject'] = 0 generating.estimates = pd.DataFrame(estimates_dict, index=np.zeros(1)) generating.predict(n_repeats=n_repeats) generated = generating.prediction recovering = glam.GLAM(generated) recovering.make_model('individual', gamma_bounds=(-10, 1), t0_val=0) recovering = fitModel(recovering, relevant_parameters=parameters, n_tries_max=n_tries, progressbar=progressbar) summary = pm.summary(recovering.trace[0]) for parameter in parameters: summary.loc[parameter + '__0_0', 'MAP'] = recovering.estimates[parameter].values[0] summary.loc[parameter + '__0_0', 'generating'] = estimates_dict[parameter] summary.to_csv(os.path.join('results', 'parameter_recovery', 'recovered_estimates_{}_multiplicative_ins.csv'.format(subject))) pm.trace_to_dataframe(recovering.trace[0]).to_csv(os.path.join('results', 'traces', 'parameter_recovery', 'trace_{}_parameter_recovery_ins.csv'.format(subject))) pm.traceplot(recovering.trace[0]) plt.savefig(os.path.join('results', 'traces', 'parameter_recovery', 'plots', 'traceplot_{}_parameter_recovery_ins.png'.format(subject))) plt.close() else: print("Previous recovery results found (Subject {}). Skipping...".format(subject)) return
def main(trainkey, predkey, outputkey, inference_method='', ncores='', nchains=1, niters='1500', redishost='10.42.72.93'): panthera = redishost conn = redis.StrictRedis(host=panthera, password='******') #predkey = 'p-50x50-guerrero-4' #trainkey = 't-luca-guerrero-4' #outputkey = 'test-model' PDF = preparePredictors(loadDataFrameFromRedis(predkey, conn)) TDF = loadDataFrameFromRedis(trainkey, conn) formula = 'LUCA ~ Longitude + Latitude + Q("Dist.to.road_m") + Population_m + name' TM, PM = splitByFormula(formula, TDF, PDF['clean']) logger.info("Start modelling inference") model = ModelSamplingEffort(TM, PM) trace = SampleModel(model, inference_method=inference_method, ncores=ncores, nchains=nchains, niters=niters) logger.info("Saving trace") try: pm.save_trace( trace, directory= '/storage/users/escamill/presence-only-model/output/rawtrace', overwrite=True) except: logger.error("not possible to save trace") tracedf = pm.trace_to_dataframe(trace) tracedf.to_csv( '/storage/users/escamill/presence-only-model/output/trace%s.csv' % outputkey, encoding='utf8') try: pred_sample = SamplePredictions(model, TM, PM, trace) except: logger.error("something went wrong") pred_sample.to_csv( '/storage/users/escamill/presence-only-model/output/pred_cond-%s.csv' % outputkey, encoding='utf8') # pred sample is a dictionary pickle.dump( '/storage/users/escamill/presence-only-model/output/pred%s.pickle' % outputkey, pred_sample) #conn.set(outputkey+'-df',pickle.dumps(tracedf)) #conn.set(outputkey+'-trace',pickle.dumps(pred_sample)) logger.info("Finished!")
def cornerplot(lc, trace, catalog, **kwargs): truths = pm.summary(trace)['mean'] samples = pm.trace_to_dataframe(trace) cornerplot = corner.corner(samples, truths=truths, **kwargs) pl.annotate("{0} {1}".format(catalog, lc.id), xy=(0.4, 0.95), xycoords="figure fraction", fontsize=30) return cornerplot
def write_summary_line(self): if not self.hasmcmc: raise Exception("Must first run mcmc by calling mcmc()\ or compute(mcmc=True) with mcmc=True") samples = pm.trace_to_dataframe(self.trace, varnames=[ "mix", "logdeltaQ", "logQ0", "logperiod", "logamp", "logs2" ])
def coin_flip_route(): draws = max(1, min(10000, int(request.args.get('draws', 500)))) alpha = max(0.01, float(request.args.get('alpha', 1))) beta = max(0.01, float(request.args.get('beta', 1))) tails = max(0, float(request.args.get('tails', 100))) heads = max(0, float(request.args.get('heads', 100))) with flip_coins(alpha, beta, tails, heads): trace = pm.sample(draws) return jsonify(pm.trace_to_dataframe(trace).to_dict(orient='records'))
def gevtr(calibration_data): #Preparing input data serie_lista = list(calibration_data) # ano_lista=calibration_data.index.year.tolist() ano_lista = calibration_data.index.tolist() ano_resetado = pd.Series(list(range(len(calibration_data)))) #First regression slope, intercept, r_value, pvalue_regress, std_err = ss.linregress( ano_resetado, calibration_data.values) numerator = ano_resetado.map(lambda x: x * slope) #Second regression and calculation of detrended series dists = [] for i in range(len(serie_lista)): dists.append( abs(serie_lista[i] - (slope * ano_resetado[i] + intercept))) slope2, intercept2, r_value2, pvalue_regress2, std_err2 = ss.linregress( ano_resetado, dists) data_reset_index = pd.Series(calibration_data.values) denominator = ano_resetado.map(lambda x: x * slope2 + intercept2) detrended_series = (data_reset_index - numerator) / denominator locm = detrended_series.mean() locs = detrended_series.std() / (np.sqrt(len(calibration_data))) scalem = detrended_series.std() scales = detrended_series.std() / (np.sqrt(2 * (len(calibration_data) - 1))) with pm.Model() as model: # Priors for unknown model parameters c = pm.Beta( 'c', alpha=6, beta=9 ) #c=x-0,5: transformation in gev_logp is required due to Beta domain is between 0 and 1 loc = pm.Normal('loc', mu=locm, sd=locs) scale = pm.Normal('scale', mu=scalem, sd=scales) # Likelihood (sampling distribution) of observations | Since GEV is not implemented in pymc a custom log likelihood function was created def gev_logp(value): scaled = (value - loc) / scale logp = -(tt.log(scale) + (((c - 0.5) + 1) / (c - 0.5) * tt.log1p( (c - 0.5) * scaled) + (1 + (c - 0.5) * scaled)**(-1 / (c - 0.5)))) bound1 = loc - scale / (c - 0.5) bounds = tt.switch((c - 0.5) > 0, value > bound1, value < bound1) return bound(logp, bounds, c != 0) gev = pm.DensityDist('gev', gev_logp, observed=detrended_series) trace = pm.sample(5000, chains=3, cores=1, progressbar=True) pm.traceplot(trace) # geweke_plot=pm.geweke(trace, 0.05, 0.5, 20) # gelman_and_rubin=pm.diagnostics.gelman_rubin(trace) posterior = pm.trace_to_dataframe(trace) summary = pm.summary(trace) return posterior, summary
def trace_to_dataframe(trace, model=None, varnames=None, include_transformed=False, log_post=False): """ Convert a PyMC3 trace to a Pandas DataFrame To add: Compute logp for each point using model.logp(). """ df = pm.trace_to_dataframe(trace, chains=[0]) for stat in trace.stat_names: if stat in df.columns: warnings.warn('`' + stat + '` is in the variable names.`' + ' Not adding this statistic.') else: df[stat] = trace.get_sampler_stats(stat, chains=[0]) if 'chain' in df.columns: warnings.warn('`chain` is in the variable name.`' + ' Not adding this statistic.') else: df['chain'] = np.array([0] * len(df), dtype=int) if len(trace.chains) == 1: return df for chain in trace.chains[1:]: df_app = pm.trace_to_dataframe(trace, chains=[chain]) for stat in trace.stat_names: if stat not in df_app.columns: df_app[stat] = trace.get_sampler_stats(stat, chains=[chain]) if 'chain' not in df_app.columns: df_app['chain'] = np.array([chain] * len(df_app)) df = df.append(df_app, ignore_index=True) if log_post: # Extract the model from context if necessary model = pm.modelcontext(model) logp = pymc3.stats._log_post_trace(trace, model).sum(axis=1) df['log_posterior'] = logp return df
def corner(trace, **kwargs): """ Make a corner plot from a trace Parameters ---------- trace : `~pymc3.backends.base.MultiTrace` Trace from SMC/NUTS """ return dfm_corner(pm.trace_to_dataframe(trace), **kwargs)
def excel_posterior(trace, filename): #Need to read the data again to set activity number and names prj = project_reader(filename) WP_NAMES = np.array(prj[1][:, 0]) WP_NUMBER = prj[1][:, 0].shape[0] PV_names = list() PVpartial_names = list() EV_names = list() COMP_names = list() SPI_names = list() CPI_names = list() Index_names = ["SPI_PROJECT", "CPI_PROJECT", "ETC", "EAC", "TEAC"] RISK_names = list() projectDefinition = prj[1] for x in range(WP_NUMBER): for y in range(2): if (projectDefinition[x][y + 1] != 0): rname = projectDefinition[x][0] + "_Risk_%d" % (y + 1) RISK_names.append(rname) for x in range(WP_NUMBER): PV_names.append("PV_%s" % WP_NAMES[x]) PVpartial_names.append("Partial_PV_%s" % WP_NAMES[x]) EV_names.append("EV_%s" % WP_NAMES[x]) COMP_names.append("COMPLETION_%s" % WP_NAMES[x]) SPI_names.append("SPI_%s" % WP_NAMES[x]) CPI_names.append("CPI_%s" % WP_NAMES[x]) all_names = RISK_names + PV_names + PVpartial_names + EV_names + COMP_names + SPI_names + CPI_names + Index_names outputName = filename + "Output.xlsx" traceName = filename + "Trace.xlsx" pm.summary(trace, varnames=all_names, stat_funcs=[trace_mean, trace_sd, trace_quantiles]).to_excel(outputName, sheet_name="Summary") pm.plot_posterior(trace, varnames=all_names) pm.trace_to_dataframe(trace).to_excel(traceName, sheet_name="Trace")
def coef(m): """ Return posterior mean of parameters necessary to compute mu, i.e. the predicted mean :param m: fitted Model instance :return: pandas series, index = parameter names, values = parameter values """ tdf = pm.trace_to_dataframe(m.trace) # filter for 'a', i.e. intercept, or for 'b.*', i.e. coefficients # together, specify formula for mean return tdf.filter(regex=r'^a$|^b.*', axis=1).mean()
def toy_model(v_samp, z_samp, logp_prior, size=500, samples=50, steps=1000, tune=1000, a_true=1.2, b_true=-0.5, width_true=0.05, extratext='_true'): ''' The pymc3 linear model z(v) = a*v+b with natural width in log space log_width. The prior contribution to likelihood is an argument to the function.''' with pm.Model() as model: a = pm.Normal("a", mu=0, sigma=10, testval=a_true) b = pm.Normal("b", mu=0, sigma=10, testval=b_true) log_width = pm.Normal("log_width", mu=np.log(width_true), sigma=2.0, testval=np.log(width_true)) mu = a * v_samp + b # The line has some width: we're calling it a Gaussian in n logp_hyper = -0.5 * (z_samp - mu)**2 * pm.math.exp( -2 * log_width) - log_width # Here we account for the intermediate prior logp = logp_hyper - logp_prior # Compute the marginalized likelihood max_logp = tt.max(logp, axis=1) # max_logp = np.zeros(len(logM_samp)) marg_logp = max_logp + pm.math.log( pm.math.sum(pm.math.exp(logp - max_logp[:, None]), axis=1)) pm.Potential('marg_logp', marg_logp) trace = pm.sample(draws=steps, tune=tune, target_accept=0.9, init='adapt_full', return_inferencedata=False) # az.plot_trace(trace) print(az.summary(trace, round_to=2)) print(a_true, b_true, np.log(width_true)) corner.corner(pm.trace_to_dataframe(trace), truths=[a_true] + [b_true] + [np.log(width_true)]) # Corner plot! plt.savefig("PriorToy/Corner_N1000_vfcomplex_prior_samp_mixed%s.png" % (extratext), bbox_inches='tight', dpi=150) return
def run_model(steps=10000): model = pymc.Model() with model: α = 1 / count_data.mean() λ1 = pymc.Exponential("λ1", α) λ2 = pymc.Exponential("λ2", α) τ = pymc.DiscreteUniform("τ", lower=0.0, upper=len(count_data)) process_mean = mean(τ, λ1, λ2) observation = pymc.Poisson("observation", process_mean, observed=count_data) start = {"λ1": 10.0, "λ2": 30.0} step1 = pymc.Slice([λ1, λ2]) step2 = pymc.Metropolis([τ]) trace = pymc.sample(steps, tune=500, start=start, step=[step1, step2], cores=2) return pymc.trace_to_dataframe(trace)
def gev0_shift_2(dataset): locm = dataset.mean() locs = dataset.std() / (np.sqrt(len(dataset))) scalem = dataset.std() scales = dataset.std() / (np.sqrt(2 * (len(dataset) - 1))) with pm.Model() as model: # Priors for unknown model parameters c1 = pm.Beta( 'c1', alpha=6, beta=9 ) # c=x-0,5: transformation in gev_logp is required due to Beta domain between 0 and 1 loc1 = pm.Normal('loc1', mu=locm, sd=locs) scale1 = pm.Normal('scale1', mu=scalem, sd=scales) c2 = pm.Beta('c2', alpha=6, beta=9) loc2 = pm.Normal('loc2', mu=locm, sd=locs) scale2 = pm.Normal('scale2', mu=scalem, sd=scales) c3 = pm.Beta('c3', alpha=6, beta=9) loc3 = pm.Normal('loc3', mu=locm, sd=locs) scale3 = pm.Normal('scale3', mu=scalem, sd=scales) def gev_logp(value): scaled = (value - loc_) / scale_ logp = -(tt.log(scale_) + (((c_ - 0.5) + 1) / (c_ - 0.5) * tt.log1p( (c_ - 0.5) * scaled) + (1 + (c_ - 0.5) * scaled)**(-1 / (c_ - 0.5)))) bound1 = loc_ - scale_ / (c_ - 0.5) bounds = tt.switch((c_ - 0.5) > 0, value > bound1, value < bound1) return bound(logp, bounds, c_ != 0) tau1 = pm.DiscreteUniform("tau1", lower=0, upper=n_count_data - 2) tau2 = pm.DiscreteUniform("tau2", lower=tau1 + 1, upper=n_count_data - 1) idx = np.arange(n_count_data) c_ = pm.math.switch(tau2 >= idx, pm.math.switch(tau1 >= idx, c1, c2), c3) loc_ = pm.math.switch(tau2 >= idx, pm.math.switch(tau1 >= idx, loc1, loc2), loc3) scale_ = pm.math.switch(tau2 >= idx, pm.math.switch(tau1 >= idx, scale1, scale2), scale3) gev = pm.DensityDist('gev', gev_logp, observed=dataset) trace = pm.sample(2000, chains=1, progressbar=True) posterior = pm.trace_to_dataframe(trace) summary = pm.summary(trace) # geweke_plot = pm.geweke(trace, 0.05, 0.5, 20) return summary, posterior
def plot_model_diagnostics(model, save_dir, file_id, export=True): """generate and export a range of diagnostic plots for a given model""" # ensure folder exists if export is True: if not os.path.exists(save_dir): os.makedirs(save_dir) model_name = model.__class__.__name__ trace_df = pm.trace_to_dataframe(model.trace, varnames=model.df_params) sns.pairplot(trace_df) if export is True: plt.savefig(save_dir + f'{model_name}_{file_id}_pairplot.pdf', format='pdf', bbox_inches='tight') plt.cla() pm.traceplot(model.trace, varnames=model.df_params) if export is True: plt.savefig(save_dir + f'{model_name}_{file_id}_traceplot.pdf', format='pdf', bbox_inches='tight') plt.cla() pm.autocorrplot(model.trace, varnames=model.df_params) if export is True: plt.savefig(save_dir + f'{model_name}_{file_id}_autocorrplot.pdf', format='pdf', bbox_inches='tight') plt.cla() pm.forestplot(model.trace, varnames=model.df_params) if export is True: plt.savefig(save_dir + f'{model_name}_{file_id}_forestplot.pdf', format='pdf', bbox_inches='tight') plt.cla() # close all figs, otherwise we can run out of memory plt.close("all")
def visualizing_fitting_process(days, trace, visual='yes'): varnames = ["period", "t0", "r", "b"] if visual == 'yes': pm.traceplot(trace, varnames=varnames) labels = [ "period [days]", "transit time [days]", "radius ratio", "impact parameter" ] samples = pm.trace_to_dataframe(trace, varnames=varnames) if visual == 'yes': corner.corner(samples[["period", "t0", "r__0", "b__0"]], labels=labels) # Compute the posterior parameters median_radius_ratio = np.median(trace["r"]) median_impact_parameter = np.median(trace["b"]) median_period = np.median(trace["period"]) median_t0 = np.median(trace["t0"]) median_x_fold = (days - median_t0 + 0.5 * median_period) % median_period - 0.5 * median_period median_inds = np.argsort(median_x_fold) return median_x_fold, median_t0, median_period, median_radius_ratio, median_impact_parameter
def divergence_plot(nm, ylim=None): if nm.hbr.configs['n_chains'] > 1 and nm.hbr.model_type != 'nn': a = pm.summary(nm.hbr.trace).round(2) plt.figure() plt.hist(a['r_hat'], 10) plt.title('Gelman-Rubin diagnostic for divergence') divergent = nm.hbr.trace['diverging'] tracedf = pm.trace_to_dataframe(nm.hbr.trace) _, ax = plt.subplots(2, 1, figsize=(15, 4), sharex=True, sharey=True) ax[0].plot(tracedf.values[divergent == 0].T, color='k', alpha=.05) ax[0].set_title('No Divergences', fontsize=10) ax[1].plot(tracedf.values[divergent == 1].T, color='C2', lw=.5, alpha=.5) ax[1].set_title('Divergences', fontsize=10) plt.ylim(ylim) plt.xticks(range(tracedf.shape[1]), list(tracedf.columns)) plt.xticks(rotation=90, fontsize=7) plt.tight_layout() plt.show()
trace = sampler.sample(draws=2000, trace=trace, chains=chains, cores=1, progressbar=False) tottime += time.time() - strt samples = np.array(trace.get_values("P", combine=False)) samples = np.moveaxis(samples, 0, 1) flag, n_eff = check_convergence(samples) if flag: break time_pymc = tottime time_ind_pymc = tottime / n_eff n_eff_pymc = n_eff # Save the trace file df = pm.trace_to_dataframe(trace) df.to_hdf(os.path.join(dirname, "pymc-trace.h5"), "trace") # Make the plots for n, letter in enumerate(string.ascii_lowercase[1:N_pl+1]): fig = plt.figure() # Get the posterior median orbital parameters p = np.median(trace["P"][:, n]) t0 = np.median(trace["t0"][:, n]) # Plot the folded data x_fold = (x - t0 + 0.5*p) % p - 0.5*p plt.errorbar(x_fold, y, yerr=yerr, fmt=".k") plt.annotate("period = {0:.4f} +/- {1:.4f} d"
# Metropolis sampling works best! tr = pm.sample(tune = 10000, draws = 50000, cores = 4, start = pm.find_MAP(), step = pm.Metropolis()) # Print the Gelman-Rubin rhat convergence statistics to a file f = open("palatability_regression_convergence.txt", "w") print(str(pm.gelman_rubin(tr)), file = f) f.close() # Save the trace to the output folder as a numpy array, for later reference # Save every 10th sample from the trace, to avoid any autocorrelation issues np.save("palatability_regression_trace.npy", tr[::10]["coeff_pal"]) # Convert the trace to a dataframe, and save that too # Save every 10th sample from the trace, to avoid any autocorrelation issues tr_df = pm.trace_to_dataframe(tr[::10]) tr_df.to_csv("palatability_regression_trace.csv") # Plot the results of the palatability regression analysis # First just plot the mean regression coefficients for every laser condition, across time fig = plt.figure() mean_coeff = np.mean(tr[::10]["coeff_pal"], axis = 0) hpd_coeff = pm.hpd(tr[::10]["coeff_pal"], alpha = 0.05) for condition in range(unique_lasers[0].shape[0]): plt.plot(x[analyze_indices], mean_coeff[:, condition], linewidth = 3.0, label = "Dur:{}ms, Lag:{}ms".format(unique_lasers[0][condition][0], unique_lasers[0][condition][1])) plt.legend() plt.xlabel("Time post taste delivery (ms)") plt.ylabel("Mean posterior regression coefficient") fig.savefig("palatability_regression_coefficients_mean.png", bbox_inches = "tight") plt.close("all") # Now plot the mean and SD of the regression coefficients for every laser condition, across time