def test_nested_model_to_netcdf(self, tmp_path): with pm.Model("scope") as model: b = pm.Normal("var") trace = pm.sample(100, tune=0) az.to_netcdf(trace, tmp_path / "trace.nc") trace1 = az.from_netcdf(tmp_path / "trace.nc") assert "scope::var" in trace1.posterior
def bayes_multiple_detector_each_sigma(t, s, n): scala = 1000 with pm.Model() as abrupt_model: sigma = pm.Normal('sigma', mu=0.02 * scala, sigma=0.015 * scala) # sigma = pm.Uniform('sigma', 5, 15) mu = pm.Uniform("mu1", -1.5 * scala, -1.4 * scala) tau = pm.DiscreteUniform("tau" + "1", t.min(), t.max()) for i in np.arange(2, n + 2): _mu = pm.Uniform("mu" + str(i), -1.6 * scala, -1.4 * scala) mu = T.switch(tau >= t, mu, _mu) if i < (n + 1): ttau = pm.DiscreteUniform("tau" + str(i), tau, t.max()) tau = ttau tau1 = abrupt_model["tau1"] tau2 = abrupt_model["tau2"] dtau = pm.DiscreteUniform('dtau', tau1 + 500, tau2) s_obs = pm.Normal("s_obs", mu=mu, sigma=sigma, observed=s) g = pm.model_to_graphviz(abrupt_model) g.view() with abrupt_model: # pm.find_MAP() trace = pm.sample(20000, tune=5000) az.plot_trace(trace) az.to_netcdf(trace, getpath('tracepath') + 'bd9_4_add_new_rule') plt.show() pm.summary(trace) return trace
def bayes_multiple_detector_I(t, s, n, tracename): with pm.Model() as abrupt_model: sigma = pm.Normal('sigma', mu=30, sigma=5) # sigma = pm.Uniform('sigma', 5, 15) mu = pm.Uniform("mu1", -30, 30) tau = pm.DiscreteUniform("tau" + "1", t.min(), t.max()) for i in np.arange(2, n + 2): _mu = pm.Uniform("mu" + str(i), -100, 0) mu = T.switch(tau >= t, mu, _mu) if (i < (n + 1)): tau = pm.DiscreteUniform("tau" + str(i), tau, t.max()) # add random walk # sigma_rw = pm.Uniform("sigma_rw", 0, 10) g_rw = pm.GaussianRandomWalk("g_rw", tau=1, shape=len(s)) s_obs = pm.Normal("s_obs", mu=g_rw + mu, sigma=sigma, observed=s) # g = pm.model_to_graphviz(abrupt_model) # g.view() with abrupt_model: pm.find_MAP() trace = pm.sample(5000, tune=1000) az.plot_trace(trace) plt.show() az.plot_autocorr(trace) plt.show() az.to_netcdf(trace, getpath('tracepath') + tracename) pm.summary(trace) return trace
def test_io_function(self, data, eight_schools_params): # create inference data and assert all attributes are present inference_data = self.get_inference_data( # pylint: disable=W0612 data, eight_schools_params) test_dict = { "posterior": ["eta", "theta", "mu", "tau"], "posterior_predictive": ["eta", "theta", "mu", "tau"], "sample_stats": ["eta", "theta", "mu", "tau"], "prior": ["eta", "theta", "mu", "tau"], "prior_predictive": ["eta", "theta", "mu", "tau"], "sample_stats_prior": ["eta", "theta", "mu", "tau"], "observed_data": ["J", "y", "sigma"], } fails = check_multiple_attrs(test_dict, inference_data) assert not fails # check filename does not exist and save InferenceData here = os.path.dirname(os.path.abspath(__file__)) data_directory = os.path.join(here, "..", "saved_models") filepath = os.path.join(data_directory, "io_function_testfile.nc") # az -function to_netcdf(inference_data, filepath) # Assert InferenceData has been saved correctly assert os.path.exists(filepath) assert os.path.getsize(filepath) > 0 inference_data2 = from_netcdf(filepath) fails = check_multiple_attrs(test_dict, inference_data2) assert not fails os.remove(filepath) assert not os.path.exists(filepath)
def test_io_function(self, data, eight_schools_params): inference_data = self.get_inference_data( # pylint: disable=W0612 data, eight_schools_params) assert hasattr(inference_data, "posterior") here = os.path.dirname(os.path.abspath(__file__)) data_directory = os.path.join(here, "saved_models") filepath = os.path.join(data_directory, "io_function_testfile.nc") # az -function to_netcdf(inference_data, filepath) assert os.path.exists(filepath) assert os.path.getsize(filepath) > 0 inference_data2 = from_netcdf(filepath) assert hasattr(inference_data2, "posterior") os.remove(filepath) assert not os.path.exists(filepath)
def hierarchical_reg_reference(samples=2000, target_df=None): """Runs a hierarchical model over the reference data set.""" _, _, dataframe = load_data() if target_df is None: target_df = pd.DataFrame({}) else: del target_df['bmrb_code'] dataframe = dataframe[dataframe.protein != '1UBQ'] dataframe = pd.concat([dataframe, target_df], ignore_index=True) mean_teo = dataframe["ca_teo"].mean() mean_exp = dataframe["ca_exp"].mean() std_teo = dataframe["ca_teo"].std() std_exp = dataframe["ca_exp"].std() ca_exp = (dataframe.ca_exp - mean_exp) / std_exp ca_teo = (dataframe.ca_teo - mean_teo) / std_teo categories = pd.Categorical(dataframe["res"]) index = categories.codes N = len(np.unique(index)) with pm.Model() as model: # hyper-priors alpha_sd = pm.HalfNormal("alpha_sd", 1.0) beta_sd = pm.HalfNormal("beta_sd", 1.0) sigma_sd = pm.HalfNormal("sigma_sd", 1.0) # priors α = pm.Normal("α", 0, alpha_sd, shape=N) β = pm.HalfNormal("β", beta_sd, shape=N) σ = pm.HalfNormal("σ", sigma_sd, shape=N) # linear model μ = pm.Deterministic("μ", α[index] + β[index] * ca_teo) # likelihood cheshift = pm.Normal("cheshift", mu=μ, sigma=σ[index], observed=ca_exp) idata = pm.sample(samples, tune=2000, random_seed=18759, target_accept=0.9, return_inferencedata=True) pps = pm.sample_posterior_predictive(idata, samples=samples * idata.posterior.dims["chain"], random_seed=18759) idata.add_groups({"posterior_predictive":{"cheshift":pps["cheshift"][None,:,:]}}) if target_df is None: az.to_netcdf(idata, os.path.join("data", "trace_reference_structures.nc")) return dataframe, idata
def sample_model(model, data, outprefix=None, **kwargs): """ Sample Stan model and write the parameters into a simple hdf5 file :param model: Stan model to sample from :param data: data to pass to model :param file_name: HDF5 file name where samples will be stored All other arguments are passed to model.sampling(). Result is cached. """ fit = _sample_model(model, data, **kwargs) if outprefix is not None: arviz.to_netcdf(fit, outprefix + 'fit.hdf5') return fit
def main(model_label, session='7t2', bids_folder='/data'): if model_label not in ['model1', 'certainty', 'certainty_full']: raise NotImplementedError(f'Not implemented {model_label}') df = get_all_behavior(sessions=session, bids_folder=bids_folder) print(df) if model_label == 'model1': model = EvidenceModel(df) if model_label.startswith('certainty'): from scipy.stats import zscore df['z_certainty'] = df.groupby(['subject']).certainty.apply(zscore) df['z_certainty'] = df['z_certainty'].fillna(0.0) if model_label == 'certainty': model = EvidenceModelRegression(df, regressors={ 'evidence_sd1': 'z_certainty', 'evidence_sd2': 'z_certainty' }) elif model_label == 'certainty_full': model = EvidenceModelRegression(df, regressors={ 'evidence_sd1': 'z_certainty', 'evidence_sd2': 'z_certainty', 'risky_prior_mu': 'z_certainty', 'risky_prior_sd': 'z_certainty' }) model.build_model() trace = model.sample(500, 500) target_folder = op.join(bids_folder, 'derivatives', 'evidence_models') if not op.exists(target_folder): os.makedirs(target_folder) az.to_netcdf( trace, op.join(target_folder, f'evidence_ses-{session}_model-{model_label}.nc'))
def main(model_type, session='7t2', bids_folder='/data'): if model_type not in ['model1', 'model2']: raise NotImplementedError(f'Not implemented {model_label}') df = get_all_behavior(sessions=session, bids_folder=bids_folder) model = ProbitModel(df, model_type, bids_folder) model.build_model() trace = model.sample(500, 500) target_folder = op.join(bids_folder, 'derivatives', 'probit_models') if not op.exists(target_folder): os.makedirs(target_folder) az.to_netcdf( trace, op.join(target_folder, f'evidence_ses-{session}_model-{model_label}.nc'))
def bayes_multiple_detector(t, s, n): scala = 1000 with pm.Model() as abrupt_model: sigma = pm.Normal('sigma', mu=0.02 * scala, sigma=0.015 * scala) # sigma = pm.Uniform('sigma', 5, 15) mu = pm.Uniform("mu1", -1.5 * scala, -1.4 * scala) tau = pm.DiscreteUniform("tau" + "1", t.min(), t.max()) for i in np.arange(2, n + 2): _mu = pm.Uniform("mu" + str(i), -1.6 * scala, -1.4 * scala) mu = T.switch(tau >= t, mu, _mu) if (i < (n + 1)): tau = pm.DiscreteUniform("tau" + str(i), tau, t.max()) s_obs = pm.Normal("s_obs", mu=mu, sigma=sigma, observed=s) with abrupt_model: pm.find_MAP() trace = pm.sample(20000, tune=5000) az.plot_trace(trace) az.to_netcdf(trace, getpath('tracepath') + 'bd9_4') plt.show() pm.summary(trace) return trace
def run(i, bin_list, runname, niter, nchain, adapt_delta, max_treedepth, verbose, save_chains, save_plots): idx = bin_list[i] stridx = str(idx) misc.printRUNNING(runname + " - Bin: " + stridx) try: # Checking the desired bin exists input_file = "../results/" + runname + "/" + runname + "_results.hdf5" struct = h5py.File(input_file, 'r+') check_bin = struct.get('out/' + stridx) if check_bin == None: misc.printFAILED("Bin " + stridx + " does not exist in " + input_file) return 'ERROR' # Defining the version of the code to use codefile = 'stan_model/bayes-losvd_ghfit.stan' if not os.path.exists(codefile): misc.printFAILED(codefile + " does not exist.") sys.exit() # Defining output names and directories outdir = "../results/" + runname pdf_filename = outdir + "/" + runname + "_gh_diagnostics_bin" + stridx + ".pdf" summary_filename = outdir + "/" + runname + "_gh_Stan_summary_bin" + stridx + ".txt" arviz_filename = outdir + "/" + runname + "_gh_chains_bin" + str( idx) + ".netcdf" sample_filename = outdir + "/" + runname + "_gh_progress_bin" + stridx + ".csv" outhdf5 = outdir + "/" + runname + "_gh_results_bin" + stridx + ".hdf5" # Creating the structure with the data for Stan # ------- # NOTE: losvd_obs, sigma_losvd is what goes into the GH fit # losvd is the processed output of bayes_losvd_run.py # losvd_obs = losvd[2,:] # sigma_losvd is an averaged version of the true 1sigma uncertainties from the bayes_losvd_run.py fit # ------- losvd = struct['out/' + stridx + '/losvd'][2, :] sigma = np.zeros((len(losvd), 2)) sigma[:, 0] = np.fabs(struct['out/' + stridx + '/losvd'][1, :] - losvd) sigma[:, 1] = np.fabs(struct['out/' + stridx + '/losvd'][3, :] - losvd) sigma_losvd = np.mean(sigma, axis=1) data = { 'nvel': struct['in/nvel'], 'xvel': struct['in/xvel'], 'losvd_obs': losvd, 'sigma_losvd': sigma_losvd } # Creating a temporary file adding the input data to the input HDF5 file info temp = tempfile.NamedTemporaryFile() struct2 = h5py.File(temp.name, 'w') struct.copy('in', struct2) struct2.create_dataset("out/" + stridx + "/losvd", data=np.array(struct['out/' + stridx + '/losvd']), compression="gzip") # Running the model with open(codefile, 'r') as myfile: code = myfile.read() model = stan_cache(model_code=code, codefile=codefile) fit = model.sampling(data=data, iter=niter, chains=nchain, control={ 'adapt_delta': adapt_delta, 'max_treedepth': max_treedepth }, sample_file=sample_filename, check_hmc_diagnostics=True) samples = fit.extract(permuted=True) diag_pars = fit.get_sampler_params() # If requested, saving sample chains if (save_chains == True): print("") print("# Saving chains in Arviz (NETCDF) format: " + arviz_filename) arviz_data = az.from_pystan(posterior=fit) az.to_netcdf(arviz_data, arviz_filename) # Saving Stan's summary of main parameters on disk print("") print("# Saving Stan summary: " + summary_filename) unwanted = {'losvd_mod'} misc.save_stan_summary(fit, unwanted=unwanted, verbose=verbose, summary_filename=summary_filename) # Processing output and saving results print("") print("# Processing and saving results: " + outhdf5) misc.process_stan_output_hdp(struct2, samples, outhdf5, stridx) # Creating diagnostic plots if (save_plots == True): if os.path.exists(pdf_filename): os.remove(pdf_filename) print("") print("# Saving diagnostic plots: " + pdf_filename) create_diagnostic_plots(idx, pdf_filename, fit, diag_pars, niter, nchain) # Removing progess files print("") print("# Deleting progress files") misc.delete_files(sample_filename, 'csv') misc.delete_files(sample_filename, 'png') # If we are here, we are DONE! struct.close() struct2.close() misc.printDONE(runname + " - Bin: " + stridx) return 'OK' except: misc.printFAILED() traceback.print_exc() return 'ERROR'
y_op = Deterministic('y_op', r * (1 / mu - 1)) # phi phi = Deterministic( 'phi', (s * mu + n_sim * r) / (s + n_sim * r + n_sim * y_mean)) #, shape=n_sample) # Define likelihood likelihood = NegativeBinomial("y", alpha=r, mu=r * (1 / phi - 1), observed=y_shared) # attention #Inference! idata = sample(1000, cores=4, progressbar=True, chains=4, tune=2000, return_inferencedata=False) az.to_netcdf(idata, filename) print(az.summary(idata, var_names=['r', 'gam', 's'])) ''' with model: idata = az.from_netcdf(filename) # az.plot_trace(idata, var_names=['r','gam','s','beta0']) print(az.summary(idata, var_names=['r', 'gam', 's'])) # print('') '''
def plot_reference_densities(residue_list, text_size=12, figsize=None, save=False): """Plot the reference densities of CS differences for high quality protein structures.""" l = len(residue_list) % 3 if l == 0: plot_lenght = len(residue_list) // 3 else: plot_lenght = len(residue_list) // 3 + 1 if not figsize: figsize = (13, plot_lenght * 2) _, ax = plt.subplots( plot_lenght, 3, figsize=figsize, sharex=False, sharey=True, constrained_layout=True, ) ax = ax.ravel() if os.path.isfile(os.path.join("data", "dataframe_reference_structures.csv")): dataframe_all = pd.read_csv(os.path.join("data", "dataframe_reference_structures.csv")) else: dataframe_all, trace_all = hierarchical_reg_reference() trace_all = az.from_pymc3(trace_all_proteins) az.to_netcdf(trace_all_proteins, os.path.join("data", "trace_reference_structures.nc")) dataframe.to_csv(os.path.join("data", "dataframe_reference_structures.csv")) categories_all = pd.Categorical(dataframe_all["res"]) index_all = categories_all.codes perct_dict = {} if "CYS" in residue_list: dataframe_all = dataframe_all[dataframe_all.res != "CYS"] for i, residue in enumerate(residue_list): ca_teo = dataframe_all[dataframe_all.res == residue].y_pred.values ca_exp = dataframe_all[dataframe_all.res == residue].ca_exp.values difference_dist = ca_teo - ca_exp _, density = az.stats.density_utils.kde(difference_dist) x0, x1 = np.min(difference_dist), np.max(difference_dist) x_range = np.linspace(x0, x1, len(density)) perct = np.percentile(difference_dist, [0, 5, 20, 80, 95, 100]) perct_dict[residue] = perct idx0 = 0 for index, p in enumerate(perct): ax[i].tick_params(labelsize=16) idx1 = np.argsort(np.abs(x_range - p))[0] ax[i].fill_between( x_range[idx0:idx1], density[idx0:idx1], color="C0", zorder=0, alpha=0.3, ) idx0 = idx1 ax[i].set_title(residue, fontsize=text_size) [ ax[idy].spines[position].set_visible(False) for position in ["left", "top", "right"] for idy in range(len(ax)) ] [ax_.set_yticks([]) for ax_ in ax] [ax_.set_xlim(-6, 6) for ax_ in ax] for i in range(1, len(ax) - len(residue_list) + 1): ax[-i].axis("off") if save: plt.savefig(f"reference.png", dpi=300, transparent=True) return _, ax, perct_dict
["left palm", "right palm"])], "site", baseline_index=None) for n in range(n_chains): result_temp = model_palms.sample_hmc(num_results=int(20000), n_burnin=5000) results.append(result_temp) #%% res_all = az.concat(results, dim="chain") print(res_all.posterior) #%% az.to_netcdf(res_all, write_path + "/multi_chain_50_len20000_all") #%% acc_probs = pd.DataFrame( pd.concat([r.effect_df.loc[:, "Inclusion probability"] for r in results])) acc_probs["chain_no"] = np.concatenate( [np.repeat(i + 1, 21) for i in range(n_chains)]) acc_probs.index = acc_probs.index.droplevel(0) acc_probs = acc_probs.reset_index() print(acc_probs)
'gamma': gamma, 'xs': randn(nobs, 3) } f = m.sampling(data=d, iter=2 * args.iter, thin=args.thin, init=init) fit = az.convert_to_inference_data(f) print(f) # Now that we're done with sampling, let's draw some pretty lines. lines = (('H0', {}, true_params['H0']), ('Om', {}, true_params['Om']), ('w0', {}, true_params['w']), ('R0_30', {}, true_params['R0_30']), ('MMin', {}, true_params['MMin']), ('MMax', {}, true_params['MMax']), ('smooth_min', {}, true_params['smooth_min']), ('smooth_max', {}, true_params['smooth_max']), ('alpha', {}, true_params['alpha']), ('beta', {}, true_params['beta']), ('gamma', {}, true_params['gamma']), ('neff_det', {}, 4 * nobs)) az.plot_trace(fit, var_names=[ 'H0', 'Om', 'w0', 'R0_30', 'MMax', 'smooth_max', 'alpha', 'beta', 'gamma', 'neff_det' ], lines=lines) savefig(args.tracefile) az.to_netcdf(fit, args.chainfile)
f0, fdot, fddot, sigma, hbin, log(args.Amin), log(args.Amax), N, start_pt=start_pt) rstate = np.random.get_state() with model: trace = pm.sample(draws=args.draws, tune=n_tune, chains=args.chains, cores=args.cores, step=pm.NUTS(potential=QuadPotentialFullAdapt( model.ndim, zeros(model.ndim)), target_accept=args.target_accept), start=start_pt, init=init) fit = az.from_pymc3(trace) ofile = args.outfile + '.tmp' if op.exists(ofile): os.remove(ofile) az.to_netcdf(fit, ofile) os.rename(ofile, args.outfile)
verbose=True, iter=2000, chains=4, n_jobs=-1, sample_file='stanwound_sample_file.csv', init='random', init_r=0.1) print( fit.stansummary(pars=('phif', 'phif_sigma', 'b', 'mu', 'von_mises_prob_sigma', 'kv', 'k0', 'kf', 'k2', 'stress_sigma'))) data = az.from_pystan(posterior=fit, ) az.to_netcdf(data, 'save_arviz_data_stanwound') with open('stanwound_model_pickle.pkl', 'wb') as f: pickle.dump(multilevel_model, f, protocol=pickle.HIGHEST_PROTOCOL) # pandas dataframe = fit.to_dataframe(pars=('kv', 'k0', 'kf', 'k2', 'b', 'mu', 'phif', 'phif_scaled'), permuted=True) dataframe.to_csv('stanwound_fit_permuted.csv') predictive_dataframe = fit.to_dataframe(pars=( 'stress_mean_predicted_phif_4', 'stress_predicted_phif_4', 'stress_mean_predicted_phif_20', 'stress_predicted_phif_20',
with model: check = pm.sample_prior_predictive(samples=3) plt.plot(x.flatten(), y, label="data", color='red') for i in range(check['pr'].shape[0]): plt.plot(x.flatten(), check['pr'][i], alpha=0.3) plt.legend() plt.savefig("experiments/plots/gp_time_prior.png", format='png') plt.show() with model: y_ = pm.Normal("y", mu=pr, sigma=sigma, observed=y) with model: mp = pm.find_MAP(maxeval=300) trace = pm.sample( 200, n_init=200, tune=100, chains=2, cores=2, return_inferencedata=True, ) arviz.to_netcdf(trace, 'experiments/results/gp_time_trace') n_nonconverged = int( np.sum(arviz.rhat(trace)[["sigma", "pr_rotated_"]].to_array() > 1.03). values) print("%i variables MCMC chains" "appear not to have converged." % n_nonconverged)
def fit_multilevel_model(data_df: pd.DataFrame, op_dir: pl.Path, n_trials: int = 200): """ Constructs the model from Gillan et al. eLife 2016;5:e11305. DOI: 10.7554/eLife.11305, pg. 19-20 using implementational details from the supplementary information of Otto et al. PNAS 2013; DOI: 10.1073/pnas.1312011110. Runs MAP estimation and NUTS sampling. Results are written into <op_dir>. Output of NUTS sampling is stored in a NetCDF file that can be read and analyzed using the package `arviz` :param pandas.DataFrame data_df: Concatenation of DataFrames returned by input_related.read_single_data_file :param pathlib.Path op_dir: path of output folder; must exist :param int n_trials: number of trials to fit :return: None """ unique_subjects = pd.unique(data_df["subject_id"]) n_subjects = unique_subjects.shape[0] data_df.sort_values(by=["subject_id", "trial_number"], inplace=True) print(f"Using data of {n_subjects} subjects: {unique_subjects.tolist()}") with pm.Model() as multilevel_model: # Priors for group level params mu_alpha = pm.Uniform(name='mu_alpha', lower=0, upper=1, transform=None) sigma_alpha_log = pm.Exponential(name="sigma_alpha_log", lam=1.5, transform=None) sigma_alpha = pm.Deterministic(name='sigma_alpha', var=pm.math.exp(sigma_alpha_log)) mu_beta = pm.Normal(name="mu_beta", mu=0, sigma=100) sigma_beta_log = pm.Exponential(name="sigma_beta_log", lam=1.5, transform=None) sigma_beta = pm.Deterministic(name='sigma_beta', var=pm.math.exp(sigma_beta_log)) for subject_ind, (subject_id, subject_df) in enumerate( data_df.groupby("subject_id")): alpha = pm.Beta(name=f'alpha_{subject_id}', alpha=mu_alpha * sigma_alpha, beta=(1 - mu_alpha) * sigma_alpha) beta = pm.Normal(name=f"beta_{subject_id}", mu=mu_beta, sigma=sigma_beta, shape=5) beta2, beta_mb, beta_mf0, beta_mf1, beta_st = beta[0], beta[ 1], beta[2], beta[3], beta[4] print(f"{datetime.datetime.now()} Doing {subject_id}") choice1_repeated = subject_df["choice1"].astype( bool).diff().fillna(False).astype(int) data_df.loc[subject_df.index.values, "choice1_repeated"] = choice1_repeated # Value function # first dimension state2, second dimension choice2 q2_arr = np.zeros((2, 2), dtype=object) # dimension is choice1 qmb_arr = np.zeros((2, ), dtype=object) qmf0_arr = np.zeros((2, ), dtype=object) qmf1_arr = np.zeros((2, ), dtype=object) for trial_number, trial_df in subject_df.groupby("trial_number"): if trial_number > n_trials: continue subject_trial_row = trial_df.iloc[0] st = subject_trial_row["state2"] c2t = subject_trial_row["choice2"] c1t = subject_trial_row["choice1"] rt = subject_trial_row["reward"] repeated_choice = choice1_repeated[trial_df.index.values[0]] p1 = logisitic(beta2 * get_value_diff_0m1(q2_arr[st, :])) # observations c2t_rv = pm.Bernoulli(name=f'c2_{trial_number}_{subject_ind}', observed=c2t, p=p1) q2_arr[st, c2t] = (1 - alpha) * q2_arr[st, c2t] + rt p1 = logisitic(beta_mb * get_value_diff_0m1(qmb_arr) + beta_mf0 * get_value_diff_0m1(qmf0_arr) + beta_mf1 * get_value_diff_0m1(qmf1_arr) + beta_st * repeated_choice) c1t_rv = pm.Bernoulli(name=f'c1_{trial_number}_{subject_ind}', observed=c1t, p=p1) qmb_arr[0] = pm.math.maximum(q2_arr[0, 0], q2_arr[0, 1]) qmb_arr[1] = pm.math.maximum(q2_arr[1, 0], q2_arr[1, 1]) qmf0_arr[c1t] = (1 - alpha) * qmf0_arr[c1t] + q2_arr[st, c2t] qmf1_arr[c1t] = (1 - alpha) * qmf1_arr[c1t] + rt gc.collect() start = { "mu_alpha": 0.5, "sigma_alpha_log": 1, "mu_beta": 0.5, "sigma_beta_log": 1 } print(f"{datetime.datetime.now()} MAP estimation started") map_estimate = pm.find_MAP(model=multilevel_model, progressbar=False, start=start) print(f"{datetime.datetime.now()} MAP estimation done") print(map_estimate) with open( op_dir / f"map_estimate_multilevel_{n_subjects}subjects_{n_trials}trials.json", "w") as fp: json.dump({k: v.tolist() for k, v in map_estimate.items()}, fp) print(f"{datetime.datetime.now()} sampling started") trace = pm.sample(draws=2000, tune=1000, return_inferencedata=False, compute_convergence_checks=True, progressbar=False, cores=4, start=start) print(f"{datetime.datetime.now()} sampling done") print(arviz.summary(trace, round_to=2)) arviz.to_netcdf( data=trace, filename=op_dir / f"sampling_results_multilevel_{n_subjects}subjects_{n_trials}trials.nc" )
data_file = 'DATA/processed/dataset.csv' x, y, x_new = get_data(data_file) with pm.Model() as model: mt = pm.gp.cov.Matern32(2, ls=0.1) gp = pm.gp.Latent(cov_func=mt) pr = gp.prior('pr', X=x) sigma = pm.HalfNormal('sigma', sigma=2) f_star = gp.conditional("f_star", x_new) with model: check = pm.sample_prior_predictive(samples=1) with model: y_ = pm.Normal("y", mu=pr, sigma=sigma, observed=y) with model: trace = pm.sample(200, n_init=100, tune=100, chains=2, cores=2, return_inferencedata=True) arviz.to_netcdf(trace, 'experiments/results/gp_spatial_trace') f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 8)) ax1.hexbin(x_new[:, 0], x_new[:, 1], C=trace.posterior['f_star'][0, :, :].mean(axis=0), gridsize=30, cmap='rainbow') plot = ax1.scatter(x[:, 0], x[:, 1], c=y, s=300, cmap='rainbow') ax2.scatter(x[:, 0], x[:, 1], c=check['pr'], s=300, cmap='rainbow') ax1.set_title("Data + Posterior") ax2.set_title("Prior") plt.colorbar(plot) plt.show()
def save_posterior(self, filename: str) -> None: posterior = self.posterior arviz.to_netcdf(posterior, filename) logger.info(f"Saved posterior samples to {filename}.")
def main(): parser = argparse.ArgumentParser(description='Combine the individual posteriors for each S value.') parser.add_argument('datafile', type=str, help='path to csv containing beta values') parser.add_argument('patientinfofile', type=str, help='path to csv containing patientinfo') parser.add_argument('outputdir', type=str, default='~', help='path to folder in which to store output') parser.add_argument('sample', type=str, help='samplename of beta array (must be a col in datafile index in patientinfo)') # Execute the parse_args() method args = parser.parse_args() datafile = args.datafile patientinfofile = args.patientinfofile outputdir = args.outputdir sample = args.sample outsamplesdir = os.path.join(outputdir, sample, 'posterior') outfinaldir = os.path.join(outputdir, sample, 'outfinal') os.makedirs(outfinaldir, exist_ok=True) beta_values = pd.read_csv(datafile, index_col = 0) patientinfo = pd.read_csv(patientinfofile, keep_default_na=False, index_col = 0) beta = beta_values[sample].dropna().values age = patientinfo.loc[sample, 'age'] outsampleslist = glob.glob(os.path.join(outsamplesdir, 'sample_*.pkl')) S = list() results = dict() for outsamples in outsampleslist: s = int(outsamples.split('.pkl')[0].split('_')[-1]) try: with open(outsamples, 'rb') as f: res = joblib.load(f) results[s] = res S.append(s) print(s) except EOFError: print('sample_{}.pkl is not a correctly formatted pickle file'.format(s)) S.sort() n = len(beta) logZs = np.empty(len(S)) logZerrs = np.empty(len(S)) Nsamples = np.empty(len(S), dtype=int) for index, s in enumerate(S): try: logZs[index] = results[s].logz[-1] logZerrs[index] = results[s].logzerr[-1] Nsamples[index] = results[s].niter except: logZs = results[s]['logz'] logZerrs = results[s]['logzerr'] Nsamples = results[s]['niter'] logZs_bootstrap = np.random.normal(loc = logZs, scale=logZerrs, size = (10000, len(logZs))) prob_s = softmax(logZs) prob_s_bootstrap = softmax(logZs_bootstrap, axis=1) prob_s_err = np.std(prob_s_bootstrap, axis=0) print('\nS:P(S)') for i, s in enumerate(S): print('{}:{:.3e}'.format(s, prob_s[i])) df = pd.DataFrame({'S':S, 'prob':prob_s, 'prob_err':prob_s_err}) df.S.astype(int) df.to_csv(os.path.join(outfinaldir, "prob_of_S.csv"), index=False) sns.set_style('white') sns.set_context("paper", font_scale=1.6) fig, ax = plt.subplots() ax.bar(S, prob_s, yerr=prob_s_err, color=sns.xkcd_rgb["denim blue"]) sns.despine() plt.xlabel("Stem Cell Number (S)") plt.ylabel("Probability") plt.tight_layout() plt.savefig(os.path.join(outfinaldir, "probability_S.png"), dpi = 300) plt.close() Ndraws = 3000 Ssamples = np.random.choice(S, size=Ndraws, p=prob_s) final_posterior = np.empty((Ndraws, 8)) final_posterior[:, -1] = Ssamples beta_hat = np.empty((1, Ndraws, n)) LL = np.empty((1, Ndraws, n)) progress_ints = (np.arange(0.1, 1.1, 0.1) * Ndraws - 1).astype(int) counter = 10 for i in range(Ndraws): if i in progress_ints: print('{}% complete'.format(counter)) counter += 10 s = Ssamples[i] try: posterior = dynesty.utils.resample_equal(results[s].samples, softmax(results[s].logwt)) except: posterior = results[s]['samples'] random_row = np.random.randint(posterior.shape[0]) final_posterior[i, :7] = posterior[random_row, :7] lamsample, musample, gammasample, deltasample, etasample = final_posterior[i, :5] kappasample = posterior[random_row, 7:] LL[0, i, :] = flipflop.loglikelihood_perpoint(posterior[random_row, :], beta, s, age) ProbDist = flipflop.runModel(s, lamsample, musample, gammasample, age) k_sample = np.random.choice(np.arange(0, 2*s+1), size=n, p=ProbDist) beta_sample = k_sample / (2*s) beta_sample = flipflop.rescale_beta(beta_sample, deltasample, etasample) beta_hat[0, i, :] = flipflop.beta_rvs(beta_sample, kappasample[k_sample]) with open(os.path.join(outfinaldir, "finalposterior.pkl"), 'wb') as f: joblib.dump(final_posterior, f) df = pd.DataFrame({'lam':final_posterior[:,0], 'mu':final_posterior[:,1], 'gamma':final_posterior[:,2], 'delta':final_posterior[:,3], 'eta':final_posterior[:,4], 'kappamean':final_posterior[:,5], 'kappadisp':final_posterior[:,6], 'S':Ssamples}) df.to_csv(os.path.join(outfinaldir, "finalposterior.csv"), index=False) fig, ax = plt.subplots() plt.hist(beta, np.linspace(0, 1, 100), density=True, alpha=0.4, linewidth=0) plt.hist(np.ravel(beta_hat), np.linspace(0, 1, 100), density=True, alpha=0.4, linewidth=0) plt.legend(("Data", "Posterior predictive")) plt.xlabel("Fraction Methylated (Beta)") plt.ylabel("Probability density") sns.despine() plt.tight_layout() plt.savefig("{}/posterior_predictive.png".format(outfinaldir), dpi = 300) plt.close() inference = az.from_dict(posterior={'lam':final_posterior[:,0], 'mu':final_posterior[:,1], 'gamma':final_posterior[:,2], 'delta':final_posterior[:,3], 'eta':final_posterior[:,4], 'kappamean':final_posterior[:,5], 'kappadisp':final_posterior[:,6], 'S':Ssamples}, observed_data={'beta':beta}, posterior_predictive={'beta_hat':beta_hat}, sample_stats={"log_likelihood":LL} ) az.to_netcdf(inference, "{}/inference.nc".format(outfinaldir)) pairs = az.plot_pair(inference, var_names=('lam', 'mu', 'gamma', 'delta', 'eta', 'kappamean', 'kappadisp')) plt.savefig('{}/plot_pairs.png'.format(outfinaldir), dpi=300) plt.close() az.plot_loo_pit(inference, y='beta', y_hat='beta_hat', ecdf=True) plt.savefig('{}/plot_loo_pit_ecdf.png'.format(outfinaldir), dpi=300) plt.close() sns.set_context("paper", font_scale=1.0) az.plot_loo_pit(inference, y='beta', y_hat='beta_hat') plt.ylabel('Leave One Out - Probability Integral Transform') plt.xlabel('Cumulative Density Function') plt.savefig('{}/plot_loo_pit.png'.format(outfinaldir), dpi=300) plt.close()
with pm.Model() as model: l_ = pm.Gamma("l", alpha=2, beta=1) eta = pm.HalfCauchy("eta", beta=1) cov = eta ** 2 * pm.gp.cov.Matern52(1, l_) gp = pm.gp.Latent(cov_func=cov) f = gp.prior("f", X=X) sigma = pm.HalfCauchy("sigma", beta=5) nu = pm.Gamma("nu", alpha=2, beta=0.1) y_ = pm.StudentT("y", mu=f, lam=1.0 / sigma, nu=nu, observed=y) trace = pm.sample(200, n_init=100, tune=100, chains=2, cores=2, return_inferencedata=True) az.to_netcdf(trace, 'src/experiments/results/lat_gp_trace') # check Rhat, values above 1 may indicate convergence issues n_nonconverged = int(np.sum(az.rhat(trace)[["eta", "l", "f_rotated_"]].to_array() > 1.03).values) print("%i variables MCMC chains appear not to have converged." % n_nonconverged) # plot the results fig = plt.figure(figsize=(12, 5)) ax = fig.gca() # plot the samples from the gp posterior with samples and shading from pymc3.gp.util import plot_gp_dist plot_gp_dist(ax, trace.posterior["f"][0, :, :], X) # plot the data and the true latent function
def run_model( model_func, data, ep, num_samples=500, num_warmup=500, num_chains=4, target_accept=0.75, max_tree_depth=15, save_results=True, output_fname=None, model_kwargs=None, save_json=False, chain_method="parallel", heuristic_step_size=True, ): """ Model run utility :param model_func: numpyro model :param data: PreprocessedData object :param ep: EpidemiologicalParameters object :param num_samples: number of samples :param num_warmup: number of warmup samples :param num_chains: number of chains :param target_accept: target accept :param max_tree_depth: maximum treedepth :param save_results: whether to save full results :param output_fname: output filename :param model_kwargs: model kwargs -- extra arguments for the model function :param save_json: whether to save json :param chain_method: Numpyro chain method to use :param heuristic_step_size: whether to find a heuristic step size :return: posterior_samples, warmup_samples, info_dict (dict with assorted diagnostics), Numpyro mcmc object """ print( f"Running {num_chains} chains, {num_samples} per chain with {num_warmup} warmup steps" ) nuts_kernel = NUTS( model_func, init_strategy=init_to_median, target_accept_prob=target_accept, max_tree_depth=max_tree_depth, find_heuristic_step_size=heuristic_step_size, ) mcmc = MCMC( nuts_kernel, num_samples=num_samples, num_warmup=num_warmup, num_chains=num_chains, chain_method=chain_method, ) rng_key = random.PRNGKey(0) # hmcstate = nuts_kernel.init(rng_key, 1, model_args=(data, ep)) # nRVs = hmcstate.adapt_state.inverse_mass_matrix.size # inverse_mass_matrix = init_diag_inv_mass_mat * jnp.ones(nRVs) # mass_matrix_sqrt_inv = np.sqrt(inverse_mass_matrix) # mass_matrix_sqrt = 1./mass_matrix_sqrt_inv # hmcstate = hmcstate._replace(adapt_state=hmcstate.adapt_state._replace(inverse_mass_matrix=inverse_mass_matrix)) # hmcstate = hmcstate._replace(adapt_state=hmcstate.adapt_state._replace(mass_matrix_sqrt_inv=mass_matrix_sqrt_inv)) # hmcstate = hmcstate._replace(adapt_state=hmcstate.adapt_state._replace(mass_matrix_sqrt=mass_matrix_sqrt)) # mcmc.post_warmup_state = hmcstate info_dict = { "model_name": model_func.__name__, } start = time.time() if model_kwargs is None: model_kwargs = {} info_dict["model_kwargs"] = model_kwargs # also collect some extra information for better diagonstics! print(f"Warmup Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") mcmc.warmup( rng_key, data, ep, **model_kwargs, collect_warmup=True, extra_fields=["num_steps", "mean_accept_prob", "adapt_state"], ) mcmc.get_extra_fields()["num_steps"].block_until_ready() info_dict["warmup"] = {} info_dict["warmup"]["num_steps"] = np.array( mcmc.get_extra_fields()["num_steps"]).tolist() info_dict["warmup"]["step_size"] = np.array( mcmc.get_extra_fields()["adapt_state"].step_size).tolist() info_dict["warmup"]["inverse_mass_matrix"] = {} all_mass_mats = jnp.array( jnp.array_split( mcmc.get_extra_fields()["adapt_state"].inverse_mass_matrix, num_chains, axis=0, )) print(all_mass_mats.shape) for i in range(num_chains): info_dict["warmup"]["inverse_mass_matrix"][ f"chain_{i}"] = all_mass_mats[i, -1, :].tolist() info_dict["warmup"]["mean_accept_prob"] = np.array( mcmc.get_extra_fields()["mean_accept_prob"]).tolist() warmup_samples = mcmc.get_samples() print(f"Sample Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") mcmc.run( rng_key, data, ep, **model_kwargs, extra_fields=["num_steps", "mean_accept_prob", "adapt_state"], ) posterior_samples = mcmc.get_samples() # if you don't block this, the timer won't quite work properly. posterior_samples[list(posterior_samples.keys())[0]].block_until_ready() print(f"Sample Finished: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}") end = time.time() time_per_sample = float(end - start) / num_samples divergences = int(mcmc.get_extra_fields()["diverging"].sum()) info_dict["time_per_sample"] = time_per_sample info_dict["total_runtime"] = float(end - start) info_dict["divergences"] = divergences info_dict["sample"] = {} info_dict["sample"]["num_steps"] = np.array( mcmc.get_extra_fields()["num_steps"]).tolist() info_dict["sample"]["mean_accept_prob"] = np.array( mcmc.get_extra_fields()["mean_accept_prob"]).tolist() info_dict["sample"]["step_size"] = np.array( mcmc.get_extra_fields()["adapt_state"].step_size).tolist() print(f"Sampling {num_samples} samples per chain took {end - start:.2f}s") print(f"There were {divergences} divergences.") grouped_posterior_samples = mcmc.get_samples(True) all_ess = np.array([]) for k in grouped_posterior_samples.keys(): ess = numpyro.diagnostics.effective_sample_size( np.asarray(grouped_posterior_samples[k])) all_ess = np.append(all_ess, ess) print(f"{np.sum(np.isnan(all_ess))} ESS were nan") all_ess = all_ess[np.logical_not(np.isnan(all_ess))] info_dict["ess"] = { "med": float(np.percentile(all_ess, 50)), "lower": float(np.percentile(all_ess, 2.5)), "upper": float(np.percentile(all_ess, 97.5)), "min": float(np.min(all_ess)), "max": float(np.max(all_ess)), } print( f"Mean ESS: {info_dict['ess']['med']:.2f} [{info_dict['ess']['lower']:.2f} ... {info_dict['ess']['upper']:.2f}]" ) if num_chains > 1: all_rhat = np.array([]) for k in grouped_posterior_samples.keys(): rhat = numpyro.diagnostics.gelman_rubin( np.asarray(grouped_posterior_samples[k])) all_rhat = np.append(all_rhat, rhat) print(f"{np.sum(np.isnan(all_rhat))} Rhat were nan") all_rhat = all_rhat[np.logical_not(np.isnan(all_rhat))] info_dict["rhat"] = { "med": float(np.percentile(all_rhat, 50)), "upper": float(np.percentile(all_rhat, 97.5)), "lower": float(np.percentile(all_rhat, 2.5)), "min": float(np.max(all_rhat)), "max": float(np.min(all_rhat)), } print( f"Rhat: {info_dict['rhat']['med']:.2f} [{info_dict['rhat']['lower']:.2f} ... {info_dict['rhat']['upper']:.2f}]" ) if save_results: print("Saving .netcdf") try: inf_data = az.from_numpyro(mcmc) if output_fname is None: output_fname = f'{model_func.__name__}-{datetime.now(tz=None).strftime("%d-%m;%H-%M-%S")}.netcdf' az.to_netcdf(inf_data, output_fname) json_fname = output_fname.replace(".netcdf", ".json") if save_json: print("Saving Json") with open(json_fname, "w") as f: json.dump(info_dict, f, ensure_ascii=False, indent=4) except Exception as e: print(e) return posterior_samples, warmup_samples, info_dict, mcmc
def run(i, bin_list, runname, niter, nchain, adapt_delta, max_treedepth, verbose=False, save_chains=False, save_plots=False, fit_type=None): idx = bin_list[i] stridx = str(idx) misc.printRUNNING(runname+" - Bin: "+stridx+" - Fit type: "+fit_type) try: # Defining the version of the code to use codefile, extrapars = misc.read_code(fit_type) # Defining output names and directories rootname = runname+"-"+fit_type outdir = "../results/"+rootname pdf_filename = outdir+"/"+rootname+"_diagnostics_bin"+str(idx)+".pdf" summary_filename = outdir+"/"+rootname+"_Stan_summary_bin"+str(idx)+".txt" arviz_filename = outdir+"/"+rootname+"_chains_bin"+str(idx)+".netcdf" sample_filename = outdir+"/"+rootname+"_progress_bin"+str(idx)+".csv" outhdf5 = outdir+"/"+rootname+"_results_bin"+str(idx)+".hdf5" # Creating the basic structure with the data for Stan struct = h5py.File("../preproc_data/"+runname+".hdf5","r") data = {'npix_obs': np.array(struct['in/npix_obs']), 'ntemp': np.array(struct['in/ntemp']), 'nvel': np.array(struct['in/nvel']), 'npix_temp': np.array(struct['in/npix_temp']), 'mask': np.array(struct['in/mask']), 'nmask': np.array(struct['in/nmask']), 'porder': np.array(struct['in/porder']), 'spec_obs': np.array(struct['in/spec_obs'][:,idx]), 'sigma_obs': np.array(struct['in/sigma_obs'][:,idx]), 'templates': np.array(struct['in/templates']), 'mean_template': np.array(struct['in/mean_template']), 'velscale': np.array(struct['in/velscale']), 'xvel': np.array(struct['in/xvel'])} # Adding any extra parameter needed for that particular fit_type for key, val in extrapars.items(): data[key] = val # Running the model with open(codefile, 'r') as myfile: code = myfile.read() model = stan_cache(model_code=code, codefile=codefile) fit = model.sampling(data=data, iter=niter, chains=nchain, control={'adapt_delta':adapt_delta, 'max_treedepth':max_treedepth}, sample_file=sample_filename, check_hmc_diagnostics=True) samples = fit.extract(permuted=True) # Extracting parameter samples diag_pars = fit.get_sampler_params() # Getting sampler diagnostic params # If requested, saving sample chains if (save_chains == True): print("") print("# Saving chains in Arviz (NETCDF) format: "+arviz_filename) arviz_data = az.from_pystan(posterior=fit, observed_data=['mask','spec_obs','sigma_obs']) az.to_netcdf(arviz_data,arviz_filename) # Saving Stan's summary of main parameters on disk print("") print("# Saving Stan summary: "+summary_filename) unwanted = {'spec','conv_spec','poly','bestfit','a','losvd_'} misc.save_stan_summary(fit, unwanted=unwanted, verbose=verbose, summary_filename=summary_filename) # Processing output and saving results print("") print("# Processing and saving results: "+outhdf5) misc.process_stan_output_hdp(struct,samples,outhdf5,stridx) # Creating diagnostic plots if (save_plots == True): if os.path.exists(pdf_filename): os.remove(pdf_filename) print("") print("# Saving diagnostic plots: "+pdf_filename) create_diagnostic_plots(idx, pdf_filename, fit, diag_pars, niter, nchain) # Removing progess files print("") print("# Deleting progress files") misc.delete_files(sample_filename,'csv') misc.delete_files(sample_filename,'png') # If we are here, we are DONE! struct.close() misc.printDONE(runname+" - Bin: "+stridx+" - Fit type: "+fit_type) return 'OK' except Exception: misc.printFAILED() traceback.print_exc() return 'ERROR'