def get_model_results_dict(): df = get_model_input_df() model_res_dict = {} # fixed effects meta analyses (lnVR and lnCVR) for model in ['fema', 'rema' ]: # lnVR, # random effects meta analyses (lnVR and lnCVR) stan_model = compile_model(os.path.join(stan_model_path, f'{model}.stan'), model_name=model) for effect_statistic in ['lnVR', 'lnCVR']: data_dict = get_data_dict(df, effect_statistic) fit = stan_model.sampling(data=data_dict, iter=4000, warmup=1000, chains=3, control={'adapt_delta': 0.99}, check_hmc_diagnostics=True, seed=1) data = az.from_pystan( posterior=fit, posterior_predictive=['Y_pred'], observed_data=['Y'], log_likelihood='log_lik', ) model_res_dict[f'{model}_{effect_statistic}'] = data model = 'remr' stan_model = compile_model(os.path.join(stan_model_path, f'{model}.stan'), model_name=model) effect_statistic = 'lnVR' data_dict = get_data_dict(df, effect_statistic) fit = stan_model.sampling(data=data_dict, iter=4000, warmup=1000, chains=3, control={'adapt_delta': 0.99}, check_hmc_diagnostics=True, seed=1) pystan.check_hmc_diagnostics(fit) data = az.from_pystan( posterior=fit, posterior_predictive=['Y_pred'], observed_data=['Y_meas', 'X_meas'], log_likelihood='log_lik', ) model_res_dict[f'{model}_{effect_statistic}'] = data return model_res_dict
def fit_models(stan_data, stan_models, **hmc_args): fits = [] az_fits = {k: None for k in fit_models} for fit_model in stan_models: print('Fitting %s model' % fit_model) model = pkl.load( open(CODE_DIR + 'assignment_errors/%s_model.stan' % fit_model, 'rb')) fit = model.sampling(data=stan_data, **hmc_args) model_params = { 'observed_data': 'y', 'log_likelihood': { 'y': 'log_lik' }, 'posterior_predictive': 'err_hat' } fit_az = az.from_pystan(posterior=fit, **model_params) fits.append(fit) az_fits[fit_model] = fit_az comps = az.compare(az_fits) return comps, fits
def get_varying_intercept_model_results(): # read in Cipriani data df = get_model_input_df() data_dict = { 'N': df.shape[0], 'Y_meas': df['lnSD'].values, 'X_meas': df['lnMean'].values, 'SD_Y': np.sqrt(df['var_lnSD'].values), 'SD_X': np.sqrt(df['var_lnMean'].values), 'K': len(df.scale.unique()), 'scale_group': df.scale_rank.values } varying_intercept_stan_model = compile_model( os.path.join(stan_model_path, 'varying_intercept_regression.stan'), model_name='varying_intercept_regression') fit = varying_intercept_stan_model.sampling(data=data_dict, iter=4000, warmup=1000, chains=3, control={'adapt_delta': 0.99}, check_hmc_diagnostics=True, seed=1) pystan.check_hmc_diagnostics(fit) data = az.from_pystan( posterior=fit, posterior_predictive=['Y_pred'], observed_data=['X_meas', 'Y_meas'], log_likelihood='log_lik', ) return data
def print_fit(*args: TaskModel, ic: str = 'looic') -> pd.DataFrame: """Print model-fits (mean LOOIC or WAIC values) of hbayesdm models. Parameters ---------- args Output instances of running hbayesdm model functions. ic Information criterion (defaults to 'looic'). Returns ------- pd.DataFrame Model-fit info per each hbayesdm output given as argument(s). """ ic_options = ('looic', 'waic') if ic not in ic_options: raise RuntimeError('Information Criterion (ic) must be one of ' + repr(ic_options)) dataset_dict = { model_data.model: az.from_pystan(model_data.fit, log_likelihood='log_lik') for model_data in args } ic = 'loo' if ic == 'looic' else 'waic' return az.compare(dataset_dict=dataset_dict, ic=ic)
def test_empty_parameter(self): if pystan_version() == 2: model_code = """ parameters { real y; vector[3] x; vector[0] a; vector[2] z; } model { y ~ normal(0,1); } """ from pystan import StanModel # pylint: disable=import-error model = StanModel(model_code=model_code) fit = model.sampling(iter=10, chains=2, check_hmc_diagnostics=False) posterior = from_pystan(posterior=fit) test_dict = { "posterior": ["y", "x", "z"], "sample_stats": ["diverging"] } fails = check_multiple_attrs(test_dict, posterior) assert not fails
def get_waic_and_loo(fit): """Compute WAIC and LOO from a fit instance""" idata = az.from_pystan(fit, log_likelihood="llx") result = {} result.update(dict(az.loo(idata, scale='deviance'))) result.update(dict(az.waic(idata, scale='deviance'))) return result
def test_index_order(self, data, eight_schools_params): """Test 0-indexed data.""" # Skip test if pystan not installed pystan = importorskip("pystan") # pylint: disable=import-error fit = data.model.sampling(data=eight_schools_params) if pystan.__version__ >= "2.18": # make 1-indexed to 0-indexed for holder in fit.sim["samples"]: new_chains = OrderedDict() for i, (key, values) in enumerate(holder.chains.items()): if "[" in key: name, *shape = key.replace("]", "").split("[") shape = [ str(int(item) - 1) for items in shape for item in items.split(",") ] key = name + "[{}]".format(",".join(shape)) new_chains[key] = np.full_like(values, fill_value=float(i)) setattr(holder, "chains", new_chains) fit.sim["fnames_oi"] = list(fit.sim["samples"][0].chains.keys()) idata = from_pystan(posterior=fit) assert idata is not None for j, fpar in enumerate(fit.sim["fnames_oi"]): if fpar == "lp__": continue par, *shape = fpar.replace("]", "").split("[") assert hasattr(idata.posterior, par) if shape: shape = [slice(None), slice(None)] + list(map(int, shape)) assert idata.posterior[par][tuple( shape)].values.mean() == float(j) else: assert idata.posterior[par].values.mean() == float(j)
def test_invalid_fit(self, data): if pystan_version() == 2: model = data.model model_data = { "J": 8, "y": np.array([28.0, 8.0, -3.0, 7.0, -1.0, 1.0, 18.0, 12.0]), "sigma": np.array([15.0, 10.0, 16.0, 11.0, 9.0, 11.0, 10.0, 18.0]), } fit_test_grad = model.sampling( data=model_data, test_grad=True, check_hmc_diagnostics=False ) with pytest.raises(AttributeError): _ = from_pystan(posterior=fit_test_grad) fit = model.sampling(data=model_data, iter=100, chains=1, check_hmc_diagnostics=False) del fit.sim["samples"] with pytest.raises(AttributeError): _ = from_pystan(posterior=fit)
def __init__(self, data, dataset, ci=95.): if az is None: raise ValueError("ArviZ package must be installed in order to work" " with the BayesianMetaRegressionResults class.") if data.__class__.__name__ == 'StanFit4Model': data = az.from_pystan(data) self.data = data self.dataset = dataset self.ci = ci
def fit_bhv_model(data, model_path=bmp, targ_field='LABthetaTarget', dist_field='LABthetaDist', resp_field='LABthetaResp', prior_dict=None, stan_iters=2000, stan_chains=4, arviz=mixture_arviz, adapt_delta=.9, diagnostics=True, **stan_params): if prior_dict is None: prior_dict = default_prior_dict targs_is = data[targ_field] session_list = np.array(data[['animal', 'date']]) mapping_list = [] session_nums = np.array([], dtype=int) for i, x in enumerate(targs_is): sess = np.ones(len(x), dtype=int) * (i + 1) session_nums = np.concatenate((session_nums, sess)) indices = x.index sess_info0 = (str(session_list[i, 0]), ) * len(x) sess_info1 = (str(session_list[i, 1]), ) * len(x) mapping_list = mapping_list + list(zip(indices, sess_info0, sess_info1)) mapping_dict = {i: mapping_list[i] for i in range(len(session_nums))} targs = np.concatenate(targs_is, axis=0) dists = np.concatenate(data[dist_field], axis=0) resps = np.concatenate(data[resp_field], axis=0) errs = u.normalize_periodic_range(targs - resps) dist_errs = u.normalize_periodic_range(dists - resps) dists_per = u.normalize_periodic_range(dists - targs) stan_data = dict(T=dist_errs.shape[0], S=len(targs_is), err=errs, dist_err=dist_errs, run_ind=session_nums, dist_loc=dists_per, **prior_dict) control = { 'adapt_delta': stan_params.pop('adapt_delta', .8), 'max_treedepth': stan_params.pop('max_treedepth', 10) } sm = pickle.load(open(model_path, 'rb')) fit = sm.sampling(data=stan_data, iter=stan_iters, chains=stan_chains, control=control, **stan_params) if diagnostics: diag = ps.diagnostics.check_hmc_diagnostics(fit) else: diag = None fit_av = az.from_pystan(posterior=fit, **arviz) return fit, diag, fit_av, stan_data, mapping_dict
def get_inference_data4(self, data): """multiple vars as lists.""" return from_pystan( posterior=data.obj, posterior_predictive=None, prior=data.obj, prior_predictive=None, observed_data="y", coords=None, dims=None, )
def get_inference_data4(self, data): """minimal input.""" return from_pystan( posterior=data.obj, posterior_predictive=None, prior=data.obj, prior_predictive=None, coords=None, dims=None, posterior_model=data.model, prior_model=data.model, )
def get_inference_data3(self, data, eight_schools_params): """multiple vars as lists.""" return from_pystan( posterior=data.obj, posterior_predictive=["y_hat", "log_lik"], prior=data.obj, prior_predictive=["y_hat", "log_lik"], observed_data="y", coords={"school": np.arange(eight_schools_params["J"])}, dims={"theta": ["school"], "y": ["school"], "y_hat": ["school"], "eta": ["school"]}, posterior_model=data.model, prior_model=data.model, )
def get_inference_data(self): return from_pystan(fit=self.obj, posterior_predictive='y_hat', observed_data=['y'], log_likelihood='log_lik', coords={'school': np.arange(self.data['J'])}, dims={ 'theta': ['school'], 'y': ['school'], 'log_lik': ['school'], 'y_hat': ['school'], 'theta_tilde': ['school'] })
def create_diagnostic_plots(idx,pdf_filename,fit,diag_pars,niter,nchain): # Converting the Stan FIT object to Arviz InfereceData samples = fit.extract(permuted=True) # Extracting parameter samples data = az.from_pystan(fit) tmp = data.posterior var_names = list(tmp.data_vars) # Filtering the list of parameters to plot unwanted = {'losvd','spec','conv_spec','poly','bestfit','losvd_','losvd_mod','spec_pred','log_likelihood'} vars_main = [e for e in var_names if e not in unwanted] # Reading diagnostic parameters accept_stat, stepsize, treedepth = np.zeros((niter,nchain)), np.zeros((niter,nchain)) , np.zeros((niter,nchain)) n_leapfrog, divergent, energy = np.zeros((niter,nchain)), np.zeros((niter,nchain)) , np.zeros((niter,nchain)) for j in range(nchain): accept_stat[:,j] = diag_pars[j]['accept_stat__'] stepsize[:,j] = diag_pars[j]['stepsize__'] treedepth[:,j] = diag_pars[j]['treedepth__'] n_leapfrog[:,j] = diag_pars[j]['n_leapfrog__'] divergent[:,j] = diag_pars[j]['divergent__'] energy[:,j] = diag_pars[j]['energy__'] # Creating the plot in multiple PDF papges pdf_pages = PdfPages(pdf_filename) print(" - Sampler params") plot_sampler_params(idx,accept_stat,stepsize,treedepth,n_leapfrog,divergent,energy) pdf_pages.savefig() print(" - Chains") plot_chains(samples,vars_main) pdf_pages.savefig() # print(" - Trace plot [Main params]") # az.plot_trace(data, var_names=vars_main) # pdf_pages.savefig() # print(" - Trace plot [LOSVD]") # az.plot_trace(data, var_names=['losvd']) # pdf_pages.savefig() print(" - Pair plot") az.plot_pair(data, var_names=vars_main, divergences=True, kind='kde', fill_last=False) pdf_pages.savefig() print(" - Autocorr plot") az.plot_autocorr(data, var_names=vars_main) pdf_pages.savefig() print(" - Energy plot") az.plot_energy(data) pdf_pages.savefig() pdf_pages.close() return
def get_inference_data5(self, data): """minimal input.""" return from_pystan( posterior=data.obj, posterior_predictive=None, prior=data.obj, prior_predictive=None, coords=None, dims=None, posterior_model=data.model, log_likelihood=False, prior_model=data.model, save_warmup=pystan_version() == 2, )
def get_inference_data(self): return from_pystan( fit=self.obj, posterior_predictive="y_hat", observed_data=["y"], log_likelihood="log_lik", coords={"school": np.arange(self.data["J"])}, dims={ "theta": ["school"], "y": ["school"], "log_lik": ["school"], "y_hat": ["school"], "theta_tilde": ["school"], }, )
def fit(self, y1, y2, w1=None, w2=None): self.mcmc_ = get_mcmc(self.model, y1, y2, w1, w2, **self.kwargs) self.data_ = az.from_pystan( posterior=self.mcmc_, posterior_predictive=['y1_pred', 'y2_pred'], observed_data=['y1', 'y2'], log_likelihood='log_lik', coords={ 'group_mu': ['Group 1', 'Group 2'], 'group_sigma': ['Group 1', 'Group 2'] }, dims={ 'mu': ['group_mu'], 'sigma': ['group_sigma'] })
def get_inference_data2(self): # dictionary observed_data = {'y_hat': self.data['y']} # ndarray log_likelihood = self.obj.extract('log_lik', permuted=False)['log_lik'] return from_pystan(fit=self.obj, posterior_predictive='y_hat', observed_data=observed_data, log_likelihood=log_likelihood, coords={'school': np.arange(self.data['J'])}, dims={ 'theta': ['school'], 'y': ['school'], 'log_lik': ['school'], 'y_hat': ['school'], 'theta_tilde': ['school'] })
def get_subgroup_models(): df = get_formatted_data() # drug class subgroup analysis model_res_dict = {} for drug_class in DRUG_CLASSES: study_ids = df.query(f'drug_class == "{drug_class}"').study_id.unique() df_sub = df[(df.study_id.isin(study_ids)) & (df.drug_class.isin([drug_class, 'placebo']))].copy() placebo_controlled_study_ids = set(df_sub.query('is_active == 1')['study_id']) \ .intersection(df_sub.query('is_active == 0')['study_id']) df_sub = df_sub[df_sub.study_id.isin(placebo_controlled_study_ids)] for column in ['study_id', 'scale', 'drug_class']: df_sub = add_rank_column(df_sub, column) df_sub = aggregate_treatment_arms(df_sub) df_sub = get_variability_effect_sizes(df_sub) model = 'remr' stan_model = compile_model(os.path.join(stan_model_path, f'{model}.stan'), model_name=model) data_dict = get_data_dict(df_sub, 'lnVR') fit = stan_model.sampling(data=data_dict, iter=4000, warmup=1000, chains=3, control={'adapt_delta': 0.99}, check_hmc_diagnostics=True, seed=1) pystan.check_hmc_diagnostics(fit) data = az.from_pystan( posterior=fit, posterior_predictive=['Y_pred'], observed_data=['Y_meas', 'X_meas'], log_likelihood='log_lik', ) model_res_dict[drug_class] = data return model_res_dict
def get_inference_data3(self, data, eight_schools_params): """log_likelihood as a ndarray.""" # ndarray log_likelihood = pystan_extract_normal(data.obj, "log_lik")["log_lik"] return from_pystan( fit=data.obj, posterior_predictive=["y_hat"], observed_data=["y"], log_likelihood=log_likelihood, coords={"school": np.arange(eight_schools_params["J"])}, dims={ "theta": ["school"], "y": ["school"], "log_lik": ["school"], "y_hat": ["school"], "theta_tilde": ["school"], }, )
def get_inference_data(self, data, eight_school_params): """vars as str.""" return from_pystan( posterior=data.obj, posterior_predictive="y_hat", prior=data.obj, prior_predictive="y_hat", observed_data="y", log_likelihood="log_lik", coords={"school": np.arange(eight_school_params["J"])}, dims={ "theta": ["school"], "y": ["school"], "log_lik": ["school"], "y_hat": ["school"], "theta_tilde": ["school"], }, )
def get_inference_data2(self): # dictionary observed_data = {"y_hat": self.data["y"]} # ndarray log_likelihood = self.obj.extract("log_lik", permuted=False)["log_lik"] return from_pystan( fit=self.obj, posterior_predictive="y_hat", observed_data=observed_data, log_likelihood=log_likelihood, coords={"school": np.arange(self.data["J"])}, dims={ "theta": ["school"], "y": ["school"], "log_lik": ["school"], "y_hat": ["school"], "theta_tilde": ["school"], }, )
def get_inference_data(self, data, eight_school_params): """log_likelihood as a var.""" prior = pystan_extract_unpermuted(data.obj) prior = {"theta_test": prior["theta"]} return from_pystan( fit=data.obj, prior=prior, posterior_predictive="y_hat", observed_data=["y"], log_likelihood="log_lik", coords={"school": np.arange(eight_school_params["J"])}, dims={ "theta": ["school"], "y": ["school"], "log_lik": ["school"], "y_hat": ["school"], "theta_tilde": ["school"], }, )
def run(self, samples=1000, chains=1, **kwargs): # pylint: disable=arguments-differ """ Run the Stan sampler. Parameters ---------- samples : int Number of samples to obtain (in each chain). chains : int Number of chains to use. kwargs : dict Optional keyword arguments passed onto the PyStan StanModel.sampling() call. Returns: ---------- An ArviZ InferenceData instance. """ self.fit = self.stan_model.sampling(data=self.X, iter=samples, chains=chains, **kwargs) return from_pystan(self.fit)
def test_empty_parameter(self): if pystan_version() == 2: model_code = """ parameters { real y; vector[0] z; } model { y ~ normal(0,1); } """ from pystan import StanModel model = StanModel(model_code=model_code) fit = model.sampling(iter=10, chains=2, check_hmc_diagnostics=False) posterior = from_pystan(posterior=fit) assert hasattr(posterior, "posterior") assert hasattr(posterior.posterior, "y") assert not hasattr(posterior.posterior, "z")
def get_baseline_severity_model(): df = prepare_data() effect_statistic = 'lnVR' data_dict = { 'N': len(df.study_id.unique()), 'Y_meas': df.groupby(['study_id']).agg({effect_statistic: 'first'}).reset_index()[effect_statistic].values, 'X_meas': df.groupby(['study_id']).agg({'lnRR': 'first'}).reset_index()['lnRR'].values, 'SD_Y': np.sqrt(df.groupby(['study_id']).agg( {f'var_{effect_statistic}': 'first'}).reset_index()[f'var_{effect_statistic}'].values), 'SD_X': np.sqrt(df.groupby(['study_id']).agg( {'var_lnRR': 'first'}).reset_index()['var_lnRR'].values), 'X0': df.groupby(['study_id']).apply( lambda x: np.sum(x['baseline'] * x['N']) / np.sum(x['N']) ).reset_index()[0].values, 'run_estimation': 1 } stan_model = compile_model( os.path.join(stan_model_path, 'remr_bs.stan'), model_name='remr_bs' ) fit = stan_model.sampling( data=data_dict, iter=4000, warmup=1000, chains=3, control={'adapt_delta': 0.99}, check_hmc_diagnostics=True, seed=1 ) pystan.check_hmc_diagnostics(fit) data = az.from_pystan( posterior=fit, posterior_predictive=['Y_pred'], observed_data=['Y_meas', 'X_meas', 'X0'], log_likelihood='log_lik', ) return data
def get_simulation_results(): data_dict = { 'N': 1000, 'rho': -0.4, 'sd_te': 6.5, 'sd_m': 0.001, 'lambda': 0.2, 'theta': 0.9 } simulation_stan_model = compile_model(os.path.join(stan_model_path, 'simulation.stan'), model_name='simulation') fit = simulation_stan_model.sampling(data=data_dict, warmup=500, iter=2500, chains=2, check_hmc_diagnostics=True, seed=1) pystan.check_hmc_diagnostics(fit) data = az.from_pystan(posterior=fit) return data
def get_inference_data3(self, data, eight_schools_params): """multiple vars as lists.""" return from_pystan( posterior=data.obj, posterior_predictive=["y_hat", "log_lik"], # wrong, but fine for testing predictions=["y_hat", "log_lik"], # wrong, but fine for testing prior=data.obj, prior_predictive=["y_hat", "log_lik"], # wrong, but fine for testing constant_data=["sigma", "y"], # wrong, but fine for testing predictions_constant_data=["sigma", "y"], # wrong, but fine for testing coords={"school": np.arange(eight_schools_params["J"])}, dims={ "theta": ["school"], "y": ["school"], "sigma": ["school"], "y_hat": ["school"], "eta": ["school"], }, posterior_model=data.model, prior_model=data.model, )
def run_model(rankings, survey_head_to_heads, stan_file=STAN_FILE, model_config='combined'): n_episode_contestant = rankings.groupby( 'episode_id')['contestant_id'].nunique() episode_rank_counts = (rankings.groupby( ['episode_id', 'rank']).size().unstack().fillna(0).astype(int)) contestants = rankings.groupby('contestant_id').first() contestants['id_stan'] = range(1, len(contestants) + 1) rankings = rankings.join(contestants['id_stan'], on='contestant_id') survey_head_to_heads = (survey_head_to_heads.join( contestants['id_stan'].rename('id_stan_own'), on='own').join(contestants['id_stan'].rename('id_stan_opp'), on='opp')) input_data = { 'N': len(rankings), 'K': len(PREDICTORS), 'C': len(contestants), 'E': rankings['episode_id'].nunique(), 'X': contestants[PREDICTORS].fillna(0).values, 'N_episode_contestant': n_episode_contestant.values, 'episode_rank': rankings['rank'].values, 'contestant': rankings['id_stan'].values, 'N_survey': len(survey_head_to_heads), 'survey_contestant': survey_head_to_heads['id_stan_own'].values, 'survey_opponent': survey_head_to_heads['id_stan_opp'].values, 'survey_count': survey_head_to_heads['count'].values, 'survey_wins': survey_head_to_heads['wins'].values } model = StanModel_cache(file=stan_file) fit = model.sampling(data={**input_data, **model_config}) return arviz.from_pystan(fit, coords={ 'contestant': contestants.index, 'predictor': PREDICTORS }, dims={ 'ability': ['contestant'], 'beta': ['predictor'] })