def run_and_plot_models(segmentDF, adjacencyMatrix, iters, warmup): tobit_dict = get_tobit_dict(segmentDF) # TOBIT MODEL: t_c_params = {'adapt_delta': 0.95, 'max_treedepth': 15} tobit_model, tobit_fit = run_or_load_model('tobit', tobit_dict, iters, warmup, t_c_params) check_hmc_diagnostics(tobit_fit) plt.hist(tobit_fit['sigma'], bins=int(iters * 4 / 100)) plt.title('tobit') tob_vars = ['sigma', 'beta_zero', 'theta'] az.plot_trace(tobit_fit, tob_vars) # SPATIAL TOBIT MODEL: c_c_params = {'adapt_delta': 0.95, 'max_treedepth': 15} car_dict = add_car_info_to_dict(tobit_dict, adjacencyMatrix) car_model, car_fit = run_or_load_model('car', car_dict, iters, warmup, c_c_params) check_hmc_diagnostics(car_fit) plt.hist(car_fit['sigma'], bins=int(iters * 4 / 100)) plt.title('car') car_vars = ['sigma', 'beta_zero', 'theta', 'alpha', 'tau'] az.plot_trace(car_fit, compact=False, var_names=car_vars) az.plot_pair(car_fit, ['tau', 'alpha', 'sigma'], divergences=True) plt.scatter(car_fit['lp__'], car_fit['sigma']) plt.hist(car_fit['phi'].mean(axis=0), bins=50)
def get_varying_intercept_model_results(): # read in Cipriani data df = get_model_input_df() data_dict = { 'N': df.shape[0], 'Y_meas': df['lnSD'].values, 'X_meas': df['lnMean'].values, 'SD_Y': np.sqrt(df['var_lnSD'].values), 'SD_X': np.sqrt(df['var_lnMean'].values), 'K': len(df.scale.unique()), 'scale_group': df.scale_rank.values } varying_intercept_stan_model = compile_model( os.path.join(stan_model_path, 'varying_intercept_regression.stan'), model_name='varying_intercept_regression') fit = varying_intercept_stan_model.sampling(data=data_dict, iter=4000, warmup=1000, chains=3, control={'adapt_delta': 0.99}, check_hmc_diagnostics=True, seed=1) pystan.check_hmc_diagnostics(fit) data = az.from_pystan( posterior=fit, posterior_predictive=['Y_pred'], observed_data=['X_meas', 'Y_meas'], log_likelihood='log_lik', ) return data
def get_model_results_dict(): df = get_model_input_df() model_res_dict = {} # fixed effects meta analyses (lnVR and lnCVR) for model in ['fema', 'rema' ]: # lnVR, # random effects meta analyses (lnVR and lnCVR) stan_model = compile_model(os.path.join(stan_model_path, f'{model}.stan'), model_name=model) for effect_statistic in ['lnVR', 'lnCVR']: data_dict = get_data_dict(df, effect_statistic) fit = stan_model.sampling(data=data_dict, iter=4000, warmup=1000, chains=3, control={'adapt_delta': 0.99}, check_hmc_diagnostics=True, seed=1) data = az.from_pystan( posterior=fit, posterior_predictive=['Y_pred'], observed_data=['Y'], log_likelihood='log_lik', ) model_res_dict[f'{model}_{effect_statistic}'] = data model = 'remr' stan_model = compile_model(os.path.join(stan_model_path, f'{model}.stan'), model_name=model) effect_statistic = 'lnVR' data_dict = get_data_dict(df, effect_statistic) fit = stan_model.sampling(data=data_dict, iter=4000, warmup=1000, chains=3, control={'adapt_delta': 0.99}, check_hmc_diagnostics=True, seed=1) pystan.check_hmc_diagnostics(fit) data = az.from_pystan( posterior=fit, posterior_predictive=['Y_pred'], observed_data=['Y_meas', 'X_meas'], log_likelihood='log_lik', ) model_res_dict[f'{model}_{effect_statistic}'] = data return model_res_dict
def check_fit(row,root='.',): output_dict = {} sample_loc = root+'/output/posteriors/' + row['event_name'] + '_raw.p' data_loc = root+'/data/timeseries/aggregated/' + row['incident_name'] + '_raw.csv' samples = pickle.load(open(sample_loc, 'rb')) dat = pd.read_csv(data_loc).iloc[row['start']:row['end']] y = dat['total_tweets'].values print('--------') print(row['event_name']) output_dict.update(pystan.check_hmc_diagnostics(samples, pars=['alpha','beta','decay', 'lambda','phi'])) observed_engagement = np.sum(y) output_dict['observed_engagement'] = observed_engagement output_dict['y_0'] = y[0] output_dict['observed_engagement'] = observed_engagement low, high = np.percentile(np.cumsum(samples['y_hat'],axis=1)[:,-1], q=[5.5,94.5]) cumulative_fit = np.cumsum(y)[-1] > low and np.cumsum(y)[-1] < high output_dict['final_predicted' ] = cumulative_fit output_dict.update(row.to_dict()) output_dict['sample_loc'] = sample_loc output_dict['data_loc'] = data_loc output_dict['lower_predicted']=low output_dict['upper_predicted']=high return output_dict
def get_subgroup_models(): df = get_formatted_data() # drug class subgroup analysis model_res_dict = {} for drug_class in DRUG_CLASSES: study_ids = df.query(f'drug_class == "{drug_class}"').study_id.unique() df_sub = df[(df.study_id.isin(study_ids)) & (df.drug_class.isin([drug_class, 'placebo']))].copy() placebo_controlled_study_ids = set(df_sub.query('is_active == 1')['study_id']) \ .intersection(df_sub.query('is_active == 0')['study_id']) df_sub = df_sub[df_sub.study_id.isin(placebo_controlled_study_ids)] for column in ['study_id', 'scale', 'drug_class']: df_sub = add_rank_column(df_sub, column) df_sub = aggregate_treatment_arms(df_sub) df_sub = get_variability_effect_sizes(df_sub) model = 'remr' stan_model = compile_model(os.path.join(stan_model_path, f'{model}.stan'), model_name=model) data_dict = get_data_dict(df_sub, 'lnVR') fit = stan_model.sampling(data=data_dict, iter=4000, warmup=1000, chains=3, control={'adapt_delta': 0.99}, check_hmc_diagnostics=True, seed=1) pystan.check_hmc_diagnostics(fit) data = az.from_pystan( posterior=fit, posterior_predictive=['Y_pred'], observed_data=['Y_meas', 'X_meas'], log_likelihood='log_lik', ) model_res_dict[drug_class] = data return model_res_dict
def get_baseline_severity_model(): df = prepare_data() effect_statistic = 'lnVR' data_dict = { 'N': len(df.study_id.unique()), 'Y_meas': df.groupby(['study_id']).agg({effect_statistic: 'first'}).reset_index()[effect_statistic].values, 'X_meas': df.groupby(['study_id']).agg({'lnRR': 'first'}).reset_index()['lnRR'].values, 'SD_Y': np.sqrt(df.groupby(['study_id']).agg( {f'var_{effect_statistic}': 'first'}).reset_index()[f'var_{effect_statistic}'].values), 'SD_X': np.sqrt(df.groupby(['study_id']).agg( {'var_lnRR': 'first'}).reset_index()['var_lnRR'].values), 'X0': df.groupby(['study_id']).apply( lambda x: np.sum(x['baseline'] * x['N']) / np.sum(x['N']) ).reset_index()[0].values, 'run_estimation': 1 } stan_model = compile_model( os.path.join(stan_model_path, 'remr_bs.stan'), model_name='remr_bs' ) fit = stan_model.sampling( data=data_dict, iter=4000, warmup=1000, chains=3, control={'adapt_delta': 0.99}, check_hmc_diagnostics=True, seed=1 ) pystan.check_hmc_diagnostics(fit) data = az.from_pystan( posterior=fit, posterior_predictive=['Y_pred'], observed_data=['Y_meas', 'X_meas', 'X0'], log_likelihood='log_lik', ) return data
def get_simulation_results(): data_dict = { 'N': 1000, 'rho': -0.4, 'sd_te': 6.5, 'sd_m': 0.001, 'lambda': 0.2, 'theta': 0.9 } simulation_stan_model = compile_model(os.path.join(stan_model_path, 'simulation.stan'), model_name='simulation') fit = simulation_stan_model.sampling(data=data_dict, warmup=500, iter=2500, chains=2, check_hmc_diagnostics=True, seed=1) pystan.check_hmc_diagnostics(fit) data = az.from_pystan(posterior=fit) return data
def get_prior_comparison(): df = get_model_input_df() model_res_dict = {} model = 'remr_prior' stan_model = compile_model(os.path.join(stan_model_path, f'{model}.stan'), model_name=model) effect_statistic = 'lnVR' data_dict = get_data_dict(df, effect_statistic) prior_dict = {'reference': (0, 1), 'optimisitc': (np.log(2), 0.43)} # from scipy import stats # stats.norm.cdf(0, loc=np.log(2), scale=0.43) # 0.05348421366569122 for prior, (mu_prior_loc, mu_prior_scale) in prior_dict.items(): data_dict_prior = data_dict.copy() data_dict_prior['mu_prior_loc'] = mu_prior_loc data_dict_prior['mu_prior_scale'] = mu_prior_scale fit = stan_model.sampling(data=data_dict_prior, iter=4000, warmup=1000, chains=3, control={'adapt_delta': 0.99}, check_hmc_diagnostics=True, seed=1) pystan.check_hmc_diagnostics(fit) data = az.from_pystan( posterior=fit, posterior_predictive=['Y_pred'], observed_data=['Y_meas', 'X_meas'], log_likelihood='log_lik', ) model_res_dict[f'{model}_{effect_statistic}_{prior}'] = data return model_res_dict
def fit_model(row, model, keep=True,root='.'): if keep and os.path.isfile(root + '/output/posteriors/' + row['event_name'] + '_raw.p'): pass else: raw_df = pd.read_csv(root + '/data/timeseries/aggregated/' + row['incident_name'] + '_raw.csv') raw_df = raw_df.iloc[row['start']:row['end']] #We add one here because it avoids a divide by zero error in log. Should not impact inference stan_data = dict(y=raw_df['total_tweets'].values.astype('int')[1:], N=raw_df.shape[0], x=np.log(raw_df['user_followers_count'].values.astype('int')+1)) samples = model.sampling(data=stan_data, check_hmc_diagnostics=False, refresh = 0, control={'adapt_delta':.99}) pickle.dump(samples.extract(inc_warmup=False), open(root + '/output/posteriors/' + row['event_name'] + '_extracted.p', 'wb')) pickle.dump(samples, open(root + '/output/posteriors/' + row['event_name'] + '_raw.p', 'wb')) return pystan.check_hmc_diagnostics(samples, pars={'alpha','beta','lambda','decay','phi'})
for d, nm in [(logistic_data_synth, 'synth'), (logistic_data_ds1, 'ds1'), (logistic_data_phish, 'phish')]: if not os.path.exists('results/logistic_samples_' + nm + '.npy'): t0 = time.process_time() fit = sm.sampling(data=d, iter=N_samples * 2, chains=1, control={ 'adapt_delta': 0.9, 'max_treedepth': 15 }, verbose=True) #fit.extract has 3 dims: iterations, chains, parametrs f = open('results/logistic_params_' + nm + '.log', 'w') f.write(str(pystan.check_hmc_diagnostics(fit)) + '\n') f.write(str(fit.model_pars) + '\n') f.write(str(fit.par_dims) + '\n') f.close() np.save('results/logistic_samples_' + nm + '.npy', fit.extract(permuted=False)) tf = time.process_time() np.save('results/' + nm + '_mcmc_time.npy', tf - t0) ############################## (U)BVI / ADVI Params ###################################### N = 10 diag = True n_samples = 2000 n_logfg_samples = 10000 adam_learning_rate = lambda itr: 1. / np.sqrt(itr + 1)
def fit_stan(fname_fitres_comb: str, dir_output: str, sample: str, screen=False, lowz=True, bias=True, plot=False): """ Fit Stan model for w. Parameters ---------- fname_fitres_comb: str Complete path to fitres (with lowz, if requested). dir_output: str Complete path to output root folder. sample: str Sample to be fitted. screen: bool (optional) If True, print Stan results to screen. Default is False. lowz: bool (optional) If True, add low-z sample. Default is True. bias: bool (optional) If True, add bias correction. Default is True. plot: bool (optional) If True, generate chains plot. Default is False. """ # read data for Bayesian model fitres_final = pd.read_csv(fname_fitres_comb, index_col=False, comment="#", skip_blank_lines=True, delim_whitespace=True) # set initial conditions z0 = 0 E0 = 0 c = 3e5 H0 = 70 # add small offset to duplicate redshifts fitres_final = remove_duplicated_z(fitres_final) # order data according to redshift indx = np.argsort(fitres_final['SIM_ZCMB'].values) # create input for stan model stan_input = {} stan_input['nobs'] = fitres_final.shape[0] stan_input['z'] = fitres_final['SIM_ZCMB'].values[indx] stan_input['mu'] = fitres_final['MU'].values[indx] stan_input['muerr'] = fitres_final['MUERR'].values[indx] stan_input['z0'] = z0 stan_input['H0'] = H0 stan_input['c'] = c stan_input['E0'] = np.array([E0]) stan_input['ompri'] = om_pri[0] stan_input['dompri'] = om_pri[1] stan_input['wmin'] = w_pri[0] stan_input['wmax'] = w_pri[1] # save only stan input to file fname_stan_input = dir_output + 'stan_input/stan_input_salt2mu_lowz_withbias_' + sample + '.csv' stan_input2 = {} stan_input2['z'] = stan_input['z'] stan_input2['mu'] = stan_input['mu'] stan_input2['muerr'] = stan_input['muerr'] stan_input_tofile = pd.DataFrame(stan_input2) stan_input_tofile.to_csv(fname_stan_input, index=False) # fit Bayesian model model = pystan.StanModel(file=dir_input + '/cosmo.stan') fit = model.sampling(data=stan_input, iter=12000, chains=5, warmup=10000, control={'adapt_delta': 0.99}) # get summary res = fit.stansummary(pars=["om", "w"]) check = str(pystan.check_hmc_diagnostics(fit)) if screen: print(res) print(' ******* ') print(check) if lowz and bias: summ_fname = dir_output + 'stan_summary/stan_summary_' + sample + '_lowz_withbias.dat' chains_fname = dir_output + 'posteriors/pkl/chains_' + sample + '_lowz_withbias.pkl' trace_fname = dir_output + 'posteriors/trace/trace_plot_' + sample + '_lowz_withbias.png' elif lowz and not bias: summ_fname = dir_output + 'stan_summary/stan_summary_' + sample + '_lowz_nobias.dat' chains_fname = dir_output + 'posteriors/pkl/chains_' + sample + '_lowz_nobias.pkl' trace_fname = dir_output + 'posteriors/trace/trace_plot_' + sample + '_lowz_nobias.png' else: summ_fname = dir_output + 'stan_summary/stan_summary_' + sample + '.dat' chains_fname = dir_output + 'posteriors/pkl/chains_' + sample + '.pkl' trace_fname = dir_output + 'posteriors/trace/trace_plot_' + sample + '.png' op2 = open(summ_fname, 'w') op2.write(res) op2.write('\n ************* \n') op2.write(check) op2.close() samples = fit.extract(pars=['om', 'w'], permuted=True, inc_warmup=False) pickle.dump(samples, open(chains_fname, "wb")) pystan.check_hmc_diagnostics(fit) ### plot chains if plot: arviz.plot_trace(fit, ['om', 'w']) plt.savefig(trace_fname) if lowz and bias: data = pd.read_pickle(chains_fname) data2 = pd.DataFrame(data) data2.to_csv(dir_output + 'posteriors/csv/' + \ 'chains_' + sample + '_lowz_withbias.csv.gz', index=False)
distanceToParent.get(x, 10) for x in hiddenLanguages + observedLanguages ] dat["prior_only"] = 0 dat["Components"] = 2 print(dat) sm = pystan.StanModel(file=f'{__file__[:-3]}.stan') fit = sm.sampling(data=dat, iter=2000, chains=4) la = fit.extract(permuted=True) # return a dictionary of arrays with open(f"fits/{__file__}.txt", "w") as outFile: print(fit, file=outFile) import sys with open(f"fits/{__file__}_diagnostic.txt", "w") as outFile: sys.stdout = outFile sys.stderr = outFile print(pystan.check_hmc_diagnostics(fit), file=outFile) # print(la, file=outFile) #print("Inferred logits", la["LogitsAll"].mean(axis=0)) #print("Inferred hidden traits", la["TraitHidden"].mean(axis=0)) #print("alpha", la["alpha"].mean(axis=0)) #print("corr_Sigma", la["corr_Sigma"].mean(axis=0)) #print("sigma_B", la["sigma_B"].mean(axis=0)) #print("Lrescor_B", la["Lrescor_B"].mean(axis=0)) #
dlt = DLTFull( response_col='pct_chg_in_sales_from_prev_mnth', regressor_col=regressors, date_col='date', seasonality=1, seed=2020, level_sm_input=0.3, # recommend for higher frequency data regressor_sigma_prior=[0.5] * len(regressors), regression_penalty='lasso', period=365, prediction_percentiles=[5, 95]) dlt.fit(df=train_df) pystan.check_hmc_diagnostics(dlt) density_plot = plot_posterior_params(dlt, kind='density', incl_trend_params=True, incl_smooth_params=True) trace_plot = plot_posterior_params(dlt, kind='trace', incl_trend_params=True, incl_smooth_params=True) pair_plot = plot_posterior_params(dlt, kind='pair', pair_type='reg', incl_trend_params=False,
# all int? data = df[['i1', 'i2', 'i3', 'Response']].values s = df[['s1', 's2', 's3']].values stimulus = np.unique(s.flatten()) # how to call the functions? fit = bds.bds(data, lapses=False) # gives warning # WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat. # To run all diagnostics call pystan.check_hmc_diagnostics(fit) pystan.check_hmc_diagnostics(fit.stanfit) # how to read out results? scale = fit.get_scale_values() CIl, CIh = fit.get_scale_credible_interval() # how to plot results? import matplotlib.pyplot as plt plt.plot(stimulus, scale, 'o') yerr = [CIh - scale, scale - CIl] plt.errorbar(stimulus, scale, yerr=yerr, fmt='none', ecolor='b', capsize=0) # others:
stan_input['H0'] = H0 stan_input['c'] = c stan_input['E0'] = np.array([E0]) stan_input['ompri'] = om_pri[0] stan_input['dompri'] = om_pri[1] stan_input['wmin'] = w_pri[0] stan_input['wmax'] = w_pri[1] # save only stan input to file fname_stan_input = dir_output + 'stan_input/stan_input_salt2mu_lowz_withbias_' + case + '.csv' stan_input2 = {} stan_input2['z'] = stan_input['z'] stan_input2['mu'] = stan_input['mu'] stan_input2['muerr'] = stan_input['muerr'] stan_input_tofile = pd.DataFrame(stan_input2) stan_input_tofile.to_csv(fname_stan_input, index=False) # fit Bayesian model model = pystan.StanModel(file = dir_input + '/cosmo.stan') fit = model.sampling(data=stan_input, iter=12000, chains=5, warmup=10000, control={'adapt_delta':0.99}) # get summary res = fit.stansummary(pars=["om", "w"]) check = str(pystan.check_hmc_diagnostics(fit))