Beispiel #1
0
def run_and_plot_models(segmentDF, adjacencyMatrix, iters, warmup):

    tobit_dict = get_tobit_dict(segmentDF)

    # TOBIT MODEL:
    t_c_params = {'adapt_delta': 0.95, 'max_treedepth': 15}
    tobit_model, tobit_fit = run_or_load_model('tobit', tobit_dict, iters,
                                               warmup, t_c_params)
    check_hmc_diagnostics(tobit_fit)

    plt.hist(tobit_fit['sigma'], bins=int(iters * 4 / 100))
    plt.title('tobit')
    tob_vars = ['sigma', 'beta_zero', 'theta']
    az.plot_trace(tobit_fit, tob_vars)

    # SPATIAL TOBIT MODEL:
    c_c_params = {'adapt_delta': 0.95, 'max_treedepth': 15}
    car_dict = add_car_info_to_dict(tobit_dict, adjacencyMatrix)
    car_model, car_fit = run_or_load_model('car', car_dict, iters, warmup,
                                           c_c_params)
    check_hmc_diagnostics(car_fit)

    plt.hist(car_fit['sigma'], bins=int(iters * 4 / 100))
    plt.title('car')
    car_vars = ['sigma', 'beta_zero', 'theta', 'alpha', 'tau']
    az.plot_trace(car_fit, compact=False, var_names=car_vars)

    az.plot_pair(car_fit, ['tau', 'alpha', 'sigma'], divergences=True)
    plt.scatter(car_fit['lp__'], car_fit['sigma'])
    plt.hist(car_fit['phi'].mean(axis=0), bins=50)
Beispiel #2
0
def get_varying_intercept_model_results():
    # read in Cipriani data
    df = get_model_input_df()
    data_dict = {
        'N': df.shape[0],
        'Y_meas': df['lnSD'].values,
        'X_meas': df['lnMean'].values,
        'SD_Y': np.sqrt(df['var_lnSD'].values),
        'SD_X': np.sqrt(df['var_lnMean'].values),
        'K': len(df.scale.unique()),
        'scale_group': df.scale_rank.values
    }
    varying_intercept_stan_model = compile_model(
        os.path.join(stan_model_path, 'varying_intercept_regression.stan'),
        model_name='varying_intercept_regression')
    fit = varying_intercept_stan_model.sampling(data=data_dict,
                                                iter=4000,
                                                warmup=1000,
                                                chains=3,
                                                control={'adapt_delta': 0.99},
                                                check_hmc_diagnostics=True,
                                                seed=1)
    pystan.check_hmc_diagnostics(fit)
    data = az.from_pystan(
        posterior=fit,
        posterior_predictive=['Y_pred'],
        observed_data=['X_meas', 'Y_meas'],
        log_likelihood='log_lik',
    )
    return data
Beispiel #3
0
def get_model_results_dict():
    df = get_model_input_df()
    model_res_dict = {}

    # fixed effects meta analyses (lnVR and lnCVR)
    for model in ['fema', 'rema'
                  ]:  # lnVR, # random effects meta analyses (lnVR and lnCVR)
        stan_model = compile_model(os.path.join(stan_model_path,
                                                f'{model}.stan'),
                                   model_name=model)
        for effect_statistic in ['lnVR', 'lnCVR']:
            data_dict = get_data_dict(df, effect_statistic)

            fit = stan_model.sampling(data=data_dict,
                                      iter=4000,
                                      warmup=1000,
                                      chains=3,
                                      control={'adapt_delta': 0.99},
                                      check_hmc_diagnostics=True,
                                      seed=1)

            data = az.from_pystan(
                posterior=fit,
                posterior_predictive=['Y_pred'],
                observed_data=['Y'],
                log_likelihood='log_lik',
            )

            model_res_dict[f'{model}_{effect_statistic}'] = data

    model = 'remr'
    stan_model = compile_model(os.path.join(stan_model_path, f'{model}.stan'),
                               model_name=model)
    effect_statistic = 'lnVR'
    data_dict = get_data_dict(df, effect_statistic)

    fit = stan_model.sampling(data=data_dict,
                              iter=4000,
                              warmup=1000,
                              chains=3,
                              control={'adapt_delta': 0.99},
                              check_hmc_diagnostics=True,
                              seed=1)
    pystan.check_hmc_diagnostics(fit)

    data = az.from_pystan(
        posterior=fit,
        posterior_predictive=['Y_pred'],
        observed_data=['Y_meas', 'X_meas'],
        log_likelihood='log_lik',
    )
    model_res_dict[f'{model}_{effect_statistic}'] = data
    return model_res_dict
def check_fit(row,root='.',):
    output_dict = {}
    sample_loc = root+'/output/posteriors/' + row['event_name'] + '_raw.p'
    data_loc = root+'/data/timeseries/aggregated/' + row['incident_name'] + '_raw.csv'
    samples = pickle.load(open(sample_loc, 'rb'))
    dat = pd.read_csv(data_loc).iloc[row['start']:row['end']]
    y = dat['total_tweets'].values
    
    print('--------')
    print(row['event_name'])
    output_dict.update(pystan.check_hmc_diagnostics(samples, pars=['alpha','beta','decay', 
                                                 'lambda','phi']))
    

    observed_engagement = np.sum(y)
    output_dict['observed_engagement'] = observed_engagement

    output_dict['y_0'] = y[0]
    output_dict['observed_engagement'] = observed_engagement
    
    low, high = np.percentile(np.cumsum(samples['y_hat'],axis=1)[:,-1], q=[5.5,94.5])
    cumulative_fit = np.cumsum(y)[-1] > low and np.cumsum(y)[-1] < high


    output_dict['final_predicted' ] = cumulative_fit
    
    output_dict.update(row.to_dict())
    output_dict['sample_loc'] = sample_loc
    output_dict['data_loc'] = data_loc
    output_dict['lower_predicted']=low
    output_dict['upper_predicted']=high

    return output_dict
Beispiel #5
0
def get_subgroup_models():
    df = get_formatted_data()

    # drug class subgroup analysis
    model_res_dict = {}

    for drug_class in DRUG_CLASSES:
        study_ids = df.query(f'drug_class == "{drug_class}"').study_id.unique()
        df_sub = df[(df.study_id.isin(study_ids))
                    & (df.drug_class.isin([drug_class, 'placebo']))].copy()
        placebo_controlled_study_ids = set(df_sub.query('is_active == 1')['study_id']) \
            .intersection(df_sub.query('is_active == 0')['study_id'])
        df_sub = df_sub[df_sub.study_id.isin(placebo_controlled_study_ids)]

        for column in ['study_id', 'scale', 'drug_class']:
            df_sub = add_rank_column(df_sub, column)

        df_sub = aggregate_treatment_arms(df_sub)
        df_sub = get_variability_effect_sizes(df_sub)

        model = 'remr'
        stan_model = compile_model(os.path.join(stan_model_path,
                                                f'{model}.stan'),
                                   model_name=model)

        data_dict = get_data_dict(df_sub, 'lnVR')

        fit = stan_model.sampling(data=data_dict,
                                  iter=4000,
                                  warmup=1000,
                                  chains=3,
                                  control={'adapt_delta': 0.99},
                                  check_hmc_diagnostics=True,
                                  seed=1)
        pystan.check_hmc_diagnostics(fit)

        data = az.from_pystan(
            posterior=fit,
            posterior_predictive=['Y_pred'],
            observed_data=['Y_meas', 'X_meas'],
            log_likelihood='log_lik',
        )

        model_res_dict[drug_class] = data
    return model_res_dict
Beispiel #6
0
def get_baseline_severity_model():
    df = prepare_data()

    effect_statistic = 'lnVR'
    data_dict = {
        'N': len(df.study_id.unique()),
        'Y_meas': df.groupby(['study_id']).agg({effect_statistic: 'first'}).reset_index()[effect_statistic].values,
        'X_meas': df.groupby(['study_id']).agg({'lnRR': 'first'}).reset_index()['lnRR'].values,
        'SD_Y': np.sqrt(df.groupby(['study_id']).agg(
            {f'var_{effect_statistic}': 'first'}).reset_index()[f'var_{effect_statistic}'].values),
        'SD_X': np.sqrt(df.groupby(['study_id']).agg(
            {'var_lnRR': 'first'}).reset_index()['var_lnRR'].values),
        'X0': df.groupby(['study_id']).apply(
            lambda x: np.sum(x['baseline'] * x['N']) / np.sum(x['N'])
        ).reset_index()[0].values,
        'run_estimation': 1
    }

    stan_model = compile_model(
        os.path.join(stan_model_path, 'remr_bs.stan'),
        model_name='remr_bs'
    )

    fit = stan_model.sampling(
        data=data_dict,
        iter=4000,
        warmup=1000,
        chains=3,
        control={'adapt_delta': 0.99},
        check_hmc_diagnostics=True,
        seed=1
    )
    pystan.check_hmc_diagnostics(fit)

    data = az.from_pystan(
        posterior=fit,
        posterior_predictive=['Y_pred'],
        observed_data=['Y_meas', 'X_meas', 'X0'],
        log_likelihood='log_lik',
    )
    return data
Beispiel #7
0
def get_simulation_results():
    data_dict = {
        'N': 1000,
        'rho': -0.4,
        'sd_te': 6.5,
        'sd_m': 0.001,
        'lambda': 0.2,
        'theta': 0.9
    }
    simulation_stan_model = compile_model(os.path.join(stan_model_path,
                                                       'simulation.stan'),
                                          model_name='simulation')
    fit = simulation_stan_model.sampling(data=data_dict,
                                         warmup=500,
                                         iter=2500,
                                         chains=2,
                                         check_hmc_diagnostics=True,
                                         seed=1)
    pystan.check_hmc_diagnostics(fit)
    data = az.from_pystan(posterior=fit)
    return data
Beispiel #8
0
def get_prior_comparison():
    df = get_model_input_df()
    model_res_dict = {}

    model = 'remr_prior'
    stan_model = compile_model(os.path.join(stan_model_path, f'{model}.stan'),
                               model_name=model)
    effect_statistic = 'lnVR'
    data_dict = get_data_dict(df, effect_statistic)

    prior_dict = {'reference': (0, 1), 'optimisitc': (np.log(2), 0.43)}
    # from scipy import stats
    # stats.norm.cdf(0, loc=np.log(2), scale=0.43)
    # 0.05348421366569122
    for prior, (mu_prior_loc, mu_prior_scale) in prior_dict.items():
        data_dict_prior = data_dict.copy()

        data_dict_prior['mu_prior_loc'] = mu_prior_loc
        data_dict_prior['mu_prior_scale'] = mu_prior_scale

        fit = stan_model.sampling(data=data_dict_prior,
                                  iter=4000,
                                  warmup=1000,
                                  chains=3,
                                  control={'adapt_delta': 0.99},
                                  check_hmc_diagnostics=True,
                                  seed=1)
        pystan.check_hmc_diagnostics(fit)

        data = az.from_pystan(
            posterior=fit,
            posterior_predictive=['Y_pred'],
            observed_data=['Y_meas', 'X_meas'],
            log_likelihood='log_lik',
        )
        model_res_dict[f'{model}_{effect_statistic}_{prior}'] = data
    return model_res_dict
def fit_model(row, model, keep=True,root='.'):
    if keep and os.path.isfile(root + '/output/posteriors/' + row['event_name'] + '_raw.p'):
        pass
    
    else:
        raw_df = pd.read_csv(root 
                     + '/data/timeseries/aggregated/' 
                     + row['incident_name'] + '_raw.csv')
        raw_df = raw_df.iloc[row['start']:row['end']]

        #We add one here because it avoids a divide by zero error in log. Should not impact inference
        stan_data = dict(y=raw_df['total_tweets'].values.astype('int')[1:],
                        N=raw_df.shape[0], 
                        x=np.log(raw_df['user_followers_count'].values.astype('int')+1))
        samples = model.sampling(data=stan_data,
                                 check_hmc_diagnostics=False,
                                 refresh = 0, 
                                 control={'adapt_delta':.99})
        pickle.dump(samples.extract(inc_warmup=False), 
            open(root + '/output/posteriors/' + row['event_name'] + '_extracted.p', 'wb'))
        pickle.dump(samples, 
            open(root + '/output/posteriors/' + row['event_name'] + '_raw.p', 'wb'))       
        return pystan.check_hmc_diagnostics(samples, pars={'alpha','beta','lambda','decay','phi'})
Beispiel #10
0
for d, nm in [(logistic_data_synth, 'synth'), (logistic_data_ds1, 'ds1'),
              (logistic_data_phish, 'phish')]:
    if not os.path.exists('results/logistic_samples_' + nm + '.npy'):
        t0 = time.process_time()
        fit = sm.sampling(data=d,
                          iter=N_samples * 2,
                          chains=1,
                          control={
                              'adapt_delta': 0.9,
                              'max_treedepth': 15
                          },
                          verbose=True)
        #fit.extract has 3 dims: iterations, chains, parametrs
        f = open('results/logistic_params_' + nm + '.log', 'w')
        f.write(str(pystan.check_hmc_diagnostics(fit)) + '\n')
        f.write(str(fit.model_pars) + '\n')
        f.write(str(fit.par_dims) + '\n')
        f.close()
        np.save('results/logistic_samples_' + nm + '.npy',
                fit.extract(permuted=False))
        tf = time.process_time()
        np.save('results/' + nm + '_mcmc_time.npy', tf - t0)

############################## (U)BVI / ADVI Params ######################################

N = 10
diag = True
n_samples = 2000
n_logfg_samples = 10000
adam_learning_rate = lambda itr: 1. / np.sqrt(itr + 1)
def fit_stan(fname_fitres_comb: str,
             dir_output: str,
             sample: str,
             screen=False,
             lowz=True,
             bias=True,
             plot=False):
    """
    Fit Stan model for w.
    
    Parameters
    ----------
    fname_fitres_comb: str
        Complete path to fitres (with lowz, if requested).
    dir_output: str
        Complete path to output root folder.
    sample: str
        Sample to be fitted.
    screen: bool (optional)
        If True, print Stan results to screen. Default is False.     
    lowz: bool (optional)
        If True, add low-z sample. Default is True.
    bias: bool (optional)
        If True, add bias correction. Default is True. 
    plot: bool (optional)
        If True, generate chains plot. Default is False.
    """

    # read data for Bayesian model
    fitres_final = pd.read_csv(fname_fitres_comb,
                               index_col=False,
                               comment="#",
                               skip_blank_lines=True,
                               delim_whitespace=True)

    # set initial conditions
    z0 = 0
    E0 = 0
    c = 3e5
    H0 = 70

    # add small offset to duplicate redshifts
    fitres_final = remove_duplicated_z(fitres_final)

    # order data according to redshift
    indx = np.argsort(fitres_final['SIM_ZCMB'].values)

    # create input for stan model
    stan_input = {}
    stan_input['nobs'] = fitres_final.shape[0]
    stan_input['z'] = fitres_final['SIM_ZCMB'].values[indx]
    stan_input['mu'] = fitres_final['MU'].values[indx]
    stan_input['muerr'] = fitres_final['MUERR'].values[indx]
    stan_input['z0'] = z0
    stan_input['H0'] = H0
    stan_input['c'] = c
    stan_input['E0'] = np.array([E0])
    stan_input['ompri'] = om_pri[0]
    stan_input['dompri'] = om_pri[1]
    stan_input['wmin'] = w_pri[0]
    stan_input['wmax'] = w_pri[1]

    # save only stan input to file
    fname_stan_input = dir_output + 'stan_input/stan_input_salt2mu_lowz_withbias_' + sample + '.csv'

    stan_input2 = {}
    stan_input2['z'] = stan_input['z']
    stan_input2['mu'] = stan_input['mu']
    stan_input2['muerr'] = stan_input['muerr']
    stan_input_tofile = pd.DataFrame(stan_input2)
    stan_input_tofile.to_csv(fname_stan_input, index=False)

    # fit Bayesian model
    model = pystan.StanModel(file=dir_input + '/cosmo.stan')
    fit = model.sampling(data=stan_input,
                         iter=12000,
                         chains=5,
                         warmup=10000,
                         control={'adapt_delta': 0.99})

    # get summary
    res = fit.stansummary(pars=["om", "w"])
    check = str(pystan.check_hmc_diagnostics(fit))

    if screen:
        print(res)
        print(' ******* ')
        print(check)

    if lowz and bias:
        summ_fname = dir_output + 'stan_summary/stan_summary_' + sample + '_lowz_withbias.dat'
        chains_fname = dir_output + 'posteriors/pkl/chains_' + sample + '_lowz_withbias.pkl'
        trace_fname = dir_output + 'posteriors/trace/trace_plot_' + sample + '_lowz_withbias.png'
    elif lowz and not bias:
        summ_fname = dir_output + 'stan_summary/stan_summary_' + sample + '_lowz_nobias.dat'
        chains_fname = dir_output + 'posteriors/pkl/chains_' + sample + '_lowz_nobias.pkl'
        trace_fname = dir_output + 'posteriors/trace/trace_plot_' + sample + '_lowz_nobias.png'
    else:
        summ_fname = dir_output + 'stan_summary/stan_summary_' + sample + '.dat'
        chains_fname = dir_output + 'posteriors/pkl/chains_' + sample + '.pkl'
        trace_fname = dir_output + 'posteriors/trace/trace_plot_' + sample + '.png'

    op2 = open(summ_fname, 'w')
    op2.write(res)
    op2.write('\n ************* \n')
    op2.write(check)
    op2.close()

    samples = fit.extract(pars=['om', 'w'], permuted=True, inc_warmup=False)

    pickle.dump(samples, open(chains_fname, "wb"))

    pystan.check_hmc_diagnostics(fit)

    ### plot chains
    if plot:
        arviz.plot_trace(fit, ['om', 'w'])
        plt.savefig(trace_fname)

    if lowz and bias:
        data = pd.read_pickle(chains_fname)
        data2 = pd.DataFrame(data)
        data2.to_csv(dir_output + 'posteriors/csv/' + \
                     'chains_'  + sample + '_lowz_withbias.csv.gz', index=False)
Beispiel #12
0
    distanceToParent.get(x, 10) for x in hiddenLanguages + observedLanguages
]
dat["prior_only"] = 0
dat["Components"] = 2

print(dat)

sm = pystan.StanModel(file=f'{__file__[:-3]}.stan')

fit = sm.sampling(data=dat, iter=2000, chains=4)
la = fit.extract(permuted=True)  # return a dictionary of arrays

with open(f"fits/{__file__}.txt", "w") as outFile:
    print(fit, file=outFile)

import sys

with open(f"fits/{__file__}_diagnostic.txt", "w") as outFile:
    sys.stdout = outFile
    sys.stderr = outFile
    print(pystan.check_hmc_diagnostics(fit), file=outFile)

#   print(la, file=outFile)
#print("Inferred logits", la["LogitsAll"].mean(axis=0))
#print("Inferred hidden traits", la["TraitHidden"].mean(axis=0))
#print("alpha", la["alpha"].mean(axis=0))
#print("corr_Sigma", la["corr_Sigma"].mean(axis=0))
#print("sigma_B", la["sigma_B"].mean(axis=0))
#print("Lrescor_B", la["Lrescor_B"].mean(axis=0))
#
Beispiel #13
0
    dlt = DLTFull(
        response_col='pct_chg_in_sales_from_prev_mnth',
        regressor_col=regressors,
        date_col='date',
        seasonality=1,
        seed=2020,
        level_sm_input=0.3,  # recommend for higher frequency data
        regressor_sigma_prior=[0.5] * len(regressors),
        regression_penalty='lasso',
        period=365,
        prediction_percentiles=[5, 95])

    dlt.fit(df=train_df)

    pystan.check_hmc_diagnostics(dlt)

    density_plot = plot_posterior_params(dlt,
                                         kind='density',
                                         incl_trend_params=True,
                                         incl_smooth_params=True)

    trace_plot = plot_posterior_params(dlt,
                                       kind='trace',
                                       incl_trend_params=True,
                                       incl_smooth_params=True)

    pair_plot = plot_posterior_params(dlt,
                                      kind='pair',
                                      pair_type='reg',
                                      incl_trend_params=False,
Beispiel #14
0
# all int?

data = df[['i1', 'i2', 'i3', 'Response']].values

s = df[['s1', 's2', 's3']].values
stimulus = np.unique(s.flatten())

# how to call the functions?

fit = bds.bds(data, lapses=False)

# gives warning
# WARNING:pystan:Maximum (flat) parameter count (1000) exceeded: skipping diagnostic tests for n_eff and Rhat.
# To run all diagnostics call pystan.check_hmc_diagnostics(fit)

pystan.check_hmc_diagnostics(fit.stanfit)

# how to read out results?
scale = fit.get_scale_values()

CIl, CIh = fit.get_scale_credible_interval()

# how to plot results?
import matplotlib.pyplot as plt

plt.plot(stimulus, scale, 'o')
yerr = [CIh - scale, scale - CIl]
plt.errorbar(stimulus, scale, yerr=yerr, fmt='none', ecolor='b', capsize=0)

# others:
Beispiel #15
0
stan_input['H0'] = H0
stan_input['c'] = c
stan_input['E0'] = np.array([E0])
stan_input['ompri'] = om_pri[0]
stan_input['dompri'] = om_pri[1]
stan_input['wmin'] = w_pri[0]
stan_input['wmax'] = w_pri[1]

# save only stan input to file
fname_stan_input = dir_output + 'stan_input/stan_input_salt2mu_lowz_withbias_' + case + '.csv'

stan_input2 = {}
stan_input2['z'] = stan_input['z']
stan_input2['mu'] = stan_input['mu']
stan_input2['muerr'] = stan_input['muerr']
stan_input_tofile = pd.DataFrame(stan_input2)
stan_input_tofile.to_csv(fname_stan_input, index=False)

# fit Bayesian model
model = pystan.StanModel(file = dir_input + '/cosmo.stan')
fit = model.sampling(data=stan_input, iter=12000, chains=5, 
                     warmup=10000, control={'adapt_delta':0.99})

# get summary
res = fit.stansummary(pars=["om", "w"])
check = str(pystan.check_hmc_diagnostics(fit))