def test_empty_parameter(self): model_code = """ parameters { real y; vector[3] x; vector[0] a; vector[2] z; } model { y ~ normal(0,1); } """ if pystan_version() == 2: from pystan import StanModel # pylint: disable=import-error model = StanModel(model_code=model_code) fit = model.sampling(iter=500, chains=2, check_hmc_diagnostics=False) else: import stan # pylint: disable=import-error model = stan.build(model_code) fit = model.sample(num_samples=500, num_chains=2) posterior = from_pystan(posterior=fit) test_dict = { "posterior": ["y", "x", "z", "~a"], "sample_stats": ["diverging"] } fails = check_multiple_attrs(test_dict, posterior) assert not fails
def run_or_load_model(m_type, m_dict, iters, warmup, c_params): if m_type not in ['car', 'tobit']: raise Exception('Invalid model type!') name = 'crash_{}_{}-{}_delta_{}_max_{}'.format(m_type, iters, warmup, c_params['adapt_delta'], c_params['max_treedepth']) try: model = load(Path('cache/' + name + '_model.joblib')) except: model = StanModel(file=Path( 'models/crash_{}.stan'.format(m_type)).open(), extra_compile_args=["-w"], model_name=name.split('-')[0]) dump(model, Path('cache/' + name + '_model.joblib')) try: fit = load(Path('cache/' + name + '_fit.joblib')) except: fit = model.sampling(data=m_dict, iter=iters, warmup=warmup, control=c_params, check_hmc_diagnostics=True) info = fit.stansummary() with open(Path('logs/' + name + '.log'), 'w') as c_log: c_log.write(info) dump(fit, Path('cache/' + name + '_fit.joblib')) return model, fit
def simple_car_model(tobit_data: pd.DataFrame, ad_matrix): """ In the model of the researchers, phi is distributed around phi_bar is this handled by the multi_normal_prec??? Need to understand docs and adjust if not. - seems to be legit. Documentation of WinBUGS does it in a similar way. https://mc-stan.org/docs/2_19/functions-reference/multivariate-normal-distribution-precision-parameterization.html - find out what the CAR prior in car.normal is. Right now I just have 2/-2 ... - Unfortunately, there is no information available. Just need to set something that works. """ car_model = StanModel(file=Path('models/tobit_car_students.stan').open(), extra_compile_args=["-w"]) car_dict = get_datadict() car_dict['W'] = ad_matrix car_dict['U'] = 800 # this smaller run still took 25 mins to sample... # And still getting too low E-BFMI values car_fit = car_model.sampling(data=car_dict, iter=2000, warmup=500, chains=4) dump(car_fit, Path('data/car_students_2000.joblib')) car_res = car_fit.extract() print('β_0: {}'.format(car_res['beta_zero'][501:].mean())) print('β: {}'.format(car_res['beta'][501:].mean(axis=0))) # getting many rejections - bad? Phi is a bit like a covariance matrix # -> only in the beginning, after 200 iterations all fine. # result from the run: chains have not mixed, might need to re-parametrize... # am I contraining the variables too much??? Need to center somehow? return car_fit, car_model
def compile(self): """Compile the Stan model.""" # Note: we deliberately use a centered parameterization for the # thetas at the moment. This is sub-optimal in terms of estimation, # but allows us to avoid having to add extra logic to detect and # handle intercepts in X. spec = f""" data {{ int<lower=1> N; int<lower=1> K; vector[N] y; int<lower=1,upper=K> id[N]; int<lower=1> C; matrix[K, C] X; vector[N] sigma; }} parameters {{ vector[C] beta; vector[K] theta; real<lower=0> tau2; }} transformed parameters {{ vector[N] mu; mu = theta[id] + X * beta; }} model {{ y ~ normal(mu, sigma); theta ~ normal(0, tau2); }} """ from pystan import StanModel self.model = StanModel(model_code=spec)
def fit(self, iterations=5000, control=default_control): self.stan_model = StanModel('../stan/player_scoring.stan') self.samples = self.stan_model.sampling(self.stan_data, iter=iterations, chains=4, refresh=1, control=control)
def scaled_spare_car(tobit_data: pd.DataFrame, ad_matrix): """ will try with values closer to 0 now. sigma was 67.3 with stdev 3.74 even worse - E-BMFI is still small, but now also much treedepth saturation (OK) and chain divergence (bad!) would need to check energy-plots and what correlates... TODO: if I scale, I have the danger of missing not hitting the condition for U... -> should not be a problem if I have zeros there as lower bound """ tobit_data['ones'] = np.ones(tobit_data.shape[0]) trans = MaxAbsScaler().fit_transform(tobit_data[new_preds + ['apt']]) data_centered = pd.DataFrame(trans, columns=new_preds + ['apt']) is_800 = tobit_data['apt'] == 800 not_800 = tobit_data['apt'] != 800 ii_obs = tobit_data[not_800]['id'] ii_cens = tobit_data[is_800]['id'] # After using vectorisation: Gradient takes 0.0003 seconds. c_sparse_dict = { 'X': data_centered[new_preds], 'n': tobit_data.shape[0], 'n_obs': not_800.sum(), 'n_cens': is_800.sum(), 'y_obs': data_centered[not_800]['apt'], 'ii_obs': ii_obs, 'ii_cens': ii_cens, 'p': len(new_preds), 'y_cens': data_centered[is_800]['apt'], 'W': ad_matrix, 'U': 1, 'W_n': ad_matrix.sum() // 2 } # or just 'models/sparse_tcar_students_without_QR.stan' c_sp_model = StanModel(file=Path('sparse_tobitcar_students.stan').open(), verbose=False, extra_compile_args=["-w"]) c_params = {'adapt_delta': 0.95, 'max_treedepth': 12} # no more saturation, but still divergence... # trying to constrain the model: α <= 0.99 instead <=1, σ >= 0.001 c_sp_fit = c_sp_model.sampling(c_sparse_dict, iter=4000, warmup=500, control=c_params) c_sp_res = c_sp_fit.extract() print(c_sp_fit.stansummary()) dump(c_sp_fit, 'data/c_sp_4000.joblib') plt.scatter(c_sp_fit['lp__'], c_sp_fit['sigma']) # sigma looks very correlated. simpler_csp = c_sp_res.copy() del simpler_csp['phi'] del simpler_csp['y_cens'] del simpler_csp['beta'] del simpler_csp['y'] if 'theta' in simpler_csp: del simpler_csp['theta'] c_sp_df = pd.DataFrame.from_dict(simpler_csp) sns.pairplot(c_sp_df) return c_sp_fit, c_sp_model
def _fit_stan_model(self, vb: bool, sm: StanModel, data_dict: Dict, pars: List, gen_init: Union[str, Callable], nchain: int, niter: int, nwarmup: int, nthin: int, adapt_delta: float, stepsize: float, max_treedepth: int, ncore: int) -> Any: """Fit the stan model. Parameters ---------- vb Whether to perform variational Bayesian analysis. sm The StanModel object to use to fit the model. data_dict Dict holding the data to pass to Stan. pars List specifying the parameters of interest. gen_init String or function to specify how to generate the initial values. nchain Number of chains to run. niter Number of iterations per chain. nwarmup Number of warm-up iterations. nthin Use every `i == nthin` sample to generate posterior distribution. adapt_delta Advanced control argument for sampler. stepsize Advanced control argument for sampler. max_treedepth Advanced control argument for sampler. ncore Argument for parallel computing while sampling multiple chains. Returns ------- fit The fitted result returned by `vb` or `sampling` function. """ if vb: return sm.vb(data=data_dict, pars=pars, init=gen_init) else: return sm.sampling(data=data_dict, pars=pars, init=gen_init, chains=nchain, iter=niter, warmup=nwarmup, thin=nthin, control={ 'adapt_delta': adapt_delta, 'stepsize': stepsize, 'max_treedepth': max_treedepth }, n_jobs=ncore)
def verify_stan(): """ Simplest model to verify the stan installation """ model_code = 'parameters {real y;} model {y ~ normal(0,1);}' model = StanModel(model_code=model_code) y = model.sampling().extract()['y'] print('If this worked, you will see a value near 0 now:') print(y.mean())
def _bayes_sampling(x, y, distribution='normal'): """ Helper function. Args: x (array_like): sample of a treatment group y (array_like): sample of a control group distribution: name of the KPI distribution model, which assumes a Stan model file with the same name exists Returns: tuple: - the posterior samples - sample size of x - sample size of y - absolute mean of x - absolute mean of y """ # Checking if data was provided if x is None or y is None: raise ValueError('Please provide two non-None samples.') # Coercing missing values to right format _x = np.array(x, dtype=float) _y = np.array(y, dtype=float) mu_x = np.nanmean(_x) mu_y = np.nanmean(_y) n_x = statx.sample_size(_x) n_y = statx.sample_size(_y) if distribution == 'normal': fit_data = {'Nc': n_y, 'Nt': n_x, 'x': _x, 'y': _y} elif distribution == 'poisson': fit_data = { 'Nc': n_y, 'Nt': n_x, 'x': _x.astype(int), 'y': _y.astype(int) } else: raise NotImplementedError model_file = __location__ + '/../models/' + distribution + '_kpi.stan' sm = StanModel(file=model_file) fit = sm.sampling(data=fit_data, iter=25000, chains=4, n_jobs=1, seed=1, control={ 'stepsize': 0.01, 'adapt_delta': 0.99 }) traces = fit.extract() return traces, n_x, n_y, mu_x, mu_y
def main(): schools_dat = { 'J': 8, 'y': [28, 8, -3, 7, -1, 1, 18, 12], 'sigma': [15, 10, 16, 11, 9, 11, 10, 18] } sm = StanModel(file='model.stan') fit = sm.sampling(data=schools_dat, iter=1000, chains=4, seed=555) with open(DATA_FILE_NAME, 'wb') as f: pickle.dump({'model': sm, 'fit': fit}, f)
def sparse_car_model(tobit_data: pd.DataFrame, ad_matrix): sparse_dict = get_sparse_modeldict(tobit_data, ad_matrix) sparse_model = StanModel( file=Path('models/sparse_tobitcar_students.stan').open(), extra_compile_args=["-w"]) sparse_fit = sparse_model.sampling(sparse_dict, iter=4000, warmup=500, chains=4) print(sparse_fit.stansummary()) return sparse_fit, sparse_model
def coin_model(): """ Example from „Kruschke: Doing Bayesian Data Analysis”. """ coin_model = StanModel(file=Path('models/bernoulli_example.stan').open()) # generate some data N = 50 z = 10 y = [1] * z + [0] * (N - z) coin_data = {'y': y, 'N': N} # warmup is the same as burnin in JAGS return coin_model.sampling(data=coin_data, chains=3, iter=1000, warmup=200)
def stanTopkl(): if os.path.isfile('log_normal.pkl'): os.remove('log_normal.pkl') sm = StanModel(file='log_normal.stan') with open('log_normal.pkl', 'wb') as f: pickle.dump(sm, f) if os.path.isfile('log_t.pkl'): os.remove('log_t.pkl') sm = StanModel(file='log_t.stan') with open('log_t.pkl', 'wb') as f: pickle.dump(sm, f)
def test_init_zero_exception_inf_grad(self): code = """ parameters { real x; } model { target += 1 / log(x); } """ sm = StanModel(model_code=code) with self.assertRaises(RuntimeError): sm.sampling(init='0', iter=1, chains=1)
def test_init_zero_exception_inf_grad(self): code = """ parameters { real x; } model { lp__ <- 1 / log(x); } """ sm = StanModel(model_code=code) assertRaisesRegex = self.assertRaisesRegexp if PY2 else self.assertRaisesRegex with assertRaisesRegex(RuntimeError, 'divergent gradient'): sm.sampling(init='0', iter=1)
def linear_model(): """ 1st example from Stan User's Guide """ linear_model = StanModel(file=Path('models/linear_example.stan').open(), extra_compile_args=["-w"]) x = list(range(10)) y = [1.1, 2.04, 3.07, 3.88, 4.95, 6.11, 7.03, 7.89, 8.91, 10] linear_data = {'x': x, 'y': y, 'N': 10} linear_fit = linear_model.sampling(data=linear_data) linear_res = linear_fit.extract() print('α : {}'.format(np.mean(linear_res['alpha']))) print('β : {}'.format(np.mean(linear_res['beta']))) return linear_fit
def bnb_stan(dataset, oos_dataset, warmup=20000, n_iter=25000): Y_data, ratings_data, expectations_data, team_dummies_data, pct = ( extract_data(dataset)) _, oos_ratings_data, oos_expectations_data, oos_team_dummies_data, oos_pct = \ extract_data(oos_dataset) ratings_data = ratings_data.squeeze() oos_ratings_data = oos_ratings_data.squeeze() ratings_data, oos_ratings_data = normalize(ratings_data, oos_ratings_data) ratings_data = np.stack((ratings_data, np.square(ratings_data)), axis=1) oos_ratings_data = np.stack( (oos_ratings_data, np.sign(oos_ratings_data) * np.square(oos_ratings_data)), axis=1) pct, oos_pct = normalize(pct, oos_pct) expectations_data, oos_expectations_data = normalize( expectations_data, oos_expectations_data) home_team_dummies = team_dummies_data[::, 0, ::] away_team_dummies = team_dummies_data[::, 1, ::] stan_data = { 'n_rows': Y_data.shape[0], 'n_teams': team_dummies_data.shape[2], 'm_ratings': ratings_data.shape[1], 'max_goals': 10, 'home_team_dummies': home_team_dummies, 'away_team_dummies': away_team_dummies, 'expectations': expectations_data, 'pct': pct, 'ratings': ratings_data, 'Y': Y_data.astype(np.int16), 'oos_n_rows': oos_ratings_data.shape[0], 'oos_home_team_dummies': oos_team_dummies_data[::, 0, ::], 'oos_away_team_dummies': oos_team_dummies_data[::, 1, ::], 'oos_expectations': oos_expectations_data, 'oos_ratings': oos_ratings_data, 'oos_pct': oos_pct } stan_model = StanModel('../stan/games.stan') samples = stan_model.sampling(stan_data, warmup=warmup, iter=n_iter, chains=4, refresh=1, control={ 'adapt_delta': 0.99, 'max_treedepth': 15 }) preds = samples['predicted_probabilities'] mean_preds = np.mean(preds, axis=0) return samples, mean_preds
def stanTopkl(): """ The function complies 'stan' models first and avoids re-complie of the model. """ if os.path.isfile('log_normal.pkl'): os.remove('log_normal.pkl') sm = StanModel(file='log_normal.stan') with open('log_normal.pkl', 'wb') as f: pickle.dump(sm, f) if os.path.isfile('log_t.pkl'): os.remove('log_t.pkl') sm = StanModel(file='log_t.stan') with open('log_t.pkl', 'wb') as f: pickle.dump(sm, f)
def compile_stan_model(stan_model_name): """ Compile stan model and save as pkl """ source_model = pkg_resources.resource_filename( 'orbit', 'stan/{}.stan'.format(stan_model_name)) compiled_model = pkg_resources.resource_filename( 'orbit', 'stan_compiled/{}.pkl'.format(stan_model_name)) # updated for py3 os.makedirs(os.path.dirname(compiled_model), exist_ok=True) # compile if stan source has changed if not os.path.isfile(compiled_model) or \ os.path.getmtime(compiled_model) < os.path.getmtime(source_model): with open(source_model, encoding="utf-8") as f: model_code = f.read() sm = StanModel(model_code=model_code) with open(compiled_model, 'wb') as f: pickle.dump(sm, f, protocol=pickle.HIGHEST_PROTOCOL) return None
def get_or_compile_stan_model(model_file, distribution): """ Creates Stan model. Compiles a Stan model and saves it to .pkl file to the folder selected by tempfile module if file doesn't exist yet and load precompiled model if there is a model file in temporary dir. Args: model_file: model file location distribution: name of the KPI distribution model, which assumes a Stan model file with the same name exists Returns: returns compiled Stan model for the selected distribution or normal distribution as a default option Note: compiled_model_file is the hardcoded file path which may cause some issues in future. There are 2 alternative implementations for Stan models handling: 1. Using global variables 2. Pre-compiling stan models and adding them as a part of expan project (3). Using temporary files with tempfile module is not currently possible, since it generates a unique file name which is difficult to track. However, compiled modules are saved in temporary directory using tempfile module which vary based on the current platform and settings. Cleaning up a temp dir is done on boot. """ python_version = '{0[0]}.{0[1]}'.format(sys.version_info) compiled_model_file = tempfile.gettempdir() + '/expan_early_stop_compiled_stan_model_' \ + distribution + '_' + python_version + '.pkl' if os.path.isfile(compiled_model_file): sm = pickle.load(open(compiled_model_file, 'rb')) else: sm = StanModel(file=model_file) with open(compiled_model_file, 'wb') as f: pickle.dump(sm, f) return sm
def test_bernoulli_compile_time(self): model_code = self.bernoulli_model_code t0 = time.time() model = StanModel(model_code=model_code) self.assertIsNotNone(model) msg = "Compile time: {}s (vs. RStan 28s)\n".format(int(time.time()-t0)) logging.info(msg)
def test_stanc_exception(self): model_code = 'parameters {real z;} model {z ~ no_such_distribution();}' assertRaisesRegex = self.assertRaisesRegexp if PY2 else self.assertRaisesRegex with assertRaisesRegex(ValueError, 'unknown distribution'): stanc(model_code=model_code) with assertRaisesRegex(ValueError, 'unknown distribution'): StanModel(model_code=model_code)
def get_stan_model(model_name, recompile=False, **model_params): """ Compile a Stan probabilistic model, or load pre-compiled model from cache, if available. Parameters ---------- model_name : str Then name of the model used "bernoulli", "b": Bernoulli likelihood flat prior on probabilities "beta-binomial", "bb": Binomial likelihood and Beta prior recompile : boolean If set to to True, always recompile the model, otherwise try to use the cached pickle of the model. """ python_version = 'python{0[0]}.{0[1]}'.format(sys.version_info) filename = "-".join([_f for _f in [model_name, python_version] if _f]) compiled_model_file = os.path.join(STAN_MODEL_CACHE, filename + ".pickle") if os.path.isfile(compiled_model_file) and not recompile: with open(compiled_model_file, 'rb') as m: model = pickle.load(m) else: model_code = get_stan_model_code(model_name) model = StanModel(model_code=model_code) logging.info('Saving model to {}'.format(compiled_model_file)) with open(compiled_model_file, 'wb') as f: pickle.dump(model, f) return model
def __init__(self, context, rdd, prepare_data_callback, stan_file): self.rdd = rdd self.prepare_data_callback = prepare_data_callback self.n_partitions = self.rdd.getNumPartitions() sm = StanModel(file=stan_file) pickle.dump(sm, open(PICKLE_FILENAME, "wb")) context.addFile(PICKLE_FILENAME)
def tobit_vec_QR(tobit_data: pd.DataFrame, scaled: bool = False): """ vectorised version of the tobit model that combines the parameters for the censored values with the uncensored values into a transformed y for more efficiency. """ vec_model = StanModel( file=Path('models/tobit_students_vec_qr.stan').open(), extra_compile_args=["-w"]) not_800 = tobit_data['apt'] != 800 is_800 = tobit_data['apt'] == 800 ii_obs = tobit_data[not_800]['id'] ii_cens = tobit_data[is_800]['id'] if not scaled: vec_dict = { 'X': tobit_data[new_preds], 'n_obs': not_800.sum(), 'n_cens': is_800.sum(), 'U': 800, 'y_obs': tobit_data[not_800]['apt'], 'p': len(new_preds), 'ii_obs': ii_obs, 'ii_cens': ii_cens } else: trans = MaxAbsScaler().fit_transform(tobit_data[new_preds + ['apt']]) data_centered = pd.DataFrame(trans, columns=new_preds + ['apt']) vec_dict = { 'X': data_centered[new_preds], 'n_obs': not_800.sum(), 'n_cens': is_800.sum(), 'U': 800, 'y_obs': data_centered[not_800]['apt'], 'p': len(new_preds), 'ii_obs': ii_obs, 'ii_cens': ii_cens, 'X_cens': data_centered[is_800][new_preds] } vec_fit = vec_model.sampling(data=vec_dict, iter=10000, chains=4, warmup=2000, control=c_params) print('β: {}'.format(vec_fit['beta'][501:].mean(axis=0))) print(vec_fit.stansummary()) return vec_fit, vec_model
def tobit_simple_model(tobit_data: pd.DataFrame, scaled: bool = False): """ 2) using a censored model. Has the same sigma - in the paper, the distinction between ε_{it} ~ normal(0,σ^2and θ^m_{it} ~ normal(0, δ^2_m) is clearly made. This looks quite close to the values from the tutorial: Intercept: 209.5488 mydata$read: 2.6980, mydata$math: 5.9148 """ censored_model = StanModel( file=Path('models/tobit_students_split.stan').open(), extra_compile_args=["-w"]) not_800 = tobit_data['apt'] != 800 is_800 = tobit_data['apt'] == 800 if not scaled: cens_dict_ex = { 'X': tobit_data[not_800][new_preds], 'n': tobit_data.shape[0] - is_800.sum(), 'y': tobit_data[not_800]['apt'], 'n_cens': is_800.sum(), 'p': len(new_preds), 'X_cens': tobit_data[is_800][new_preds], 'y_cens': tobit_data[is_800]['apt'], 'U': 800 } else: trans = MaxAbsScaler().fit_transform(tobit_data[new_preds + ['apt']]) data_centered = pd.DataFrame(trans, columns=new_preds + ['apt']) cens_dict_ex = { 'X': data_centered[not_800][new_preds], 'n': tobit_data.shape[0] - is_800.sum(), 'y': data_centered[not_800]['apt'], 'n_cens': is_800.sum(), 'p': len(new_preds), 'y_cens': data_centered[is_800]['apt'], 'U': 1, 'X_cens': tobit_data[is_800][new_preds] } censored_fit = censored_model.sampling(data=cens_dict_ex, iter=2000, chains=4, warmup=500, control=c_params) censored_res = censored_fit.extract() print('β: {}'.format(censored_res['beta'][501:].mean(axis=0))) return censored_fit, censored_model
def build_stan_model(target_dir, model_dir=MODEL_DIR): from pystan import StanModel model_name = 'prophet.stan' target_name = 'prophet_model.pkl' with open(os.path.join(model_dir, model_name)) as f: model_code = f.read() sm = StanModel(model_code=model_code) with open(os.path.join(target_dir, target_name), 'wb') as f: pickle.dump(sm, f, protocol=pickle.HIGHEST_PROTOCOL)
def run_inference(): df = pd.read_csv('3gaussians-10k.csv') X = np.array(df[['XX', 'YY']].values) K = 3 data = {'D': 2, 'K': 3, 'N': 10000, 'Omega0': np.identity(2), 'alpha': K * [0.1], 'beta0': 0.1, 'dof0': 1.1, 'm0': np.zeros(2), 'x': X} model = StanModel(file='finite_gaussian_mixture.stan') return model.sampling(data=data, warmup=200, iter=700)
def compile_stan_models(target_dir, model_dir=MODEL_DIR): """Pre-compile the stan models that are used by the module.""" from pystan import StanModel names = ["simple_model.stan", "model_with_prior.stan"] targets = ["simple_model.pkl", "prior_model.pkl"] for (name, target) in zip(names, targets): sm = StanModel(file=os.path.join(model_dir, name)) with open(os.path.join(target_dir, target), "wb") as f: pickle.dump(sm, f, protocol=pickle.HIGHEST_PROTOCOL)
def compile_stan_models(target_dir, model_dir=MODEL_DIR): """Pre-compile the stan models that are used by the module.""" from pystan import StanModel print("Compiling Stan player model, and putting pickle in {}".format( target_dir)) sm = StanModel(file=os.path.join(model_dir, "player_forecasts.stan")) with open(os.path.join(target_dir, "player_forecasts.pkl"), "wb") as f_stan: pickle.dump(sm, f_stan, protocol=pickle.HIGHEST_PROTOCOL)
def test_matrix_param_order_optimizing(): model_code = """ data { int<lower=2> K; } parameters { matrix[K,2] beta; } model { for (k in 1:K) beta[k,1] ~ normal(0,1); for (k in 1:K) beta[k,2] ~ normal(100,1); }""" sm = StanModel(model_code=model_code) op = sm.optimizing(data=dict(K=3)) beta = op['par']['beta'] assert beta.shape == (3, 2) beta_colmeans = np.mean(beta, axis=0) assert beta_colmeans[0] < 4 assert beta_colmeans[1] > 100 - 4
import pandas as pd from pystan import StanModel import matplotlib.pyplot as plt import pickle d = pd.read_csv('input/data-attendance-1.txt') d.Score /= 200 data = d.to_dict('list') data.update({'N':len(d)}) stanmodel = StanModel(file='model/model5-3.stan') # NUTS (No U-Turn Sampler) fit_nuts = stanmodel.sampling(data=data, n_jobs=1) mcmc_sample = fit_nuts.extract() mu_est = mcmc_sample['mu'] # ADVI (Automatic Differentiation Variational Inference) fit_vb = stanmodel.vb(data=data) vb_sample = pd.read_csv(fit_vb['args']['sample_file'].decode('utf-8'), comment='#') vb_sample = vb_sample.drop([0,1]) mu_est = vb_sample.filter(regex='mu\.\d+') with open('output/model_and_result.pkl', 'wb') as f: pickle.dump(stanmodel, f) pickle.dump(fit_nuts, f)
def test8schools(): model_name = "_8chools" sfile = os.path.join(os.path.dirname(__file__), "../stan/src/models/misc/eight_schools/eight_schools.stan") m = StanModel(file=sfile, model_name=model_name, verbose=True) m.dso yam = StanModel(file=sfile, model_name=model_name, save_dso=False, verbose=True) yam.dso dat = dict(J=8, y=(28, 8, -3, 7, -1, 1, 18, 12), sigma=(15, 10, 16, 11, 9, 11, 10, 18)) iter = 5020 # HMC ss1 = m.sampling(data=dat, iter=iter, chains=4, algorithm='HMC', refresh=100) ss1son = stan(fit=ss1, data=dat, init_r=0.0001) ss1son = stan(fit=ss1, data=dat, init_r=0) ainfo1 = ss1.get_adaptation_info() lp1 = ss1.get_logposterior() yalp1 = ss1.get_logposterior(inc_warmup=False) sp1 = ss1.get_sampler_params() yasp1 = ss1.get_sampler_params(inc_warmup=False) gm1 = ss1.get_posterior_mean() print(gm1) # NUTS 1 ss2 = m.sampling(data=dat, iter=iter, chains=4, refresh=100, control=dict(metric="unit_e")) ainfo2 = ss2.get_adaptation_info() lp2 = ss2.get_logposterior() yalp2 = ss2.get_logposterior(inc_warmup=False) sp2 = ss2.get_sampler_params() yasp2 = ss2.get_sampler_params(inc_warmup=False) gm2 = ss2.get_posterior_mean() print(gm2) # NUTS 2 ss3 = m.sampling(data=dat, iter=iter, chains=4, refresh=100) ainfo3 = ss3.get_adaptation_info() lp3 = ss3.get_logposterior() yalp3 = ss3.get_logposterior(inc_warmup=False) sp3 = ss3.get_sampler_params() yasp3 = ss3.get_sampler_params(inc_warmup=False) gm3 = ss3.get_posterior_mean() print(gm3) # Non-diag ss4 = m.sampling(data=dat, iter=iter, chains=4, control=dict(metric='dense_e'), refresh=100) ainfo4 = ss4.get_adaptation_info() lp4 = ss4.get_logposterior() yalp4 = ss4.get_logposterior(inc_warmup=False) sp4 = ss4.get_sampler_params() yasp4 = ss4.get_sampler_params(inc_warmup=False) gm4 = ss4.get_posterior_mean() print(gm4) print(ss1) print(ss2) print(ss3) ss1.plot() ss1.traceplot() ss9 = m.sampling(data=dat, iter=iter, chains=4, refresh=10) iter = 52012 ss = stan(sfile, data=dat, iter=iter, chains=4, sample_file='8schools.csv') print(ss) ss_inits = ss.inits ss_same = stan(sfile, data=dat, iter=iter, chains=4, seed=ss.stan_args[0]['seed'], init=ss_inits, sample_file='ya8schools.csv') b = np.allclose(ss.extract(permuted=False), ss_same.extract(permuted=False)) # b is not true as ss is initialized randomly while ss.same is not. s = ss_same.summary(pars="mu", probs=(.3, .8)) # not in python: print(ss.same, pars='theta', probs=c(.4, .8)) print(ss_same)
N = len(x_data) # Introduce noise x_data = np.random.normal(x_data, 7) y_data = np.random.normal(y_data, 8) # plot the data pyplot.plot(x_data, y_data, 'o') stan_data_mappings = { 'x': x_data, 'y': y_data, 'N': N, } model = StanModel(file='models/univariate_regression.stan') fit = model.sampling(data=stan_data_mappings) params = fit.extract() a_pred = params['a'] b_pred = params['b'] sigma_pred = params['sigma'] # Draw 100 points from where x_data is. xfit = np.linspace(-10 + min(x_data), 10 + max(x_data), 100) # Number of samples. M = len(a_pred) yfit = a_pred.reshape((M, 1)) + b_pred.reshape((M, 1)) * xfit
'sc_mean_vec': sc_mean_vec, 'sc_var_vec': sc_var_vec, 'mp_mean_prior_mean': 1, 'mp_mean_prior_var': 2, 'mp_var_prior_shape': 3, 'mp_var_prior_scale': 4, 'mp_corr_prior_conc': 3, 'exponent_prior_mean': exponent, 'base_rate_prior_mean': base_rate, 'threshold_prior_mean': threshold, 'n_samples': n_samp_est, 'stdnorm_samples': stdnorm_samples } if recompile: sm = StanModel(file='corr_rate.stan', verbose=False) with open('corr_rate.pkl', 'wb') as f: pickle.dump(sm, f) else: sm = pickle.load(open('corr_rate.pkl', 'rb')) fit = sm.sampling(data=corr_dat, iter=2000, chains=3) estimation = fit.extract(permuted=True) cm = estimation['mp_corr_mat'] pickle.dump(cm, open('corr_mat_samples.pkl', 'wb')) savemat('corr_mat_samples.mat', {'cm': cm}) mp_col = 'r' sc_true_col = 'y' sc_obs_col = 'g'
import numpy as np import pickle import pystan from matplotlib import pyplot as plt print 'test' from pystan import StanModel sm = StanModel(file='model.stan') with open('model.pkl','wb') as f: pickle.dump(sm, f) import scipy as sc def phi(x,mu=0,sd=1): return 0.5 * (1 + sc.special.erf((x - mu) / (sd * np.sqrt(2)))) N = 300 df = 2*(np.random.random(N)-0.5)*0.6 d1 = 2*(np.random.random(N)-0.5) A = lambda sc,de : lambda d : sc*d*np.exp(-np.abs(d)/de) alpha = A(5,1) sigma = 0.06 y = (np.random.rand(N)<phi(df/sigma-alpha(d1))).astype(int) plt.plot(d1,alpha(d1),'rx',) plt.plot(d1,df/sigma,'gx',) sm = pickle.load(open('model.pkl','rb'))
def test_optimizing_basic(): sm = StanModel(model_code='parameters {real y;} model {y ~ normal(0,1);}') op = sm.optimizing() assert op['par']['y'].shape == () assert abs(op['par']['y']) < 1
// We will do this in separate dimensions (teff, logg) because we are not considering // the case that the outliers are covariant in the same way we believe the measurements // are increment_log_prob(log_sum_exp( log1m_alpha + normal_log(sp_vector[1], outlier_teff_mu, outlier_teff_sigma), log1m_alpha + normal_log(sp_vector[2], outlier_teff_mu, outlier_teff_sigma) )); } }""" # Ok, here is our toy data: with open("toy.data", "r") as fp: data = json.load(fp) model = StanModel(model_code=model_code) print("Optimizing...") op = model.optimizing(data=data) print("Fitting...") fit = model.sampling(data=data, pars=op["par"], iter=20000) subplots_adjust = { "left": 0.10, "bottom": 0.05, "right": 0.95, "top": 0.95, "wspace": 0.20, "hspace": 0.45 } nodes = range(2) dimensions = ("teff", "logg") # Plot the m, b parameters for each node
'C': C, 'z_shape': z_shape, 'z_scale': z_scale, 'g_shape': g_shape, 'g_scale': g_scale, 'Var_u': Var_u } if model_type == 1: fname = "csm" elif model_type == 2: fname = "csm2" elif model_type == 3: fname = "msm" if recompile: sm = StanModel(file=fname + '_inference.stan') with open(fname + '_inference.pkl', 'wb') as f: pickle.dump(sm, f) else: sm = pickle.load(open(fname + '_inference.pkl', 'rb')) fit = sm.sampling(data=gsm_dat, iter=2000, chains=2) estimation = fit.extract(permuted=True) g_est_mean = np.mean(estimation["g"], 0) print('g est', g_est_mean) print('g true', g_synth) z_est_mean = np.mean(estimation["z"], 0).T print('z est', z_est_mean) print('z true', z_synth)
parameters { real mu; real<lower=0> tau; real eta[J]; } transformed parameters { real theta[J]; for (j in 1:J) theta[j] <- mu + tau * eta[j]; } model { eta ~ normal(0, 1); y ~ normal(theta, sigma); } ''' m = StanModel(model_code=schools_code, model_name=model_name, verbose=True) J = 8 y = (28, 8, -3, 7, -1, 1, 18, 12) sigma = (15, 10, 16, 11, 9, 11, 10, 18) iter = 1000 dat = dict(J=J, y=y, sigma=sigma) ss1 = m.sampling(data=dat, iter=iter, chains=4, refresh=100) print(ss1) ss1.traceplot() ss = stan(model_code=schools_code, data=dat, iter=iter, chains=4, sample_file='8schools.csv') print(ss)
yield obsid, odata standata = StanData(datafile, samplefile, obssel) N = standata.Noutcomes() K = standata.Npredictors() matrix = standata.predictors() print "Predictor matix (", K, "X", N, "):" for i in range(min(K, 5)): print "\t".join( [str(matrix[j,i]) for j in range(min(N, 10))] ) from pystan import StanModel smfile = modelfile + ".pkl" if not os.path.exists(smfile): modelname = os.path.splitext(os.path.basename(modelfile))[0] + "_" + os.path.splitext(os.path.basename(datafile))[0] stanmodel = StanModel(file=modelfile, model_name=modelname) with open(smfile, 'wb') as f: pickle.dump(stanmodel, f) else: stanmodel = pickle.load(open(smfile, 'rb')) def get_median(sample_array): sample_array.sort() n = len(sample_array) medianvalue = sample_array[n/2] if (n % 2) == 0 else (sample_array[n/2] + sample_array[n/2 + 1]) / 2.0 return medianvalue pdf = bp.PdfPages(betapdffile) results = {} for obsid, odata in standata.iter_observations(): sample_outfile = os.path.join(sampledir, obsid + "_samples.txt") if sampledir != "" else None
import matplotlib.pyplot as pl # x = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] x = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1] # x = [1, 1, 1] cf_dat = { 'N': len(x), 'x': x, 'prior_width': 0.2 } recompile = False if recompile: sm = StanModel(file='coinflip.stan') with open('coinflip.pkl', 'wb') as f: pickle.dump(sm, f) else: sm = pickle.load(open('coinflip.pkl', 'rb')) # fit = sm.sampling(data=cf_dat, iter=2000, chains=2) fit = sm.sampling(data=cf_dat, iter=20, chains=1, seed='random', init=[{'beta': 0.5}], warmup=10) estimation = fit.extract(permuted=True) print(estimation['beta']) pl.hist(estimation['beta'], bins=40) pl.xlim([0, 1]) pl.show()
for i in range(N): act_u = u_synth[i, :].T act_mean = z_synth[i] * A.dot(act_u) x_synth[i, :] = multivariate_normal(act_mean, sigma_x) gsm_dat = { "N": N, "d_x": d_x, "d_u": d_u, "sigma_x": sigma_x, "x": x_synth, "A": A, "C": C, "z_shape": z_shape, "z_scale": z_scale, } if recompile: sm = StanModel(file="gsm_inference.stan") with open("gsm_inference.pkl", "wb") as f: pickle.dump(sm, f) else: sm = pickle.load(open("gsm_inference.pkl", "rb")) fit = sm.sampling(data=gsm_dat, iter=100, chains=8) estimation = fit.extract(permuted=True) z_est_mean = mean(estimation["z"], 0).T print(z_est_mean) print(z_synth)