def stanTopkl(): if os.path.isfile('log_normal.pkl'): os.remove('log_normal.pkl') sm = StanModel(file='log_normal.stan') with open('log_normal.pkl', 'wb') as f: pickle.dump(sm, f) if os.path.isfile('log_t.pkl'): os.remove('log_t.pkl') sm = StanModel(file='log_t.stan') with open('log_t.pkl', 'wb') as f: pickle.dump(sm, f)
def stanTopkl(): """ The function complies 'stan' models first and avoids re-complie of the model. """ if os.path.isfile('log_normal.pkl'): os.remove('log_normal.pkl') sm = StanModel(file='log_normal.stan') with open('log_normal.pkl', 'wb') as f: pickle.dump(sm, f) if os.path.isfile('log_t.pkl'): os.remove('log_t.pkl') sm = StanModel(file='log_t.stan') with open('log_t.pkl', 'wb') as f: pickle.dump(sm, f)
def get_or_compile_stan_model(model_file, distribution): """ Creates Stan model. Compiles a Stan model and saves it to .pkl file to the folder selected by tempfile module if file doesn't exist yet and load precompiled model if there is a model file in temporary dir. Args: model_file: model file location distribution: name of the KPI distribution model, which assumes a Stan model file with the same name exists Returns: returns compiled Stan model for the selected distribution or normal distribution as a default option Note: compiled_model_file is the hardcoded file path which may cause some issues in future. There are 2 alternative implementations for Stan models handling: 1. Using global variables 2. Pre-compiling stan models and adding them as a part of expan project (3). Using temporary files with tempfile module is not currently possible, since it generates a unique file name which is difficult to track. However, compiled modules are saved in temporary directory using tempfile module which vary based on the current platform and settings. Cleaning up a temp dir is done on boot. """ python_version = '{0[0]}.{0[1]}'.format(sys.version_info) compiled_model_file = tempfile.gettempdir() + '/expan_early_stop_compiled_stan_model_' \ + distribution + '_' + python_version + '.pkl' if os.path.isfile(compiled_model_file): sm = pickle.load(open(compiled_model_file, 'rb')) else: sm = StanModel(file=model_file) with open(compiled_model_file, 'wb') as f: pickle.dump(sm, f) return sm
def simple_car_model(tobit_data: pd.DataFrame, ad_matrix): """ In the model of the researchers, phi is distributed around phi_bar is this handled by the multi_normal_prec??? Need to understand docs and adjust if not. - seems to be legit. Documentation of WinBUGS does it in a similar way. https://mc-stan.org/docs/2_19/functions-reference/multivariate-normal-distribution-precision-parameterization.html - find out what the CAR prior in car.normal is. Right now I just have 2/-2 ... - Unfortunately, there is no information available. Just need to set something that works. """ car_model = StanModel(file=Path('models/tobit_car_students.stan').open(), extra_compile_args=["-w"]) car_dict = get_datadict() car_dict['W'] = ad_matrix car_dict['U'] = 800 # this smaller run still took 25 mins to sample... # And still getting too low E-BFMI values car_fit = car_model.sampling(data=car_dict, iter=2000, warmup=500, chains=4) dump(car_fit, Path('data/car_students_2000.joblib')) car_res = car_fit.extract() print('β_0: {}'.format(car_res['beta_zero'][501:].mean())) print('β: {}'.format(car_res['beta'][501:].mean(axis=0))) # getting many rejections - bad? Phi is a bit like a covariance matrix # -> only in the beginning, after 200 iterations all fine. # result from the run: chains have not mixed, might need to re-parametrize... # am I contraining the variables too much??? Need to center somehow? return car_fit, car_model
def compile_stan_model(stan_model_name): """ Compile stan model and save as pkl """ source_model = pkg_resources.resource_filename( 'orbit', 'stan/{}.stan'.format(stan_model_name)) compiled_model = pkg_resources.resource_filename( 'orbit', 'stan_compiled/{}.pkl'.format(stan_model_name)) # updated for py3 os.makedirs(os.path.dirname(compiled_model), exist_ok=True) # compile if stan source has changed if not os.path.isfile(compiled_model) or \ os.path.getmtime(compiled_model) < os.path.getmtime(source_model): with open(source_model, encoding="utf-8") as f: model_code = f.read() sm = StanModel(model_code=model_code) with open(compiled_model, 'wb') as f: pickle.dump(sm, f, protocol=pickle.HIGHEST_PROTOCOL) return None
def test_stanc_exception(self): model_code = 'parameters {real z;} model {z ~ no_such_distribution();}' assertRaisesRegex = self.assertRaisesRegexp if PY2 else self.assertRaisesRegex with assertRaisesRegex(ValueError, 'unknown distribution'): stanc(model_code=model_code) with assertRaisesRegex(ValueError, 'unknown distribution'): StanModel(model_code=model_code)
def get_stan_model(model_name, recompile=False, **model_params): """ Compile a Stan probabilistic model, or load pre-compiled model from cache, if available. Parameters ---------- model_name : str Then name of the model used "bernoulli", "b": Bernoulli likelihood flat prior on probabilities "beta-binomial", "bb": Binomial likelihood and Beta prior recompile : boolean If set to to True, always recompile the model, otherwise try to use the cached pickle of the model. """ python_version = 'python{0[0]}.{0[1]}'.format(sys.version_info) filename = "-".join([_f for _f in [model_name, python_version] if _f]) compiled_model_file = os.path.join(STAN_MODEL_CACHE, filename + ".pickle") if os.path.isfile(compiled_model_file) and not recompile: with open(compiled_model_file, 'rb') as m: model = pickle.load(m) else: model_code = get_stan_model_code(model_name) model = StanModel(model_code=model_code) logging.info('Saving model to {}'.format(compiled_model_file)) with open(compiled_model_file, 'wb') as f: pickle.dump(model, f) return model
def run_or_load_model(m_type, m_dict, iters, warmup, c_params): if m_type not in ['car', 'tobit']: raise Exception('Invalid model type!') name = 'crash_{}_{}-{}_delta_{}_max_{}'.format(m_type, iters, warmup, c_params['adapt_delta'], c_params['max_treedepth']) try: model = load(Path('cache/' + name + '_model.joblib')) except: model = StanModel(file=Path( 'models/crash_{}.stan'.format(m_type)).open(), extra_compile_args=["-w"], model_name=name.split('-')[0]) dump(model, Path('cache/' + name + '_model.joblib')) try: fit = load(Path('cache/' + name + '_fit.joblib')) except: fit = model.sampling(data=m_dict, iter=iters, warmup=warmup, control=c_params, check_hmc_diagnostics=True) info = fit.stansummary() with open(Path('logs/' + name + '.log'), 'w') as c_log: c_log.write(info) dump(fit, Path('cache/' + name + '_fit.joblib')) return model, fit
def fit(self, iterations=5000, control=default_control): self.stan_model = StanModel('../stan/player_scoring.stan') self.samples = self.stan_model.sampling(self.stan_data, iter=iterations, chains=4, refresh=1, control=control)
def test_empty_parameter(self): model_code = """ parameters { real y; vector[3] x; vector[0] a; vector[2] z; } model { y ~ normal(0,1); } """ if pystan_version() == 2: from pystan import StanModel # pylint: disable=import-error model = StanModel(model_code=model_code) fit = model.sampling(iter=500, chains=2, check_hmc_diagnostics=False) else: import stan # pylint: disable=import-error model = stan.build(model_code) fit = model.sample(num_samples=500, num_chains=2) posterior = from_pystan(posterior=fit) test_dict = { "posterior": ["y", "x", "z", "~a"], "sample_stats": ["diverging"] } fails = check_multiple_attrs(test_dict, posterior) assert not fails
def compile(self): """Compile the Stan model.""" # Note: we deliberately use a centered parameterization for the # thetas at the moment. This is sub-optimal in terms of estimation, # but allows us to avoid having to add extra logic to detect and # handle intercepts in X. spec = f""" data {{ int<lower=1> N; int<lower=1> K; vector[N] y; int<lower=1,upper=K> id[N]; int<lower=1> C; matrix[K, C] X; vector[N] sigma; }} parameters {{ vector[C] beta; vector[K] theta; real<lower=0> tau2; }} transformed parameters {{ vector[N] mu; mu = theta[id] + X * beta; }} model {{ y ~ normal(mu, sigma); theta ~ normal(0, tau2); }} """ from pystan import StanModel self.model = StanModel(model_code=spec)
def test_bernoulli_compile_time(self): model_code = self.bernoulli_model_code t0 = time.time() model = StanModel(model_code=model_code) self.assertIsNotNone(model) msg = "Compile time: {}s (vs. RStan 28s)\n".format(int(time.time()-t0)) logging.info(msg)
def __init__(self, context, rdd, prepare_data_callback, stan_file): self.rdd = rdd self.prepare_data_callback = prepare_data_callback self.n_partitions = self.rdd.getNumPartitions() sm = StanModel(file=stan_file) pickle.dump(sm, open(PICKLE_FILENAME, "wb")) context.addFile(PICKLE_FILENAME)
def scaled_spare_car(tobit_data: pd.DataFrame, ad_matrix): """ will try with values closer to 0 now. sigma was 67.3 with stdev 3.74 even worse - E-BMFI is still small, but now also much treedepth saturation (OK) and chain divergence (bad!) would need to check energy-plots and what correlates... TODO: if I scale, I have the danger of missing not hitting the condition for U... -> should not be a problem if I have zeros there as lower bound """ tobit_data['ones'] = np.ones(tobit_data.shape[0]) trans = MaxAbsScaler().fit_transform(tobit_data[new_preds + ['apt']]) data_centered = pd.DataFrame(trans, columns=new_preds + ['apt']) is_800 = tobit_data['apt'] == 800 not_800 = tobit_data['apt'] != 800 ii_obs = tobit_data[not_800]['id'] ii_cens = tobit_data[is_800]['id'] # After using vectorisation: Gradient takes 0.0003 seconds. c_sparse_dict = { 'X': data_centered[new_preds], 'n': tobit_data.shape[0], 'n_obs': not_800.sum(), 'n_cens': is_800.sum(), 'y_obs': data_centered[not_800]['apt'], 'ii_obs': ii_obs, 'ii_cens': ii_cens, 'p': len(new_preds), 'y_cens': data_centered[is_800]['apt'], 'W': ad_matrix, 'U': 1, 'W_n': ad_matrix.sum() // 2 } # or just 'models/sparse_tcar_students_without_QR.stan' c_sp_model = StanModel(file=Path('sparse_tobitcar_students.stan').open(), verbose=False, extra_compile_args=["-w"]) c_params = {'adapt_delta': 0.95, 'max_treedepth': 12} # no more saturation, but still divergence... # trying to constrain the model: α <= 0.99 instead <=1, σ >= 0.001 c_sp_fit = c_sp_model.sampling(c_sparse_dict, iter=4000, warmup=500, control=c_params) c_sp_res = c_sp_fit.extract() print(c_sp_fit.stansummary()) dump(c_sp_fit, 'data/c_sp_4000.joblib') plt.scatter(c_sp_fit['lp__'], c_sp_fit['sigma']) # sigma looks very correlated. simpler_csp = c_sp_res.copy() del simpler_csp['phi'] del simpler_csp['y_cens'] del simpler_csp['beta'] del simpler_csp['y'] if 'theta' in simpler_csp: del simpler_csp['theta'] c_sp_df = pd.DataFrame.from_dict(simpler_csp) sns.pairplot(c_sp_df) return c_sp_fit, c_sp_model
def _bayes_sampling(x, y, distribution='normal'): """ Helper function. Args: x (array_like): sample of a treatment group y (array_like): sample of a control group distribution: name of the KPI distribution model, which assumes a Stan model file with the same name exists Returns: tuple: - the posterior samples - sample size of x - sample size of y - absolute mean of x - absolute mean of y """ # Checking if data was provided if x is None or y is None: raise ValueError('Please provide two non-None samples.') # Coercing missing values to right format _x = np.array(x, dtype=float) _y = np.array(y, dtype=float) mu_x = np.nanmean(_x) mu_y = np.nanmean(_y) n_x = statx.sample_size(_x) n_y = statx.sample_size(_y) if distribution == 'normal': fit_data = {'Nc': n_y, 'Nt': n_x, 'x': _x, 'y': _y} elif distribution == 'poisson': fit_data = { 'Nc': n_y, 'Nt': n_x, 'x': _x.astype(int), 'y': _y.astype(int) } else: raise NotImplementedError model_file = __location__ + '/../models/' + distribution + '_kpi.stan' sm = StanModel(file=model_file) fit = sm.sampling(data=fit_data, iter=25000, chains=4, n_jobs=1, seed=1, control={ 'stepsize': 0.01, 'adapt_delta': 0.99 }) traces = fit.extract() return traces, n_x, n_y, mu_x, mu_y
def build_stan_model(target_dir, model_dir=MODEL_DIR): from pystan import StanModel model_name = 'prophet.stan' target_name = 'prophet_model.pkl' with open(os.path.join(model_dir, model_name)) as f: model_code = f.read() sm = StanModel(model_code=model_code) with open(os.path.join(target_dir, target_name), 'wb') as f: pickle.dump(sm, f, protocol=pickle.HIGHEST_PROTOCOL)
def verify_stan(): """ Simplest model to verify the stan installation """ model_code = 'parameters {real y;} model {y ~ normal(0,1);}' model = StanModel(model_code=model_code) y = model.sampling().extract()['y'] print('If this worked, you will see a value near 0 now:') print(y.mean())
def compile_stan_models(target_dir, model_dir=MODEL_DIR): """Pre-compile the stan models that are used by the module.""" from pystan import StanModel names = ["simple_model.stan", "model_with_prior.stan"] targets = ["simple_model.pkl", "prior_model.pkl"] for (name, target) in zip(names, targets): sm = StanModel(file=os.path.join(model_dir, name)) with open(os.path.join(target_dir, target), "wb") as f: pickle.dump(sm, f, protocol=pickle.HIGHEST_PROTOCOL)
def main(): schools_dat = { 'J': 8, 'y': [28, 8, -3, 7, -1, 1, 18, 12], 'sigma': [15, 10, 16, 11, 9, 11, 10, 18] } sm = StanModel(file='model.stan') fit = sm.sampling(data=schools_dat, iter=1000, chains=4, seed=555) with open(DATA_FILE_NAME, 'wb') as f: pickle.dump({'model': sm, 'fit': fit}, f)
def build_model(stan_file): pkl_file = stan_file.replace('.stan', '.pkl') if pkl_file not in listdir(): model = StanModel(file=stan_file) pickle_to_file(model, pkl_file) else: model = pickle_from_file(pkl_file) return model
def build_stan_models(target_dir, models_dir=MODELS_DIR): from pystan import StanModel for model_type in ['linear', 'logistic']: model_name = 'prophet_{}_growth.stan'.format(model_type) target_name = '{}_growth.pkl'.format(model_type) with open(os.path.join(models_dir, model_name)) as f: model_code = f.read() sm = StanModel(model_code=model_code) with open(os.path.join(target_dir, target_name), 'wb') as f: pickle.dump(sm, f, protocol=pickle.HIGHEST_PROTOCOL)
def compile_stan_models(target_dir, model_dir=MODEL_DIR): """Pre-compile the stan models that are used by the module.""" from pystan import StanModel print("Compiling Stan player model, and putting pickle in {}".format( target_dir)) sm = StanModel(file=os.path.join(model_dir, "player_forecasts.stan")) with open(os.path.join(target_dir, "player_forecasts.pkl"), "wb") as f_stan: pickle.dump(sm, f_stan, protocol=pickle.HIGHEST_PROTOCOL)
def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter, description="This script compiles a stan model and pickles it to disc") parser.add_argument("stan", help="The stan model file") parser.add_argument("out", help="The python3 pickle output file") args = parser.parse_args() sm = StanModel(file=args.stan) with open(args.out, 'wb') as f: pickle.dump(sm, f)
def build_stan_models(self): from pystan import StanModel target_dir = os.path.join(self.build_lib, 'fbprophet/stan_models') self.mkpath(target_dir) for model_type in ['linear', 'logistic']: with open('stan/prophet_{}_growth.stan'.format(model_type)) as f: model_code = f.read() sm = StanModel(model_code=model_code) with open(os.path.join(target_dir, '{}_growth.pkl'.format(model_type)), 'wb') as f: pickle.dump(sm, f)
def compile_stan_model(input_stan_filepath, output_model_filepath=None): with open(input_stan_filepath) as f: model_code = f.read() model = StanModel(model_code=model_code, verbose=True) if output_model_filepath is None: output_model_filepath = input_stan_filepath.replace( '.stan', '.pkl') with open(output_model_filepath, 'wb') as f: pickle.dump(model, f)
def sparse_car_model(tobit_data: pd.DataFrame, ad_matrix): sparse_dict = get_sparse_modeldict(tobit_data, ad_matrix) sparse_model = StanModel( file=Path('models/sparse_tobitcar_students.stan').open(), extra_compile_args=["-w"]) sparse_fit = sparse_model.sampling(sparse_dict, iter=4000, warmup=500, chains=4) print(sparse_fit.stansummary()) return sparse_fit, sparse_model
def test_stanc_exception(self): model_code = 'parameters {real z;} model {z ~ no_such_distribution();}' assertRaisesRegex = self.assertRaisesRegexp if PY2 else self.assertRaisesRegex # distribution not found error with assertRaisesRegex( ValueError, r'Probability function must end in _lpdf or _lpmf\. Found'): stanc(model_code=model_code) with assertRaisesRegex( ValueError, r'Probability function must end in _lpdf or _lpmf\. Found'): StanModel(model_code=model_code)
def compile_stan_models(target_dir, models_dir=MODELS_DIR): from pystan import StanModel model_path_list = MODELS_DIR.glob("*.stan") for model_path in model_path_list: model_type = model_path.stem target_name = model_type + ".pkl" target_path = target_dir / target_name with open(model_path) as f: model_code = f.read() model = StanModel(model_code=model_code, model_name=model_type) with open(target_path, "wb") as f: pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)
def test_init_zero_exception_inf_grad(self): code = """ parameters { real x; } model { target += 1 / log(x); } """ sm = StanModel(model_code=code) with self.assertRaises(RuntimeError): sm.sampling(init='0', iter=1, chains=1)
def setUpClass(cls): covexpquad = """ data { real rx1[5]; } model { matrix[5,5] a; a = cov_exp_quad(rx1, 1, 1); } """ cls.model = StanModel(model_code=covexpquad)