Example #1
0
    def test_empty_parameter(self):
        model_code = """
            parameters {
                real y;
                vector[3] x;
                vector[0] a;
                vector[2] z;
            }
            model {
                y ~ normal(0,1);
            }
        """
        if pystan_version() == 2:
            from pystan import StanModel  # pylint: disable=import-error

            model = StanModel(model_code=model_code)
            fit = model.sampling(iter=500,
                                 chains=2,
                                 check_hmc_diagnostics=False)
        else:
            import stan  # pylint: disable=import-error

            model = stan.build(model_code)
            fit = model.sample(num_samples=500, num_chains=2)

        posterior = from_pystan(posterior=fit)
        test_dict = {
            "posterior": ["y", "x", "z", "~a"],
            "sample_stats": ["diverging"]
        }
        fails = check_multiple_attrs(test_dict, posterior)
        assert not fails
Example #2
0
def run_or_load_model(m_type, m_dict, iters, warmup, c_params):
    if m_type not in ['car', 'tobit']:
        raise Exception('Invalid model type!')
    name = 'crash_{}_{}-{}_delta_{}_max_{}'.format(m_type, iters, warmup,
                                                   c_params['adapt_delta'],
                                                   c_params['max_treedepth'])
    try:
        model = load(Path('cache/' + name + '_model.joblib'))
    except:
        model = StanModel(file=Path(
            'models/crash_{}.stan'.format(m_type)).open(),
                          extra_compile_args=["-w"],
                          model_name=name.split('-')[0])
        dump(model, Path('cache/' + name + '_model.joblib'))
    try:
        fit = load(Path('cache/' + name + '_fit.joblib'))
    except:
        fit = model.sampling(data=m_dict,
                             iter=iters,
                             warmup=warmup,
                             control=c_params,
                             check_hmc_diagnostics=True)
        info = fit.stansummary()
        with open(Path('logs/' + name + '.log'), 'w') as c_log:
            c_log.write(info)
        dump(fit, Path('cache/' + name + '_fit.joblib'))
    return model, fit
Example #3
0
def simple_car_model(tobit_data: pd.DataFrame, ad_matrix):
    """
    In the model of the researchers, phi is distributed around phi_bar
    is this handled by the multi_normal_prec??? Need to understand docs and adjust if not.
      - seems to be legit. Documentation of WinBUGS does it in a similar way.
    https://mc-stan.org/docs/2_19/functions-reference/multivariate-normal-distribution-precision-parameterization.html
     - find out what the CAR prior in car.normal is. Right now I just have 2/-2 ...
       - Unfortunately, there is no information available. Just need to set something that works.

    """
    car_model = StanModel(file=Path('models/tobit_car_students.stan').open(),
                          extra_compile_args=["-w"])
    car_dict = get_datadict()
    car_dict['W'] = ad_matrix
    car_dict['U'] = 800

    # this smaller run still took 25 mins to sample...
    # And still getting too low E-BFMI values
    car_fit = car_model.sampling(data=car_dict,
                                 iter=2000,
                                 warmup=500,
                                 chains=4)
    dump(car_fit, Path('data/car_students_2000.joblib'))
    car_res = car_fit.extract()
    print('β_0: {}'.format(car_res['beta_zero'][501:].mean()))
    print('β:   {}'.format(car_res['beta'][501:].mean(axis=0)))

    # getting many rejections - bad? Phi is a bit like a covariance matrix
    # -> only in the beginning, after 200 iterations all fine.
    # result from the run: chains have not mixed, might need to re-parametrize...
    # am I contraining the variables too much??? Need to center somehow?
    return car_fit, car_model
Example #4
0
 def compile(self):
     """Compile the Stan model."""
     # Note: we deliberately use a centered parameterization for the
     # thetas at the moment. This is sub-optimal in terms of estimation,
     # but allows us to avoid having to add extra logic to detect and
     # handle intercepts in X.
     spec = f"""
     data {{
         int<lower=1> N;
         int<lower=1> K;
         vector[N] y;
         int<lower=1,upper=K> id[N];
         int<lower=1> C;
         matrix[K, C] X;
         vector[N] sigma;
     }}
     parameters {{
         vector[C] beta;
         vector[K] theta;
         real<lower=0> tau2;
     }}
     transformed parameters {{
         vector[N] mu;
         mu = theta[id] + X * beta;
     }}
     model {{
         y ~ normal(mu, sigma);
         theta ~ normal(0, tau2);
     }}
     """
     from pystan import StanModel
     self.model = StanModel(model_code=spec)
Example #5
0
 def fit(self, iterations=5000, control=default_control):
     self.stan_model = StanModel('../stan/player_scoring.stan')
     self.samples = self.stan_model.sampling(self.stan_data,
                                             iter=iterations,
                                             chains=4,
                                             refresh=1,
                                             control=control)
Example #6
0
def scaled_spare_car(tobit_data: pd.DataFrame, ad_matrix):
    """
    will try with values closer to 0 now.
    sigma was  67.3  with stdev 3.74
    even worse - E-BMFI is still small, but now also much treedepth saturation (OK)
    and chain divergence (bad!) would need to check energy-plots and what correlates...
    TODO: if I scale, I have the danger of missing not hitting the condition for U...
     -> should not be a problem if I have zeros there as lower bound
    """
    tobit_data['ones'] = np.ones(tobit_data.shape[0])
    trans = MaxAbsScaler().fit_transform(tobit_data[new_preds + ['apt']])
    data_centered = pd.DataFrame(trans, columns=new_preds + ['apt'])
    is_800 = tobit_data['apt'] == 800
    not_800 = tobit_data['apt'] != 800
    ii_obs = tobit_data[not_800]['id']
    ii_cens = tobit_data[is_800]['id']
    # After using vectorisation: Gradient takes 0.0003  seconds.
    c_sparse_dict = {
        'X': data_centered[new_preds],
        'n': tobit_data.shape[0],
        'n_obs': not_800.sum(),
        'n_cens': is_800.sum(),
        'y_obs': data_centered[not_800]['apt'],
        'ii_obs': ii_obs,
        'ii_cens': ii_cens,
        'p': len(new_preds),
        'y_cens': data_centered[is_800]['apt'],
        'W': ad_matrix,
        'U': 1,
        'W_n': ad_matrix.sum() // 2
    }
    # or just 'models/sparse_tcar_students_without_QR.stan'
    c_sp_model = StanModel(file=Path('sparse_tobitcar_students.stan').open(),
                           verbose=False,
                           extra_compile_args=["-w"])
    c_params = {'adapt_delta': 0.95, 'max_treedepth': 12}
    # no more saturation, but still divergence...
    # trying to constrain the model: α <= 0.99 instead <=1, σ >= 0.001
    c_sp_fit = c_sp_model.sampling(c_sparse_dict,
                                   iter=4000,
                                   warmup=500,
                                   control=c_params)
    c_sp_res = c_sp_fit.extract()
    print(c_sp_fit.stansummary())
    dump(c_sp_fit, 'data/c_sp_4000.joblib')
    plt.scatter(c_sp_fit['lp__'], c_sp_fit['sigma'])

    # sigma looks very correlated.
    simpler_csp = c_sp_res.copy()
    del simpler_csp['phi']
    del simpler_csp['y_cens']
    del simpler_csp['beta']
    del simpler_csp['y']
    if 'theta' in simpler_csp:
        del simpler_csp['theta']
    c_sp_df = pd.DataFrame.from_dict(simpler_csp)
    sns.pairplot(c_sp_df)
    return c_sp_fit, c_sp_model
Example #7
0
    def _fit_stan_model(self, vb: bool, sm: StanModel, data_dict: Dict,
                        pars: List, gen_init: Union[str, Callable],
                        nchain: int, niter: int, nwarmup: int, nthin: int,
                        adapt_delta: float, stepsize: float,
                        max_treedepth: int, ncore: int) -> Any:
        """Fit the stan model.

        Parameters
        ----------
        vb
            Whether to perform variational Bayesian analysis.
        sm
            The StanModel object to use to fit the model.
        data_dict
            Dict holding the data to pass to Stan.
        pars
            List specifying the parameters of interest.
        gen_init
            String or function to specify how to generate the initial values.
        nchain
            Number of chains to run.
        niter
            Number of iterations per chain.
        nwarmup
            Number of warm-up iterations.
        nthin
            Use every `i == nthin` sample to generate posterior distribution.
        adapt_delta
            Advanced control argument for sampler.
        stepsize
            Advanced control argument for sampler.
        max_treedepth
            Advanced control argument for sampler.
        ncore
            Argument for parallel computing while sampling multiple chains.

        Returns
        -------
        fit
            The fitted result returned by `vb` or `sampling` function.
        """
        if vb:
            return sm.vb(data=data_dict, pars=pars, init=gen_init)
        else:
            return sm.sampling(data=data_dict,
                               pars=pars,
                               init=gen_init,
                               chains=nchain,
                               iter=niter,
                               warmup=nwarmup,
                               thin=nthin,
                               control={
                                   'adapt_delta': adapt_delta,
                                   'stepsize': stepsize,
                                   'max_treedepth': max_treedepth
                               },
                               n_jobs=ncore)
Example #8
0
def verify_stan():
    """
    Simplest model to verify the stan installation
    """
    model_code = 'parameters {real y;} model {y ~ normal(0,1);}'
    model = StanModel(model_code=model_code)
    y = model.sampling().extract()['y']
    print('If this worked, you will see a value near 0 now:')
    print(y.mean())
Example #9
0
def _bayes_sampling(x, y, distribution='normal'):
    """
	Helper function.

	Args:
		x (array_like): sample of a treatment group
		y (array_like): sample of a control group
		distribution: name of the KPI distribution model, which assumes a
			Stan model file with the same name exists

	Returns:
		tuple:
			- the posterior samples
			- sample size of x
			- sample size of y
			- absolute mean of x
			- absolute mean of y
	"""
    # Checking if data was provided
    if x is None or y is None:
        raise ValueError('Please provide two non-None samples.')

    # Coercing missing values to right format
    _x = np.array(x, dtype=float)
    _y = np.array(y, dtype=float)

    mu_x = np.nanmean(_x)
    mu_y = np.nanmean(_y)
    n_x = statx.sample_size(_x)
    n_y = statx.sample_size(_y)

    if distribution == 'normal':
        fit_data = {'Nc': n_y, 'Nt': n_x, 'x': _x, 'y': _y}
    elif distribution == 'poisson':
        fit_data = {
            'Nc': n_y,
            'Nt': n_x,
            'x': _x.astype(int),
            'y': _y.astype(int)
        }
    else:
        raise NotImplementedError
    model_file = __location__ + '/../models/' + distribution + '_kpi.stan'
    sm = StanModel(file=model_file)

    fit = sm.sampling(data=fit_data,
                      iter=25000,
                      chains=4,
                      n_jobs=1,
                      seed=1,
                      control={
                          'stepsize': 0.01,
                          'adapt_delta': 0.99
                      })
    traces = fit.extract()

    return traces, n_x, n_y, mu_x, mu_y
def main():
    schools_dat = {
        'J': 8,
        'y': [28, 8, -3, 7, -1, 1, 18, 12],
        'sigma': [15, 10, 16, 11, 9, 11, 10, 18]
    }
    sm = StanModel(file='model.stan')
    fit = sm.sampling(data=schools_dat, iter=1000, chains=4, seed=555)
    with open(DATA_FILE_NAME, 'wb') as f:
        pickle.dump({'model': sm, 'fit': fit}, f)
Example #11
0
def sparse_car_model(tobit_data: pd.DataFrame, ad_matrix):
    sparse_dict = get_sparse_modeldict(tobit_data, ad_matrix)
    sparse_model = StanModel(
        file=Path('models/sparse_tobitcar_students.stan').open(),
        extra_compile_args=["-w"])
    sparse_fit = sparse_model.sampling(sparse_dict,
                                       iter=4000,
                                       warmup=500,
                                       chains=4)
    print(sparse_fit.stansummary())
    return sparse_fit, sparse_model
Example #12
0
def coin_model():
    """
    Example from „Kruschke: Doing Bayesian Data Analysis”. 
    """
    coin_model = StanModel(file=Path('models/bernoulli_example.stan').open())
    # generate some data
    N = 50
    z = 10
    y = [1] * z + [0] * (N - z)
    coin_data = {'y': y, 'N': N}
    # warmup is the same as burnin in JAGS
    return coin_model.sampling(data=coin_data, chains=3, iter=1000, warmup=200)
Example #13
0
def stanTopkl():
    if os.path.isfile('log_normal.pkl'):
        os.remove('log_normal.pkl')
    sm = StanModel(file='log_normal.stan')
    with open('log_normal.pkl', 'wb') as f:
        pickle.dump(sm, f)

    if os.path.isfile('log_t.pkl'):
        os.remove('log_t.pkl')
    sm = StanModel(file='log_t.stan')
    with open('log_t.pkl', 'wb') as f:
        pickle.dump(sm, f)
 def test_init_zero_exception_inf_grad(self):
     code = """
     parameters {
         real x;
     }
     model {
         target += 1 / log(x);
     }
     """
     sm = StanModel(model_code=code)
     with self.assertRaises(RuntimeError):
         sm.sampling(init='0', iter=1, chains=1)
Example #15
0
 def test_init_zero_exception_inf_grad(self):
     code = """
     parameters {
         real x;
     }
     model {
         target += 1 / log(x);
     }
     """
     sm = StanModel(model_code=code)
     with self.assertRaises(RuntimeError):
         sm.sampling(init='0', iter=1, chains=1)
Example #16
0
 def test_init_zero_exception_inf_grad(self):
     code = """
     parameters {
         real x;
     }
     model {
         lp__ <- 1 / log(x);
     }
     """
     sm = StanModel(model_code=code)
     assertRaisesRegex = self.assertRaisesRegexp if PY2 else self.assertRaisesRegex
     with assertRaisesRegex(RuntimeError, 'divergent gradient'):
         sm.sampling(init='0', iter=1)
Example #17
0
 def test_init_zero_exception_inf_grad(self):
     code = """
     parameters {
         real x;
     }
     model {
         lp__ <- 1 / log(x);
     }
     """
     sm = StanModel(model_code=code)
     assertRaisesRegex = self.assertRaisesRegexp if PY2 else self.assertRaisesRegex
     with assertRaisesRegex(RuntimeError, 'divergent gradient'):
         sm.sampling(init='0', iter=1)
Example #18
0
def linear_model():
    """
    1st example from Stan User's Guide
    """
    linear_model = StanModel(file=Path('models/linear_example.stan').open(),
                             extra_compile_args=["-w"])
    x = list(range(10))
    y = [1.1, 2.04, 3.07, 3.88, 4.95, 6.11, 7.03, 7.89, 8.91, 10]
    linear_data = {'x': x, 'y': y, 'N': 10}
    linear_fit = linear_model.sampling(data=linear_data)
    linear_res = linear_fit.extract()
    print('α : {}'.format(np.mean(linear_res['alpha'])))
    print('β : {}'.format(np.mean(linear_res['beta'])))
    return linear_fit
Example #19
0
def bnb_stan(dataset, oos_dataset, warmup=20000, n_iter=25000):
    Y_data, ratings_data, expectations_data, team_dummies_data, pct = (
        extract_data(dataset))
    _, oos_ratings_data, oos_expectations_data, oos_team_dummies_data, oos_pct = \
        extract_data(oos_dataset)
    ratings_data = ratings_data.squeeze()
    oos_ratings_data = oos_ratings_data.squeeze()
    ratings_data, oos_ratings_data = normalize(ratings_data, oos_ratings_data)
    ratings_data = np.stack((ratings_data, np.square(ratings_data)), axis=1)
    oos_ratings_data = np.stack(
        (oos_ratings_data,
         np.sign(oos_ratings_data) * np.square(oos_ratings_data)),
        axis=1)
    pct, oos_pct = normalize(pct, oos_pct)
    expectations_data, oos_expectations_data = normalize(
        expectations_data, oos_expectations_data)
    home_team_dummies = team_dummies_data[::, 0, ::]
    away_team_dummies = team_dummies_data[::, 1, ::]
    stan_data = {
        'n_rows': Y_data.shape[0],
        'n_teams': team_dummies_data.shape[2],
        'm_ratings': ratings_data.shape[1],
        'max_goals': 10,
        'home_team_dummies': home_team_dummies,
        'away_team_dummies': away_team_dummies,
        'expectations': expectations_data,
        'pct': pct,
        'ratings': ratings_data,
        'Y': Y_data.astype(np.int16),
        'oos_n_rows': oos_ratings_data.shape[0],
        'oos_home_team_dummies': oos_team_dummies_data[::, 0, ::],
        'oos_away_team_dummies': oos_team_dummies_data[::, 1, ::],
        'oos_expectations': oos_expectations_data,
        'oos_ratings': oos_ratings_data,
        'oos_pct': oos_pct
    }
    stan_model = StanModel('../stan/games.stan')
    samples = stan_model.sampling(stan_data,
                                  warmup=warmup,
                                  iter=n_iter,
                                  chains=4,
                                  refresh=1,
                                  control={
                                      'adapt_delta': 0.99,
                                      'max_treedepth': 15
                                  })
    preds = samples['predicted_probabilities']
    mean_preds = np.mean(preds, axis=0)
    return samples, mean_preds
Example #20
0
def stanTopkl():
    """
    The function complies 'stan' models first and avoids re-complie of the model.
    """
    if os.path.isfile('log_normal.pkl'):
        os.remove('log_normal.pkl')
    sm = StanModel(file='log_normal.stan')
    with open('log_normal.pkl', 'wb') as f:
        pickle.dump(sm, f)

    if os.path.isfile('log_t.pkl'):
        os.remove('log_t.pkl')
    sm = StanModel(file='log_t.stan')
    with open('log_t.pkl', 'wb') as f:
        pickle.dump(sm, f)
Example #21
0
def compile_stan_model(stan_model_name):
    """
    Compile stan model and save as pkl
    """
    source_model = pkg_resources.resource_filename(
        'orbit', 'stan/{}.stan'.format(stan_model_name))
    compiled_model = pkg_resources.resource_filename(
        'orbit', 'stan_compiled/{}.pkl'.format(stan_model_name))

    # updated for py3
    os.makedirs(os.path.dirname(compiled_model), exist_ok=True)

    # compile if stan source has changed
    if not os.path.isfile(compiled_model) or \
            os.path.getmtime(compiled_model) < os.path.getmtime(source_model):

        with open(source_model, encoding="utf-8") as f:
            model_code = f.read()

        sm = StanModel(model_code=model_code)

        with open(compiled_model, 'wb') as f:
            pickle.dump(sm, f, protocol=pickle.HIGHEST_PROTOCOL)

    return None
Example #22
0
def get_or_compile_stan_model(model_file, distribution):
    """
    Creates Stan model. Compiles a Stan model and saves it to .pkl file to the folder selected by tempfile module if
        file doesn't exist yet and load precompiled model if there is a model file in temporary dir.
    Args:
        model_file: model file location
        distribution: name of the KPI distribution model, which assumes a 
            Stan model file with the same name exists
    Returns:
        returns compiled Stan model for the selected distribution or normal distribution
            as a default option
    Note: compiled_model_file is the hardcoded file path which may cause some issues in future.
    There are 2 alternative implementations for Stan models handling:
        1. Using global variables
        2. Pre-compiling stan models and adding them as a part of expan project
        (3). Using temporary files with tempfile module is not currently possible, since it 
            generates a unique file name which is difficult to track.
        However, compiled modules are saved in temporary directory using tempfile module 
        which vary based on the current platform and settings. Cleaning up a temp dir is done on boot.
    """
    python_version = '{0[0]}.{0[1]}'.format(sys.version_info)
    compiled_model_file = tempfile.gettempdir() + '/expan_early_stop_compiled_stan_model_' \
                          + distribution + '_' + python_version + '.pkl'

    if os.path.isfile(compiled_model_file):
        sm = pickle.load(open(compiled_model_file, 'rb'))
    else:
        sm = StanModel(file=model_file)
        with open(compiled_model_file, 'wb') as f:
            pickle.dump(sm, f)
    return sm
Example #23
0
 def test_bernoulli_compile_time(self):
     model_code = self.bernoulli_model_code
     t0 = time.time()
     model = StanModel(model_code=model_code)
     self.assertIsNotNone(model)
     msg = "Compile time: {}s (vs. RStan 28s)\n".format(int(time.time()-t0))
     logging.info(msg)
Example #24
0
 def test_stanc_exception(self):
     model_code = 'parameters {real z;} model {z ~ no_such_distribution();}'
     assertRaisesRegex = self.assertRaisesRegexp if PY2 else self.assertRaisesRegex
     with assertRaisesRegex(ValueError, 'unknown distribution'):
         stanc(model_code=model_code)
     with assertRaisesRegex(ValueError, 'unknown distribution'):
         StanModel(model_code=model_code)
Example #25
0
def get_stan_model(model_name, recompile=False, **model_params):
    """
    Compile a Stan probabilistic model, or load pre-compiled model from cache,
    if available.

    Parameters
    ----------
    model_name : str
        Then name of the model used
            "bernoulli", "b": Bernoulli likelihood flat prior on probabilities
            "beta-binomial", "bb": Binomial likelihood and Beta prior
    recompile : boolean
        If set to to True, always recompile the model, otherwise try to use
        the cached pickle of the model.
    """

    python_version = 'python{0[0]}.{0[1]}'.format(sys.version_info)

    filename = "-".join([_f for _f in [model_name, python_version] if _f])
    compiled_model_file = os.path.join(STAN_MODEL_CACHE, filename + ".pickle")

    if os.path.isfile(compiled_model_file) and not recompile:
        with open(compiled_model_file, 'rb') as m:
            model = pickle.load(m)
    else:
        model_code = get_stan_model_code(model_name)
        model = StanModel(model_code=model_code)
        logging.info('Saving model to {}'.format(compiled_model_file))
        with open(compiled_model_file, 'wb') as f:
            pickle.dump(model, f)

    return model
Example #26
0
    def __init__(self, context, rdd, prepare_data_callback, stan_file):
        self.rdd = rdd
        self.prepare_data_callback = prepare_data_callback
        self.n_partitions = self.rdd.getNumPartitions()

        sm = StanModel(file=stan_file)
        pickle.dump(sm, open(PICKLE_FILENAME, "wb"))
        context.addFile(PICKLE_FILENAME)
Example #27
0
def tobit_vec_QR(tobit_data: pd.DataFrame, scaled: bool = False):
    """
    vectorised version of the tobit model that combines the parameters for the censored
    values with the uncensored values into a transformed y for more efficiency.
    """
    vec_model = StanModel(
        file=Path('models/tobit_students_vec_qr.stan').open(),
        extra_compile_args=["-w"])
    not_800 = tobit_data['apt'] != 800
    is_800 = tobit_data['apt'] == 800
    ii_obs = tobit_data[not_800]['id']
    ii_cens = tobit_data[is_800]['id']
    if not scaled:
        vec_dict = {
            'X': tobit_data[new_preds],
            'n_obs': not_800.sum(),
            'n_cens': is_800.sum(),
            'U': 800,
            'y_obs': tobit_data[not_800]['apt'],
            'p': len(new_preds),
            'ii_obs': ii_obs,
            'ii_cens': ii_cens
        }
    else:
        trans = MaxAbsScaler().fit_transform(tobit_data[new_preds + ['apt']])
        data_centered = pd.DataFrame(trans, columns=new_preds + ['apt'])
        vec_dict = {
            'X': data_centered[new_preds],
            'n_obs': not_800.sum(),
            'n_cens': is_800.sum(),
            'U': 800,
            'y_obs': data_centered[not_800]['apt'],
            'p': len(new_preds),
            'ii_obs': ii_obs,
            'ii_cens': ii_cens,
            'X_cens': data_centered[is_800][new_preds]
        }

    vec_fit = vec_model.sampling(data=vec_dict,
                                 iter=10000,
                                 chains=4,
                                 warmup=2000,
                                 control=c_params)
    print('β: {}'.format(vec_fit['beta'][501:].mean(axis=0)))
    print(vec_fit.stansummary())
    return vec_fit, vec_model
Example #28
0
def tobit_simple_model(tobit_data: pd.DataFrame, scaled: bool = False):
    """
    2) using a censored model. Has the same sigma - in the paper, the distinction 
    between ε_{it} ~ normal(0,σ^2and θ^m_{it} ~ normal(0, δ^2_m) is clearly made.
    This looks quite close to the values from the tutorial:
    Intercept:  209.5488
    mydata$read: 2.6980, mydata$math: 5.9148  
    """
    censored_model = StanModel(
        file=Path('models/tobit_students_split.stan').open(),
        extra_compile_args=["-w"])
    not_800 = tobit_data['apt'] != 800
    is_800 = tobit_data['apt'] == 800
    if not scaled:
        cens_dict_ex = {
            'X': tobit_data[not_800][new_preds],
            'n': tobit_data.shape[0] - is_800.sum(),
            'y': tobit_data[not_800]['apt'],
            'n_cens': is_800.sum(),
            'p': len(new_preds),
            'X_cens': tobit_data[is_800][new_preds],
            'y_cens': tobit_data[is_800]['apt'],
            'U': 800
        }
    else:
        trans = MaxAbsScaler().fit_transform(tobit_data[new_preds + ['apt']])
        data_centered = pd.DataFrame(trans, columns=new_preds + ['apt'])
        cens_dict_ex = {
            'X': data_centered[not_800][new_preds],
            'n': tobit_data.shape[0] - is_800.sum(),
            'y': data_centered[not_800]['apt'],
            'n_cens': is_800.sum(),
            'p': len(new_preds),
            'y_cens': data_centered[is_800]['apt'],
            'U': 1,
            'X_cens': tobit_data[is_800][new_preds]
        }
    censored_fit = censored_model.sampling(data=cens_dict_ex,
                                           iter=2000,
                                           chains=4,
                                           warmup=500,
                                           control=c_params)
    censored_res = censored_fit.extract()
    print('β: {}'.format(censored_res['beta'][501:].mean(axis=0)))
    return censored_fit, censored_model
Example #29
0
def build_stan_model(target_dir, model_dir=MODEL_DIR):
    from pystan import StanModel
    model_name = 'prophet.stan'
    target_name = 'prophet_model.pkl'
    with open(os.path.join(model_dir, model_name)) as f:
        model_code = f.read()
    sm = StanModel(model_code=model_code)
    with open(os.path.join(target_dir, target_name), 'wb') as f:
        pickle.dump(sm, f, protocol=pickle.HIGHEST_PROTOCOL)
Example #30
0
def run_inference():
    df = pd.read_csv('3gaussians-10k.csv')
    X = np.array(df[['XX', 'YY']].values)
    K = 3

    data = {'D': 2,
            'K': 3,
            'N': 10000,
            'Omega0': np.identity(2),
            'alpha': K * [0.1],
            'beta0': 0.1,
            'dof0': 1.1,
            'm0': np.zeros(2),
            'x': X}

    model = StanModel(file='finite_gaussian_mixture.stan')

    return model.sampling(data=data, warmup=200, iter=700)
Example #31
0
def compile_stan_models(target_dir, model_dir=MODEL_DIR):
    """Pre-compile the stan models that are used by the module."""
    from pystan import StanModel

    names = ["simple_model.stan", "model_with_prior.stan"]
    targets = ["simple_model.pkl", "prior_model.pkl"]
    for (name, target) in zip(names, targets):
        sm = StanModel(file=os.path.join(model_dir, name))
        with open(os.path.join(target_dir, target), "wb") as f:
            pickle.dump(sm, f, protocol=pickle.HIGHEST_PROTOCOL)
Example #32
0
def compile_stan_models(target_dir, model_dir=MODEL_DIR):
    """Pre-compile the stan models that are used by the module."""
    from pystan import StanModel

    print("Compiling Stan player model, and putting pickle in {}".format(
        target_dir))
    sm = StanModel(file=os.path.join(model_dir, "player_forecasts.stan"))
    with open(os.path.join(target_dir, "player_forecasts.pkl"),
              "wb") as f_stan:
        pickle.dump(sm, f_stan, protocol=pickle.HIGHEST_PROTOCOL)
Example #33
0
def test_matrix_param_order_optimizing():
    model_code = """
    data {
    int<lower=2> K;
    }
    parameters {
    matrix[K,2] beta;
    }
    model {
    for (k in 1:K)
      beta[k,1] ~ normal(0,1);
    for (k in 1:K)
      beta[k,2] ~ normal(100,1);
    }"""
    sm = StanModel(model_code=model_code)
    op = sm.optimizing(data=dict(K=3))
    beta = op['par']['beta']
    assert beta.shape == (3, 2)
    beta_colmeans = np.mean(beta, axis=0)
    assert beta_colmeans[0] < 4
    assert beta_colmeans[1] > 100 - 4
Example #34
0
import pandas as pd
from pystan import StanModel
import matplotlib.pyplot as plt
import pickle

d = pd.read_csv('input/data-attendance-1.txt')
d.Score /= 200
data = d.to_dict('list')
data.update({'N':len(d)})

stanmodel = StanModel(file='model/model5-3.stan')

# NUTS (No U-Turn Sampler)
fit_nuts = stanmodel.sampling(data=data, n_jobs=1)
mcmc_sample = fit_nuts.extract()
mu_est = mcmc_sample['mu']

# ADVI (Automatic Differentiation Variational Inference)
fit_vb = stanmodel.vb(data=data)
vb_sample = pd.read_csv(fit_vb['args']['sample_file'].decode('utf-8'), comment='#')
vb_sample = vb_sample.drop([0,1])
mu_est = vb_sample.filter(regex='mu\.\d+')

with open('output/model_and_result.pkl', 'wb') as f:
    pickle.dump(stanmodel, f)
    pickle.dump(fit_nuts, f)
Example #35
0
def test8schools():

    model_name = "_8chools"
    sfile = os.path.join(os.path.dirname(__file__),
                         "../stan/src/models/misc/eight_schools/eight_schools.stan")
    m = StanModel(file=sfile, model_name=model_name, verbose=True)
    m.dso

    yam = StanModel(file=sfile, model_name=model_name, save_dso=False, verbose=True)
    yam.dso

    dat = dict(J=8, y=(28,  8, -3,  7, -1,  1, 18, 12),
               sigma=(15, 10, 16, 11,  9, 11, 10, 18))

    iter = 5020

    # HMC
    ss1 = m.sampling(data=dat, iter=iter, chains=4, algorithm='HMC', refresh=100)
    ss1son = stan(fit=ss1, data=dat, init_r=0.0001)
    ss1son = stan(fit=ss1, data=dat, init_r=0)
    ainfo1 = ss1.get_adaptation_info()
    lp1 = ss1.get_logposterior()
    yalp1 = ss1.get_logposterior(inc_warmup=False)
    sp1 = ss1.get_sampler_params()
    yasp1 = ss1.get_sampler_params(inc_warmup=False)
    gm1 = ss1.get_posterior_mean()
    print(gm1)

    # NUTS 1
    ss2 = m.sampling(data=dat, iter=iter, chains=4, refresh=100,
                     control=dict(metric="unit_e"))
    ainfo2 = ss2.get_adaptation_info()
    lp2 = ss2.get_logposterior()
    yalp2 = ss2.get_logposterior(inc_warmup=False)
    sp2 = ss2.get_sampler_params()
    yasp2 = ss2.get_sampler_params(inc_warmup=False)
    gm2 = ss2.get_posterior_mean()
    print(gm2)

    # NUTS 2
    ss3 = m.sampling(data=dat, iter=iter, chains=4, refresh=100)
    ainfo3 = ss3.get_adaptation_info()
    lp3 = ss3.get_logposterior()
    yalp3 = ss3.get_logposterior(inc_warmup=False)
    sp3 = ss3.get_sampler_params()
    yasp3 = ss3.get_sampler_params(inc_warmup=False)

    gm3 = ss3.get_posterior_mean()
    print(gm3)

    # Non-diag
    ss4 = m.sampling(data=dat, iter=iter, chains=4,
                     control=dict(metric='dense_e'), refresh=100)
    ainfo4 = ss4.get_adaptation_info()
    lp4 = ss4.get_logposterior()
    yalp4 = ss4.get_logposterior(inc_warmup=False)
    sp4 = ss4.get_sampler_params()
    yasp4 = ss4.get_sampler_params(inc_warmup=False)

    gm4 = ss4.get_posterior_mean()
    print(gm4)

    print(ss1)
    print(ss2)
    print(ss3)

    ss1.plot()
    ss1.traceplot()

    ss9 = m.sampling(data=dat, iter=iter, chains=4, refresh=10)

    iter = 52012

    ss = stan(sfile, data=dat, iter=iter, chains=4, sample_file='8schools.csv')

    print(ss)

    ss_inits = ss.inits
    ss_same = stan(sfile, data=dat, iter=iter, chains=4,
                   seed=ss.stan_args[0]['seed'], init=ss_inits,
                   sample_file='ya8schools.csv')

    b = np.allclose(ss.extract(permuted=False), ss_same.extract(permuted=False))
    # b is not true as ss is initialized randomly while ss.same is not.

    s = ss_same.summary(pars="mu", probs=(.3, .8))
    # not in python: print(ss.same, pars='theta', probs=c(.4, .8))
    print(ss_same)
N = len(x_data)

# Introduce noise
x_data = np.random.normal(x_data, 7)
y_data = np.random.normal(y_data, 8)

# plot the data
pyplot.plot(x_data, y_data, 'o')

stan_data_mappings = {
  'x': x_data,
  'y': y_data,
  'N': N,
}

model = StanModel(file='models/univariate_regression.stan')

fit = model.sampling(data=stan_data_mappings)

params = fit.extract()
a_pred = params['a']
b_pred = params['b']
sigma_pred = params['sigma']

# Draw 100 points from where x_data is.
xfit = np.linspace(-10 + min(x_data), 10 + max(x_data), 100)

# Number of samples.
M = len(a_pred)

yfit = a_pred.reshape((M, 1)) + b_pred.reshape((M, 1)) * xfit
Example #37
0
    'sc_mean_vec': sc_mean_vec,
    'sc_var_vec': sc_var_vec,
    'mp_mean_prior_mean': 1,
    'mp_mean_prior_var': 2,
    'mp_var_prior_shape': 3,
    'mp_var_prior_scale': 4,
    'mp_corr_prior_conc': 3,
    'exponent_prior_mean': exponent,
    'base_rate_prior_mean': base_rate,
    'threshold_prior_mean': threshold,
    'n_samples': n_samp_est,
    'stdnorm_samples': stdnorm_samples
}

if recompile:
    sm = StanModel(file='corr_rate.stan', verbose=False)
    with open('corr_rate.pkl', 'wb') as f:
        pickle.dump(sm, f)
else:
    sm = pickle.load(open('corr_rate.pkl', 'rb'))

fit = sm.sampling(data=corr_dat, iter=2000, chains=3)
estimation = fit.extract(permuted=True)
cm = estimation['mp_corr_mat']
pickle.dump(cm, open('corr_mat_samples.pkl', 'wb'))
savemat('corr_mat_samples.mat', {'cm': cm})


mp_col = 'r'
sc_true_col = 'y'
sc_obs_col = 'g'
import numpy as np
import pickle
import pystan
from matplotlib import pyplot as plt
print 'test'


from pystan import StanModel

sm = StanModel(file='model.stan')
with open('model.pkl','wb') as f:
    pickle.dump(sm, f)

import scipy as sc
def phi(x,mu=0,sd=1):
    return 0.5 * (1 + sc.special.erf((x - mu) / (sd * np.sqrt(2))))

N = 300
df = 2*(np.random.random(N)-0.5)*0.6
d1 = 2*(np.random.random(N)-0.5)
A = lambda sc,de : lambda d : sc*d*np.exp(-np.abs(d)/de) 
alpha = A(5,1)
sigma = 0.06
y = (np.random.rand(N)<phi(df/sigma-alpha(d1))).astype(int)

plt.plot(d1,alpha(d1),'rx',)
plt.plot(d1,df/sigma,'gx',)


sm = pickle.load(open('model.pkl','rb'))
Example #39
0
def test_optimizing_basic():
    sm = StanModel(model_code='parameters {real y;} model {y ~ normal(0,1);}')
    op = sm.optimizing()
    assert op['par']['y'].shape == ()
    assert abs(op['par']['y']) < 1
Example #40
0
File: toy.py Project: andycasey/ges
		// We will do this in separate dimensions (teff, logg) because we are not considering
		// the case that the outliers are covariant in the same way we believe the measurements
		// are

		increment_log_prob(log_sum_exp(
			log1m_alpha + normal_log(sp_vector[1], outlier_teff_mu, outlier_teff_sigma),
			log1m_alpha + normal_log(sp_vector[2], outlier_teff_mu, outlier_teff_sigma)
		));
    }
}"""

# Ok, here is our toy data:
with open("toy.data", "r") as fp:
	data = json.load(fp)

model = StanModel(model_code=model_code)

print("Optimizing...")
op = model.optimizing(data=data)

print("Fitting...")
fit = model.sampling(data=data, pars=op["par"], iter=20000)

subplots_adjust = { "left": 0.10, "bottom": 0.05, "right": 0.95, "top": 0.95,
	"wspace": 0.20, "hspace": 0.45
	}

nodes = range(2)
dimensions = ("teff", "logg")

# Plot the m, b parameters for each node
Example #41
0
    'C': C,
    'z_shape': z_shape,
    'z_scale': z_scale,
    'g_shape': g_shape,
    'g_scale': g_scale,
    'Var_u': Var_u
}

if model_type == 1:
    fname = "csm"
elif model_type == 2:
    fname = "csm2"
elif model_type == 3:
    fname = "msm"
if recompile:
    sm = StanModel(file=fname + '_inference.stan')
    with open(fname + '_inference.pkl', 'wb') as f:
        pickle.dump(sm, f)
else:
    sm = pickle.load(open(fname + '_inference.pkl', 'rb'))

fit = sm.sampling(data=gsm_dat, iter=2000, chains=2)
estimation = fit.extract(permuted=True)

g_est_mean = np.mean(estimation["g"], 0)
print('g est', g_est_mean)
print('g true', g_synth)

z_est_mean = np.mean(estimation["z"], 0).T
print('z est', z_est_mean)
print('z true', z_synth)
Example #42
0
  parameters {
    real mu;
    real<lower=0> tau;
    real eta[J];
  }
  transformed parameters {
    real theta[J];
    for (j in 1:J)
      theta[j] <- mu + tau * eta[j];
  }
  model {
    eta ~ normal(0, 1);
    y ~ normal(theta, sigma);
  }
'''
m = StanModel(model_code=schools_code, model_name=model_name, verbose=True)

J = 8
y = (28,  8, -3,  7, -1,  1, 18, 12)
sigma = (15, 10, 16, 11,  9, 11, 10, 18)

iter = 1000
dat = dict(J=J, y=y, sigma=sigma)
ss1 = m.sampling(data=dat, iter=iter, chains=4, refresh=100)

print(ss1)
ss1.traceplot()

ss = stan(model_code=schools_code, data=dat, iter=iter, chains=4,
          sample_file='8schools.csv')
print(ss)
Example #43
0
            yield obsid, odata


standata = StanData(datafile, samplefile, obssel)
N = standata.Noutcomes()
K = standata.Npredictors()
matrix = standata.predictors()
print "Predictor matix (", K, "X", N, "):"
for i in range(min(K, 5)):
    print "\t".join( [str(matrix[j,i]) for j in range(min(N, 10))] )

from pystan import StanModel
smfile = modelfile + ".pkl"
if not os.path.exists(smfile):
    modelname = os.path.splitext(os.path.basename(modelfile))[0] + "_" + os.path.splitext(os.path.basename(datafile))[0]
    stanmodel = StanModel(file=modelfile, model_name=modelname)
    with open(smfile, 'wb') as f:
        pickle.dump(stanmodel, f)
else:
    stanmodel = pickle.load(open(smfile, 'rb'))

def get_median(sample_array):
        sample_array.sort()
        n = len(sample_array)
        medianvalue = sample_array[n/2] if (n % 2) == 0 else (sample_array[n/2] + sample_array[n/2 + 1]) / 2.0
        return medianvalue

pdf = bp.PdfPages(betapdffile)
results = {}
for obsid, odata in standata.iter_observations():
    sample_outfile = os.path.join(sampledir, obsid + "_samples.txt") if sampledir != "" else None
Example #44
0
import matplotlib.pyplot as pl

# x = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
x = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# x = [1, 1, 1]

cf_dat = {
    'N': len(x),
    'x': x,
    'prior_width': 0.2
}

recompile = False

if recompile:
    sm = StanModel(file='coinflip.stan')
    with open('coinflip.pkl', 'wb') as f:
        pickle.dump(sm, f)
else:
    sm = pickle.load(open('coinflip.pkl', 'rb'))

# fit = sm.sampling(data=cf_dat, iter=2000, chains=2)
fit = sm.sampling(data=cf_dat, iter=20, chains=1, seed='random', init=[{'beta': 0.5}], warmup=10)
estimation = fit.extract(permuted=True)

print(estimation['beta'])

pl.hist(estimation['beta'], bins=40)
pl.xlim([0, 1])
pl.show()
Example #45
0
for i in range(N):
    act_u = u_synth[i, :].T
    act_mean = z_synth[i] * A.dot(act_u)
    x_synth[i, :] = multivariate_normal(act_mean, sigma_x)

gsm_dat = {
    "N": N,
    "d_x": d_x,
    "d_u": d_u,
    "sigma_x": sigma_x,
    "x": x_synth,
    "A": A,
    "C": C,
    "z_shape": z_shape,
    "z_scale": z_scale,
}

if recompile:
    sm = StanModel(file="gsm_inference.stan")
    with open("gsm_inference.pkl", "wb") as f:
        pickle.dump(sm, f)
else:
    sm = pickle.load(open("gsm_inference.pkl", "rb"))

fit = sm.sampling(data=gsm_dat, iter=100, chains=8)
estimation = fit.extract(permuted=True)
z_est_mean = mean(estimation["z"], 0).T
print(z_est_mean)

print(z_synth)