Ejemplo n.º 1
0
 def test_variables(self):
     # construct fit using existing sampler output
     exe = os.path.join(DATAFILES_PATH, 'lotka-volterra' + EXTENSION)
     jdata = os.path.join(DATAFILES_PATH, 'lotka-volterra.data.json')
     sampler_args = SamplerArgs(iter_sampling=20)
     cmdstan_args = CmdStanArgs(
         model_name='lotka-volterra',
         model_exe=exe,
         chain_ids=[1],
         seed=12345,
         data=jdata,
         output_dir=DATAFILES_PATH,
         method_args=sampler_args,
     )
     runset = RunSet(args=cmdstan_args, chains=1)
     runset._csv_files = [
         os.path.join(DATAFILES_PATH, 'lotka-volterra.csv')
     ]
     runset._set_retcode(0, 0)
     fit = CmdStanMCMC(runset)
     self.assertEqual(20, fit.num_draws)
     self.assertEqual(8, len(fit._stan_variable_dims))
     self.assertTrue('z' in fit._stan_variable_dims)
     self.assertEqual(fit._stan_variable_dims['z'], (20, 2))
     vars = fit.stan_variables()
     self.assertEqual(len(vars), len(fit._stan_variable_dims))
     self.assertTrue('z' in vars)
     self.assertEqual(vars['z'].shape, (20, 20, 2))
     self.assertTrue('theta' in vars)
     self.assertEqual(vars['theta'].shape, (20, 4))
Ejemplo n.º 2
0
 def test_diagnose_divergences(self):
     exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
     sampler_args = SamplerArgs()
     cmdstan_args = CmdStanArgs(
         model_name='bernoulli',
         model_exe=exe,
         chain_ids=[1],
         output_dir=DATAFILES_PATH,
         method_args=sampler_args,
     )
     runset = RunSet(args=cmdstan_args, chains=1)
     runset._csv_files = [
         os.path.join(DATAFILES_PATH, 'diagnose-good',
                      'corr_gauss_depth8-1.csv')
     ]
     fit = CmdStanMCMC(runset)
     # TODO - use cmdstan test files instead
     expected = '\n'.join([
         'Checking sampler transitions treedepth.',
         '424 of 1000 (42%) transitions hit the maximum '
         'treedepth limit of 8, or 2^8 leapfrog steps.',
         'Trajectories that are prematurely terminated '
         'due to this limit will result in slow exploration.',
         'For optimal performance, increase this limit.',
     ])
     self.assertIn(expected, fit.diagnose().replace('\r\n', '\n'))
Ejemplo n.º 3
0
 def test_instantiate(self):
     stan = os.path.join(DATAFILES_PATH, 'variational',
                         'eta_should_be_big.stan')
     model = CmdStanModel(stan_file=stan)
     no_data = {}
     args = VariationalArgs(algorithm='meanfield')
     cmdstan_args = CmdStanArgs(
         model_name=model.name,
         model_exe=model.exe_file,
         chain_ids=None,
         data=no_data,
         method_args=args,
     )
     runset = RunSet(args=cmdstan_args, chains=1)
     runset._csv_files = [
         os.path.join(DATAFILES_PATH, 'variational', 'eta_big_output.csv')
     ]
     variational = CmdStanVB(runset)
     self.assertIn('CmdStanVB: model=eta_should_be_big',
                   variational.__repr__())
     self.assertIn('method=variational', variational.__repr__())
     self.assertEqual(
         variational.column_names,
         ('lp__', 'log_p__', 'log_g__', 'mu[1]', 'mu[2]'),
     )
     self.assertAlmostEqual(variational.variational_params_dict['mu[1]'],
                            31.0299,
                            places=2)
     self.assertAlmostEqual(variational.variational_params_dict['mu[2]'],
                            28.8141,
                            places=2)
     self.assertEqual(variational.variational_sample.shape, (1000, 5))
Ejemplo n.º 4
0
 def test_validate_big_run(self):
     exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
     sampler_args = SamplerArgs(iter_warmup=1500, iter_sampling=1000)
     cmdstan_args = CmdStanArgs(
         model_name='bernoulli',
         model_exe=exe,
         chain_ids=[1, 2],
         seed=12345,
         output_dir=DATAFILES_PATH,
         method_args=sampler_args,
     )
     runset = RunSet(args=cmdstan_args, chains=2)
     runset._csv_files = [
         os.path.join(DATAFILES_PATH, 'runset-big',
                      'output_icar_nyc-1.csv'),
         os.path.join(DATAFILES_PATH, 'runset-big',
                      'output_icar_nyc-1.csv'),
     ]
     fit = CmdStanMCMC(runset)
     phis = ['phi[{}]'.format(str(x + 1)) for x in range(2095)]
     column_names = SAMPLER_STATE + phis
     self.assertEqual(fit.num_draws_sampling, 1000)
     self.assertEqual(fit.column_names, tuple(column_names))
     self.assertEqual(fit.metric_type, 'diag_e')
     self.assertEqual(fit.step_size.shape, (2, ))
     self.assertEqual(fit.metric.shape, (2, 2095))
     self.assertEqual((1000, 2, 2102), fit.draws().shape)
     phis = fit.draws_pd(params=['phi'])
     self.assertEqual((2000, 2095), phis.shape)
     with self.assertRaisesRegex(ValueError, r'unknown parameter: gamma'):
         fit.draws_pd(params=['gamma'])
Ejemplo n.º 5
0
    def test_good(self):
        # construct fit using existing sampler output
        exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
        jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
        sampler_args = SamplerArgs(iter_sampling=100,
                                   max_treedepth=11,
                                   adapt_delta=0.95)
        cmdstan_args = CmdStanArgs(
            model_name='bernoulli',
            model_exe=exe,
            chain_ids=[1, 2, 3, 4],
            seed=12345,
            data=jdata,
            output_dir=DATAFILES_PATH,
            method_args=sampler_args,
        )
        runset = RunSet(args=cmdstan_args)
        runset._csv_files = [
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-1.csv'),
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-2.csv'),
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-3.csv'),
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-4.csv'),
        ]
        retcodes = runset._retcodes
        for i in range(len(retcodes)):
            runset._set_retcode(i, 0)
        config = check_sampler_csv(
            path=runset.csv_files[i],
            is_fixed_param=False,
            iter_sampling=100,
            iter_warmup=1000,
            save_warmup=False,
            thin=1,
        )
        expected = 'Metadata:\n{}\n'.format(config)
        metadata = InferenceMetadata(config)
        actual = '{}'.format(metadata)
        self.assertEqual(expected, actual)
        self.assertEqual(config, metadata.cmdstan_config)

        hmc_vars = {
            'lp__',
            'accept_stat__',
            'stepsize__',
            'treedepth__',
            'n_leapfrog__',
            'divergent__',
            'energy__',
        }

        sampler_vars_cols = metadata.sampler_vars_cols
        self.assertEqual(hmc_vars, sampler_vars_cols.keys())
        bern_model_vars = {'theta'}
        self.assertEqual(bern_model_vars, metadata.stan_vars_dims.keys())
        self.assertEqual((), metadata.stan_vars_dims['theta'])
        self.assertEqual(bern_model_vars, metadata.stan_vars_cols.keys())
        self.assertEqual((7, ), metadata.stan_vars_cols['theta'])
Ejemplo n.º 6
0
    def test_validate_good_run(self):
        # construct fit using existing sampler output
        exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
        jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
        sampler_args = SamplerArgs(iter_sampling=100,
                                   max_treedepth=11,
                                   adapt_delta=0.95)
        cmdstan_args = CmdStanArgs(
            model_name='bernoulli',
            model_exe=exe,
            chain_ids=[1, 2, 3, 4],
            seed=12345,
            data=jdata,
            output_dir=DATAFILES_PATH,
            method_args=sampler_args,
        )
        runset = RunSet(args=cmdstan_args, chains=4)
        runset._csv_files = [
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-1.csv'),
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-2.csv'),
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-3.csv'),
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-4.csv'),
        ]
        self.assertEqual(4, runset.chains)
        retcodes = runset._retcodes
        for i in range(len(retcodes)):
            runset._set_retcode(i, 0)
        self.assertTrue(runset._check_retcodes())

        fit = CmdStanMCMC(runset)
        self.assertEqual(100, fit.num_draws)
        self.assertEqual(len(BERNOULLI_COLS), len(fit.column_names))
        self.assertEqual('lp__', fit.column_names[0])

        drawset = fit.get_drawset()
        self.assertEqual(
            drawset.shape,
            (fit.runset.chains * fit.num_draws, len(fit.column_names)),
        )
        _ = fit.summary()
        self.assertTrue(True)

        # TODO - use cmdstan test files instead
        expected = '\n'.join([
            'Checking sampler transitions treedepth.',
            'Treedepth satisfactory for all transitions.',
            '\nChecking sampler transitions for divergences.',
            'No divergent transitions found.',
            '\nChecking E-BFMI - sampler transitions HMC potential energy.',
            'E-BFMI satisfactory for all transitions.',
            '\nEffective sample size satisfactory.',
        ])
        self.assertIn(expected, fit.diagnose().replace('\r\n', '\n'))
Ejemplo n.º 7
0
 def test_validate_big_run(self):
     exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
     sampler_args = SamplerArgs()
     cmdstan_args = CmdStanArgs(
         model_name='bernoulli',
         model_exe=exe,
         chain_ids=[1, 2],
         seed=12345,
         output_dir=DATAFILES_PATH,
         method_args=sampler_args,
     )
     runset = RunSet(args=cmdstan_args, chains=2)
     runset._csv_files = [
         os.path.join(DATAFILES_PATH, 'runset-big',
                      'output_icar_nyc-1.csv'),
         os.path.join(DATAFILES_PATH, 'runset-big',
                      'output_icar_nyc-1.csv'),
     ]
     fit = CmdStanMCMC(runset)
     fit._validate_csv_files()
     sampler_state = [
         'lp__',
         'accept_stat__',
         'stepsize__',
         'treedepth__',
         'n_leapfrog__',
         'divergent__',
         'energy__',
     ]
     phis = ['phi.{}'.format(str(x + 1)) for x in range(2095)]
     column_names = sampler_state + phis
     self.assertEqual(fit.columns, len(column_names))
     self.assertEqual(fit.column_names, tuple(column_names))
     self.assertEqual(fit.metric_type, 'diag_e')
     self.assertEqual(fit.stepsize.shape, (2, ))
     self.assertEqual(fit.metric.shape, (2, 2095))
     self.assertEqual((1000, 2, 2102), fit.sample.shape)
     phis = fit.get_drawset(params=['phi'])
     self.assertEqual((2000, 2095), phis.shape)
     phi1 = fit.get_drawset(params=['phi.1'])
     self.assertEqual((2000, 1), phi1.shape)
     mo_phis = fit.get_drawset(params=['phi.1', 'phi.10', 'phi.100'])
     self.assertEqual((2000, 3), mo_phis.shape)
     phi2095 = fit.get_drawset(params=['phi.2095'])
     self.assertEqual((2000, 1), phi2095.shape)
     with self.assertRaises(Exception):
         fit.get_drawset(params=['phi.2096'])
     with self.assertRaises(Exception):
         fit.get_drawset(params=['ph'])
Ejemplo n.º 8
0
    def test_validate_summary_sig_figs(self):
        # construct CmdStanMCMC from logistic model output, config
        exe = os.path.join(DATAFILES_PATH, 'logistic' + EXTENSION)
        rdata = os.path.join(DATAFILES_PATH, 'logistic.data.R')
        sampler_args = SamplerArgs(iter_sampling=100)
        cmdstan_args = CmdStanArgs(
            model_name='logistic',
            model_exe=exe,
            chain_ids=[1, 2, 3, 4],
            seed=12345,
            data=rdata,
            output_dir=DATAFILES_PATH,
            sig_figs=17,
            method_args=sampler_args,
        )
        runset = RunSet(args=cmdstan_args)
        runset._csv_files = [
            os.path.join(DATAFILES_PATH, 'logistic_output_1.csv'),
            os.path.join(DATAFILES_PATH, 'logistic_output_2.csv'),
            os.path.join(DATAFILES_PATH, 'logistic_output_3.csv'),
            os.path.join(DATAFILES_PATH, 'logistic_output_4.csv'),
        ]
        retcodes = runset._retcodes
        for i in range(len(retcodes)):
            runset._set_retcode(i, 0)
        fit = CmdStanMCMC(runset)

        sum_default = fit.summary()
        beta1_default = format(sum_default.iloc[1, 0], '.18g')
        self.assertTrue(beta1_default.startswith('1.3'))

        if cmdstan_version_at(2, 25):
            sum_17 = fit.summary(sig_figs=17)
            beta1_17 = format(sum_17.iloc[1, 0], '.18g')
            self.assertTrue(beta1_17.startswith('1.345767078273'))

            sum_10 = fit.summary(sig_figs=10)
            beta1_10 = format(sum_10.iloc[1, 0], '.18g')
            self.assertTrue(beta1_10.startswith('1.34576707'))

        with self.assertRaises(ValueError):
            fit.summary(sig_figs=20)
        with self.assertRaises(ValueError):
            fit.summary(sig_figs=-1)
Ejemplo n.º 9
0
 def test_instantiate(self):
     stan = os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock.stan')
     model = CmdStanModel(stan_file=stan)
     no_data = {}
     args = OptimizeArgs(algorithm='Newton')
     cmdstan_args = CmdStanArgs(
         model_name=model.name,
         model_exe=model.exe_file,
         chain_ids=None,
         data=no_data,
         method_args=args,
     )
     runset = RunSet(args=cmdstan_args, chains=1)
     runset._csv_files = [
         os.path.join(DATAFILES_PATH, 'optimize', 'rosenbrock_mle.csv')
     ]
     mle = CmdStanMLE(runset)
     self.assertIn('CmdStanMLE: model=rosenbrock', mle.__repr__())
     self.assertIn('method=optimize', mle.__repr__())
     self.assertEqual(mle.column_names, ('lp__', 'x', 'y'))
     self.assertAlmostEqual(mle.optimized_params_dict['x'], 1, places=3)
     self.assertAlmostEqual(mle.optimized_params_dict['y'], 1, places=3)
Ejemplo n.º 10
0
 def test_variables_3d(self):
     # construct fit using existing sampler output
     exe = os.path.join(DATAFILES_PATH, 'multidim_vars' + EXTENSION)
     jdata = os.path.join(DATAFILES_PATH, 'logistic.data.R')
     sampler_args = SamplerArgs(iter_sampling=20)
     cmdstan_args = CmdStanArgs(
         model_name='multidim_vars',
         model_exe=exe,
         chain_ids=[1],
         seed=12345,
         data=jdata,
         output_dir=DATAFILES_PATH,
         method_args=sampler_args,
     )
     runset = RunSet(args=cmdstan_args, chains=1)
     runset._csv_files = [os.path.join(DATAFILES_PATH, 'multidim_vars.csv')]
     runset._set_retcode(0, 0)
     fit = CmdStanMCMC(runset)
     self.assertEqual(20, fit.num_draws_sampling)
     self.assertEqual(3, len(fit.stan_vars_dims))
     self.assertTrue('y_rep' in fit.stan_vars_dims)
     self.assertEqual(fit.stan_vars_dims['y_rep'], (5, 4, 3))
     var_y_rep = fit.stan_variable(name='y_rep')
     self.assertEqual(var_y_rep.shape, (20, 5, 4, 3))
     var_beta = fit.stan_variable(name='beta')
     self.assertEqual(var_beta.shape, (20, 2))
     var_frac_60 = fit.stan_variable(name='frac_60')
     self.assertEqual(var_frac_60.shape, (20, ))
     vars = fit.stan_variables()
     self.assertEqual(len(vars), len(fit.stan_vars_dims))
     self.assertTrue('y_rep' in vars)
     self.assertEqual(vars['y_rep'].shape, (20, 5, 4, 3))
     self.assertTrue('beta' in vars)
     self.assertEqual(vars['beta'].shape, (20, 2))
     self.assertTrue('frac_60' in vars)
     self.assertEqual(vars['frac_60'].shape, (20, ))
Ejemplo n.º 11
0
    def test_validate_bad_run(self):
        exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
        jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
        sampler_args = SamplerArgs(max_treedepth=11, adapt_delta=0.95)

        # some chains had errors
        cmdstan_args = CmdStanArgs(
            model_name='bernoulli',
            model_exe=exe,
            chain_ids=[1, 2, 3, 4],
            seed=12345,
            data=jdata,
            output_dir=DATAFILES_PATH,
            method_args=sampler_args,
        )
        runset = RunSet(args=cmdstan_args, chains=4)
        for i in range(4):
            runset._set_retcode(i, 0)
        self.assertTrue(runset._check_retcodes())

        # errors reported
        runset._stderr_files = [
            os.path.join(DATAFILES_PATH, 'runset-bad',
                         'bad-transcript-bern-1.txt'),
            os.path.join(DATAFILES_PATH, 'runset-bad',
                         'bad-transcript-bern-2.txt'),
            os.path.join(DATAFILES_PATH, 'runset-bad',
                         'bad-transcript-bern-3.txt'),
            os.path.join(DATAFILES_PATH, 'runset-bad',
                         'bad-transcript-bern-4.txt'),
        ]
        self.assertEqual(len(runset._get_err_msgs()), 4)

        # csv file headers inconsistent
        runset._csv_files = [
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-1.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-2.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-3.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-hdr-bern-4.csv'),
        ]
        with self.assertRaisesRegex(ValueError, 'header mismatch'):
            CmdStanMCMC(runset)

        # bad draws
        runset._csv_files = [
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-1.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-2.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-3.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-draws-bern-4.csv'),
        ]
        with self.assertRaisesRegex(ValueError, 'draws'):
            CmdStanMCMC(runset)

        # mismatch - column headers, draws
        runset._csv_files = [
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-1.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-2.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-3.csv'),
            os.path.join(DATAFILES_PATH, 'runset-bad', 'bad-cols-bern-4.csv'),
        ]
        with self.assertRaisesRegex(ValueError,
                                    'bad draw, expecting 9 items, found 8'):
            CmdStanMCMC(runset)
Ejemplo n.º 12
0
    def test_validate_good_run(self):
        # construct fit using existing sampler output
        exe = os.path.join(DATAFILES_PATH, 'bernoulli' + EXTENSION)
        jdata = os.path.join(DATAFILES_PATH, 'bernoulli.data.json')
        sampler_args = SamplerArgs(iter_sampling=100,
                                   max_treedepth=11,
                                   adapt_delta=0.95)
        cmdstan_args = CmdStanArgs(
            model_name='bernoulli',
            model_exe=exe,
            chain_ids=[1, 2, 3, 4],
            seed=12345,
            data=jdata,
            output_dir=DATAFILES_PATH,
            method_args=sampler_args,
        )
        runset = RunSet(args=cmdstan_args)
        runset._csv_files = [
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-1.csv'),
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-2.csv'),
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-3.csv'),
            os.path.join(DATAFILES_PATH, 'runset-good', 'bern-4.csv'),
        ]
        self.assertEqual(4, runset.chains)
        retcodes = runset._retcodes
        for i in range(len(retcodes)):
            runset._set_retcode(i, 0)
        self.assertTrue(runset._check_retcodes())

        fit = CmdStanMCMC(runset)
        self.assertEqual(100, fit.num_draws)
        self.assertEqual(len(BERNOULLI_COLS), len(fit.column_names))
        self.assertEqual('lp__', fit.column_names[0])

        drawset = fit.get_drawset()
        self.assertEqual(
            drawset.shape,
            (fit.runset.chains * fit.num_draws, len(fit.column_names)),
        )

        summary = fit.summary()
        self.assertIn('5%', list(summary.columns))
        self.assertIn('50%', list(summary.columns))
        self.assertIn('95%', list(summary.columns))
        self.assertNotIn('1%', list(summary.columns))
        self.assertNotIn('99%', list(summary.columns))

        summary = fit.summary(percentiles=[1, 45, 99])
        self.assertIn('1%', list(summary.columns))
        self.assertIn('45%', list(summary.columns))
        self.assertIn('99%', list(summary.columns))
        self.assertNotIn('5%', list(summary.columns))
        self.assertNotIn('50%', list(summary.columns))
        self.assertNotIn('95%', list(summary.columns))

        with self.assertRaises(ValueError):
            fit.summary(percentiles=[])

        with self.assertRaises(ValueError):
            fit.summary(percentiles=[-1])

        diagnostics = fit.diagnose()
        self.assertIn('Treedepth satisfactory for all transitions.',
                      diagnostics)
        self.assertIn('No divergent transitions found.', diagnostics)
        self.assertIn('E-BFMI satisfactory for all transitions.', diagnostics)
        self.assertIn('Effective sample size satisfactory.', diagnostics)
Ejemplo n.º 13
0
    def generate_quantities(
        self,
        data: Union[Dict, str] = None,
        mcmc_sample: Union[CmdStanMCMC, List[str]] = None,
        seed: int = None,
        gq_output_dir: str = None,
    ) -> CmdStanGQ:
        """
        Run CmdStan's generate_quantities method which runs the generated
        quantities block of a model given an existing sample.

        This function takes a CmdStanMCMC object and the dataset used to
        generate that sample and calls to the CmdStan ``generate_quantities``
        method to generate additional quantities of interest.

        The ``CmdStanGQ`` object records the command, the return code,
        and the paths to the generate method output csv and console files.
        The output files are written either to a specified output directory
        or to a temporary directory which is deleted upon session exit.

        Output files are either written to a temporary directory or to the
        specified output directory.  Output filenames correspond to the template
        '<model_name>-<YYYYMMDDHHMM>-<chain_id>' plus the file suffix which is
        either '.csv' for the CmdStan output or '.txt' for
        the console messages, e.g. 'bernoulli-201912081451-1.csv'.
        Output files written to the temporary directory contain an additional
        8-character random string, e.g. 'bernoulli-201912081451-1-5nm6as7u.csv'.

        :param data: Values for all data variables in the model, specified
            either as a dictionary with entries matching the data variables,
            or as the path of a data file in JSON or Rdump format.

        :param mcmc_sample: Can be either a ``CmdStanMCMC`` object returned by
            the ``sample`` method or a list of stan-csv files generated
            by fitting the model to the data using any Stan interface.

        :param seed: The seed for random number generator. Must be an integer
            between 0 and 2^32 - 1. If unspecified,
            ``numpy.random.RandomState()``
            is used to generate a seed which will be used for all chains.
            *NOTE: Specifying the seed will guarantee the same result for
            multiple invocations of this method with the same inputs.  However
            this will not reproduce results from the sample method given
            the same inputs because the RNG will be in a different state.*

        :param gq_output_dir:  Name of the directory in which the CmdStan output
            files are saved.  If unspecified, files will be written to a
            temporary directory which is deleted upon session exit.

        :return: CmdStanGQ object
        """
        sample_csv_files = []
        sample_drawset = None
        chains = 0

        if isinstance(mcmc_sample, CmdStanMCMC):
            sample_csv_files = mcmc_sample.runset.csv_files
            sample_drawset = mcmc_sample.draws_pd()
            chains = mcmc_sample.chains
            chain_ids = mcmc_sample.chain_ids
        elif isinstance(mcmc_sample, list):
            if len(mcmc_sample) < 1:
                raise ValueError('MCMC sample cannot be empty list')
            sample_csv_files = mcmc_sample
            chains = len(sample_csv_files)
            chain_ids = [x + 1 for x in range(chains)]
        else:
            raise ValueError('MCMC sample must be either CmdStanMCMC object'
                             ' or list of paths to sample csv_files.')
        try:
            if sample_drawset is None:  # assemble sample from csv files
                config = {}
                # scan 1st csv file to get config
                try:
                    config = scan_sampler_csv(sample_csv_files[0])
                except ValueError:
                    config = scan_sampler_csv(sample_csv_files[0], True)
                conf_iter_sampling = None
                if 'num_samples' in config:
                    conf_iter_sampling = int(config['num_samples'])
                conf_iter_warmup = None
                if 'num_warmup' in config:
                    conf_iter_warmup = int(config['num_warmup'])
                conf_thin = None
                if 'thin' in config:
                    conf_thin = int(config['thin'])
                sampler_args = SamplerArgs(
                    iter_sampling=conf_iter_sampling,
                    iter_warmup=conf_iter_warmup,
                    thin=conf_thin,
                )
                args = CmdStanArgs(
                    self._name,
                    self._exe_file,
                    chain_ids=chain_ids,
                    method_args=sampler_args,
                )
                runset = RunSet(args=args, chains=chains, chain_ids=chain_ids)
                runset._csv_files = sample_csv_files
                sample_fit = CmdStanMCMC(runset)
                sample_drawset = sample_fit.draws_pd()
        except ValueError as exc:
            raise ValueError('Invalid mcmc_sample, error:\n\t{}\n\t'
                             ' while processing files\n\t{}'.format(
                                 repr(exc),
                                 '\n\t'.join(sample_csv_files))) from exc

        generate_quantities_args = GenerateQuantitiesArgs(
            csv_files=sample_csv_files)
        generate_quantities_args.validate(chains)
        with MaybeDictToFilePath(data, None) as (_data, _inits):
            args = CmdStanArgs(
                self._name,
                self._exe_file,
                chain_ids=chain_ids,
                data=_data,
                seed=seed,
                output_dir=gq_output_dir,
                method_args=generate_quantities_args,
            )
            runset = RunSet(args=args, chains=chains, chain_ids=chain_ids)

            parallel_chains_avail = cpu_count()
            parallel_chains = max(min(parallel_chains_avail - 2, chains), 1)
            with ThreadPoolExecutor(max_workers=parallel_chains) as executor:
                for i in range(chains):
                    executor.submit(self._run_cmdstan, runset, i)

            if not runset._check_retcodes():
                msg = 'Error during generate_quantities.\n{}'.format(
                    runset.get_err_msgs())
                raise RuntimeError(msg)
            quantities = CmdStanGQ(runset=runset, mcmc_sample=sample_drawset)
        return quantities
Ejemplo n.º 14
0
    def test_metadata(self):
        # construct CmdStanMCMC from logistic model output, config
        exe = os.path.join(DATAFILES_PATH, 'logistic' + EXTENSION)
        rdata = os.path.join(DATAFILES_PATH, 'logistic.data.R')
        sampler_args = SamplerArgs(iter_sampling=100)
        cmdstan_args = CmdStanArgs(
            model_name='logistic',
            model_exe=exe,
            chain_ids=[1, 2, 3, 4],
            seed=12345,
            data=rdata,
            output_dir=DATAFILES_PATH,
            sig_figs=17,
            method_args=sampler_args,
        )
        runset = RunSet(args=cmdstan_args)
        runset._csv_files = [
            os.path.join(DATAFILES_PATH, 'logistic_output_1.csv'),
            os.path.join(DATAFILES_PATH, 'logistic_output_2.csv'),
            os.path.join(DATAFILES_PATH, 'logistic_output_3.csv'),
            os.path.join(DATAFILES_PATH, 'logistic_output_4.csv'),
        ]
        retcodes = runset._retcodes
        for i in range(len(retcodes)):
            runset._set_retcode(i, 0)
        fit = CmdStanMCMC(runset)

        col_names = tuple([
            'lp__',
            'accept_stat__',
            'stepsize__',
            'treedepth__',
            'n_leapfrog__',
            'divergent__',
            'energy__',
            'beta[1]',
            'beta[2]',
        ])

        self.assertEqual(fit.chains, 4)
        self.assertEqual(fit.chain_ids, [1, 2, 3, 4])
        self.assertEqual(fit.num_draws_warmup, 1000)
        self.assertEqual(fit.num_draws_sampling, 100)
        self.assertEqual(fit.column_names, col_names)
        self.assertEqual(fit.num_unconstrained_params, 2)
        self.assertEqual(fit.metric_type, 'diag_e')

        self.assertEqual(fit.sampler_config['num_samples'], 100)
        self.assertEqual(fit.sampler_config['thin'], 1)
        self.assertEqual(fit.sampler_config['algorithm'], 'hmc')
        self.assertEqual(fit.sampler_config['metric'], 'diag_e')
        self.assertAlmostEqual(fit.sampler_config['delta'], 0.80)

        self.assertTrue('n_leapfrog__' in fit.sampler_vars_cols)
        self.assertTrue('energy__' in fit.sampler_vars_cols)
        self.assertTrue('beta' not in fit.sampler_vars_cols)
        self.assertTrue('energy__' not in fit.stan_vars_dims)
        self.assertTrue('beta' in fit.stan_vars_dims)
        self.assertTrue('beta' in fit.stan_vars_cols)
        self.assertEqual(fit.stan_vars_dims['beta'], tuple([2]))
        self.assertEqual(fit.stan_vars_cols['beta'], tuple([7, 8]))
Ejemplo n.º 15
0
    def generate_quantities(
        self,
        data: Union[Dict, str] = None,
        mcmc_sample: Union[CmdStanMCMC, List[str]] = None,
        seed: int = None,
        gq_output_dir: str = None,
    ) -> CmdStanGQ:
        """
        Run CmdStan's generate_quantities method which runs the generated
        quantities block of a model given an existing sample.

        This function takes a CmdStanMCMC object and the dataset used to
        generate that sample and calls to the CmdStan ``generate_quantities``
        method to generate additional quantities of interest.

        The ``CmdStanGQ`` object records the command, the return code,
        and the paths to the generate method output csv and console files.
        The output files are written either to a specified output directory
        or to a temporary directory which is deleted upon session exit.

        Output filenames are composed of the model name, a timestamp
        in the form YYYYMMDDhhmm and the chain id, plus the corresponding
        filetype suffix, either '.csv' for the CmdStan output or '.txt' for
        the console messages, e.g. `bernoulli_ppc-201912081451-1.csv`. Output
        files  written to the temporary directory contain an additional
        8-character random string, e.g.
        `bernoulli_ppc-201912081451-1-5nm6as7u.csv`.

        :param data: Values for all data variables in the model, specified
            either as a dictionary with entries matching the data variables,
            or as the path of a data file in JSON or Rdump format.

        :param mcmc_sample: Can be either a CmdStanMCMC object returned by
            CmdStanPy's `sample` method or a list of stan-csv files generated
            by fitting the model to the data using any Stan interface.

        :param seed: The seed for random number generator. Must be an integer
            between ``0`` and ``2^32 - 1``. If unspecified,
            ``numpy.random.RandomState()``
            is used to generate a seed which will be used for all chains.
            *NOTE: Specifying the seed will guarantee the same result for
            multiple invocations of this method with the same inputs.  However
            this will not reproduce results from the sample method given
            the same inputs because the RNG will be in a different state.*

        :param gq_output_dir:  Name of the directory in which the CmdStan output
            files are saved.  If unspecified, files will be written to a
            temporary directory which is deleted upon session exit.

        :return: CmdStanGQ object
        """
        sample_csv_files = []
        sample_drawset = None
        chains = 0

        if isinstance(mcmc_sample, CmdStanMCMC):
            sample_csv_files = mcmc_sample.runset.csv_files
            sample_drawset = mcmc_sample.get_drawset()
            chains = mcmc_sample.chains
        elif isinstance(mcmc_sample, list):
            sample_csv_files = mcmc_sample
        else:
            raise ValueError(
                'mcmc_sample must be either CmdStanMCMC object'
                ' or list of paths to sample csv_files'
            )

        try:
            chains = len(sample_csv_files)
            if sample_drawset is None:  # assemble sample from csv files
                sampler_args = SamplerArgs()
                args = CmdStanArgs(
                    self._name,
                    self._exe_file,
                    chain_ids=[x + 1 for x in range(chains)],
                    method_args=sampler_args,
                )
                runset = RunSet(args=args, chains=chains)
                runset._csv_files = sample_csv_files
                sample_fit = CmdStanMCMC(runset)
                sample_fit._validate_csv_files()
                sample_drawset = sample_fit.get_drawset()
        except ValueError as e:
            raise ValueError(
                'Invalid mcmc_sample, error:\n\t{}\n\t'
                ' while processing files\n\t{}'.format(
                    repr(e), '\n\t'.join(sample_csv_files)
                )
            )

        generate_quantities_args = GenerateQuantitiesArgs(
            csv_files=sample_csv_files
        )
        generate_quantities_args.validate(chains)
        with MaybeDictToFilePath(data, None) as (_data, _inits):
            args = CmdStanArgs(
                self._name,
                self._exe_file,
                chain_ids=[x + 1 for x in range(chains)],
                data=_data,
                seed=seed,
                output_dir=gq_output_dir,
                method_args=generate_quantities_args,
            )
            runset = RunSet(args=args, chains=chains)

            cores_avail = cpu_count()
            cores = max(min(cores_avail - 2, chains), 1)
            with ThreadPoolExecutor(max_workers=cores) as executor:
                for i in range(chains):
                    executor.submit(self._run_cmdstan, runset, i)

            if not runset._check_retcodes():
                msg = 'Error during generate_quantities'
                for i in range(chains):
                    if runset._retcode(i) != 0:
                        msg = '{}, chain {} returned error code {}'.format(
                            msg, i, runset._retcode(i)
                        )
                raise RuntimeError(msg)
            quantities = CmdStanGQ(runset=runset, mcmc_sample=sample_drawset)
            quantities._set_attrs_gq_csv_files(sample_csv_files[0])
        return quantities
Ejemplo n.º 16
0
    def generate_quantities(
        self,
        data: Union[Dict, str] = None,
        mcmc_sample: Union[CmdStanMCMC, List[str]] = None,
        seed: int = None,
        gq_csv_basename: str = None,
    ) -> CmdStanGQ:
        """
        Wrapper for generated quantities call.  Given a CmdStanMCMC object
        containing a sample from the fitted model, along with the
        corresponding dataset for that fit, run just the generated quantities
        block of the model in order to get additional quantities of interest.

        :param data: Values for all data variables in the model, specified
            either as a dictionary with entries matching the data variables,
            or as the path of a data file in JSON or Rdump format.

        :param mcmc_sample: Can be either a CmdStanMCMC object returned by
            CmdStanPy's `sample` method or a list of stan-csv files generated
            by fitting the model to the data using any Stan interface.

        :param seed: The seed for random number generator. Must be an integer
            between ``0`` and ``2^32 - 1``. If unspecified,
            ``numpy.random.RandomState()``
            is used to generate a seed which will be used for all chains.
            *NOTE: Specifying the seed will guarantee the same result for
            multiple invocations of this method with the same inputs.  However
            this will not reproduce results from the sample method given
            the same inputs because the RNG will be in a different state.*

        :param gq_csv_basename: A path or file name which will be used as the
            basename for the sampler output files.  The csv output files
            for each chain are written to file ``<basename>-<chain_id>.csv``
            and the console output and error messages are written to file
            ``<basename>-<chain_id>.txt``.

        :return: CmdStanGQ object
        """
        sample_csv_files = []
        sample_drawset = None
        chains = 0

        if isinstance(mcmc_sample, CmdStanMCMC):
            sample_csv_files = mcmc_sample.runset.csv_files
            sample_drawset = mcmc_sample.get_drawset()
            chains = mcmc_sample.chains
        elif isinstance(mcmc_sample, list):
            sample_csv_files = mcmc_sample
        else:
            raise ValueError(
                'mcmc_sample must be either CmdStanMCMC object'
                ' or list of paths to sample csv_files'
            )

        try:
            chains = len(sample_csv_files)
            if sample_drawset is None:  # assemble sample from csv files
                sampler_args = SamplerArgs()
                args = CmdStanArgs(
                    self._name,
                    self._exe_file,
                    chain_ids=[x + 1 for x in range(chains)],
                    method_args=sampler_args,
                )
                runset = RunSet(args=args, chains=chains)
                runset._csv_files = sample_csv_files
                sample_fit = CmdStanMCMC(runset)
                sample_fit._validate_csv_files()
                sample_drawset = sample_fit.get_drawset()
        except ValueError as e:
            raise ValueError(
                'Invalid mcmc_sample, error:\n\t{}\n\t'
                ' while processing files\n\t{}'.format(
                    repr(e), '\n\t'.join(sample_csv_files))
            )

        generate_quantities_args = GenerateQuantitiesArgs(
            csv_files=sample_csv_files
        )
        generate_quantities_args.validate(chains)
        with MaybeDictToFilePath(data, None) as (_data, _inits):
            args = CmdStanArgs(
                self._name,
                self._exe_file,
                chain_ids=[x + 1 for x in range(chains)],
                data=_data,
                seed=seed,
                output_basename=gq_csv_basename,
                method_args=generate_quantities_args,
            )
            runset = RunSet(args=args, chains=chains)

            cores_avail = cpu_count()
            cores = max(min(cores_avail - 2, chains), 1)
            with ThreadPoolExecutor(max_workers=cores) as executor:
                for i in range(chains):
                    executor.submit(self._run_cmdstan, runset, i)

            if not runset._check_retcodes():
                msg = 'Error during generate_quantities'
                for i in range(chains):
                    if runset._retcode(i) != 0:
                        msg = '{}, chain {} returned error code {}'.format(
                            msg, i, runset._retcode(i)
                        )
                raise RuntimeError(msg)
            quantities = CmdStanGQ(runset=runset, mcmc_sample=sample_drawset)
            quantities._set_attrs_gq_csv_files(sample_csv_files[0])
        return quantities