Beispiel #1
0
 def summary(self) -> pd.DataFrame:
     """
     Run cmdstan/bin/stansummary over all output csv files.
     Echo stansummary stdout/stderr to console.
     Assemble csv tempfile contents into pandasDataFrame.
     """
     names = self.column_names
     cmd_path = os.path.join(cmdstan_path(), 'bin',
                             'stansummary' + EXTENSION)
     tmp_csv_file = 'stansummary-{}-{}-chain-'.format(
         self.runset._args.model_name, self.runset.chains)
     tmp_csv_path = create_named_text_file(dir=TMPDIR,
                                           prefix=tmp_csv_file,
                                           suffix='.csv')
     cmd = [
         cmd_path,
         '--csv_file={}'.format(tmp_csv_path),
     ] + self.runset.csv_files
     do_command(cmd, logger=self.runset._logger)
     with open(tmp_csv_path, 'rb') as fd:
         summary_data = pd.read_csv(fd,
                                    delimiter=',',
                                    header=0,
                                    index_col=0,
                                    comment='#')
     mask = [
         x == 'lp__' or not x.endswith('__') for x in summary_data.index
     ]
     return summary_data[mask]
Beispiel #2
0
 def __init__(self,
              args: CmdStanArgs,
              chains: int = 4,
              logger: logging.Logger = None) -> None:
     """Initialize object."""
     self._args = args
     self._chains = chains
     self._logger = logger or get_logger()
     if chains < 1:
         raise ValueError('chains must be positive integer value, '
                          'found {i]}'.format(chains))
     self._csv_files = []
     if args.output_basename is None:
         csv_basename = 'stan-{}-{}'.format(args.model_name, args.method)
         for i in range(chains):
             fd_name = create_named_text_file(
                 dir=TMPDIR,
                 prefix='{}-{}-'.format(csv_basename, i + 1),
                 suffix='.csv',
             )
             self._csv_files.append(fd_name)
     else:
         for i in range(chains):
             self._csv_files.append('{}-{}.csv'.format(
                 args.output_basename, i + 1))
     self._console_files = []
     for i in range(chains):
         txt_file = ''.join(
             [os.path.splitext(self._csv_files[i])[0], '.txt'])
         self._console_files.append(txt_file)
     self._cmds = [
         args.compose_command(i, self._csv_files[i]) for i in range(chains)
     ]
     self._retcodes = [-1 for _ in range(chains)]
Beispiel #3
0
    def summary(self, percentiles: List[int] = None) -> pd.DataFrame:
        """
        Run cmdstan/bin/stansummary over all output csv files.
        Echo stansummary stdout/stderr to console.
        Assemble csv tempfile contents into pandasDataFrame.

        :param percentiles: Ordered non-empty list of percentiles to report.
            Must be integers from (1, 99), inclusive.
        """
        percentiles_str = '--percentiles=5,50,95'
        if percentiles is not None:
            if len(percentiles) == 0:
                raise ValueError(
                    'invalid percentiles argument, must be ordered'
                    ' non-empty list from (1, 99), inclusive.'
                )

            cur_pct = 0
            for pct in percentiles:
                if pct > 99 or not pct > cur_pct:
                    raise ValueError(
                        'invalid percentiles spec, must be ordered'
                        ' non-empty list from (1, 99), inclusive.'
                    )
                cur_pct = pct
            percentiles_str = '='.join(
                ['--percentiles', ','.join([str(x) for x in percentiles])]
            )
        cmd_path = os.path.join(
            cmdstan_path(), 'bin', 'stansummary' + EXTENSION
        )
        tmp_csv_file = 'stansummary-{}-{}-chain-'.format(
            self.runset._args.model_name, self.runset.chains
        )
        tmp_csv_path = create_named_text_file(
            dir=_TMPDIR, prefix=tmp_csv_file, suffix='.csv'
        )
        cmd = [
            cmd_path,
            percentiles_str,
            '--csv_file={}'.format(tmp_csv_path),
        ] + self.runset.csv_files
        do_command(cmd, logger=self.runset._logger)
        with open(tmp_csv_path, 'rb') as fd:
            summary_data = pd.read_csv(
                fd,
                delimiter=',',
                header=0,
                index_col=0,
                comment='#',
                float_precision='high',
            )
        mask = [x == 'lp__' or not x.endswith('__') for x in summary_data.index]
        return summary_data[mask]
Beispiel #4
0
 def __init__(self,
              args: CmdStanArgs,
              chains: int = 4,
              logger: logging.Logger = None) -> None:
     """Initialize object."""
     self._args = args
     self._is_optimizing = isinstance(self._args.method_args, OptimizeArgs)
     self._is_sampling = isinstance(self._args.method_args, SamplerArgs)
     self._chains = chains
     self._logger = logger or get_logger()
     if chains < 1:
         raise ValueError('chains must be positive integer value, '
                          'found {i]}'.format(chains))
     self._csv_files = []
     # per-chain sample csv files.
     if args.output_basename is None:
         csv_basename = 'stan-{}-draws'.format(args.model_name)
         for i in range(chains):
             fd_name = create_named_text_file(
                 dir=TMPDIR,
                 prefix='{}-{}-'.format(csv_basename, i + 1),
                 suffix='.csv',
             )
             self._csv_files.append(fd_name)
     else:
         for i in range(chains):
             self._csv_files.append('{}-{}.csv'.format(
                 args.output_basename, i + 1))
     self.console_files = []
     # per-chain sample console output files.
     for i in range(chains):
         txt_file = ''.join(
             [os.path.splitext(self._csv_files[i])[0], '.txt'])
         self.console_files.append(txt_file)
     self.cmds = [
         args.compose_command(i, self._csv_files[i]) for i in range(chains)
     ]
     # per-chain sampler command.
     self._retcodes = [-1 for _ in range(chains)]
     self._draws = None
     self._column_names = None
     self._num_params = None  # metric dim(s)
     self._metric_type = None
     self._metric = None
     self._stepsize = None
     self._sample = None
     self._first_draw = None
     self._generated_quantities = None
Beispiel #5
0
    def __init__(self,
                 args: CmdStanArgs,
                 chains: int = 4,
                 logger: logging.Logger = None) -> None:
        """Initialize object."""
        self._args = args
        self._chains = chains
        self._logger = logger or get_logger()
        if chains < 1:
            raise ValueError('chains must be positive integer value, '
                             'found {}'.format(chains))

        self._retcodes = [-1 for _ in range(chains)]

        # output and console messages are written to a text file:
        # ``<model_name>-<YYYYMMDDHHMM>-<chain_id>.txt``
        now = datetime.now()
        now_str = now.strftime('%Y%m%d%H%M')
        file_basename = '-'.join([args.model_name, now_str])
        if args.output_dir is not None:
            output_dir = args.output_dir
        else:
            output_dir = TMPDIR

        self._csv_files = []
        self._diagnostic_files = [None for _ in range(chains)]
        self._console_files = []
        self._cmds = []
        for i in range(chains):
            if args.output_dir is None:
                csv_file = create_named_text_file(
                    dir=output_dir,
                    prefix='{}-{}-'.format(file_basename, i + 1),
                    suffix='.csv',
                )
            else:
                csv_file = os.path.join(
                    output_dir, '{}-{}.{}'.format(file_basename, i + 1, 'csv'))
            self._csv_files.append(csv_file)
            txt_file = ''.join([os.path.splitext(csv_file)[0], '.txt'])
            self._console_files.append(txt_file)
            if args.save_diagnostics:
                if args.output_dir is None:
                    diag_file = create_named_text_file(
                        dir=TMPDIR,
                        prefix='{}-diagnostic-{}-'.format(
                            file_basename, i + 1),
                        suffix='.csv',
                    )
                else:
                    diag_file = os.path.join(
                        output_dir,
                        '{}-diagnostic-{}.{}'.format(file_basename, i + 1,
                                                     'csv'),
                    )
                self._diagnostic_files.append(diag_file)
                self._cmds.append(
                    args.compose_command(i, self._csv_files[i],
                                         self._diagnostic_files[i]))
            else:
                self._cmds.append(args.compose_command(i, self._csv_files[i]))
Beispiel #6
0
    def __init__(
        self,
        args: CmdStanArgs,
        chains: int = 4,
        chain_ids: List[int] = None,
        logger: logging.Logger = None,
    ) -> None:
        """Initialize object."""
        self._args = args
        self._chains = chains
        self._logger = logger or get_logger()
        if chains < 1:
            raise ValueError('chains must be positive integer value, '
                             'found {}'.format(chains))
        if chain_ids is None:
            chain_ids = [x + 1 for x in range(chains)]
        elif len(chain_ids) != chains:
            raise ValueError(
                'mismatch between number of chains and chain_ids, '
                'found {} chains, but {} chain_ids'.format(
                    chains, len(chain_ids)))
        self._chain_ids = chain_ids
        self._retcodes = [-1 for _ in range(chains)]

        # stdout, stderr are written to text files
        # prefix: ``<model_name>-<YYYYMMDDHHMM>-<chain_id>``
        # suffixes: ``-stdout.txt``, ``-stderr.txt``
        now = datetime.now()
        now_str = now.strftime('%Y%m%d%H%M')
        file_basename = '-'.join([args.model_name, now_str])
        if args.output_dir is not None:
            output_dir = args.output_dir
        else:
            output_dir = _TMPDIR
        self._csv_files = [None for _ in range(chains)]
        self._diagnostic_files = [None for _ in range(chains)]
        self._stdout_files = [None for _ in range(chains)]
        self._stderr_files = [None for _ in range(chains)]
        self._cmds = []
        for i in range(chains):
            if args.output_dir is None:
                csv_file = create_named_text_file(
                    dir=output_dir,
                    prefix='{}-{}-'.format(file_basename, str(chain_ids[i])),
                    suffix='.csv',
                )
            else:
                csv_file = os.path.join(
                    output_dir,
                    '{}-{}.{}'.format(file_basename, str(chain_ids[i]), 'csv'),
                )
            self._csv_files[i] = csv_file
            stdout_file = ''.join(
                [os.path.splitext(csv_file)[0], '-stdout.txt'])
            self._stdout_files[i] = stdout_file
            stderr_file = ''.join(
                [os.path.splitext(csv_file)[0], '-stderr.txt'])
            self._stderr_files[i] = stderr_file
            if args.save_diagnostics:
                if args.output_dir is None:
                    diag_file = create_named_text_file(
                        dir=_TMPDIR,
                        prefix='{}-diagnostic-{}-'.format(
                            file_basename, str(chain_ids[i])),
                        suffix='.csv',
                    )
                else:
                    diag_file = os.path.join(
                        output_dir,
                        '{}-diagnostic-{}.{}'.format(file_basename,
                                                     str(chain_ids[i]), 'csv'),
                    )
                self._diagnostic_files[i] = diag_file
                self._cmds.append(
                    args.compose_command(i, self._csv_files[i],
                                         self._diagnostic_files[i]))
            else:
                self._cmds.append(args.compose_command(i, self._csv_files[i]))
Beispiel #7
0
    def validate(self, chains: Optional[int]) -> None:
        """
        Check arguments correctness and consistency.

        * adaptation and warmup args are consistent
        * if file(s) for metric are supplied, check contents.
        * length of per-chain lists equals specified # of chains
        """
        if not isinstance(chains, int) or chains < 1:
            raise ValueError(
                'Sampler expects number of chains to be greater than 0.')
        if not (self.adapt_delta is None and self.adapt_init_phase is None
                and self.adapt_metric_window is None
                and self.adapt_step_size is None):
            if self.adapt_engaged is False:
                msg = 'Conflicting arguments: adapt_engaged: False'
                if self.adapt_delta is not None:
                    msg = '{}, adapt_delta: {}'.format(msg, self.adapt_delta)
                if self.adapt_init_phase is not None:
                    msg = '{}, adapt_init_phase: {}'.format(
                        msg, self.adapt_init_phase)
                if self.adapt_metric_window is not None:
                    msg = '{}, adapt_metric_window: {}'.format(
                        msg, self.adapt_metric_window)
                if self.adapt_step_size is not None:
                    msg = '{}, adapt_step_size: {}'.format(
                        msg, self.adapt_step_size)
                raise ValueError(msg)

        if self.iter_warmup is not None:
            if self.iter_warmup < 0 or not isinstance(self.iter_warmup, int):
                raise ValueError(
                    'Value for iter_warmup must be a non-negative integer,'
                    ' found {}.'.format(self.iter_warmup))
            if self.iter_warmup > 0 and not self.adapt_engaged:
                raise ValueError('Argument "adapt_engaged" is False, '
                                 'cannot specify warmup iterations.')
        if self.iter_sampling is not None:
            if self.iter_sampling < 0 or not isinstance(
                    self.iter_sampling, int):
                raise ValueError(
                    'Argument "iter_sampling" must be a non-negative integer,'
                    ' found {}.'.format(self.iter_sampling))
        if self.thin is not None:
            if self.thin < 1 or not isinstance(self.thin, int):
                raise ValueError('Argument "thin" must be a positive integer,'
                                 'found {}.'.format(self.thin))
        if self.max_treedepth is not None:
            if self.max_treedepth < 1 or not isinstance(
                    self.max_treedepth, int):
                raise ValueError(
                    'Argument "max_treedepth" must be a positive integer,'
                    ' found {}.'.format(self.max_treedepth))
        if self.step_size is not None:
            if isinstance(self.step_size, (float, int)):
                if self.step_size <= 0:
                    raise ValueError('Argument "step_size" must be > 0, '
                                     'found {}.'.format(self.step_size))
            else:
                if len(self.step_size) != chains:
                    raise ValueError(
                        'Expecting {} per-chain step_size specifications, '
                        ' found {}.'.format(chains, len(self.step_size)))
                for i, step_size in enumerate(self.step_size):
                    if step_size < 0:
                        raise ValueError('Argument "step_size" must be > 0, '
                                         'chain {}, found {}.'.format(
                                             i + 1, step_size))
        if self.metric is not None:
            if isinstance(self.metric, str):
                if self.metric in ['diag', 'diag_e']:
                    self.metric_type = 'diag_e'
                elif self.metric in ['dense', 'dense_e']:
                    self.metric_type = 'dense_e'
                elif self.metric in ['unit', 'unit_e']:
                    self.metric_type = 'unit_e'
                else:
                    if not os.path.exists(self.metric):
                        raise ValueError('no such file {}'.format(self.metric))
                    dims = read_metric(self.metric)
                    if len(dims) == 1:
                        self.metric_type = 'diag_e'
                    else:
                        self.metric_type = 'dense_e'
                    self.metric_file = self.metric
            elif isinstance(self.metric, Dict):
                if 'inv_metric' not in self.metric:
                    raise ValueError(
                        'Entry "inv_metric" not found in metric dict.')
                dims = list(np.asarray(self.metric['inv_metric']).shape)
                if len(dims) == 1:
                    self.metric_type = 'diag_e'
                else:
                    self.metric_type = 'dense_e'
                dict_file = create_named_text_file(dir=_TMPDIR,
                                                   prefix="metric",
                                                   suffix=".json")
                write_stan_json(dict_file, self.metric)
                self.metric_file = dict_file
            elif isinstance(self.metric, (list, tuple)):
                if len(self.metric) != chains:
                    raise ValueError(
                        'Number of metric files must match number of chains,'
                        ' found {} metric files for {} chains.'.format(
                            len(self.metric), chains))
                if all(isinstance(elem, dict) for elem in self.metric):
                    metric_files: List[str] = []
                    for i, metric in enumerate(self.metric):
                        assert isinstance(metric,
                                          dict)  # make the typechecker happy
                        metric_dict: Dict[str, Any] = metric
                        if 'inv_metric' not in metric_dict:
                            raise ValueError(
                                'Entry "inv_metric" not found in metric dict '
                                'for chain {}.'.format(i + 1))
                        if i == 0:
                            dims = list(
                                np.asarray(metric_dict['inv_metric']).shape)
                        else:
                            dims2 = list(
                                np.asarray(metric_dict['inv_metric']).shape)
                            if dims != dims2:
                                raise ValueError(
                                    'Found inconsistent "inv_metric" entry '
                                    'for chain {}: entry has dims '
                                    '{}, expected {}.'.format(
                                        i + 1, dims, dims2))
                        dict_file = create_named_text_file(dir=_TMPDIR,
                                                           prefix="metric",
                                                           suffix=".json")
                        write_stan_json(dict_file, metric_dict)
                        metric_files.append(dict_file)
                    if len(dims) == 1:
                        self.metric_type = 'diag_e'
                    else:
                        self.metric_type = 'dense_e'
                    self.metric_file = metric_files
                elif all(isinstance(elem, str) for elem in self.metric):
                    metric_files = []
                    for i, metric in enumerate(self.metric):
                        assert isinstance(metric, str)  # typecheck
                        if not os.path.exists(metric):
                            raise ValueError('no such file {}'.format(metric))
                        if i == 0:
                            dims = read_metric(metric)
                        else:
                            dims2 = read_metric(metric)
                            if len(dims) != len(dims2):
                                raise ValueError(
                                    'Metrics files {}, {},'
                                    ' inconsistent metrics'.format(
                                        self.metric[0], metric))
                            if dims != dims2:
                                raise ValueError(
                                    'Metrics files {}, {},'
                                    ' inconsistent metrics'.format(
                                        self.metric[0], metric))
                        metric_files.append(metric)
                    if len(dims) == 1:
                        self.metric_type = 'diag_e'
                    else:
                        self.metric_type = 'dense_e'
                    self.metric_file = metric_files
                else:
                    raise ValueError(
                        'Argument "metric" must be a list of pathnames or '
                        'Python dicts, found list of {}.'.format(
                            type(self.metric[0])))
            else:
                raise ValueError(
                    'Invalid metric specified, not a recognized metric type, '
                    'must be either a metric type name, a filepath, dict, '
                    'or list of per-chain filepaths or dicts.  Found '
                    'an object of type {}.'.format(type(self.metric)))

        if self.adapt_delta is not None:
            if not 0 < self.adapt_delta < 1:
                raise ValueError(
                    'Argument "adapt_delta" must be between 0 and 1,'
                    ' found {}'.format(self.adapt_delta))
        if self.adapt_init_phase is not None:
            if self.adapt_init_phase < 0 or not isinstance(
                    self.adapt_init_phase, int):
                raise ValueError(
                    'Argument "adapt_init_phase" must be a non-negative '
                    'integer, found {}'.format(self.adapt_init_phase))
        if self.adapt_metric_window is not None:
            if self.adapt_metric_window < 0 or not isinstance(
                    self.adapt_metric_window, int):
                raise ValueError(
                    'Argument "adapt_metric_window" must be a non-negative '
                    ' integer, found {}'.format(self.adapt_metric_window))
        if self.adapt_step_size is not None:
            if self.adapt_step_size < 0 or not isinstance(
                    self.adapt_step_size, int):
                raise ValueError(
                    'Argument "adapt_step_size" must be a non-negative integer,'
                    'found {}'.format(self.adapt_step_size))

        if self.fixed_param and (
                self.max_treedepth is not None or self.metric is not None
                or self.step_size is not None or
                not (self.adapt_delta is None and self.adapt_init_phase is None
                     and self.adapt_metric_window is None
                     and self.adapt_step_size is None)):
            raise ValueError(
                'When fixed_param=True, cannot specify adaptation parameters.')
Beispiel #8
0
    def summary(self,
                percentiles: List[int] = None,
                sig_figs: int = None) -> pd.DataFrame:
        """
        Run cmdstan/bin/stansummary over all output csv files, assemble
        summary into DataFrame object; first row contains summary statistics
        for total joint log probability `lp__`, remaining rows contain summary
        statistics for all parameters, transformed parameters, and generated
        quantities variables listed in the order in which they were declared
        in the Stan program.

        :param percentiles: Ordered non-empty list of percentiles to report.
            Must be integers from (1, 99), inclusive.

        :param sig_figs: Number of significant figures to report.
            Must be an integer between 1 and 18.  If unspecified, the default
            precision for the system file I/O is used; the usual value is 6.
            If precision above 6 is requested, sample must have been produced
            by CmdStan version 2.25 or later and sampler output precision
            must equal to or greater than the requested summary precision.

        :return: pandas.DataFrame
        """
        percentiles_str = '--percentiles=5,50,95'
        if percentiles is not None:
            if len(percentiles) == 0:
                raise ValueError(
                    'invalid percentiles argument, must be ordered'
                    ' non-empty list from (1, 99), inclusive.')
            cur_pct = 0
            for pct in percentiles:
                if pct > 99 or not pct > cur_pct:
                    raise ValueError(
                        'invalid percentiles spec, must be ordered'
                        ' non-empty list from (1, 99), inclusive.')
                cur_pct = pct
            percentiles_str = '='.join(
                ['--percentiles', ','.join([str(x) for x in percentiles])])
        sig_figs_str = '--sig_figs=2'
        if sig_figs is not None:
            if not isinstance(sig_figs, int) or sig_figs < 1 or sig_figs > 18:
                raise ValueError(
                    'sig_figs must be an integer between 1 and 18,'
                    ' found {}'.format(sig_figs))
            csv_sig_figs = self._sig_figs or 6
            if sig_figs > csv_sig_figs:
                self._logger.warning(
                    'Requesting %d significant digits of output, but CSV files'
                    ' only have %d digits of precision.',
                    sig_figs,
                    csv_sig_figs,
                )
            sig_figs_str = '--sig_figs=' + str(sig_figs)
        cmd_path = os.path.join(cmdstan_path(), 'bin',
                                'stansummary' + EXTENSION)
        tmp_csv_file = 'stansummary-{}-'.format(self.runset._args.model_name)
        tmp_csv_path = create_named_text_file(dir=_TMPDIR,
                                              prefix=tmp_csv_file,
                                              suffix='.csv',
                                              name_only=True)
        csv_str = '--csv_filename={}'.format(tmp_csv_path)
        if not cmdstan_version_at(2, 24):
            csv_str = '--csv_file={}'.format(tmp_csv_path)
        cmd = [
            cmd_path,
            percentiles_str,
            sig_figs_str,
            csv_str,
        ] + self.runset.csv_files
        do_command(cmd, logger=self.runset._logger)
        with open(tmp_csv_path, 'rb') as fd:
            summary_data = pd.read_csv(
                fd,
                delimiter=',',
                header=0,
                index_col=0,
                comment='#',
                float_precision='high',
            )
        mask = [
            x == 'lp__' or not x.endswith('__') for x in summary_data.index
        ]
        return summary_data[mask]