def summary(self) -> pd.DataFrame: """ Run cmdstan/bin/stansummary over all output csv files. Echo stansummary stdout/stderr to console. Assemble csv tempfile contents into pandasDataFrame. """ names = self.column_names cmd_path = os.path.join(cmdstan_path(), 'bin', 'stansummary' + EXTENSION) tmp_csv_file = 'stansummary-{}-{}-chain-'.format( self.runset._args.model_name, self.runset.chains) tmp_csv_path = create_named_text_file(dir=TMPDIR, prefix=tmp_csv_file, suffix='.csv') cmd = [ cmd_path, '--csv_file={}'.format(tmp_csv_path), ] + self.runset.csv_files do_command(cmd, logger=self.runset._logger) with open(tmp_csv_path, 'rb') as fd: summary_data = pd.read_csv(fd, delimiter=',', header=0, index_col=0, comment='#') mask = [ x == 'lp__' or not x.endswith('__') for x in summary_data.index ] return summary_data[mask]
def __init__(self, args: CmdStanArgs, chains: int = 4, logger: logging.Logger = None) -> None: """Initialize object.""" self._args = args self._chains = chains self._logger = logger or get_logger() if chains < 1: raise ValueError('chains must be positive integer value, ' 'found {i]}'.format(chains)) self._csv_files = [] if args.output_basename is None: csv_basename = 'stan-{}-{}'.format(args.model_name, args.method) for i in range(chains): fd_name = create_named_text_file( dir=TMPDIR, prefix='{}-{}-'.format(csv_basename, i + 1), suffix='.csv', ) self._csv_files.append(fd_name) else: for i in range(chains): self._csv_files.append('{}-{}.csv'.format( args.output_basename, i + 1)) self._console_files = [] for i in range(chains): txt_file = ''.join( [os.path.splitext(self._csv_files[i])[0], '.txt']) self._console_files.append(txt_file) self._cmds = [ args.compose_command(i, self._csv_files[i]) for i in range(chains) ] self._retcodes = [-1 for _ in range(chains)]
def summary(self, percentiles: List[int] = None) -> pd.DataFrame: """ Run cmdstan/bin/stansummary over all output csv files. Echo stansummary stdout/stderr to console. Assemble csv tempfile contents into pandasDataFrame. :param percentiles: Ordered non-empty list of percentiles to report. Must be integers from (1, 99), inclusive. """ percentiles_str = '--percentiles=5,50,95' if percentiles is not None: if len(percentiles) == 0: raise ValueError( 'invalid percentiles argument, must be ordered' ' non-empty list from (1, 99), inclusive.' ) cur_pct = 0 for pct in percentiles: if pct > 99 or not pct > cur_pct: raise ValueError( 'invalid percentiles spec, must be ordered' ' non-empty list from (1, 99), inclusive.' ) cur_pct = pct percentiles_str = '='.join( ['--percentiles', ','.join([str(x) for x in percentiles])] ) cmd_path = os.path.join( cmdstan_path(), 'bin', 'stansummary' + EXTENSION ) tmp_csv_file = 'stansummary-{}-{}-chain-'.format( self.runset._args.model_name, self.runset.chains ) tmp_csv_path = create_named_text_file( dir=_TMPDIR, prefix=tmp_csv_file, suffix='.csv' ) cmd = [ cmd_path, percentiles_str, '--csv_file={}'.format(tmp_csv_path), ] + self.runset.csv_files do_command(cmd, logger=self.runset._logger) with open(tmp_csv_path, 'rb') as fd: summary_data = pd.read_csv( fd, delimiter=',', header=0, index_col=0, comment='#', float_precision='high', ) mask = [x == 'lp__' or not x.endswith('__') for x in summary_data.index] return summary_data[mask]
def __init__(self, args: CmdStanArgs, chains: int = 4, logger: logging.Logger = None) -> None: """Initialize object.""" self._args = args self._is_optimizing = isinstance(self._args.method_args, OptimizeArgs) self._is_sampling = isinstance(self._args.method_args, SamplerArgs) self._chains = chains self._logger = logger or get_logger() if chains < 1: raise ValueError('chains must be positive integer value, ' 'found {i]}'.format(chains)) self._csv_files = [] # per-chain sample csv files. if args.output_basename is None: csv_basename = 'stan-{}-draws'.format(args.model_name) for i in range(chains): fd_name = create_named_text_file( dir=TMPDIR, prefix='{}-{}-'.format(csv_basename, i + 1), suffix='.csv', ) self._csv_files.append(fd_name) else: for i in range(chains): self._csv_files.append('{}-{}.csv'.format( args.output_basename, i + 1)) self.console_files = [] # per-chain sample console output files. for i in range(chains): txt_file = ''.join( [os.path.splitext(self._csv_files[i])[0], '.txt']) self.console_files.append(txt_file) self.cmds = [ args.compose_command(i, self._csv_files[i]) for i in range(chains) ] # per-chain sampler command. self._retcodes = [-1 for _ in range(chains)] self._draws = None self._column_names = None self._num_params = None # metric dim(s) self._metric_type = None self._metric = None self._stepsize = None self._sample = None self._first_draw = None self._generated_quantities = None
def __init__(self, args: CmdStanArgs, chains: int = 4, logger: logging.Logger = None) -> None: """Initialize object.""" self._args = args self._chains = chains self._logger = logger or get_logger() if chains < 1: raise ValueError('chains must be positive integer value, ' 'found {}'.format(chains)) self._retcodes = [-1 for _ in range(chains)] # output and console messages are written to a text file: # ``<model_name>-<YYYYMMDDHHMM>-<chain_id>.txt`` now = datetime.now() now_str = now.strftime('%Y%m%d%H%M') file_basename = '-'.join([args.model_name, now_str]) if args.output_dir is not None: output_dir = args.output_dir else: output_dir = TMPDIR self._csv_files = [] self._diagnostic_files = [None for _ in range(chains)] self._console_files = [] self._cmds = [] for i in range(chains): if args.output_dir is None: csv_file = create_named_text_file( dir=output_dir, prefix='{}-{}-'.format(file_basename, i + 1), suffix='.csv', ) else: csv_file = os.path.join( output_dir, '{}-{}.{}'.format(file_basename, i + 1, 'csv')) self._csv_files.append(csv_file) txt_file = ''.join([os.path.splitext(csv_file)[0], '.txt']) self._console_files.append(txt_file) if args.save_diagnostics: if args.output_dir is None: diag_file = create_named_text_file( dir=TMPDIR, prefix='{}-diagnostic-{}-'.format( file_basename, i + 1), suffix='.csv', ) else: diag_file = os.path.join( output_dir, '{}-diagnostic-{}.{}'.format(file_basename, i + 1, 'csv'), ) self._diagnostic_files.append(diag_file) self._cmds.append( args.compose_command(i, self._csv_files[i], self._diagnostic_files[i])) else: self._cmds.append(args.compose_command(i, self._csv_files[i]))
def __init__( self, args: CmdStanArgs, chains: int = 4, chain_ids: List[int] = None, logger: logging.Logger = None, ) -> None: """Initialize object.""" self._args = args self._chains = chains self._logger = logger or get_logger() if chains < 1: raise ValueError('chains must be positive integer value, ' 'found {}'.format(chains)) if chain_ids is None: chain_ids = [x + 1 for x in range(chains)] elif len(chain_ids) != chains: raise ValueError( 'mismatch between number of chains and chain_ids, ' 'found {} chains, but {} chain_ids'.format( chains, len(chain_ids))) self._chain_ids = chain_ids self._retcodes = [-1 for _ in range(chains)] # stdout, stderr are written to text files # prefix: ``<model_name>-<YYYYMMDDHHMM>-<chain_id>`` # suffixes: ``-stdout.txt``, ``-stderr.txt`` now = datetime.now() now_str = now.strftime('%Y%m%d%H%M') file_basename = '-'.join([args.model_name, now_str]) if args.output_dir is not None: output_dir = args.output_dir else: output_dir = _TMPDIR self._csv_files = [None for _ in range(chains)] self._diagnostic_files = [None for _ in range(chains)] self._stdout_files = [None for _ in range(chains)] self._stderr_files = [None for _ in range(chains)] self._cmds = [] for i in range(chains): if args.output_dir is None: csv_file = create_named_text_file( dir=output_dir, prefix='{}-{}-'.format(file_basename, str(chain_ids[i])), suffix='.csv', ) else: csv_file = os.path.join( output_dir, '{}-{}.{}'.format(file_basename, str(chain_ids[i]), 'csv'), ) self._csv_files[i] = csv_file stdout_file = ''.join( [os.path.splitext(csv_file)[0], '-stdout.txt']) self._stdout_files[i] = stdout_file stderr_file = ''.join( [os.path.splitext(csv_file)[0], '-stderr.txt']) self._stderr_files[i] = stderr_file if args.save_diagnostics: if args.output_dir is None: diag_file = create_named_text_file( dir=_TMPDIR, prefix='{}-diagnostic-{}-'.format( file_basename, str(chain_ids[i])), suffix='.csv', ) else: diag_file = os.path.join( output_dir, '{}-diagnostic-{}.{}'.format(file_basename, str(chain_ids[i]), 'csv'), ) self._diagnostic_files[i] = diag_file self._cmds.append( args.compose_command(i, self._csv_files[i], self._diagnostic_files[i])) else: self._cmds.append(args.compose_command(i, self._csv_files[i]))
def validate(self, chains: Optional[int]) -> None: """ Check arguments correctness and consistency. * adaptation and warmup args are consistent * if file(s) for metric are supplied, check contents. * length of per-chain lists equals specified # of chains """ if not isinstance(chains, int) or chains < 1: raise ValueError( 'Sampler expects number of chains to be greater than 0.') if not (self.adapt_delta is None and self.adapt_init_phase is None and self.adapt_metric_window is None and self.adapt_step_size is None): if self.adapt_engaged is False: msg = 'Conflicting arguments: adapt_engaged: False' if self.adapt_delta is not None: msg = '{}, adapt_delta: {}'.format(msg, self.adapt_delta) if self.adapt_init_phase is not None: msg = '{}, adapt_init_phase: {}'.format( msg, self.adapt_init_phase) if self.adapt_metric_window is not None: msg = '{}, adapt_metric_window: {}'.format( msg, self.adapt_metric_window) if self.adapt_step_size is not None: msg = '{}, adapt_step_size: {}'.format( msg, self.adapt_step_size) raise ValueError(msg) if self.iter_warmup is not None: if self.iter_warmup < 0 or not isinstance(self.iter_warmup, int): raise ValueError( 'Value for iter_warmup must be a non-negative integer,' ' found {}.'.format(self.iter_warmup)) if self.iter_warmup > 0 and not self.adapt_engaged: raise ValueError('Argument "adapt_engaged" is False, ' 'cannot specify warmup iterations.') if self.iter_sampling is not None: if self.iter_sampling < 0 or not isinstance( self.iter_sampling, int): raise ValueError( 'Argument "iter_sampling" must be a non-negative integer,' ' found {}.'.format(self.iter_sampling)) if self.thin is not None: if self.thin < 1 or not isinstance(self.thin, int): raise ValueError('Argument "thin" must be a positive integer,' 'found {}.'.format(self.thin)) if self.max_treedepth is not None: if self.max_treedepth < 1 or not isinstance( self.max_treedepth, int): raise ValueError( 'Argument "max_treedepth" must be a positive integer,' ' found {}.'.format(self.max_treedepth)) if self.step_size is not None: if isinstance(self.step_size, (float, int)): if self.step_size <= 0: raise ValueError('Argument "step_size" must be > 0, ' 'found {}.'.format(self.step_size)) else: if len(self.step_size) != chains: raise ValueError( 'Expecting {} per-chain step_size specifications, ' ' found {}.'.format(chains, len(self.step_size))) for i, step_size in enumerate(self.step_size): if step_size < 0: raise ValueError('Argument "step_size" must be > 0, ' 'chain {}, found {}.'.format( i + 1, step_size)) if self.metric is not None: if isinstance(self.metric, str): if self.metric in ['diag', 'diag_e']: self.metric_type = 'diag_e' elif self.metric in ['dense', 'dense_e']: self.metric_type = 'dense_e' elif self.metric in ['unit', 'unit_e']: self.metric_type = 'unit_e' else: if not os.path.exists(self.metric): raise ValueError('no such file {}'.format(self.metric)) dims = read_metric(self.metric) if len(dims) == 1: self.metric_type = 'diag_e' else: self.metric_type = 'dense_e' self.metric_file = self.metric elif isinstance(self.metric, Dict): if 'inv_metric' not in self.metric: raise ValueError( 'Entry "inv_metric" not found in metric dict.') dims = list(np.asarray(self.metric['inv_metric']).shape) if len(dims) == 1: self.metric_type = 'diag_e' else: self.metric_type = 'dense_e' dict_file = create_named_text_file(dir=_TMPDIR, prefix="metric", suffix=".json") write_stan_json(dict_file, self.metric) self.metric_file = dict_file elif isinstance(self.metric, (list, tuple)): if len(self.metric) != chains: raise ValueError( 'Number of metric files must match number of chains,' ' found {} metric files for {} chains.'.format( len(self.metric), chains)) if all(isinstance(elem, dict) for elem in self.metric): metric_files: List[str] = [] for i, metric in enumerate(self.metric): assert isinstance(metric, dict) # make the typechecker happy metric_dict: Dict[str, Any] = metric if 'inv_metric' not in metric_dict: raise ValueError( 'Entry "inv_metric" not found in metric dict ' 'for chain {}.'.format(i + 1)) if i == 0: dims = list( np.asarray(metric_dict['inv_metric']).shape) else: dims2 = list( np.asarray(metric_dict['inv_metric']).shape) if dims != dims2: raise ValueError( 'Found inconsistent "inv_metric" entry ' 'for chain {}: entry has dims ' '{}, expected {}.'.format( i + 1, dims, dims2)) dict_file = create_named_text_file(dir=_TMPDIR, prefix="metric", suffix=".json") write_stan_json(dict_file, metric_dict) metric_files.append(dict_file) if len(dims) == 1: self.metric_type = 'diag_e' else: self.metric_type = 'dense_e' self.metric_file = metric_files elif all(isinstance(elem, str) for elem in self.metric): metric_files = [] for i, metric in enumerate(self.metric): assert isinstance(metric, str) # typecheck if not os.path.exists(metric): raise ValueError('no such file {}'.format(metric)) if i == 0: dims = read_metric(metric) else: dims2 = read_metric(metric) if len(dims) != len(dims2): raise ValueError( 'Metrics files {}, {},' ' inconsistent metrics'.format( self.metric[0], metric)) if dims != dims2: raise ValueError( 'Metrics files {}, {},' ' inconsistent metrics'.format( self.metric[0], metric)) metric_files.append(metric) if len(dims) == 1: self.metric_type = 'diag_e' else: self.metric_type = 'dense_e' self.metric_file = metric_files else: raise ValueError( 'Argument "metric" must be a list of pathnames or ' 'Python dicts, found list of {}.'.format( type(self.metric[0]))) else: raise ValueError( 'Invalid metric specified, not a recognized metric type, ' 'must be either a metric type name, a filepath, dict, ' 'or list of per-chain filepaths or dicts. Found ' 'an object of type {}.'.format(type(self.metric))) if self.adapt_delta is not None: if not 0 < self.adapt_delta < 1: raise ValueError( 'Argument "adapt_delta" must be between 0 and 1,' ' found {}'.format(self.adapt_delta)) if self.adapt_init_phase is not None: if self.adapt_init_phase < 0 or not isinstance( self.adapt_init_phase, int): raise ValueError( 'Argument "adapt_init_phase" must be a non-negative ' 'integer, found {}'.format(self.adapt_init_phase)) if self.adapt_metric_window is not None: if self.adapt_metric_window < 0 or not isinstance( self.adapt_metric_window, int): raise ValueError( 'Argument "adapt_metric_window" must be a non-negative ' ' integer, found {}'.format(self.adapt_metric_window)) if self.adapt_step_size is not None: if self.adapt_step_size < 0 or not isinstance( self.adapt_step_size, int): raise ValueError( 'Argument "adapt_step_size" must be a non-negative integer,' 'found {}'.format(self.adapt_step_size)) if self.fixed_param and ( self.max_treedepth is not None or self.metric is not None or self.step_size is not None or not (self.adapt_delta is None and self.adapt_init_phase is None and self.adapt_metric_window is None and self.adapt_step_size is None)): raise ValueError( 'When fixed_param=True, cannot specify adaptation parameters.')
def summary(self, percentiles: List[int] = None, sig_figs: int = None) -> pd.DataFrame: """ Run cmdstan/bin/stansummary over all output csv files, assemble summary into DataFrame object; first row contains summary statistics for total joint log probability `lp__`, remaining rows contain summary statistics for all parameters, transformed parameters, and generated quantities variables listed in the order in which they were declared in the Stan program. :param percentiles: Ordered non-empty list of percentiles to report. Must be integers from (1, 99), inclusive. :param sig_figs: Number of significant figures to report. Must be an integer between 1 and 18. If unspecified, the default precision for the system file I/O is used; the usual value is 6. If precision above 6 is requested, sample must have been produced by CmdStan version 2.25 or later and sampler output precision must equal to or greater than the requested summary precision. :return: pandas.DataFrame """ percentiles_str = '--percentiles=5,50,95' if percentiles is not None: if len(percentiles) == 0: raise ValueError( 'invalid percentiles argument, must be ordered' ' non-empty list from (1, 99), inclusive.') cur_pct = 0 for pct in percentiles: if pct > 99 or not pct > cur_pct: raise ValueError( 'invalid percentiles spec, must be ordered' ' non-empty list from (1, 99), inclusive.') cur_pct = pct percentiles_str = '='.join( ['--percentiles', ','.join([str(x) for x in percentiles])]) sig_figs_str = '--sig_figs=2' if sig_figs is not None: if not isinstance(sig_figs, int) or sig_figs < 1 or sig_figs > 18: raise ValueError( 'sig_figs must be an integer between 1 and 18,' ' found {}'.format(sig_figs)) csv_sig_figs = self._sig_figs or 6 if sig_figs > csv_sig_figs: self._logger.warning( 'Requesting %d significant digits of output, but CSV files' ' only have %d digits of precision.', sig_figs, csv_sig_figs, ) sig_figs_str = '--sig_figs=' + str(sig_figs) cmd_path = os.path.join(cmdstan_path(), 'bin', 'stansummary' + EXTENSION) tmp_csv_file = 'stansummary-{}-'.format(self.runset._args.model_name) tmp_csv_path = create_named_text_file(dir=_TMPDIR, prefix=tmp_csv_file, suffix='.csv', name_only=True) csv_str = '--csv_filename={}'.format(tmp_csv_path) if not cmdstan_version_at(2, 24): csv_str = '--csv_file={}'.format(tmp_csv_path) cmd = [ cmd_path, percentiles_str, sig_figs_str, csv_str, ] + self.runset.csv_files do_command(cmd, logger=self.runset._logger) with open(tmp_csv_path, 'rb') as fd: summary_data = pd.read_csv( fd, delimiter=',', header=0, index_col=0, comment='#', float_precision='high', ) mask = [ x == 'lp__' or not x.endswith('__') for x in summary_data.index ] return summary_data[mask]