Example #1
0
 def test_mk_new_dir(self):
     self.assertFalse(os.path.exists(self.path))
     functions.mk_new_dir(self.path)
     self.assertTrue(os.path.exists(self.path))
     self.assertTrue(os.path.isdir(self.path))
     for i in range(10):
         functions.mk_new_dir(self.path)
         p = self.path + '-' + str(i)
         self.assertTrue(os.path.exists(p))
         self.assertTrue(os.path.isdir(p))
         self.register_dir(p)
Example #2
0
 def test_mk_new_dir(self):
     self.assertFalse(os.path.exists(self.path))
     functions.mk_new_dir(self.path)
     self.assertTrue(os.path.exists(self.path))
     self.assertTrue(os.path.isdir(self.path))
     for i in range(10):
         functions.mk_new_dir(self.path)
         p = self.path + '-' + str(i)
         self.assertTrue(os.path.exists(p))
         self.assertTrue(os.path.isdir(p))
         self.register_dir(p)
Example #3
0
def main_cli(argv = sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse_utils.SmartHelpFormatter)
    parser.add_argument('-o', '--observed-configs',
            nargs = '+',
            type = argparse_utils.arg_is_config,
            required = True,
            help = ('One or more msBayes config files to be used to either '
                    'calculate or simulate observed summary statistics. If '
                    'used in combination with `-r` each config will be used to '
                    'simulate pseudo-observed data. If analyzing real data, do '
                    'not use the `-r` option, and the fasta files specified '
                    'within the config must exist and contain the sequence '
                    'data.'))
    parser.add_argument('-p', '--prior-configs',
            nargs = '+',
            type = argparse_utils.arg_is_path,
            required = True,
            help = ('One or more config files to be used to generate prior '
                    'samples. If more than one config is specified, they '
                    'should be separated by spaces. '
                    'This option can also be used to specify the path to a '
                    'directory containing the prior samples and summary '
                    'statistic means and standard deviations generated by a '
                    'previous run using the `generate-samples-only` option. '
                    'These files should be found in the directory '
                    '`pymsbayes-output/prior-stats-summaries`. The'
                    '`pymsbayes-output/model-key.txt` also needs to be present.'
                    ' If specifying this directory, it should be the only '
                    'argument (i.e., no other directories or config files can '
                    'be provided).'))
    parser.add_argument('-r', '--reps',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('This option has two effects. First, it signifies that '
                    'the analysis will be simulation based (i.e., no real '
                    'data will be used). Second, it specifies how many '
                    'simulation replicates to perform (i.e., how many data '
                    'sets to simulate and analyze).'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000000,
            help = ('The number of prior samples to simulate for each prior '
                    'config specified with `-p`.'))
    parser.add_argument('--prior-batch-size',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 10000,
            help = ('The number of prior samples to simulate for each batch.'))
    parser.add_argument('--generate-samples-only',
            action = 'store_true',
            help = ('Only generate samples from models as requested. I.e., '
                    'No analyses are performed to approximate posteriors. '
                    'This option can be useful if you want the prior samples '
                    'for other purposes.'))
    parser.add_argument('--num-posterior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000,
            help = ('The number of posterior samples desired for each '
                    'analysis. Default: 1000.'))
    parser.add_argument('--num-standardizing-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 10000,
            help = ('The number of prior samples desired to use for '
                    'standardizing statistics. Default: 10000.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('--staging-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage prior files. This option '
                    'can be useful on clusters to speed up I/O while '
                    'generating prior samples. You can designate a local temp '
                    'directory on a compute node to avoid constant writing to '
                    'a shared drive. The default is to use the `temp-dir`.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi wattTheta pi.net tajD.denom`.'))
    parser.add_argument('-b', '--bandwidth',
            action = 'store',
            type = float,
            help = ('Smoothing parameter for the posterior kernal density '
                    'estimation. This option is used for the `glm` '
                    'regression method. The default is 2 / '
                    '`num-posterior-samples`.'))
    parser.add_argument('-q', '--num-posterior-quantiles',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000,
            help = ('The number of equally spaced quantiles at which to '
                    'evaluate the GLM-estimated posterior density. '
                    'Default: 1000.'))
    parser.add_argument('--reporting-frequency',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('Suggested frequency (in number of prior samples) for '
                    'running regression and reporting current results. '
                    'Default: 0 (only report final results). '
                    'If a value is given, it may be adjusted so that the '
                    'reporting frequency is a multiple of the multi-processed '
                    'batch size.'))
    parser.add_argument('--sort-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            choices = range(12),
            help = argparse_utils.get_sort_index_help_message())
    parser.add_argument('--no-global-estimate',
            action = 'store_true',
            help = ('If multiple prior models are specified, by default a '
                    'global estimate is performed averaging over all models. '
                    'This option prevents the global estimation (i.e., only '
                    'inferences for each model are made).'))
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress large results files.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--output-prefix',
            action = 'store',
            type = str,
            default = '',
            help = ('Prefix to use at beginning of output files. The default '
                    'is no prefix.'))
    parser.add_argument('--data-key-path',
            action = 'store',
            type = argparse_utils.arg_is_file,
            help = ('The path to a `data-key.txt` file generated by a previous '
                    'run. This file should be found in the directory '
                    '`pymsbayes-output/data-key.txt`. This option '
                    'will override the `-o`/`--observed-configs` option, and '
                    'is intended to be used in combination with the '
                    '`--start-from` option to restart an analysis.'))
    parser.add_argument('--start-from-simulation-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('The simulation index at which to begin analyses. Must be '
                    'used in combination with either the number of simulation '
                    'replicates (`-r`/`--reps`) or the `--data-key-path` '
                    'option, and must be a positive '
                    'integer that is less than the number of simulation '
                    'replicates. This option can be useful if an analysis '
                    'needs to be restarted.'))
    parser.add_argument('--start-from-observed-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('The observed config index at which to begin analyses. '
                    'Can be used in combination with the `--data-key-path` '
                    'option to restart long-running, multi-observed-config '
                    'analyses'))
    parser.add_argument('--dry-run',
            action = 'store_true',
            help = 'Do not run analyses; only process settings')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import (MsBayesWorker, merge_prior_files,
            ObsSumStatsWorker)
    from pymsbayes.teams import ABCTeam
    from pymsbayes.utils.functions import (is_file, is_dir, long_division,
            mk_new_dir)
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS,
            PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS,
            line_count)
    from pymsbayes.utils import sumresults, errors
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace,
            MSBAYES_SORT_INDEX, ToolPathManager)

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    if len(args.observed_configs) != len(set(args.observed_configs)):
        raise ValueError('All paths to observed config files must be unique')

    if args.num_standardizing_samples > args.num_prior_samples:
        args.num_standardizing_samples = args.num_prior_samples
    
    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')
    eureject_path = ToolPathManager.get_tool_full_path('eureject')
    abctb_path = ToolPathManager.get_tool_full_path('ABCestimator')

    # vet prior-configs option
    using_previous_priors = False
    previous_prior_dir = None
    if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])):
        previous_prior_dir = args.prior_configs.pop(0)
        previous_priors = glob.glob(os.path.join(previous_prior_dir,
                '*-prior-sample.txt'))
        previous_sums = glob.glob(os.path.join(previous_prior_dir,
                '*-means-and-std-devs.txt'))
        if (not previous_priors) or (not previous_sums):
            raise ValueError('directory {0!r} specified with `prior-configs` '
                    'option does not contain necessary prior and summary '
                    'files'.format(args.prior_configs[0]))
        using_previous_priors = True
    else:
        for path in args.prior_configs:
            if not is_file(path):
                raise ValueError('prior config {0!r} is not a file'.format(
                        path))
    if len(args.prior_configs) != len(set(args.prior_configs)):
        raise ValueError('All paths to prior config files must be unique') 
    if not args.output_dir:
        args.output_dir = os.path.dirname(args.observed_configs[0])
    base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results'))
    if not args.temp_dir:
        args.temp_dir = base_dir
    info_path = os.path.join(base_dir, args.output_prefix + \
            'pymsbayes-info.txt')
    info = InfoLogger(info_path)
    info.write('[pymsbayes]'.format(base_dir))
    info.write('\tversion = {version}'.format(**_program_info))
    info.write('\toutput_directory = {0}'.format(base_dir))
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    base_temp_dir = temp_fs.base_dir
    info.write('\ttemp_directory = {0}'.format(base_temp_dir))
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()))
    info.write('\tsimulation_reps = {0}'.format(args.reps))
    stat_patterns = DEFAULT_STAT_PATTERNS
    if args.stat_prefixes:
        for i in range(len(args.stat_prefixes)):
            if not args.stat_prefixes[i].endswith('.'):
                args.stat_prefixes[i] += '.'
        stat_patterns = get_patterns_from_prefixes(
                args.stat_prefixes,
                ignore_case=True)
    if not args.bandwidth:
        args.bandwidth = 2 / float(args.num_posterior_samples)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    if args.data_key_path:
        observed_map = sumresults.parse_data_key_file(args.data_key_path)
        observed_paths = [observed_map[k] for k in sorted(observed_map.keys())]
    else:
        observed_dir = mk_new_dir(os.path.join(base_dir,
                'observed-summary-stats'))
        observed_paths = [os.path.join(observed_dir, args.output_prefix + \
            'observed-{0}.txt'.format(i+1)) for i in range(len(
                    args.observed_configs))]
    info.write('\tseed = {0}'.format(args.seed))
    info.write('\tnum_processors = {0}'.format(args.np))
    info.write('\tnum_prior_samples = {0}'.format(
            args.num_prior_samples))
    info.write('\tnum_standardizing_samples = {0}'.format(
            args.num_standardizing_samples))
    info.write('\tbandwidth = {0}'.format(args.bandwidth))
    info.write('\tposterior_quantiles = {0}'.format(
            args.num_posterior_quantiles))
    info.write('\tposterior_sample_size = {0}'.format(
            args.num_posterior_samples))
    info.write('\tstat_patterns = {0}'.format(
            ', '.join([p.pattern for p in stat_patterns])))

    # vet observed configs
    ref_config_path = args.observed_configs[0]
    ref_config = MsBayesConfig(ref_config_path) 
    all_config_paths = []
    num_taxon_pairs = ref_config.npairs
    assert num_taxon_pairs > 0
    for config in args.observed_configs:
        all_config_paths.append(config)
        if not ref_config.equal_sample_table(config):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                    'sample tables in config {0!r} and {1!r} differ; '
                    'all sample tables must be the same.'.format(
                            ref_config_path, config))

    info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs))
    info.write('\tdry_run = {0}'.format(args.dry_run))
    info.write('\t[[tool_paths]]')
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path))
    info.write('\t\tmsbayes = {0}'.format(msbayes_path))
    info.write('\t\teureject = {0}'.format(eureject_path))
    info.write('\t\tabcestimator = {0}'.format(abctb_path))
    info.write('\t[[observed_configs]]')
    for i, cfg in enumerate(args.observed_configs):
        info.write('\t\t{0} = {1}'.format(i + 1, os.path.relpath(cfg,
                os.path.dirname(info_path))))

    abc_team = ABCTeam(
            temp_fs = temp_fs,
            observed_stats_files = observed_paths,
            num_taxon_pairs = num_taxon_pairs,
            config_paths = args.prior_configs,
            previous_prior_dir = previous_prior_dir,
            num_prior_samples = args.num_prior_samples,
            num_processors = args.np,
            num_standardizing_samples = args.num_standardizing_samples,
            num_posterior_samples = args.num_posterior_samples,
            num_posterior_density_quantiles = args.num_posterior_quantiles,
            batch_size = args.prior_batch_size,
            output_dir = base_dir,
            output_prefix = args.output_prefix,
            prior_temp_dir = args.staging_dir,
            rng = GLOBAL_RNG,
            report_parameters = True,
            stat_patterns = stat_patterns,
            eureject_exe_path = eureject_path,
            abctoolbox_exe_path = abctb_path,
            msbayes_exe_path = None,
            abctoolbox_bandwidth = args.bandwidth,
            omega_threshold = 0.01,
            cv_threshold = 0.01,
            compress = args.compress,
            reporting_frequency = args.reporting_frequency,
            keep_temps = args.keep_temps,
            global_estimate_only = False,
            global_estimate = not args.no_global_estimate,
            generate_prior_samples_only = args.generate_samples_only,
            start_from_simulation_index = args.start_from_simulation_index,
            start_from_observed_index = args.start_from_observed_index)

    models_to_configs = {}
    configs_to_models = {}
    for k, v in abc_team.models.iteritems():
        models_to_configs[k] = v
        configs_to_models[v] = k
        cfg = MsBayesConfig(v)
        all_config_paths.append(v)
        # vet prior configs
        if not ref_config.equal_sample_table(cfg):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                    'sample tables in config {0!r} and {1!r} differ; '
                    'all sample tables must be the same.'.format(
                            ref_config_path, v))

    info.write('\t[[observed_paths]]')
    for i in sorted(abc_team.observed_stats_paths.iterkeys()):
        info.write('\t\t{0} = {1}'.format(i, os.path.relpath(
                abc_team.observed_stats_paths[i],
                os.path.dirname(info_path))))
    info.write('\t[[prior_configs]]')
    for i in sorted(abc_team.models.iterkeys()):
        info.write('\t\t{0} = {1}'.format(i, os.path.relpath(
                abc_team.models[i],
                os.path.dirname(info_path))))

    ##########################################################################
    ## begin analysis --- get observed summary stats

    set_memory_trace() # start logging memory profile
    start_time = datetime.datetime.now()

    if args.data_key_path:
        log.info('Using provided summary statitics...')
    elif not args.dry_run:
        obs_temp_dir = base_temp_dir
        if args.staging_dir:
            obs_temp_dir = args.staging_dir
        observed_temp_fs = TempFileSystem(parent = obs_temp_dir,
                prefix = 'observed-temps-')

        if args.reps < 1:
            log.info('Calculating summary statistics from sequence data...')
            obs_workers = []
            for i, cfg in enumerate(args.observed_configs):
                ss_worker = ObsSumStatsWorker(
                        temp_fs = observed_temp_fs,
                        config_path = cfg,
                        output_path = observed_paths[i],
                        schema = 'abctoolbox',
                        stat_patterns = stat_patterns)
                obs_workers.append(ss_worker)

            obs_workers = Manager.run_workers(
                workers = obs_workers,
                num_processors = args.np)

            # re-vet all configs to see if some were changed by obsSumStats.pl
            new_ref_config = ref_config
            ref_modified = False
            # new ref because if all configs were updated all is good
            if not ref_config.equal_sample_table(ref_config_path):
                ref_modified = True
                new_ref_config = MsBayesConfig(ref_config_path)
                log.warning("""
The alignment lengths in config
{0!r}
have been corrected for sites with *any* ambiguous bases and/or gaps by
obsSumStats.pl.
                    """.format(ref_config_path))
            for config in all_config_paths:
                if not new_ref_config.equal_sample_table(config):
                    corrected_config = config
                    if ref_modified:
                        corrected_config = ref_config_path
                    if not args.keep_temps:
                        observed_temp_fs.purge()
                        temp_fs.purge()
                    raise errors.SampleTableError("""
The sample tables in configs
{0!r}
and
{1!r}
differ because obsSumStats.pl modified alignment lengths in config
{2!r}
to correct for sites in the alignments with *any* ambiguous bases and/or gaps.
Please make sure the sample tables in all configs will be the same after
correcting alignment lengths for sites that contain *any* ambiguous bases
and/or gaps. You can do this by copying and pasting the sample table in
{2!r}
that has been corrected by obsSumStats.pl into the other configs that were not
corrected.
                        """.format(ref_config_path, config, corrected_config))

        else:
            log.info('Simulating summary statistics from observed configs...')
            num_observed_workers = min([args.reps, args.np])
            if args.reps <= args.np:
                observed_batch_size = 1
                remainder = 0
            else:
                observed_batch_size, remainder = long_division(args.reps,
                        args.np)
            msbayes_workers = []
            for idx, cfg in enumerate(args.observed_configs):
                observed_model_idx = configs_to_models.get(cfg,
                        None)
                schema = 'abctoolbox'
                for i in range(num_observed_workers):
                    worker = MsBayesWorker(
                            temp_fs = observed_temp_fs,
                            sample_size = observed_batch_size,
                            config_path = cfg,
                            model_index = observed_model_idx,
                            report_parameters = True,
                            schema = schema,
                            include_header = True,
                            stat_patterns = stat_patterns,
                            write_stats_file = False,
                            staging_dir = None,
                            tag = idx)
                    msbayes_workers.append(worker)
                if remainder > 0:
                    worker = MsBayesWorker(
                            temp_fs = observed_temp_fs,
                            sample_size = remainder,
                            config_path = cfg,
                            model_index = observed_model_idx,
                            report_parameters = True,
                            schema = schema,
                            include_header = True,
                            stat_patterns = stat_patterns,
                            write_stats_file = False,
                            staging_dir = None,
                            tag = idx)
                    msbayes_workers.append(worker)

            # run parallel msbayes processes
            msbayes_workers = Manager.run_workers(
                workers = msbayes_workers,
                num_processors = args.np)

            workers = dict(zip(range(len(args.observed_configs)),
                    [[] for i in range(len(args.observed_configs))]))
            for w in msbayes_workers:
                workers[w.tag].append(w)

            # merge simulated observed data into one file
            for i in range(len(args.observed_configs)):
                merge_prior_files([w.prior_path for w in workers[i]],
                        observed_paths[i])
                lc = line_count(observed_paths[i], ignore_headers=True)
                if lc != args.reps:
                    if not args.keep_temps:
                        temp_fs.purge()
                    raise Exception('The number of observed simulations ({0}) '
                            'generated for observed config {1!r} and output to '
                            'file {2!r} does not match the number of reps '
                            '({3})'.format(lc, args.observed_configs[i],
                                observed_paths[i], args.reps))
        if not args.keep_temps:
            log.debug('purging observed temps...')
            observed_temp_fs.purge()

    ##########################################################################
    ## Begin ABC analyses

    if not args.dry_run:
        abc_team.run()

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Example #4
0
def main_cli(argv=sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse_utils.SmartHelpFormatter)
    parser.add_argument(
        '-o',
        '--observed-configs',
        nargs='+',
        type=argparse_utils.arg_is_config,
        required=True,
        help=('One or more msBayes config files to be used to either '
              'calculate or simulate observed summary statistics. If '
              'used in combination with `-r` each config will be used to '
              'simulate pseudo-observed data. If analyzing real data, do '
              'not use the `-r` option, and the fasta files specified '
              'within the config must exist and contain the sequence '
              'data.'))
    parser.add_argument(
        '-p',
        '--prior-configs',
        nargs='+',
        type=argparse_utils.arg_is_path,
        required=True,
        help=('One or more config files to be used to generate prior '
              'samples. If more than one config is specified, they '
              'should be separated by spaces. '
              'This option can also be used to specify the path to a '
              'directory containing the prior samples and summary '
              'statistic means and standard deviations generated by a '
              'previous run using the `generate-samples-only` option. '
              'These files should be found in the directory '
              '`pymsbayes-output/prior-stats-summaries`. The'
              '`pymsbayes-output/model-key.txt` also needs to be present.'
              ' If specifying this directory, it should be the only '
              'argument (i.e., no other directories or config files can '
              'be provided).'))
    parser.add_argument(
        '-r',
        '--reps',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('This option has two effects. First, it signifies that '
              'the analysis will be simulation based (i.e., no real '
              'data will be used). Second, it specifies how many '
              'simulation replicates to perform (i.e., how many data '
              'sets to simulate and analyze).'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000000,
        help=('The number of prior samples to simulate for each prior '
              'config specified with `-p`.'))
    parser.add_argument(
        '--prior-batch-size',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=10000,
        help=('The number of prior samples to simulate for each batch.'))
    parser.add_argument(
        '--generate-samples-only',
        action='store_true',
        help=('Only generate samples from models as requested. I.e., '
              'No analyses are performed to approximate posteriors. '
              'This option can be useful if you want the prior samples '
              'for other purposes.'))
    parser.add_argument(
        '--num-posterior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000,
        help=('The number of posterior samples desired for each '
              'analysis. Default: 1000.'))
    parser.add_argument('--num-standardizing-samples',
                        action='store',
                        type=argparse_utils.arg_is_positive_int,
                        default=10000,
                        help=('The number of prior samples desired to use for '
                              'standardizing statistics. Default: 10000.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '--staging-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage prior files. This option '
              'can be useful on clusters to speed up I/O while '
              'generating prior samples. You can designate a local temp '
              'directory on a compute node to avoid constant writing to '
              'a shared drive. The default is to use the `temp-dir`.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi wattTheta pi.net tajD.denom`.'))
    parser.add_argument(
        '-b',
        '--bandwidth',
        action='store',
        type=float,
        help=('Smoothing parameter for the posterior kernal density '
              'estimation. This option is used for the `glm` '
              'regression method. The default is 2 / '
              '`num-posterior-samples`.'))
    parser.add_argument(
        '-q',
        '--num-posterior-quantiles',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000,
        help=('The number of equally spaced quantiles at which to '
              'evaluate the GLM-estimated posterior density. '
              'Default: 1000.'))
    parser.add_argument(
        '--reporting-frequency',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('Suggested frequency (in number of prior samples) for '
              'running regression and reporting current results. '
              'Default: 0 (only report final results). '
              'If a value is given, it may be adjusted so that the '
              'reporting frequency is a multiple of the multi-processed '
              'batch size.'))
    parser.add_argument('--sort-index',
                        action='store',
                        type=argparse_utils.arg_is_nonnegative_int,
                        default=0,
                        choices=range(12),
                        help=argparse_utils.get_sort_index_help_message())
    parser.add_argument(
        '--no-global-estimate',
        action='store_true',
        help=('If multiple prior models are specified, by default a '
              'global estimate is performed averaging over all models. '
              'This option prevents the global estimation (i.e., only '
              'inferences for each model are made).'))
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress large results files.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument(
        '--output-prefix',
        action='store',
        type=str,
        default='',
        help=('Prefix to use at beginning of output files. The default '
              'is no prefix.'))
    parser.add_argument(
        '--data-key-path',
        action='store',
        type=argparse_utils.arg_is_file,
        help=('The path to a `data-key.txt` file generated by a previous '
              'run. This file should be found in the directory '
              '`pymsbayes-output/data-key.txt`. This option '
              'will override the `-o`/`--observed-configs` option, and '
              'is intended to be used in combination with the '
              '`--start-from` option to restart an analysis.'))
    parser.add_argument(
        '--start-from-simulation-index',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('The simulation index at which to begin analyses. Must be '
              'used in combination with either the number of simulation '
              'replicates (`-r`/`--reps`) or the `--data-key-path` '
              'option, and must be a positive '
              'integer that is less than the number of simulation '
              'replicates. This option can be useful if an analysis '
              'needs to be restarted.'))
    parser.add_argument(
        '--start-from-observed-index',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('The observed config index at which to begin analyses. '
              'Can be used in combination with the `--data-key-path` '
              'option to restart long-running, multi-observed-config '
              'analyses'))
    parser.add_argument('--dry-run',
                        action='store_true',
                        help='Do not run analyses; only process settings')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import (MsBayesWorker, merge_prior_files,
                                   ObsSumStatsWorker)
    from pymsbayes.teams import ABCTeam
    from pymsbayes.utils.functions import (is_file, is_dir, long_division,
                                           mk_new_dir)
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         DIV_MODEL_PATTERNS, MODEL_PATTERNS,
                                         PSI_PATTERNS, MEAN_TAU_PATTERNS,
                                         OMEGA_PATTERNS, CV_PATTERNS,
                                         line_count)
    from pymsbayes.utils import sumresults, errors
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace,
                                 MSBAYES_SORT_INDEX, ToolPathManager)

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    if len(args.observed_configs) != len(set(args.observed_configs)):
        raise ValueError('All paths to observed config files must be unique')

    if args.num_standardizing_samples > args.num_prior_samples:
        args.num_standardizing_samples = args.num_prior_samples

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')
    eureject_path = ToolPathManager.get_tool_full_path('eureject')
    abctb_path = ToolPathManager.get_tool_full_path('ABCestimator')

    # vet prior-configs option
    using_previous_priors = False
    previous_prior_dir = None
    if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])):
        previous_prior_dir = args.prior_configs.pop(0)
        previous_priors = glob.glob(
            os.path.join(previous_prior_dir, '*-prior-sample.txt'))
        previous_sums = glob.glob(
            os.path.join(previous_prior_dir, '*-means-and-std-devs.txt'))
        if (not previous_priors) or (not previous_sums):
            raise ValueError(
                'directory {0!r} specified with `prior-configs` '
                'option does not contain necessary prior and summary '
                'files'.format(args.prior_configs[0]))
        using_previous_priors = True
    else:
        for path in args.prior_configs:
            if not is_file(path):
                raise ValueError(
                    'prior config {0!r} is not a file'.format(path))
    if len(args.prior_configs) != len(set(args.prior_configs)):
        raise ValueError('All paths to prior config files must be unique')
    if not args.output_dir:
        args.output_dir = os.path.dirname(args.observed_configs[0])
    base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results'))
    if not args.temp_dir:
        args.temp_dir = base_dir
    info_path = os.path.join(base_dir, args.output_prefix + \
            'pymsbayes-info.txt')
    info = InfoLogger(info_path)
    info.write('[pymsbayes]'.format(base_dir))
    info.write('\tversion = {version}'.format(**_program_info))
    info.write('\toutput_directory = {0}'.format(base_dir))
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    base_temp_dir = temp_fs.base_dir
    info.write('\ttemp_directory = {0}'.format(base_temp_dir))
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()))
    info.write('\tsimulation_reps = {0}'.format(args.reps))
    stat_patterns = DEFAULT_STAT_PATTERNS
    if args.stat_prefixes:
        for i in range(len(args.stat_prefixes)):
            if not args.stat_prefixes[i].endswith('.'):
                args.stat_prefixes[i] += '.'
        stat_patterns = get_patterns_from_prefixes(args.stat_prefixes,
                                                   ignore_case=True)
    if not args.bandwidth:
        args.bandwidth = 2 / float(args.num_posterior_samples)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    if args.data_key_path:
        observed_map = sumresults.parse_data_key_file(args.data_key_path)
        observed_paths = [observed_map[k] for k in sorted(observed_map.keys())]
    else:
        observed_dir = mk_new_dir(
            os.path.join(base_dir, 'observed-summary-stats'))
        observed_paths = [os.path.join(observed_dir, args.output_prefix + \
            'observed-{0}.txt'.format(i+1)) for i in range(len(
                    args.observed_configs))]
    info.write('\tseed = {0}'.format(args.seed))
    info.write('\tnum_processors = {0}'.format(args.np))
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples))
    info.write('\tnum_standardizing_samples = {0}'.format(
        args.num_standardizing_samples))
    info.write('\tbandwidth = {0}'.format(args.bandwidth))
    info.write('\tposterior_quantiles = {0}'.format(
        args.num_posterior_quantiles))
    info.write('\tposterior_sample_size = {0}'.format(
        args.num_posterior_samples))
    info.write('\tstat_patterns = {0}'.format(', '.join(
        [p.pattern for p in stat_patterns])))

    # vet observed configs
    ref_config_path = args.observed_configs[0]
    ref_config = MsBayesConfig(ref_config_path)
    all_config_paths = []
    num_taxon_pairs = ref_config.npairs
    assert num_taxon_pairs > 0
    for config in args.observed_configs:
        all_config_paths.append(config)
        if not ref_config.equal_sample_table(config):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                'sample tables in config {0!r} and {1!r} differ; '
                'all sample tables must be the same.'.format(
                    ref_config_path, config))

    info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs))
    info.write('\tdry_run = {0}'.format(args.dry_run))
    info.write('\t[[tool_paths]]')
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path))
    info.write('\t\tmsbayes = {0}'.format(msbayes_path))
    info.write('\t\teureject = {0}'.format(eureject_path))
    info.write('\t\tabcestimator = {0}'.format(abctb_path))
    info.write('\t[[observed_configs]]')
    for i, cfg in enumerate(args.observed_configs):
        info.write('\t\t{0} = {1}'.format(
            i + 1, os.path.relpath(cfg, os.path.dirname(info_path))))

    abc_team = ABCTeam(
        temp_fs=temp_fs,
        observed_stats_files=observed_paths,
        num_taxon_pairs=num_taxon_pairs,
        config_paths=args.prior_configs,
        previous_prior_dir=previous_prior_dir,
        num_prior_samples=args.num_prior_samples,
        num_processors=args.np,
        num_standardizing_samples=args.num_standardizing_samples,
        num_posterior_samples=args.num_posterior_samples,
        num_posterior_density_quantiles=args.num_posterior_quantiles,
        batch_size=args.prior_batch_size,
        output_dir=base_dir,
        output_prefix=args.output_prefix,
        prior_temp_dir=args.staging_dir,
        rng=GLOBAL_RNG,
        report_parameters=True,
        stat_patterns=stat_patterns,
        eureject_exe_path=eureject_path,
        abctoolbox_exe_path=abctb_path,
        msbayes_exe_path=None,
        abctoolbox_bandwidth=args.bandwidth,
        omega_threshold=0.01,
        cv_threshold=0.01,
        compress=args.compress,
        reporting_frequency=args.reporting_frequency,
        keep_temps=args.keep_temps,
        global_estimate_only=False,
        global_estimate=not args.no_global_estimate,
        generate_prior_samples_only=args.generate_samples_only,
        start_from_simulation_index=args.start_from_simulation_index,
        start_from_observed_index=args.start_from_observed_index)

    models_to_configs = {}
    configs_to_models = {}
    for k, v in abc_team.models.iteritems():
        models_to_configs[k] = v
        configs_to_models[v] = k
        cfg = MsBayesConfig(v)
        all_config_paths.append(v)
        # vet prior configs
        if not ref_config.equal_sample_table(cfg):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                'sample tables in config {0!r} and {1!r} differ; '
                'all sample tables must be the same.'.format(
                    ref_config_path, v))

    info.write('\t[[observed_paths]]')
    for i in sorted(abc_team.observed_stats_paths.iterkeys()):
        info.write('\t\t{0} = {1}'.format(
            i,
            os.path.relpath(abc_team.observed_stats_paths[i],
                            os.path.dirname(info_path))))
    info.write('\t[[prior_configs]]')
    for i in sorted(abc_team.models.iterkeys()):
        info.write('\t\t{0} = {1}'.format(
            i, os.path.relpath(abc_team.models[i],
                               os.path.dirname(info_path))))

    ##########################################################################
    ## begin analysis --- get observed summary stats

    set_memory_trace()  # start logging memory profile
    start_time = datetime.datetime.now()

    if args.data_key_path:
        log.info('Using provided summary statitics...')
    elif not args.dry_run:
        obs_temp_dir = base_temp_dir
        if args.staging_dir:
            obs_temp_dir = args.staging_dir
        observed_temp_fs = TempFileSystem(parent=obs_temp_dir,
                                          prefix='observed-temps-')

        if args.reps < 1:
            log.info('Calculating summary statistics from sequence data...')
            obs_workers = []
            for i, cfg in enumerate(args.observed_configs):
                ss_worker = ObsSumStatsWorker(temp_fs=observed_temp_fs,
                                              config_path=cfg,
                                              output_path=observed_paths[i],
                                              schema='abctoolbox',
                                              stat_patterns=stat_patterns)
                obs_workers.append(ss_worker)

            obs_workers = Manager.run_workers(workers=obs_workers,
                                              num_processors=args.np)

            # re-vet all configs to see if some were changed by obsSumStats.pl
            new_ref_config = ref_config
            ref_modified = False
            # new ref because if all configs were updated all is good
            if not ref_config.equal_sample_table(ref_config_path):
                ref_modified = True
                new_ref_config = MsBayesConfig(ref_config_path)
                log.warning("""
The alignment lengths in config
{0!r}
have been corrected for sites with *any* ambiguous bases and/or gaps by
obsSumStats.pl.
                    """.format(ref_config_path))
            for config in all_config_paths:
                if not new_ref_config.equal_sample_table(config):
                    corrected_config = config
                    if ref_modified:
                        corrected_config = ref_config_path
                    if not args.keep_temps:
                        observed_temp_fs.purge()
                        temp_fs.purge()
                    raise errors.SampleTableError("""
The sample tables in configs
{0!r}
and
{1!r}
differ because obsSumStats.pl modified alignment lengths in config
{2!r}
to correct for sites in the alignments with *any* ambiguous bases and/or gaps.
Please make sure the sample tables in all configs will be the same after
correcting alignment lengths for sites that contain *any* ambiguous bases
and/or gaps. You can do this by copying and pasting the sample table in
{2!r}
that has been corrected by obsSumStats.pl into the other configs that were not
corrected.
                        """.format(ref_config_path, config, corrected_config))

        else:
            log.info('Simulating summary statistics from observed configs...')
            num_observed_workers = min([args.reps, args.np])
            if args.reps <= args.np:
                observed_batch_size = 1
                remainder = 0
            else:
                observed_batch_size, remainder = long_division(
                    args.reps, args.np)
            msbayes_workers = []
            for idx, cfg in enumerate(args.observed_configs):
                observed_model_idx = configs_to_models.get(cfg, None)
                schema = 'abctoolbox'
                for i in range(num_observed_workers):
                    worker = MsBayesWorker(temp_fs=observed_temp_fs,
                                           sample_size=observed_batch_size,
                                           config_path=cfg,
                                           model_index=observed_model_idx,
                                           report_parameters=True,
                                           schema=schema,
                                           include_header=True,
                                           stat_patterns=stat_patterns,
                                           write_stats_file=False,
                                           staging_dir=None,
                                           tag=idx)
                    msbayes_workers.append(worker)
                if remainder > 0:
                    worker = MsBayesWorker(temp_fs=observed_temp_fs,
                                           sample_size=remainder,
                                           config_path=cfg,
                                           model_index=observed_model_idx,
                                           report_parameters=True,
                                           schema=schema,
                                           include_header=True,
                                           stat_patterns=stat_patterns,
                                           write_stats_file=False,
                                           staging_dir=None,
                                           tag=idx)
                    msbayes_workers.append(worker)

            # run parallel msbayes processes
            msbayes_workers = Manager.run_workers(workers=msbayes_workers,
                                                  num_processors=args.np)

            workers = dict(
                zip(range(len(args.observed_configs)),
                    [[] for i in range(len(args.observed_configs))]))
            for w in msbayes_workers:
                workers[w.tag].append(w)

            # merge simulated observed data into one file
            for i in range(len(args.observed_configs)):
                merge_prior_files([w.prior_path for w in workers[i]],
                                  observed_paths[i])
                lc = line_count(observed_paths[i], ignore_headers=True)
                if lc != args.reps:
                    if not args.keep_temps:
                        temp_fs.purge()
                    raise Exception(
                        'The number of observed simulations ({0}) '
                        'generated for observed config {1!r} and output to '
                        'file {2!r} does not match the number of reps '
                        '({3})'.format(lc, args.observed_configs[i],
                                       observed_paths[i], args.reps))
        if not args.keep_temps:
            log.debug('purging observed temps...')
            observed_temp_fs.purge()

    ##########################################################################
    ## Begin ABC analyses

    if not args.dry_run:
        abc_team.run()

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()