def arg_is_file(path): try: if not is_file(path): raise except: msg = '{0!r} is not a file'.format(path) raise argparse.ArgumentTypeError(msg) return expand_path(path)
def test_is_file(self): self.assertFalse(functions.is_file(None)) self.assertFalse(functions.is_file(self.bogus_file)) self.assertTrue(functions.is_file(self.file))
def main_cli(argv = sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('-o', '--observed-configs', nargs = '+', type = argparse_utils.arg_is_config, required = True, help = ('One or more msBayes config files to be used to either ' 'calculate or simulate observed summary statistics. If ' 'used in combination with `-r` each config will be used to ' 'simulate pseudo-observed data. If analyzing real data, do ' 'not use the `-r` option, and the fasta files specified ' 'within the config must exist and contain the sequence ' 'data.')) parser.add_argument('-p', '--prior-configs', nargs = '+', type = argparse_utils.arg_is_path, required = True, help = ('One or more config files to be used to generate prior ' 'samples. If more than one config is specified, they ' 'should be separated by spaces. ' 'This option can also be used to specify the path to a ' 'directory containing the prior samples and summary ' 'statistic means and standard deviations generated by a ' 'previous run using the `generate-samples-only` option. ' 'These files should be found in the directory ' '`pymsbayes-output/prior-stats-summaries`. The' '`pymsbayes-output/model-key.txt` also needs to be present.' ' If specifying this directory, it should be the only ' 'argument (i.e., no other directories or config files can ' 'be provided).')) parser.add_argument('-r', '--reps', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('This option has two effects. First, it signifies that ' 'the analysis will be simulation based (i.e., no real ' 'data will be used). Second, it specifies how many ' 'simulation replicates to perform (i.e., how many data ' 'sets to simulate and analyze).')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000000, help = ('The number of prior samples to simulate for each prior ' 'config specified with `-p`.')) parser.add_argument('--prior-batch-size', action = 'store', type = argparse_utils.arg_is_positive_int, default = 10000, help = ('The number of prior samples to simulate for each batch.')) parser.add_argument('--generate-samples-only', action = 'store_true', help = ('Only generate samples from models as requested. I.e., ' 'No analyses are performed to approximate posteriors. ' 'This option can be useful if you want the prior samples ' 'for other purposes.')) parser.add_argument('--num-posterior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000, help = ('The number of posterior samples desired for each ' 'analysis. Default: 1000.')) parser.add_argument('--num-standardizing-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 10000, help = ('The number of prior samples desired to use for ' 'standardizing statistics. Default: 10000.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('--staging-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage prior files. This option ' 'can be useful on clusters to speed up I/O while ' 'generating prior samples. You can designate a local temp ' 'directory on a compute node to avoid constant writing to ' 'a shared drive. The default is to use the `temp-dir`.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi wattTheta pi.net tajD.denom`.')) parser.add_argument('-b', '--bandwidth', action = 'store', type = float, help = ('Smoothing parameter for the posterior kernal density ' 'estimation. This option is used for the `glm` ' 'regression method. The default is 2 / ' '`num-posterior-samples`.')) parser.add_argument('-q', '--num-posterior-quantiles', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000, help = ('The number of equally spaced quantiles at which to ' 'evaluate the GLM-estimated posterior density. ' 'Default: 1000.')) parser.add_argument('--reporting-frequency', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('Suggested frequency (in number of prior samples) for ' 'running regression and reporting current results. ' 'Default: 0 (only report final results). ' 'If a value is given, it may be adjusted so that the ' 'reporting frequency is a multiple of the multi-processed ' 'batch size.')) parser.add_argument('--sort-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, choices = range(12), help = argparse_utils.get_sort_index_help_message()) parser.add_argument('--no-global-estimate', action = 'store_true', help = ('If multiple prior models are specified, by default a ' 'global estimate is performed averaging over all models. ' 'This option prevents the global estimation (i.e., only ' 'inferences for each model are made).')) parser.add_argument('--compress', action = 'store_true', help = 'Compress large results files.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--output-prefix', action = 'store', type = str, default = '', help = ('Prefix to use at beginning of output files. The default ' 'is no prefix.')) parser.add_argument('--data-key-path', action = 'store', type = argparse_utils.arg_is_file, help = ('The path to a `data-key.txt` file generated by a previous ' 'run. This file should be found in the directory ' '`pymsbayes-output/data-key.txt`. This option ' 'will override the `-o`/`--observed-configs` option, and ' 'is intended to be used in combination with the ' '`--start-from` option to restart an analysis.')) parser.add_argument('--start-from-simulation-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('The simulation index at which to begin analyses. Must be ' 'used in combination with either the number of simulation ' 'replicates (`-r`/`--reps`) or the `--data-key-path` ' 'option, and must be a positive ' 'integer that is less than the number of simulation ' 'replicates. This option can be useful if an analysis ' 'needs to be restarted.')) parser.add_argument('--start-from-observed-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('The observed config index at which to begin analyses. ' 'Can be used in combination with the `--data-key-path` ' 'option to restart long-running, multi-observed-config ' 'analyses')) parser.add_argument('--dry-run', action = 'store_true', help = 'Do not run analyses; only process settings') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import (MsBayesWorker, merge_prior_files, ObsSumStatsWorker) from pymsbayes.teams import ABCTeam from pymsbayes.utils.functions import (is_file, is_dir, long_division, mk_new_dir) from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS, PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS, line_count) from pymsbayes.utils import sumresults, errors from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.config import MsBayesConfig from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace, MSBAYES_SORT_INDEX, ToolPathManager) MSBAYES_SORT_INDEX.set_index(args.sort_index) if len(args.observed_configs) != len(set(args.observed_configs)): raise ValueError('All paths to observed config files must be unique') if args.num_standardizing_samples > args.num_prior_samples: args.num_standardizing_samples = args.num_prior_samples # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') eureject_path = ToolPathManager.get_tool_full_path('eureject') abctb_path = ToolPathManager.get_tool_full_path('ABCestimator') # vet prior-configs option using_previous_priors = False previous_prior_dir = None if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])): previous_prior_dir = args.prior_configs.pop(0) previous_priors = glob.glob(os.path.join(previous_prior_dir, '*-prior-sample.txt')) previous_sums = glob.glob(os.path.join(previous_prior_dir, '*-means-and-std-devs.txt')) if (not previous_priors) or (not previous_sums): raise ValueError('directory {0!r} specified with `prior-configs` ' 'option does not contain necessary prior and summary ' 'files'.format(args.prior_configs[0])) using_previous_priors = True else: for path in args.prior_configs: if not is_file(path): raise ValueError('prior config {0!r} is not a file'.format( path)) if len(args.prior_configs) != len(set(args.prior_configs)): raise ValueError('All paths to prior config files must be unique') if not args.output_dir: args.output_dir = os.path.dirname(args.observed_configs[0]) base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results')) if not args.temp_dir: args.temp_dir = base_dir info_path = os.path.join(base_dir, args.output_prefix + \ 'pymsbayes-info.txt') info = InfoLogger(info_path) info.write('[pymsbayes]'.format(base_dir)) info.write('\tversion = {version}'.format(**_program_info)) info.write('\toutput_directory = {0}'.format(base_dir)) temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') base_temp_dir = temp_fs.base_dir info.write('\ttemp_directory = {0}'.format(base_temp_dir)) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value())) info.write('\tsimulation_reps = {0}'.format(args.reps)) stat_patterns = DEFAULT_STAT_PATTERNS if args.stat_prefixes: for i in range(len(args.stat_prefixes)): if not args.stat_prefixes[i].endswith('.'): args.stat_prefixes[i] += '.' stat_patterns = get_patterns_from_prefixes( args.stat_prefixes, ignore_case=True) if not args.bandwidth: args.bandwidth = 2 / float(args.num_posterior_samples) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if args.data_key_path: observed_map = sumresults.parse_data_key_file(args.data_key_path) observed_paths = [observed_map[k] for k in sorted(observed_map.keys())] else: observed_dir = mk_new_dir(os.path.join(base_dir, 'observed-summary-stats')) observed_paths = [os.path.join(observed_dir, args.output_prefix + \ 'observed-{0}.txt'.format(i+1)) for i in range(len( args.observed_configs))] info.write('\tseed = {0}'.format(args.seed)) info.write('\tnum_processors = {0}'.format(args.np)) info.write('\tnum_prior_samples = {0}'.format( args.num_prior_samples)) info.write('\tnum_standardizing_samples = {0}'.format( args.num_standardizing_samples)) info.write('\tbandwidth = {0}'.format(args.bandwidth)) info.write('\tposterior_quantiles = {0}'.format( args.num_posterior_quantiles)) info.write('\tposterior_sample_size = {0}'.format( args.num_posterior_samples)) info.write('\tstat_patterns = {0}'.format( ', '.join([p.pattern for p in stat_patterns]))) # vet observed configs ref_config_path = args.observed_configs[0] ref_config = MsBayesConfig(ref_config_path) all_config_paths = [] num_taxon_pairs = ref_config.npairs assert num_taxon_pairs > 0 for config in args.observed_configs: all_config_paths.append(config) if not ref_config.equal_sample_table(config): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, config)) info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs)) info.write('\tdry_run = {0}'.format(args.dry_run)) info.write('\t[[tool_paths]]') info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path)) info.write('\t\tmsbayes = {0}'.format(msbayes_path)) info.write('\t\teureject = {0}'.format(eureject_path)) info.write('\t\tabcestimator = {0}'.format(abctb_path)) info.write('\t[[observed_configs]]') for i, cfg in enumerate(args.observed_configs): info.write('\t\t{0} = {1}'.format(i + 1, os.path.relpath(cfg, os.path.dirname(info_path)))) abc_team = ABCTeam( temp_fs = temp_fs, observed_stats_files = observed_paths, num_taxon_pairs = num_taxon_pairs, config_paths = args.prior_configs, previous_prior_dir = previous_prior_dir, num_prior_samples = args.num_prior_samples, num_processors = args.np, num_standardizing_samples = args.num_standardizing_samples, num_posterior_samples = args.num_posterior_samples, num_posterior_density_quantiles = args.num_posterior_quantiles, batch_size = args.prior_batch_size, output_dir = base_dir, output_prefix = args.output_prefix, prior_temp_dir = args.staging_dir, rng = GLOBAL_RNG, report_parameters = True, stat_patterns = stat_patterns, eureject_exe_path = eureject_path, abctoolbox_exe_path = abctb_path, msbayes_exe_path = None, abctoolbox_bandwidth = args.bandwidth, omega_threshold = 0.01, cv_threshold = 0.01, compress = args.compress, reporting_frequency = args.reporting_frequency, keep_temps = args.keep_temps, global_estimate_only = False, global_estimate = not args.no_global_estimate, generate_prior_samples_only = args.generate_samples_only, start_from_simulation_index = args.start_from_simulation_index, start_from_observed_index = args.start_from_observed_index) models_to_configs = {} configs_to_models = {} for k, v in abc_team.models.iteritems(): models_to_configs[k] = v configs_to_models[v] = k cfg = MsBayesConfig(v) all_config_paths.append(v) # vet prior configs if not ref_config.equal_sample_table(cfg): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, v)) info.write('\t[[observed_paths]]') for i in sorted(abc_team.observed_stats_paths.iterkeys()): info.write('\t\t{0} = {1}'.format(i, os.path.relpath( abc_team.observed_stats_paths[i], os.path.dirname(info_path)))) info.write('\t[[prior_configs]]') for i in sorted(abc_team.models.iterkeys()): info.write('\t\t{0} = {1}'.format(i, os.path.relpath( abc_team.models[i], os.path.dirname(info_path)))) ########################################################################## ## begin analysis --- get observed summary stats set_memory_trace() # start logging memory profile start_time = datetime.datetime.now() if args.data_key_path: log.info('Using provided summary statitics...') elif not args.dry_run: obs_temp_dir = base_temp_dir if args.staging_dir: obs_temp_dir = args.staging_dir observed_temp_fs = TempFileSystem(parent = obs_temp_dir, prefix = 'observed-temps-') if args.reps < 1: log.info('Calculating summary statistics from sequence data...') obs_workers = [] for i, cfg in enumerate(args.observed_configs): ss_worker = ObsSumStatsWorker( temp_fs = observed_temp_fs, config_path = cfg, output_path = observed_paths[i], schema = 'abctoolbox', stat_patterns = stat_patterns) obs_workers.append(ss_worker) obs_workers = Manager.run_workers( workers = obs_workers, num_processors = args.np) # re-vet all configs to see if some were changed by obsSumStats.pl new_ref_config = ref_config ref_modified = False # new ref because if all configs were updated all is good if not ref_config.equal_sample_table(ref_config_path): ref_modified = True new_ref_config = MsBayesConfig(ref_config_path) log.warning(""" The alignment lengths in config {0!r} have been corrected for sites with *any* ambiguous bases and/or gaps by obsSumStats.pl. """.format(ref_config_path)) for config in all_config_paths: if not new_ref_config.equal_sample_table(config): corrected_config = config if ref_modified: corrected_config = ref_config_path if not args.keep_temps: observed_temp_fs.purge() temp_fs.purge() raise errors.SampleTableError(""" The sample tables in configs {0!r} and {1!r} differ because obsSumStats.pl modified alignment lengths in config {2!r} to correct for sites in the alignments with *any* ambiguous bases and/or gaps. Please make sure the sample tables in all configs will be the same after correcting alignment lengths for sites that contain *any* ambiguous bases and/or gaps. You can do this by copying and pasting the sample table in {2!r} that has been corrected by obsSumStats.pl into the other configs that were not corrected. """.format(ref_config_path, config, corrected_config)) else: log.info('Simulating summary statistics from observed configs...') num_observed_workers = min([args.reps, args.np]) if args.reps <= args.np: observed_batch_size = 1 remainder = 0 else: observed_batch_size, remainder = long_division(args.reps, args.np) msbayes_workers = [] for idx, cfg in enumerate(args.observed_configs): observed_model_idx = configs_to_models.get(cfg, None) schema = 'abctoolbox' for i in range(num_observed_workers): worker = MsBayesWorker( temp_fs = observed_temp_fs, sample_size = observed_batch_size, config_path = cfg, model_index = observed_model_idx, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False, staging_dir = None, tag = idx) msbayes_workers.append(worker) if remainder > 0: worker = MsBayesWorker( temp_fs = observed_temp_fs, sample_size = remainder, config_path = cfg, model_index = observed_model_idx, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False, staging_dir = None, tag = idx) msbayes_workers.append(worker) # run parallel msbayes processes msbayes_workers = Manager.run_workers( workers = msbayes_workers, num_processors = args.np) workers = dict(zip(range(len(args.observed_configs)), [[] for i in range(len(args.observed_configs))])) for w in msbayes_workers: workers[w.tag].append(w) # merge simulated observed data into one file for i in range(len(args.observed_configs)): merge_prior_files([w.prior_path for w in workers[i]], observed_paths[i]) lc = line_count(observed_paths[i], ignore_headers=True) if lc != args.reps: if not args.keep_temps: temp_fs.purge() raise Exception('The number of observed simulations ({0}) ' 'generated for observed config {1!r} and output to ' 'file {2!r} does not match the number of reps ' '({3})'.format(lc, args.observed_configs[i], observed_paths[i], args.reps)) if not args.keep_temps: log.debug('purging observed temps...') observed_temp_fs.purge() ########################################################################## ## Begin ABC analyses if not args.dry_run: abc_team.run() stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(argv=sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument( '-o', '--observed-configs', nargs='+', type=argparse_utils.arg_is_config, required=True, help=('One or more msBayes config files to be used to either ' 'calculate or simulate observed summary statistics. If ' 'used in combination with `-r` each config will be used to ' 'simulate pseudo-observed data. If analyzing real data, do ' 'not use the `-r` option, and the fasta files specified ' 'within the config must exist and contain the sequence ' 'data.')) parser.add_argument( '-p', '--prior-configs', nargs='+', type=argparse_utils.arg_is_path, required=True, help=('One or more config files to be used to generate prior ' 'samples. If more than one config is specified, they ' 'should be separated by spaces. ' 'This option can also be used to specify the path to a ' 'directory containing the prior samples and summary ' 'statistic means and standard deviations generated by a ' 'previous run using the `generate-samples-only` option. ' 'These files should be found in the directory ' '`pymsbayes-output/prior-stats-summaries`. The' '`pymsbayes-output/model-key.txt` also needs to be present.' ' If specifying this directory, it should be the only ' 'argument (i.e., no other directories or config files can ' 'be provided).')) parser.add_argument( '-r', '--reps', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('This option has two effects. First, it signifies that ' 'the analysis will be simulation based (i.e., no real ' 'data will be used). Second, it specifies how many ' 'simulation replicates to perform (i.e., how many data ' 'sets to simulate and analyze).')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=1000000, help=('The number of prior samples to simulate for each prior ' 'config specified with `-p`.')) parser.add_argument( '--prior-batch-size', action='store', type=argparse_utils.arg_is_positive_int, default=10000, help=('The number of prior samples to simulate for each batch.')) parser.add_argument( '--generate-samples-only', action='store_true', help=('Only generate samples from models as requested. I.e., ' 'No analyses are performed to approximate posteriors. ' 'This option can be useful if you want the prior samples ' 'for other purposes.')) parser.add_argument( '--num-posterior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=1000, help=('The number of posterior samples desired for each ' 'analysis. Default: 1000.')) parser.add_argument('--num-standardizing-samples', action='store', type=argparse_utils.arg_is_positive_int, default=10000, help=('The number of prior samples desired to use for ' 'standardizing statistics. Default: 10000.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '--staging-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage prior files. This option ' 'can be useful on clusters to speed up I/O while ' 'generating prior samples. You can designate a local temp ' 'directory on a compute node to avoid constant writing to ' 'a shared drive. The default is to use the `temp-dir`.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi wattTheta pi.net tajD.denom`.')) parser.add_argument( '-b', '--bandwidth', action='store', type=float, help=('Smoothing parameter for the posterior kernal density ' 'estimation. This option is used for the `glm` ' 'regression method. The default is 2 / ' '`num-posterior-samples`.')) parser.add_argument( '-q', '--num-posterior-quantiles', action='store', type=argparse_utils.arg_is_positive_int, default=1000, help=('The number of equally spaced quantiles at which to ' 'evaluate the GLM-estimated posterior density. ' 'Default: 1000.')) parser.add_argument( '--reporting-frequency', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('Suggested frequency (in number of prior samples) for ' 'running regression and reporting current results. ' 'Default: 0 (only report final results). ' 'If a value is given, it may be adjusted so that the ' 'reporting frequency is a multiple of the multi-processed ' 'batch size.')) parser.add_argument('--sort-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, choices=range(12), help=argparse_utils.get_sort_index_help_message()) parser.add_argument( '--no-global-estimate', action='store_true', help=('If multiple prior models are specified, by default a ' 'global estimate is performed averaging over all models. ' 'This option prevents the global estimation (i.e., only ' 'inferences for each model are made).')) parser.add_argument('--compress', action='store_true', help='Compress large results files.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument( '--output-prefix', action='store', type=str, default='', help=('Prefix to use at beginning of output files. The default ' 'is no prefix.')) parser.add_argument( '--data-key-path', action='store', type=argparse_utils.arg_is_file, help=('The path to a `data-key.txt` file generated by a previous ' 'run. This file should be found in the directory ' '`pymsbayes-output/data-key.txt`. This option ' 'will override the `-o`/`--observed-configs` option, and ' 'is intended to be used in combination with the ' '`--start-from` option to restart an analysis.')) parser.add_argument( '--start-from-simulation-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('The simulation index at which to begin analyses. Must be ' 'used in combination with either the number of simulation ' 'replicates (`-r`/`--reps`) or the `--data-key-path` ' 'option, and must be a positive ' 'integer that is less than the number of simulation ' 'replicates. This option can be useful if an analysis ' 'needs to be restarted.')) parser.add_argument( '--start-from-observed-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('The observed config index at which to begin analyses. ' 'Can be used in combination with the `--data-key-path` ' 'option to restart long-running, multi-observed-config ' 'analyses')) parser.add_argument('--dry-run', action='store_true', help='Do not run analyses; only process settings') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import (MsBayesWorker, merge_prior_files, ObsSumStatsWorker) from pymsbayes.teams import ABCTeam from pymsbayes.utils.functions import (is_file, is_dir, long_division, mk_new_dir) from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS, PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS, line_count) from pymsbayes.utils import sumresults, errors from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.config import MsBayesConfig from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace, MSBAYES_SORT_INDEX, ToolPathManager) MSBAYES_SORT_INDEX.set_index(args.sort_index) if len(args.observed_configs) != len(set(args.observed_configs)): raise ValueError('All paths to observed config files must be unique') if args.num_standardizing_samples > args.num_prior_samples: args.num_standardizing_samples = args.num_prior_samples # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') eureject_path = ToolPathManager.get_tool_full_path('eureject') abctb_path = ToolPathManager.get_tool_full_path('ABCestimator') # vet prior-configs option using_previous_priors = False previous_prior_dir = None if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])): previous_prior_dir = args.prior_configs.pop(0) previous_priors = glob.glob( os.path.join(previous_prior_dir, '*-prior-sample.txt')) previous_sums = glob.glob( os.path.join(previous_prior_dir, '*-means-and-std-devs.txt')) if (not previous_priors) or (not previous_sums): raise ValueError( 'directory {0!r} specified with `prior-configs` ' 'option does not contain necessary prior and summary ' 'files'.format(args.prior_configs[0])) using_previous_priors = True else: for path in args.prior_configs: if not is_file(path): raise ValueError( 'prior config {0!r} is not a file'.format(path)) if len(args.prior_configs) != len(set(args.prior_configs)): raise ValueError('All paths to prior config files must be unique') if not args.output_dir: args.output_dir = os.path.dirname(args.observed_configs[0]) base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results')) if not args.temp_dir: args.temp_dir = base_dir info_path = os.path.join(base_dir, args.output_prefix + \ 'pymsbayes-info.txt') info = InfoLogger(info_path) info.write('[pymsbayes]'.format(base_dir)) info.write('\tversion = {version}'.format(**_program_info)) info.write('\toutput_directory = {0}'.format(base_dir)) temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') base_temp_dir = temp_fs.base_dir info.write('\ttemp_directory = {0}'.format(base_temp_dir)) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value())) info.write('\tsimulation_reps = {0}'.format(args.reps)) stat_patterns = DEFAULT_STAT_PATTERNS if args.stat_prefixes: for i in range(len(args.stat_prefixes)): if not args.stat_prefixes[i].endswith('.'): args.stat_prefixes[i] += '.' stat_patterns = get_patterns_from_prefixes(args.stat_prefixes, ignore_case=True) if not args.bandwidth: args.bandwidth = 2 / float(args.num_posterior_samples) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if args.data_key_path: observed_map = sumresults.parse_data_key_file(args.data_key_path) observed_paths = [observed_map[k] for k in sorted(observed_map.keys())] else: observed_dir = mk_new_dir( os.path.join(base_dir, 'observed-summary-stats')) observed_paths = [os.path.join(observed_dir, args.output_prefix + \ 'observed-{0}.txt'.format(i+1)) for i in range(len( args.observed_configs))] info.write('\tseed = {0}'.format(args.seed)) info.write('\tnum_processors = {0}'.format(args.np)) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples)) info.write('\tnum_standardizing_samples = {0}'.format( args.num_standardizing_samples)) info.write('\tbandwidth = {0}'.format(args.bandwidth)) info.write('\tposterior_quantiles = {0}'.format( args.num_posterior_quantiles)) info.write('\tposterior_sample_size = {0}'.format( args.num_posterior_samples)) info.write('\tstat_patterns = {0}'.format(', '.join( [p.pattern for p in stat_patterns]))) # vet observed configs ref_config_path = args.observed_configs[0] ref_config = MsBayesConfig(ref_config_path) all_config_paths = [] num_taxon_pairs = ref_config.npairs assert num_taxon_pairs > 0 for config in args.observed_configs: all_config_paths.append(config) if not ref_config.equal_sample_table(config): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, config)) info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs)) info.write('\tdry_run = {0}'.format(args.dry_run)) info.write('\t[[tool_paths]]') info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path)) info.write('\t\tmsbayes = {0}'.format(msbayes_path)) info.write('\t\teureject = {0}'.format(eureject_path)) info.write('\t\tabcestimator = {0}'.format(abctb_path)) info.write('\t[[observed_configs]]') for i, cfg in enumerate(args.observed_configs): info.write('\t\t{0} = {1}'.format( i + 1, os.path.relpath(cfg, os.path.dirname(info_path)))) abc_team = ABCTeam( temp_fs=temp_fs, observed_stats_files=observed_paths, num_taxon_pairs=num_taxon_pairs, config_paths=args.prior_configs, previous_prior_dir=previous_prior_dir, num_prior_samples=args.num_prior_samples, num_processors=args.np, num_standardizing_samples=args.num_standardizing_samples, num_posterior_samples=args.num_posterior_samples, num_posterior_density_quantiles=args.num_posterior_quantiles, batch_size=args.prior_batch_size, output_dir=base_dir, output_prefix=args.output_prefix, prior_temp_dir=args.staging_dir, rng=GLOBAL_RNG, report_parameters=True, stat_patterns=stat_patterns, eureject_exe_path=eureject_path, abctoolbox_exe_path=abctb_path, msbayes_exe_path=None, abctoolbox_bandwidth=args.bandwidth, omega_threshold=0.01, cv_threshold=0.01, compress=args.compress, reporting_frequency=args.reporting_frequency, keep_temps=args.keep_temps, global_estimate_only=False, global_estimate=not args.no_global_estimate, generate_prior_samples_only=args.generate_samples_only, start_from_simulation_index=args.start_from_simulation_index, start_from_observed_index=args.start_from_observed_index) models_to_configs = {} configs_to_models = {} for k, v in abc_team.models.iteritems(): models_to_configs[k] = v configs_to_models[v] = k cfg = MsBayesConfig(v) all_config_paths.append(v) # vet prior configs if not ref_config.equal_sample_table(cfg): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, v)) info.write('\t[[observed_paths]]') for i in sorted(abc_team.observed_stats_paths.iterkeys()): info.write('\t\t{0} = {1}'.format( i, os.path.relpath(abc_team.observed_stats_paths[i], os.path.dirname(info_path)))) info.write('\t[[prior_configs]]') for i in sorted(abc_team.models.iterkeys()): info.write('\t\t{0} = {1}'.format( i, os.path.relpath(abc_team.models[i], os.path.dirname(info_path)))) ########################################################################## ## begin analysis --- get observed summary stats set_memory_trace() # start logging memory profile start_time = datetime.datetime.now() if args.data_key_path: log.info('Using provided summary statitics...') elif not args.dry_run: obs_temp_dir = base_temp_dir if args.staging_dir: obs_temp_dir = args.staging_dir observed_temp_fs = TempFileSystem(parent=obs_temp_dir, prefix='observed-temps-') if args.reps < 1: log.info('Calculating summary statistics from sequence data...') obs_workers = [] for i, cfg in enumerate(args.observed_configs): ss_worker = ObsSumStatsWorker(temp_fs=observed_temp_fs, config_path=cfg, output_path=observed_paths[i], schema='abctoolbox', stat_patterns=stat_patterns) obs_workers.append(ss_worker) obs_workers = Manager.run_workers(workers=obs_workers, num_processors=args.np) # re-vet all configs to see if some were changed by obsSumStats.pl new_ref_config = ref_config ref_modified = False # new ref because if all configs were updated all is good if not ref_config.equal_sample_table(ref_config_path): ref_modified = True new_ref_config = MsBayesConfig(ref_config_path) log.warning(""" The alignment lengths in config {0!r} have been corrected for sites with *any* ambiguous bases and/or gaps by obsSumStats.pl. """.format(ref_config_path)) for config in all_config_paths: if not new_ref_config.equal_sample_table(config): corrected_config = config if ref_modified: corrected_config = ref_config_path if not args.keep_temps: observed_temp_fs.purge() temp_fs.purge() raise errors.SampleTableError(""" The sample tables in configs {0!r} and {1!r} differ because obsSumStats.pl modified alignment lengths in config {2!r} to correct for sites in the alignments with *any* ambiguous bases and/or gaps. Please make sure the sample tables in all configs will be the same after correcting alignment lengths for sites that contain *any* ambiguous bases and/or gaps. You can do this by copying and pasting the sample table in {2!r} that has been corrected by obsSumStats.pl into the other configs that were not corrected. """.format(ref_config_path, config, corrected_config)) else: log.info('Simulating summary statistics from observed configs...') num_observed_workers = min([args.reps, args.np]) if args.reps <= args.np: observed_batch_size = 1 remainder = 0 else: observed_batch_size, remainder = long_division( args.reps, args.np) msbayes_workers = [] for idx, cfg in enumerate(args.observed_configs): observed_model_idx = configs_to_models.get(cfg, None) schema = 'abctoolbox' for i in range(num_observed_workers): worker = MsBayesWorker(temp_fs=observed_temp_fs, sample_size=observed_batch_size, config_path=cfg, model_index=observed_model_idx, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False, staging_dir=None, tag=idx) msbayes_workers.append(worker) if remainder > 0: worker = MsBayesWorker(temp_fs=observed_temp_fs, sample_size=remainder, config_path=cfg, model_index=observed_model_idx, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False, staging_dir=None, tag=idx) msbayes_workers.append(worker) # run parallel msbayes processes msbayes_workers = Manager.run_workers(workers=msbayes_workers, num_processors=args.np) workers = dict( zip(range(len(args.observed_configs)), [[] for i in range(len(args.observed_configs))])) for w in msbayes_workers: workers[w.tag].append(w) # merge simulated observed data into one file for i in range(len(args.observed_configs)): merge_prior_files([w.prior_path for w in workers[i]], observed_paths[i]) lc = line_count(observed_paths[i], ignore_headers=True) if lc != args.reps: if not args.keep_temps: temp_fs.purge() raise Exception( 'The number of observed simulations ({0}) ' 'generated for observed config {1!r} and output to ' 'file {2!r} does not match the number of reps ' '({3})'.format(lc, args.observed_configs[i], observed_paths[i], args.reps)) if not args.keep_temps: log.debug('purging observed temps...') observed_temp_fs.purge() ########################################################################## ## Begin ABC analyses if not args.dry_run: abc_team.run() stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()