def set_up(self): self.temp_fs = TempFileSystem(parent=package_paths.test_path(), prefix='PyMsBayesTestTemp-') self.test_id = 'pymsbayes-' + random_str()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('-c', '--config', type = arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--vertical-lines', nargs = '*', type = float, default = [], help = ('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = config_path, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel = compress_level) for row in dict_line_iter(stats_by_time, sep = '\t', header = header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = {'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$'} spg = SaturationPlotGrid(stats_by_time, x_key = 'PRI.t', y_keys = args.stat_prefixes, y_labels = y_labels, num_columns = 2, vertical_line_positions = args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
class PyMsBayesTestCase(unittest.TestCase): def set_up(self): self.temp_fs = TempFileSystem(parent=package_paths.test_path(), prefix='PyMsBayesTestTemp-') self.test_id = 'pymsbayes-' + random_str() def tear_down(self): self.register_file_system() self.temp_fs.purge() self.assertEqual(FileStream.open_files, set()) def get_test_path(self, parent=None, prefix='temp'): return self.temp_fs.get_file_path(parent=parent, prefix=prefix) def get_test_subdir(self, parent=None, prefix='temp'): return self.temp_fs.create_subdir(parent=parent, prefix=prefix) def register_file(self, path): self.temp_fs._register_file(path) def register_dir(self, path): self.temp_fs._register_dir(path) def register_file_system(self): _LOG.debug('registering test file system...') for path, dirs, files, in os.walk(self.temp_fs.base_dir): for f in files: if f.startswith(self.test_id): self.register_file(os.path.join(path, f)) for d in dirs: if d.startswith(self.test_id): self.register_dir(os.path.join(path, d)) def _exe_script(self, script_name, args, stdout=None, stderr=None, return_code=0): script_path = package_paths.script_path(script_name) if isinstance(args, str): arg_list = args.split() else: arg_list = args arg_list = [str(x) for x in arg_list] cmd = [sys.executable, script_path] + arg_list _LOG.debug('Invocation:\n\t{0}'.format(' '.join(cmd))) p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) o, e = p.communicate() exit_code = p.wait() if exit_code != return_code: _LOG.error("exit code {0} did not match {1}".format( exit_code, return_code)) _LOG.error("here is the stdout:\n{0}".format(o)) _LOG.error("here is the stderr:\n{0}".format(e)) self.assertEqual(exit_code, return_code) if stdout != None: if o != stdout: _LOG.error("std out did not match expected:\n{0}".format(o)) self.assertEqual(o, stdout) if stderr != None: if e != stderr: _LOG.error("std error did not match expected:\n{0}".format(e)) self.assertEqual(e, stderr) def get_expected_indices(self, num_pairs, dummy_column=True, parameters_reported=True): num_summary_params = 4 if _CV_INCLUDED: num_summary_params += 1 num_params = 4 * num_pairs num_default_stats = 4 * num_pairs start = 0 if dummy_column: start = 1 param_indices = range(start, start + num_summary_params) start += num_summary_params if parameters_reported: param_indices += range(start, start + num_params) start += num_params stat_indices = range(start, start + num_default_stats) return param_indices, stat_indices def prior_file_is_valid(self, prior_path, num_of_samples, num_of_columns=None): try: prior_file = open(prior_path, 'rU') except: _LOG.error('prior invalid: could not open prior path {0}'.format( prior_path)) return False nrows = 0 for i, line in enumerate(prior_file): if nrows == 0 and HEADER_PATTERN.match(line): pass else: nrows += 1 if not num_of_columns: num_of_columns = len(line.strip().split()) ncols = len(line.strip().split()) if num_of_columns != ncols: _LOG.error('prior invalid: num of columns at line {0} is {1} ' 'NOT {2}'.format(i + 1, ncols, num_of_columns)) return False prior_file.close() if num_of_samples != nrows: _LOG.error('prior invalid: num of rows is {0} NOT {1}'.format( nrows, num_of_samples)) return False return True def get_number_of_lines(self, path): f, close = process_file_arg(path) count = 0 for l in f: count += 1 if close: f.close() return count def get_number_of_header_lines(self, path): f, close = process_file_arg(path) count = 0 for l in f: if HEADER_PATTERN.match(l.strip()): count += 1 if close: f.close() return count def parse_python_config(self, path): return ConfigObj(path) def get_config_from_msbayes_workers(self, msbayes_workers): cfgs = [MsBayesConfig(w.config_path) for w in msbayes_workers] self.assertSameConfigs(cfgs) return cfgs[0] def assertSameConfigs(self, cfgs): configs = list(cfgs) c1 = configs.pop(0) for c2 in cfgs: self.assertEqual(c1.time_in_subs_per_site, c2.time_in_subs_per_site) self.assertEqual(c1.npairs, c2.npairs) self.assertEqual(c1.implementation, c2.implementation) self.assertEqual(c1.div_model_prior, c2.div_model_prior) self.assertEqual(c1.bottle_proportion_shared, c2.bottle_proportion_shared) self.assertEqual(c1.theta_parameters, c2.theta_parameters) self.assertEqual(c1.taxa, c2.taxa) self.assertTrue(c1.sample_table.equals(c2.sample_table)) if c1.psi: self.assertSameDistributions(c1.psi, c2.psi) else: self.assertEqual(c1.psi, c2.psi) if c1.tau: self.assertSameDistributions(c1.tau, c2.tau) else: self.assertEqual(c1.tau, c2.tau) if c1.theta: self.assertSameDistributions(c1.theta, c2.theta) else: self.assertEqual(c1.theta, c2.theta) if c1.a_theta: self.assertSameDistributions(c1.a_theta, c2.a_theta) else: self.assertEqual(c1.a_theta, c2.a_theta) if c1.d_theta: self.assertSameDistributions(c1.d_theta, c2.d_theta) else: self.assertEqual(c1.d_theta, c2.d_theta) if c1.recombination: self.assertSameDistributions(c1.recombination, c2.recombination) else: self.assertEqual(c1.recombination, c2.recombination) if c1.migration: self.assertSameDistributions(c1.migration, c2.migration) else: self.assertEqual(c1.migration, c2.migration) if c1.dpp_concentration: self.assertSameDistributions(c1.dpp_concentration, c2.dpp_concentration) else: self.assertEqual(c1.dpp_concentration, c2.dpp_concentration) if c1.bottle_proportion: self.assertSameDistributions(c1.bottle_proportion, c2.bottle_proportion) else: self.assertEqual(c1.bottle_proportion, c2.bottle_proportion) def get_parameter_summaries_from_msbayes_workers(self, msbayes_workers, shuffle_taus=True): msbayes_workers = list(msbayes_workers) s = dict( zip([i for i in msbayes_workers[0].parameter_indices], [ SampleSummarizer(tag=msbayes_workers[0].header[i]) for i in msbayes_workers[0].parameter_indices ])) ncols = None header = msbayes_workers[0].header pi = msbayes_workers[0].parameter_indices for w in msbayes_workers: self.assertEqual(w.header, header) self.assertEqual(w.parameter_indices, pi) f = open(w.prior_path, 'rU') for line_idx, row in enumerate(f): if not ncols: ncols = len(row.strip().split()) if HEADER_PATTERN.match(row.strip()): continue r = row.strip().split() assert len(r) == ncols if shuffle_taus: # because taus are sorted in prior files psi_index = get_indices_of_patterns( w.header, PSI_PATTERNS)[0] tau_indices = get_indices_of_patterns( w.header, TAU_PATTERNS) psi = int(r[psi_index]) taus = [float(r[i]) for i in tau_indices] self.assertEqual(psi, len(set(taus))) random.shuffle(taus) for n, i in enumerate(tau_indices): s[i].add_sample(taus[n]) p_set = set(w.parameter_indices) - set(tau_indices) p = sorted(list(p_set)) for i in p: s[i].add_sample(float(r[i])) else: for i in w.parameter_indices: s[i].add_sample(float(r[i])) f.close() return s def assertPriorIsPrecise(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) param_sums = self.get_parameter_summaries_from_msbayes_workers( msbayes_workers) sample_size = 0 for w in msbayes_workers: sample_size += w.sample_size for s in param_sums.itervalues(): self.assertEqual(s.n, sample_size) cfg = self.get_config_from_msbayes_workers(msbayes_workers) psi_indices = get_indices_of_patterns(msbayes_workers[0].header, PSI_PATTERNS) self.assertEqual(len(psi_indices), 1) model_indices = get_indices_of_patterns(msbayes_workers[0].header, MODEL_PATTERNS) if not msbayes_workers[0].model_index is None: self.assertEqual(len(model_indices), 1) else: self.assertEqual(len(model_indices), 0) tau_indices = get_indices_of_patterns(msbayes_workers[0].header, TAU_PATTERNS) a_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, A_THETA_PATTERNS) d_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, D_THETA_PATTERNS) if msbayes_workers[0].report_parameters: self.assertEqual(len(tau_indices), cfg.npairs) self.assertEqual(len(a_theta_indices), cfg.npairs) self.assertEqual(len(d_theta_indices), 2 * cfg.npairs) else: self.assertEqual(len(tau_indices), 0) self.assertEqual(len(a_theta_indices), 0) self.assertEqual(len(d_theta_indices), 0) _LOG.debug('\n{0}\n'.format('\n'.join( [str(param_sums[i]) for i in sorted(param_sums.iterkeys())]))) for i in psi_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.psi, places=places) for i in tau_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.tau, places=places) for i in a_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.a_theta, places=places) for i in d_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.d_theta, mean_adj=cfg.theta.mean, max_adj=cfg.theta.maximum, compare_variance=False, places=places) def assertPriorIsAccurate(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) pass def assertPriorIsValid(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) self.assertPriorIsPrecise(msbayes_workers, places=places) self.assertPriorIsAccurate(msbayes_workers, places=places) def assertWorkersFinished(self, msbayes_workers): for w in msbayes_workers: self.assertTrue(w.finished) def assertSampleIsFromDistribution(self, sample_sum, dist, places=2, mean_adj=1, max_adj=1, compare_variance=True): if isinstance(dist, probability.DiscreteUniformDistribution): self.assertEqual(sample_sum.minimum, dist.minimum) self.assertEqual(sample_sum.maximum, dist.maximum) else: if dist.minimum != float('-inf') or dist.minimum != float('inf'): self.assertAlmostEqual(sample_sum.minimum, dist.minimum, places) if dist.maximum != float('-inf') or dist.maximum != float('inf'): self.assertAlmostEqual(sample_sum.maximum, dist.maximum * max_adj, places) self.assertAlmostEqual(sample_sum.mean, dist.mean * mean_adj, places) if compare_variance: self.assertAlmostEqual(sample_sum.variance, dist.variance, places) def assertApproxEqual(self, x, y, percent_tol=1e-6): eq = (((abs(x - y) / ((abs(x) + abs(y)) / 2)) * 100) < percent_tol) if not eq: _LOG.error('x ({0}) and y ({1}) are not equal'.format(x, y)) self.assertTrue(eq) def files_equal(self, f1, f2, exclude_line_endings=False): equal = True diffs = [] f1, c1 = process_file_arg(f1) f2, c2 = process_file_arg(f2) line = 0 f1_end = False f2_end = False lines_left = True while True: line += 1 if f1_end == False: try: l1 = f1.next() except (StopIteration, EOFError): f1_end = line pass if f2_end == False: try: l2 = f2.next() except (StopIteration, EOFError): f2_end = line pass if f1_end != False and f2_end != False: break if exclude_line_endings: l1 = l1.strip() l2 = l2.strip() if f1_end == False and f2_end == False and l1 != l2: diffs.append(line) equal = False if f1_end != f2_end: mn = min([f1_end, f2_end]) mx = max([f1_end, f2_end]) diffs.extend(range(mn, mx + 1)) equal = False assert len(diffs) == len(set(diffs)) if c1: f1.close() if c2: f2.close() return equal, diffs def assertSameFiles(self, files, exclude_line_endings=False): files = list(files) all_equal = True diffs = StringIO() f1 = files.pop(0) for f2 in files: equal, diff_list = self.files_equal(f1, f2, exclude_line_endings) if not equal: all_equal = False n1 = f1 if not isinstance(n1, str): n1 = f1.name n2 = f2 if not isinstance(n2, str): n2 = f2.name diffs.write('{0} and {1} differ at lines:\n\t{2}\n'.format( n1, n2, ','.join([str(i) for i in diff_list]))) if not all_equal: _LOG.error('files are not equal:\n{0}\n'.format(diffs.getvalue())) self.assertTrue(all_equal) def assertSameUnsortedFiles(self, files): files = list(files) all_equal = True diffs = StringIO() f1, close = process_file_arg(files.pop(0)) lines1 = sorted(f1.readlines()) for f in files: f2, close2 = process_file_arg(f) lines2 = sorted(f2.readlines()) if len(lines1) != len(lines2): all_equal = False diffs.write('{0} ({1}) and {2} ({3}) have different ' 'number of lines\n'.format(f1.name, len(lines1), f2.name, len(lines2))) for i in range(min(len(lines1), len(lines2))): if lines1[i].strip().split() != lines2[i].strip().split(): all_equal = False diffs.write('{0} and {1} differ at sorted index ' '{2}\n'.format(f1.name, f2.name, i)) if close2: f2.close() if not all_equal: _LOG.error('files are not equal after sorting:\n{0}\n'.format( diffs.getvalue())) self.assertTrue(all_equal) if close: f1.close() def same_samples(self, sample1, sample2, places=4, num_mismatches=0): if len(sample1) != len(sample2): return False for i in range(len(sample1)): if round(float(sample1[i]) - float(sample2[i]), places) != 0: if num_mismatches < 1: return False num_mismatches -= 1 return True def assertSameSamples(self, files, columns_to_ignore=[], header=True, places=5, num_mismatches_per_sample=0, num_sample_mismatches=0): files = list(files) all_equal = True diffs = StringIO() f1, close = process_file_arg(files.pop(0)) f1_lines = f1.readlines() indices = [ i for i in range(len(f1_lines[0].strip().split())) if i not in columns_to_ignore ] h1 = [] if header: head = f1_lines.pop(0).strip().split() h1 = [head[i] for i in indices] lines1 = sorted(f1_lines) for f in files: f2, close2 = process_file_arg(f) f2_lines = f2.readlines() h2 = [] if header: head = f2_lines.pop(0).strip().split() h2 = [head[i] for i in indices] if h1 != h2: all_equal = False diffs.write('{0} and {1} have different headers; not ' 'comparing further\n'.format(f1.name, f2.name)) continue lines2 = sorted(f2_lines) if len(lines1) != len(lines2): all_equal = False diffs.write('{0} ({1}) and {2} ({3}) have different ' 'number of lines\n'.format(f1.name, len(lines1), f2.name, len(lines2))) n_matches = 0 n_mismatches = 0 for l1 in lines1: found = False for l2 in lines2: values1 = l1.strip().split() values2 = l2.strip().split() v1 = [float(values1[x]) for x in indices] v2 = [float(values2[x]) for x in indices] if self.same_samples( v1, v2, places=places, num_mismatches=num_mismatches_per_sample): found = True if found: n_matches += 1 else: n_mismatches += 1 if n_mismatches > 0: if n_mismatches > num_sample_mismatches: all_equal = False diffs.write('{0} and {1}\nhave {2} mismatching samples and ' 'share {3} samples\n'.format( f1.name, f2.name, n_mismatches, n_matches)) if close2: f2.close() if diffs.getvalue() != '': _LOG.error('files are not equal after sorting:\n{0}\n'.format( diffs.getvalue())) self.assertTrue(all_equal) if close: f1.close() def assertSameDistributions(self, d1, d2): self.assertEqual(d1.name, d2.name) self.assertEqual(str(d1), str(d2)) self.assertEqual(d1.minimum, d2.minimum) self.assertEqual(d1.maximum, d2.maximum) self.assertEqual(d1.mean, d2.mean) self.assertEqual(d1.variance, d2.variance) def assertSameIntegerPartitions(self, integer_partitions): ips = list(integer_partitions) ip1 = ips.pop(0) for ip2 in ips: self.assertEqual(ip1._initialized, ip2._initialized) self.assertEqual(ip1.n, ip2.n) self.assertEqual(ip1.key, ip2.key) self.assertEqual(ip1.integer_partition, ip2.integer_partition) self.assertEqual(ip1._items, ip2._items) def assertSamePartitions(self, partitions): ps = list(partitions) p1 = ps.pop(0) for p2 in ps: self.assertEqual(p1._initialized, p2._initialized) self.assertEqual(p1.n, p2.n) self.assertEqual(p1.key, p2.key) self.assertEqual(p1.partition, p2.partition) self.assertEqual(p1.values, p2.values)
def set_up(self): self.temp_fs = TempFileSystem( parent = package_paths.test_path(), prefix = 'PyMsBayesTestTemp-') self.test_id = 'pymsbayes-' + random_str()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('-c', '--config', type = argparse_utils.arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action = 'store', type = int, default = 0, choices = range(12), help = argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = args.config, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel = compress_level) for row in dict_line_iter(sample, sep = '\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x = values, normed = True, bins = 20, histtype = 'bar', align = 'mid', orientation = 'vertical', zorder = 0) hist = plotting.ScatterPlot(hist_data_list = [hd], right_text = s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks = xticks, labels = xtick_labels, horizontalalignment = 'center') yticks_obj = plotting.Ticks(ticks = yticks, labels = ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots = [hist], num_columns = 1, label_schema = None, title = stat, title_size = 14.0, title_top = False, y_title = 'Density', y_title_position = 0.001, y_title_size = 14.0, height = 4.0, width = 6.0, auto_height = False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
class PyMsBayesTestCase(unittest.TestCase): def set_up(self): self.temp_fs = TempFileSystem( parent = package_paths.test_path(), prefix = 'PyMsBayesTestTemp-') self.test_id = 'pymsbayes-' + random_str() def tear_down(self): self.register_file_system() self.temp_fs.purge() self.assertEqual(FileStream.open_files, set()) def get_test_path(self, parent=None, prefix='temp'): return self.temp_fs.get_file_path(parent=parent, prefix=prefix) def get_test_subdir(self, parent=None, prefix='temp'): return self.temp_fs.create_subdir(parent=parent, prefix=prefix) def register_file(self, path): self.temp_fs._register_file(path) def register_dir(self, path): self.temp_fs._register_dir(path) def register_file_system(self): _LOG.debug('registering test file system...') for path, dirs, files, in os.walk(self.temp_fs.base_dir): for f in files: if f.startswith(self.test_id): self.register_file(os.path.join(path, f)) for d in dirs: if d.startswith(self.test_id): self.register_dir(os.path.join(path, d)) def _exe_script(self, script_name, args, stdout = None, stderr = None, return_code = 0): script_path = package_paths.script_path(script_name) if isinstance(args, str): arg_list = args.split() else: arg_list = args arg_list = [str(x) for x in arg_list] cmd = [sys.executable, script_path] + arg_list _LOG.debug('Invocation:\n\t{0}'.format(' '.join(cmd))) p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE, stderr=subprocess.PIPE) o, e = p.communicate() exit_code = p.wait() if exit_code != return_code: _LOG.error("exit code {0} did not match {1}".format(exit_code, return_code)) _LOG.error("here is the stdout:\n{0}".format(o)) _LOG.error("here is the stderr:\n{0}".format(e)) self.assertEqual(exit_code, return_code) if stdout != None: if o != stdout: _LOG.error("std out did not match expected:\n{0}".format(o)) self.assertEqual(o, stdout) if stderr != None: if e != stderr: _LOG.error("std error did not match expected:\n{0}".format(e)) self.assertEqual(e, stderr) def get_expected_indices(self, num_pairs, dummy_column=True, parameters_reported=True): num_summary_params = 4 if _CV_INCLUDED: num_summary_params += 1 num_params = 4*num_pairs num_default_stats = 4*num_pairs start = 0 if dummy_column: start = 1 param_indices = range(start, start+num_summary_params) start += num_summary_params if parameters_reported: param_indices += range(start, start+num_params) start += num_params stat_indices = range(start, start+num_default_stats) return param_indices, stat_indices def prior_file_is_valid(self, prior_path, num_of_samples, num_of_columns=None): try: prior_file = open(prior_path, 'rU') except: _LOG.error('prior invalid: could not open prior path {0}'.format( prior_path)) return False nrows = 0 for i, line in enumerate(prior_file): if nrows == 0 and HEADER_PATTERN.match(line): pass else: nrows += 1 if not num_of_columns: num_of_columns = len(line.strip().split()) ncols = len(line.strip().split()) if num_of_columns != ncols: _LOG.error('prior invalid: num of columns at line {0} is {1} ' 'NOT {2}'.format(i+1, ncols, num_of_columns)) return False prior_file.close() if num_of_samples != nrows: _LOG.error('prior invalid: num of rows is {0} NOT {1}'.format( nrows, num_of_samples)) return False return True def get_number_of_lines(self, path): f, close = process_file_arg(path) count = 0 for l in f: count += 1 if close: f.close() return count def get_number_of_header_lines(self, path): f, close = process_file_arg(path) count = 0 for l in f: if HEADER_PATTERN.match(l.strip()): count += 1 if close: f.close() return count def parse_python_config(self, path): return ConfigObj(path) def get_config_from_msbayes_workers(self, msbayes_workers): cfgs = [MsBayesConfig(w.config_path) for w in msbayes_workers] self.assertSameConfigs(cfgs) return cfgs[0] def assertSameConfigs(self, cfgs): configs = list(cfgs) c1 = configs.pop(0) for c2 in cfgs: self.assertEqual(c1.time_in_subs_per_site, c2.time_in_subs_per_site) self.assertEqual(c1.npairs, c2.npairs) self.assertEqual(c1.implementation, c2.implementation) self.assertEqual(c1.div_model_prior, c2.div_model_prior) self.assertEqual(c1.bottle_proportion_shared, c2.bottle_proportion_shared) self.assertEqual(c1.theta_parameters, c2.theta_parameters) self.assertEqual(c1.taxa, c2.taxa) self.assertTrue(c1.sample_table.equals(c2.sample_table)) if c1.psi: self.assertSameDistributions(c1.psi, c2.psi) else: self.assertEqual(c1.psi, c2.psi) if c1.tau: self.assertSameDistributions(c1.tau, c2.tau) else: self.assertEqual(c1.tau, c2.tau) if c1.theta: self.assertSameDistributions(c1.theta, c2.theta) else: self.assertEqual(c1.theta, c2.theta) if c1.a_theta: self.assertSameDistributions(c1.a_theta, c2.a_theta) else: self.assertEqual(c1.a_theta, c2.a_theta) if c1.d_theta: self.assertSameDistributions(c1.d_theta, c2.d_theta) else: self.assertEqual(c1.d_theta, c2.d_theta) if c1.recombination: self.assertSameDistributions(c1.recombination, c2.recombination) else: self.assertEqual(c1.recombination, c2.recombination) if c1.migration: self.assertSameDistributions(c1.migration, c2.migration) else: self.assertEqual(c1.migration, c2.migration) if c1.dpp_concentration: self.assertSameDistributions(c1.dpp_concentration, c2.dpp_concentration) else: self.assertEqual(c1.dpp_concentration, c2.dpp_concentration) if c1.bottle_proportion: self.assertSameDistributions(c1.bottle_proportion, c2.bottle_proportion) else: self.assertEqual(c1.bottle_proportion, c2.bottle_proportion) def get_parameter_summaries_from_msbayes_workers(self, msbayes_workers, shuffle_taus=True): msbayes_workers = list(msbayes_workers) s = dict(zip( [i for i in msbayes_workers[0].parameter_indices], [SampleSummarizer( tag=msbayes_workers[0].header[i]) for i in msbayes_workers[ 0].parameter_indices])) ncols = None header = msbayes_workers[0].header pi = msbayes_workers[0].parameter_indices for w in msbayes_workers: self.assertEqual(w.header, header) self.assertEqual(w.parameter_indices, pi) f = open(w.prior_path, 'rU') for line_idx, row in enumerate(f): if not ncols: ncols = len(row.strip().split()) if HEADER_PATTERN.match(row.strip()): continue r = row.strip().split() assert len(r) == ncols if shuffle_taus: # because taus are sorted in prior files psi_index = get_indices_of_patterns(w.header, PSI_PATTERNS)[0] tau_indices = get_indices_of_patterns(w.header, TAU_PATTERNS) psi = int(r[psi_index]) taus = [float(r[i]) for i in tau_indices] self.assertEqual(psi, len(set(taus))) random.shuffle(taus) for n, i in enumerate(tau_indices): s[i].add_sample(taus[n]) p_set = set(w.parameter_indices) - set(tau_indices) p = sorted(list(p_set)) for i in p: s[i].add_sample(float(r[i])) else: for i in w.parameter_indices: s[i].add_sample(float(r[i])) f.close() return s def assertPriorIsPrecise(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) param_sums = self.get_parameter_summaries_from_msbayes_workers( msbayes_workers) sample_size = 0 for w in msbayes_workers: sample_size += w.sample_size for s in param_sums.itervalues(): self.assertEqual(s.n, sample_size) cfg = self.get_config_from_msbayes_workers(msbayes_workers) psi_indices = get_indices_of_patterns(msbayes_workers[0].header, PSI_PATTERNS) self.assertEqual(len(psi_indices), 1) model_indices = get_indices_of_patterns(msbayes_workers[0].header, MODEL_PATTERNS) if not msbayes_workers[0].model_index is None: self.assertEqual(len(model_indices), 1) else: self.assertEqual(len(model_indices), 0) tau_indices = get_indices_of_patterns(msbayes_workers[0].header, TAU_PATTERNS) a_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, A_THETA_PATTERNS) d_theta_indices = get_indices_of_patterns(msbayes_workers[0].header, D_THETA_PATTERNS) if msbayes_workers[0].report_parameters: self.assertEqual(len(tau_indices), cfg.npairs) self.assertEqual(len(a_theta_indices), cfg.npairs) self.assertEqual(len(d_theta_indices), 2*cfg.npairs) else: self.assertEqual(len(tau_indices), 0) self.assertEqual(len(a_theta_indices), 0) self.assertEqual(len(d_theta_indices), 0) _LOG.debug('\n{0}\n'.format('\n'.join( [str(param_sums[i]) for i in sorted(param_sums.iterkeys())]))) for i in psi_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.psi, places=places) for i in tau_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.tau, places=places) for i in a_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.a_theta, places=places) for i in d_theta_indices: self.assertSampleIsFromDistribution(param_sums[i], cfg.d_theta, mean_adj=cfg.theta.mean, max_adj=cfg.theta.maximum, compare_variance=False, places=places) def assertPriorIsAccurate(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) pass def assertPriorIsValid(self, msbayes_workers, places=2): msbayes_workers = list(msbayes_workers) self.assertWorkersFinished(msbayes_workers) self.assertPriorIsPrecise(msbayes_workers, places=places) self.assertPriorIsAccurate(msbayes_workers, places=places) def assertWorkersFinished(self, msbayes_workers): for w in msbayes_workers: self.assertTrue(w.finished) def assertSampleIsFromDistribution(self, sample_sum, dist, places=2, mean_adj=1, max_adj=1, compare_variance=True): if isinstance(dist, probability.DiscreteUniformDistribution): self.assertEqual(sample_sum.minimum, dist.minimum) self.assertEqual(sample_sum.maximum, dist.maximum) else: if dist.minimum != float('-inf') or dist.minimum != float('inf'): self.assertAlmostEqual(sample_sum.minimum, dist.minimum, places) if dist.maximum != float('-inf') or dist.maximum != float('inf'): self.assertAlmostEqual(sample_sum.maximum, dist.maximum*max_adj, places) self.assertAlmostEqual(sample_sum.mean, dist.mean*mean_adj, places) if compare_variance: self.assertAlmostEqual(sample_sum.variance, dist.variance, places) def assertApproxEqual(self, x, y, percent_tol=1e-6): eq = (((abs(x-y) / ((abs(x)+abs(y))/2))*100) < percent_tol) if not eq: _LOG.error('x ({0}) and y ({1}) are not equal'.format(x, y)) self.assertTrue(eq) def files_equal(self, f1, f2, exclude_line_endings=False): equal = True diffs = [] f1, c1 = process_file_arg(f1) f2, c2 = process_file_arg(f2) line = 0 f1_end = False f2_end = False lines_left = True while True: line += 1 if f1_end == False: try: l1 = f1.next() except (StopIteration, EOFError): f1_end = line pass if f2_end == False: try: l2 = f2.next() except (StopIteration, EOFError): f2_end = line pass if f1_end != False and f2_end != False: break if exclude_line_endings: l1 = l1.strip() l2 = l2.strip() if f1_end == False and f2_end == False and l1 != l2: diffs.append(line) equal = False if f1_end != f2_end: mn = min([f1_end, f2_end]) mx = max([f1_end, f2_end]) diffs.extend(range(mn, mx+1)) equal = False assert len(diffs) == len(set(diffs)) if c1: f1.close() if c2: f2.close() return equal, diffs def assertSameFiles(self, files, exclude_line_endings=False): files = list(files) all_equal = True diffs = StringIO() f1 = files.pop(0) for f2 in files: equal, diff_list = self.files_equal(f1, f2, exclude_line_endings) if not equal: all_equal = False n1 = f1 if not isinstance(n1, str): n1 = f1.name n2 = f2 if not isinstance(n2, str): n2 = f2.name diffs.write('{0} and {1} differ at lines:\n\t{2}\n'.format( n1, n2, ','.join([str(i) for i in diff_list]))) if not all_equal: _LOG.error('files are not equal:\n{0}\n'.format(diffs.getvalue())) self.assertTrue(all_equal) def assertSameUnsortedFiles(self, files): files = list(files) all_equal = True diffs = StringIO() f1, close = process_file_arg(files.pop(0)) lines1 = sorted(f1.readlines()) for f in files: f2, close2 = process_file_arg(f) lines2 = sorted(f2.readlines()) if len(lines1) != len(lines2): all_equal = False diffs.write('{0} ({1}) and {2} ({3}) have different ' 'number of lines\n'.format(f1.name, len(lines1), f2.name, len(lines2))) for i in range(min(len(lines1), len(lines2))): if lines1[i].strip().split() != lines2[i].strip().split(): all_equal = False diffs.write('{0} and {1} differ at sorted index ' '{2}\n'.format(f1.name, f2.name, i)) if close2: f2.close() if not all_equal: _LOG.error('files are not equal after sorting:\n{0}\n'.format( diffs.getvalue())) self.assertTrue(all_equal) if close: f1.close() def same_samples(self, sample1, sample2, places = 4, num_mismatches = 0): if len(sample1) != len(sample2): return False for i in range(len(sample1)): if round(float(sample1[i]) - float(sample2[i]), places) != 0: if num_mismatches < 1: return False num_mismatches -= 1 return True def assertSameSamples(self, files, columns_to_ignore = [], header = True, places = 5, num_mismatches_per_sample = 0, num_sample_mismatches = 0): files = list(files) all_equal = True diffs = StringIO() f1, close = process_file_arg(files.pop(0)) f1_lines = f1.readlines() indices = [i for i in range(len( f1_lines[0].strip().split())) if i not in columns_to_ignore] h1 = [] if header: head = f1_lines.pop(0).strip().split() h1 = [head[i] for i in indices] lines1 = sorted(f1_lines) for f in files: f2, close2 = process_file_arg(f) f2_lines = f2.readlines() h2 = [] if header: head = f2_lines.pop(0).strip().split() h2 = [head[i] for i in indices] if h1 != h2: all_equal = False diffs.write('{0} and {1} have different headers; not ' 'comparing further\n'.format( f1.name, f2.name)) continue lines2 = sorted(f2_lines) if len(lines1) != len(lines2): all_equal = False diffs.write('{0} ({1}) and {2} ({3}) have different ' 'number of lines\n'.format(f1.name, len(lines1), f2.name, len(lines2))) n_matches = 0 n_mismatches = 0 for l1 in lines1: found = False for l2 in lines2: values1 = l1.strip().split() values2 = l2.strip().split() v1 = [float(values1[x]) for x in indices] v2 = [float(values2[x]) for x in indices] if self.same_samples(v1, v2, places = places, num_mismatches = num_mismatches_per_sample): found = True if found: n_matches += 1 else: n_mismatches += 1 if n_mismatches > 0: if n_mismatches > num_sample_mismatches: all_equal = False diffs.write('{0} and {1}\nhave {2} mismatching samples and ' 'share {3} samples\n'.format( f1.name, f2.name, n_mismatches, n_matches)) if close2: f2.close() if diffs.getvalue() != '': _LOG.error('files are not equal after sorting:\n{0}\n'.format( diffs.getvalue())) self.assertTrue(all_equal) if close: f1.close() def assertSameDistributions(self, d1, d2): self.assertEqual(d1.name, d2.name) self.assertEqual(str(d1), str(d2)) self.assertEqual(d1.minimum, d2.minimum) self.assertEqual(d1.maximum, d2.maximum) self.assertEqual(d1.mean, d2.mean) self.assertEqual(d1.variance, d2.variance) def assertSameIntegerPartitions(self, integer_partitions): ips = list(integer_partitions) ip1 = ips.pop(0) for ip2 in ips: self.assertEqual(ip1._initialized, ip2._initialized) self.assertEqual(ip1.n, ip2.n) self.assertEqual(ip1.key, ip2.key) self.assertEqual(ip1.integer_partition, ip2.integer_partition) self.assertEqual(ip1._items, ip2._items) def assertSamePartitions(self, partitions): ps = list(partitions) p1 = ps.pop(0) for p2 in ps: self.assertEqual(p1._initialized, p2._initialized) self.assertEqual(p1.n, p2.n) self.assertEqual(p1.key, p2.key) self.assertEqual(p1.partition, p2.partition) self.assertEqual(p1.values, p2.values)
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument( '-c', '--config', type=arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument( '--vertical-lines', nargs='*', type=float, default=[], help=('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=config_path, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel=compress_level) for row in dict_line_iter(stats_by_time, sep='\t', header=header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = { 'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$' } spg = SaturationPlotGrid(stats_by_time, x_key='PRI.t', y_keys=args.stat_prefixes, y_labels=y_labels, num_columns=2, vertical_line_positions=args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(argv = sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('-o', '--observed-configs', nargs = '+', type = argparse_utils.arg_is_config, required = True, help = ('One or more msBayes config files to be used to either ' 'calculate or simulate observed summary statistics. If ' 'used in combination with `-r` each config will be used to ' 'simulate pseudo-observed data. If analyzing real data, do ' 'not use the `-r` option, and the fasta files specified ' 'within the config must exist and contain the sequence ' 'data.')) parser.add_argument('-p', '--prior-configs', nargs = '+', type = argparse_utils.arg_is_path, required = True, help = ('One or more config files to be used to generate prior ' 'samples. If more than one config is specified, they ' 'should be separated by spaces. ' 'This option can also be used to specify the path to a ' 'directory containing the prior samples and summary ' 'statistic means and standard deviations generated by a ' 'previous run using the `generate-samples-only` option. ' 'These files should be found in the directory ' '`pymsbayes-output/prior-stats-summaries`. The' '`pymsbayes-output/model-key.txt` also needs to be present.' ' If specifying this directory, it should be the only ' 'argument (i.e., no other directories or config files can ' 'be provided).')) parser.add_argument('-r', '--reps', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('This option has two effects. First, it signifies that ' 'the analysis will be simulation based (i.e., no real ' 'data will be used). Second, it specifies how many ' 'simulation replicates to perform (i.e., how many data ' 'sets to simulate and analyze).')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000000, help = ('The number of prior samples to simulate for each prior ' 'config specified with `-p`.')) parser.add_argument('--prior-batch-size', action = 'store', type = argparse_utils.arg_is_positive_int, default = 10000, help = ('The number of prior samples to simulate for each batch.')) parser.add_argument('--generate-samples-only', action = 'store_true', help = ('Only generate samples from models as requested. I.e., ' 'No analyses are performed to approximate posteriors. ' 'This option can be useful if you want the prior samples ' 'for other purposes.')) parser.add_argument('--num-posterior-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000, help = ('The number of posterior samples desired for each ' 'analysis. Default: 1000.')) parser.add_argument('--num-standardizing-samples', action = 'store', type = argparse_utils.arg_is_positive_int, default = 10000, help = ('The number of prior samples desired to use for ' 'standardizing statistics. Default: 10000.')) parser.add_argument('--np', action = 'store', type = argparse_utils.arg_is_positive_int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('--staging-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage prior files. This option ' 'can be useful on clusters to speed up I/O while ' 'generating prior samples. You can designate a local temp ' 'directory on a compute node to avoid constant writing to ' 'a shared drive. The default is to use the `temp-dir`.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi wattTheta pi.net tajD.denom`.')) parser.add_argument('-b', '--bandwidth', action = 'store', type = float, help = ('Smoothing parameter for the posterior kernal density ' 'estimation. This option is used for the `glm` ' 'regression method. The default is 2 / ' '`num-posterior-samples`.')) parser.add_argument('-q', '--num-posterior-quantiles', action = 'store', type = argparse_utils.arg_is_positive_int, default = 1000, help = ('The number of equally spaced quantiles at which to ' 'evaluate the GLM-estimated posterior density. ' 'Default: 1000.')) parser.add_argument('--reporting-frequency', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('Suggested frequency (in number of prior samples) for ' 'running regression and reporting current results. ' 'Default: 0 (only report final results). ' 'If a value is given, it may be adjusted so that the ' 'reporting frequency is a multiple of the multi-processed ' 'batch size.')) parser.add_argument('--sort-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, choices = range(12), help = argparse_utils.get_sort_index_help_message()) parser.add_argument('--no-global-estimate', action = 'store_true', help = ('If multiple prior models are specified, by default a ' 'global estimate is performed averaging over all models. ' 'This option prevents the global estimation (i.e., only ' 'inferences for each model are made).')) parser.add_argument('--compress', action = 'store_true', help = 'Compress large results files.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--output-prefix', action = 'store', type = str, default = '', help = ('Prefix to use at beginning of output files. The default ' 'is no prefix.')) parser.add_argument('--data-key-path', action = 'store', type = argparse_utils.arg_is_file, help = ('The path to a `data-key.txt` file generated by a previous ' 'run. This file should be found in the directory ' '`pymsbayes-output/data-key.txt`. This option ' 'will override the `-o`/`--observed-configs` option, and ' 'is intended to be used in combination with the ' '`--start-from` option to restart an analysis.')) parser.add_argument('--start-from-simulation-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('The simulation index at which to begin analyses. Must be ' 'used in combination with either the number of simulation ' 'replicates (`-r`/`--reps`) or the `--data-key-path` ' 'option, and must be a positive ' 'integer that is less than the number of simulation ' 'replicates. This option can be useful if an analysis ' 'needs to be restarted.')) parser.add_argument('--start-from-observed-index', action = 'store', type = argparse_utils.arg_is_nonnegative_int, default = 0, help = ('The observed config index at which to begin analyses. ' 'Can be used in combination with the `--data-key-path` ' 'option to restart long-running, multi-observed-config ' 'analyses')) parser.add_argument('--dry-run', action = 'store_true', help = 'Do not run analyses; only process settings') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import (MsBayesWorker, merge_prior_files, ObsSumStatsWorker) from pymsbayes.teams import ABCTeam from pymsbayes.utils.functions import (is_file, is_dir, long_division, mk_new_dir) from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS, PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS, line_count) from pymsbayes.utils import sumresults, errors from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.config import MsBayesConfig from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace, MSBAYES_SORT_INDEX, ToolPathManager) MSBAYES_SORT_INDEX.set_index(args.sort_index) if len(args.observed_configs) != len(set(args.observed_configs)): raise ValueError('All paths to observed config files must be unique') if args.num_standardizing_samples > args.num_prior_samples: args.num_standardizing_samples = args.num_prior_samples # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') eureject_path = ToolPathManager.get_tool_full_path('eureject') abctb_path = ToolPathManager.get_tool_full_path('ABCestimator') # vet prior-configs option using_previous_priors = False previous_prior_dir = None if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])): previous_prior_dir = args.prior_configs.pop(0) previous_priors = glob.glob(os.path.join(previous_prior_dir, '*-prior-sample.txt')) previous_sums = glob.glob(os.path.join(previous_prior_dir, '*-means-and-std-devs.txt')) if (not previous_priors) or (not previous_sums): raise ValueError('directory {0!r} specified with `prior-configs` ' 'option does not contain necessary prior and summary ' 'files'.format(args.prior_configs[0])) using_previous_priors = True else: for path in args.prior_configs: if not is_file(path): raise ValueError('prior config {0!r} is not a file'.format( path)) if len(args.prior_configs) != len(set(args.prior_configs)): raise ValueError('All paths to prior config files must be unique') if not args.output_dir: args.output_dir = os.path.dirname(args.observed_configs[0]) base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results')) if not args.temp_dir: args.temp_dir = base_dir info_path = os.path.join(base_dir, args.output_prefix + \ 'pymsbayes-info.txt') info = InfoLogger(info_path) info.write('[pymsbayes]'.format(base_dir)) info.write('\tversion = {version}'.format(**_program_info)) info.write('\toutput_directory = {0}'.format(base_dir)) temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') base_temp_dir = temp_fs.base_dir info.write('\ttemp_directory = {0}'.format(base_temp_dir)) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value())) info.write('\tsimulation_reps = {0}'.format(args.reps)) stat_patterns = DEFAULT_STAT_PATTERNS if args.stat_prefixes: for i in range(len(args.stat_prefixes)): if not args.stat_prefixes[i].endswith('.'): args.stat_prefixes[i] += '.' stat_patterns = get_patterns_from_prefixes( args.stat_prefixes, ignore_case=True) if not args.bandwidth: args.bandwidth = 2 / float(args.num_posterior_samples) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if args.data_key_path: observed_map = sumresults.parse_data_key_file(args.data_key_path) observed_paths = [observed_map[k] for k in sorted(observed_map.keys())] else: observed_dir = mk_new_dir(os.path.join(base_dir, 'observed-summary-stats')) observed_paths = [os.path.join(observed_dir, args.output_prefix + \ 'observed-{0}.txt'.format(i+1)) for i in range(len( args.observed_configs))] info.write('\tseed = {0}'.format(args.seed)) info.write('\tnum_processors = {0}'.format(args.np)) info.write('\tnum_prior_samples = {0}'.format( args.num_prior_samples)) info.write('\tnum_standardizing_samples = {0}'.format( args.num_standardizing_samples)) info.write('\tbandwidth = {0}'.format(args.bandwidth)) info.write('\tposterior_quantiles = {0}'.format( args.num_posterior_quantiles)) info.write('\tposterior_sample_size = {0}'.format( args.num_posterior_samples)) info.write('\tstat_patterns = {0}'.format( ', '.join([p.pattern for p in stat_patterns]))) # vet observed configs ref_config_path = args.observed_configs[0] ref_config = MsBayesConfig(ref_config_path) all_config_paths = [] num_taxon_pairs = ref_config.npairs assert num_taxon_pairs > 0 for config in args.observed_configs: all_config_paths.append(config) if not ref_config.equal_sample_table(config): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, config)) info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs)) info.write('\tdry_run = {0}'.format(args.dry_run)) info.write('\t[[tool_paths]]') info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path)) info.write('\t\tmsbayes = {0}'.format(msbayes_path)) info.write('\t\teureject = {0}'.format(eureject_path)) info.write('\t\tabcestimator = {0}'.format(abctb_path)) info.write('\t[[observed_configs]]') for i, cfg in enumerate(args.observed_configs): info.write('\t\t{0} = {1}'.format(i + 1, os.path.relpath(cfg, os.path.dirname(info_path)))) abc_team = ABCTeam( temp_fs = temp_fs, observed_stats_files = observed_paths, num_taxon_pairs = num_taxon_pairs, config_paths = args.prior_configs, previous_prior_dir = previous_prior_dir, num_prior_samples = args.num_prior_samples, num_processors = args.np, num_standardizing_samples = args.num_standardizing_samples, num_posterior_samples = args.num_posterior_samples, num_posterior_density_quantiles = args.num_posterior_quantiles, batch_size = args.prior_batch_size, output_dir = base_dir, output_prefix = args.output_prefix, prior_temp_dir = args.staging_dir, rng = GLOBAL_RNG, report_parameters = True, stat_patterns = stat_patterns, eureject_exe_path = eureject_path, abctoolbox_exe_path = abctb_path, msbayes_exe_path = None, abctoolbox_bandwidth = args.bandwidth, omega_threshold = 0.01, cv_threshold = 0.01, compress = args.compress, reporting_frequency = args.reporting_frequency, keep_temps = args.keep_temps, global_estimate_only = False, global_estimate = not args.no_global_estimate, generate_prior_samples_only = args.generate_samples_only, start_from_simulation_index = args.start_from_simulation_index, start_from_observed_index = args.start_from_observed_index) models_to_configs = {} configs_to_models = {} for k, v in abc_team.models.iteritems(): models_to_configs[k] = v configs_to_models[v] = k cfg = MsBayesConfig(v) all_config_paths.append(v) # vet prior configs if not ref_config.equal_sample_table(cfg): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, v)) info.write('\t[[observed_paths]]') for i in sorted(abc_team.observed_stats_paths.iterkeys()): info.write('\t\t{0} = {1}'.format(i, os.path.relpath( abc_team.observed_stats_paths[i], os.path.dirname(info_path)))) info.write('\t[[prior_configs]]') for i in sorted(abc_team.models.iterkeys()): info.write('\t\t{0} = {1}'.format(i, os.path.relpath( abc_team.models[i], os.path.dirname(info_path)))) ########################################################################## ## begin analysis --- get observed summary stats set_memory_trace() # start logging memory profile start_time = datetime.datetime.now() if args.data_key_path: log.info('Using provided summary statitics...') elif not args.dry_run: obs_temp_dir = base_temp_dir if args.staging_dir: obs_temp_dir = args.staging_dir observed_temp_fs = TempFileSystem(parent = obs_temp_dir, prefix = 'observed-temps-') if args.reps < 1: log.info('Calculating summary statistics from sequence data...') obs_workers = [] for i, cfg in enumerate(args.observed_configs): ss_worker = ObsSumStatsWorker( temp_fs = observed_temp_fs, config_path = cfg, output_path = observed_paths[i], schema = 'abctoolbox', stat_patterns = stat_patterns) obs_workers.append(ss_worker) obs_workers = Manager.run_workers( workers = obs_workers, num_processors = args.np) # re-vet all configs to see if some were changed by obsSumStats.pl new_ref_config = ref_config ref_modified = False # new ref because if all configs were updated all is good if not ref_config.equal_sample_table(ref_config_path): ref_modified = True new_ref_config = MsBayesConfig(ref_config_path) log.warning(""" The alignment lengths in config {0!r} have been corrected for sites with *any* ambiguous bases and/or gaps by obsSumStats.pl. """.format(ref_config_path)) for config in all_config_paths: if not new_ref_config.equal_sample_table(config): corrected_config = config if ref_modified: corrected_config = ref_config_path if not args.keep_temps: observed_temp_fs.purge() temp_fs.purge() raise errors.SampleTableError(""" The sample tables in configs {0!r} and {1!r} differ because obsSumStats.pl modified alignment lengths in config {2!r} to correct for sites in the alignments with *any* ambiguous bases and/or gaps. Please make sure the sample tables in all configs will be the same after correcting alignment lengths for sites that contain *any* ambiguous bases and/or gaps. You can do this by copying and pasting the sample table in {2!r} that has been corrected by obsSumStats.pl into the other configs that were not corrected. """.format(ref_config_path, config, corrected_config)) else: log.info('Simulating summary statistics from observed configs...') num_observed_workers = min([args.reps, args.np]) if args.reps <= args.np: observed_batch_size = 1 remainder = 0 else: observed_batch_size, remainder = long_division(args.reps, args.np) msbayes_workers = [] for idx, cfg in enumerate(args.observed_configs): observed_model_idx = configs_to_models.get(cfg, None) schema = 'abctoolbox' for i in range(num_observed_workers): worker = MsBayesWorker( temp_fs = observed_temp_fs, sample_size = observed_batch_size, config_path = cfg, model_index = observed_model_idx, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False, staging_dir = None, tag = idx) msbayes_workers.append(worker) if remainder > 0: worker = MsBayesWorker( temp_fs = observed_temp_fs, sample_size = remainder, config_path = cfg, model_index = observed_model_idx, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False, staging_dir = None, tag = idx) msbayes_workers.append(worker) # run parallel msbayes processes msbayes_workers = Manager.run_workers( workers = msbayes_workers, num_processors = args.np) workers = dict(zip(range(len(args.observed_configs)), [[] for i in range(len(args.observed_configs))])) for w in msbayes_workers: workers[w.tag].append(w) # merge simulated observed data into one file for i in range(len(args.observed_configs)): merge_prior_files([w.prior_path for w in workers[i]], observed_paths[i]) lc = line_count(observed_paths[i], ignore_headers=True) if lc != args.reps: if not args.keep_temps: temp_fs.purge() raise Exception('The number of observed simulations ({0}) ' 'generated for observed config {1!r} and output to ' 'file {2!r} does not match the number of reps ' '({3})'.format(lc, args.observed_configs[i], observed_paths[i], args.reps)) if not args.keep_temps: log.debug('purging observed temps...') observed_temp_fs.purge() ########################################################################## ## Begin ABC analyses if not args.dry_run: abc_team.run() stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(argv=sys.argv): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument( '-o', '--observed-configs', nargs='+', type=argparse_utils.arg_is_config, required=True, help=('One or more msBayes config files to be used to either ' 'calculate or simulate observed summary statistics. If ' 'used in combination with `-r` each config will be used to ' 'simulate pseudo-observed data. If analyzing real data, do ' 'not use the `-r` option, and the fasta files specified ' 'within the config must exist and contain the sequence ' 'data.')) parser.add_argument( '-p', '--prior-configs', nargs='+', type=argparse_utils.arg_is_path, required=True, help=('One or more config files to be used to generate prior ' 'samples. If more than one config is specified, they ' 'should be separated by spaces. ' 'This option can also be used to specify the path to a ' 'directory containing the prior samples and summary ' 'statistic means and standard deviations generated by a ' 'previous run using the `generate-samples-only` option. ' 'These files should be found in the directory ' '`pymsbayes-output/prior-stats-summaries`. The' '`pymsbayes-output/model-key.txt` also needs to be present.' ' If specifying this directory, it should be the only ' 'argument (i.e., no other directories or config files can ' 'be provided).')) parser.add_argument( '-r', '--reps', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('This option has two effects. First, it signifies that ' 'the analysis will be simulation based (i.e., no real ' 'data will be used). Second, it specifies how many ' 'simulation replicates to perform (i.e., how many data ' 'sets to simulate and analyze).')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=1000000, help=('The number of prior samples to simulate for each prior ' 'config specified with `-p`.')) parser.add_argument( '--prior-batch-size', action='store', type=argparse_utils.arg_is_positive_int, default=10000, help=('The number of prior samples to simulate for each batch.')) parser.add_argument( '--generate-samples-only', action='store_true', help=('Only generate samples from models as requested. I.e., ' 'No analyses are performed to approximate posteriors. ' 'This option can be useful if you want the prior samples ' 'for other purposes.')) parser.add_argument( '--num-posterior-samples', action='store', type=argparse_utils.arg_is_positive_int, default=1000, help=('The number of posterior samples desired for each ' 'analysis. Default: 1000.')) parser.add_argument('--num-standardizing-samples', action='store', type=argparse_utils.arg_is_positive_int, default=10000, help=('The number of prior samples desired to use for ' 'standardizing statistics. Default: 10000.')) parser.add_argument( '--np', action='store', type=argparse_utils.arg_is_positive_int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '--staging-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage prior files. This option ' 'can be useful on clusters to speed up I/O while ' 'generating prior samples. You can designate a local temp ' 'directory on a compute node to avoid constant writing to ' 'a shared drive. The default is to use the `temp-dir`.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi wattTheta pi.net tajD.denom`.')) parser.add_argument( '-b', '--bandwidth', action='store', type=float, help=('Smoothing parameter for the posterior kernal density ' 'estimation. This option is used for the `glm` ' 'regression method. The default is 2 / ' '`num-posterior-samples`.')) parser.add_argument( '-q', '--num-posterior-quantiles', action='store', type=argparse_utils.arg_is_positive_int, default=1000, help=('The number of equally spaced quantiles at which to ' 'evaluate the GLM-estimated posterior density. ' 'Default: 1000.')) parser.add_argument( '--reporting-frequency', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('Suggested frequency (in number of prior samples) for ' 'running regression and reporting current results. ' 'Default: 0 (only report final results). ' 'If a value is given, it may be adjusted so that the ' 'reporting frequency is a multiple of the multi-processed ' 'batch size.')) parser.add_argument('--sort-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, choices=range(12), help=argparse_utils.get_sort_index_help_message()) parser.add_argument( '--no-global-estimate', action='store_true', help=('If multiple prior models are specified, by default a ' 'global estimate is performed averaging over all models. ' 'This option prevents the global estimation (i.e., only ' 'inferences for each model are made).')) parser.add_argument('--compress', action='store_true', help='Compress large results files.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument( '--output-prefix', action='store', type=str, default='', help=('Prefix to use at beginning of output files. The default ' 'is no prefix.')) parser.add_argument( '--data-key-path', action='store', type=argparse_utils.arg_is_file, help=('The path to a `data-key.txt` file generated by a previous ' 'run. This file should be found in the directory ' '`pymsbayes-output/data-key.txt`. This option ' 'will override the `-o`/`--observed-configs` option, and ' 'is intended to be used in combination with the ' '`--start-from` option to restart an analysis.')) parser.add_argument( '--start-from-simulation-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('The simulation index at which to begin analyses. Must be ' 'used in combination with either the number of simulation ' 'replicates (`-r`/`--reps`) or the `--data-key-path` ' 'option, and must be a positive ' 'integer that is less than the number of simulation ' 'replicates. This option can be useful if an analysis ' 'needs to be restarted.')) parser.add_argument( '--start-from-observed-index', action='store', type=argparse_utils.arg_is_nonnegative_int, default=0, help=('The observed config index at which to begin analyses. ' 'Can be used in combination with the `--data-key-path` ' 'option to restart long-running, multi-observed-config ' 'analyses')) parser.add_argument('--dry-run', action='store_true', help='Do not run analyses; only process settings') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') if argv == sys.argv: args = parser.parse_args() else: args = parser.parse_args(argv) ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import (MsBayesWorker, merge_prior_files, ObsSumStatsWorker) from pymsbayes.teams import ABCTeam from pymsbayes.utils.functions import (is_file, is_dir, long_division, mk_new_dir) from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS, PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS, line_count) from pymsbayes.utils import sumresults, errors from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.config import MsBayesConfig from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace, MSBAYES_SORT_INDEX, ToolPathManager) MSBAYES_SORT_INDEX.set_index(args.sort_index) if len(args.observed_configs) != len(set(args.observed_configs)): raise ValueError('All paths to observed config files must be unique') if args.num_standardizing_samples > args.num_prior_samples: args.num_standardizing_samples = args.num_prior_samples # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') eureject_path = ToolPathManager.get_tool_full_path('eureject') abctb_path = ToolPathManager.get_tool_full_path('ABCestimator') # vet prior-configs option using_previous_priors = False previous_prior_dir = None if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])): previous_prior_dir = args.prior_configs.pop(0) previous_priors = glob.glob( os.path.join(previous_prior_dir, '*-prior-sample.txt')) previous_sums = glob.glob( os.path.join(previous_prior_dir, '*-means-and-std-devs.txt')) if (not previous_priors) or (not previous_sums): raise ValueError( 'directory {0!r} specified with `prior-configs` ' 'option does not contain necessary prior and summary ' 'files'.format(args.prior_configs[0])) using_previous_priors = True else: for path in args.prior_configs: if not is_file(path): raise ValueError( 'prior config {0!r} is not a file'.format(path)) if len(args.prior_configs) != len(set(args.prior_configs)): raise ValueError('All paths to prior config files must be unique') if not args.output_dir: args.output_dir = os.path.dirname(args.observed_configs[0]) base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results')) if not args.temp_dir: args.temp_dir = base_dir info_path = os.path.join(base_dir, args.output_prefix + \ 'pymsbayes-info.txt') info = InfoLogger(info_path) info.write('[pymsbayes]'.format(base_dir)) info.write('\tversion = {version}'.format(**_program_info)) info.write('\toutput_directory = {0}'.format(base_dir)) temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') base_temp_dir = temp_fs.base_dir info.write('\ttemp_directory = {0}'.format(base_temp_dir)) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value())) info.write('\tsimulation_reps = {0}'.format(args.reps)) stat_patterns = DEFAULT_STAT_PATTERNS if args.stat_prefixes: for i in range(len(args.stat_prefixes)): if not args.stat_prefixes[i].endswith('.'): args.stat_prefixes[i] += '.' stat_patterns = get_patterns_from_prefixes(args.stat_prefixes, ignore_case=True) if not args.bandwidth: args.bandwidth = 2 / float(args.num_posterior_samples) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) if args.data_key_path: observed_map = sumresults.parse_data_key_file(args.data_key_path) observed_paths = [observed_map[k] for k in sorted(observed_map.keys())] else: observed_dir = mk_new_dir( os.path.join(base_dir, 'observed-summary-stats')) observed_paths = [os.path.join(observed_dir, args.output_prefix + \ 'observed-{0}.txt'.format(i+1)) for i in range(len( args.observed_configs))] info.write('\tseed = {0}'.format(args.seed)) info.write('\tnum_processors = {0}'.format(args.np)) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples)) info.write('\tnum_standardizing_samples = {0}'.format( args.num_standardizing_samples)) info.write('\tbandwidth = {0}'.format(args.bandwidth)) info.write('\tposterior_quantiles = {0}'.format( args.num_posterior_quantiles)) info.write('\tposterior_sample_size = {0}'.format( args.num_posterior_samples)) info.write('\tstat_patterns = {0}'.format(', '.join( [p.pattern for p in stat_patterns]))) # vet observed configs ref_config_path = args.observed_configs[0] ref_config = MsBayesConfig(ref_config_path) all_config_paths = [] num_taxon_pairs = ref_config.npairs assert num_taxon_pairs > 0 for config in args.observed_configs: all_config_paths.append(config) if not ref_config.equal_sample_table(config): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, config)) info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs)) info.write('\tdry_run = {0}'.format(args.dry_run)) info.write('\t[[tool_paths]]') info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path)) info.write('\t\tmsbayes = {0}'.format(msbayes_path)) info.write('\t\teureject = {0}'.format(eureject_path)) info.write('\t\tabcestimator = {0}'.format(abctb_path)) info.write('\t[[observed_configs]]') for i, cfg in enumerate(args.observed_configs): info.write('\t\t{0} = {1}'.format( i + 1, os.path.relpath(cfg, os.path.dirname(info_path)))) abc_team = ABCTeam( temp_fs=temp_fs, observed_stats_files=observed_paths, num_taxon_pairs=num_taxon_pairs, config_paths=args.prior_configs, previous_prior_dir=previous_prior_dir, num_prior_samples=args.num_prior_samples, num_processors=args.np, num_standardizing_samples=args.num_standardizing_samples, num_posterior_samples=args.num_posterior_samples, num_posterior_density_quantiles=args.num_posterior_quantiles, batch_size=args.prior_batch_size, output_dir=base_dir, output_prefix=args.output_prefix, prior_temp_dir=args.staging_dir, rng=GLOBAL_RNG, report_parameters=True, stat_patterns=stat_patterns, eureject_exe_path=eureject_path, abctoolbox_exe_path=abctb_path, msbayes_exe_path=None, abctoolbox_bandwidth=args.bandwidth, omega_threshold=0.01, cv_threshold=0.01, compress=args.compress, reporting_frequency=args.reporting_frequency, keep_temps=args.keep_temps, global_estimate_only=False, global_estimate=not args.no_global_estimate, generate_prior_samples_only=args.generate_samples_only, start_from_simulation_index=args.start_from_simulation_index, start_from_observed_index=args.start_from_observed_index) models_to_configs = {} configs_to_models = {} for k, v in abc_team.models.iteritems(): models_to_configs[k] = v configs_to_models[v] = k cfg = MsBayesConfig(v) all_config_paths.append(v) # vet prior configs if not ref_config.equal_sample_table(cfg): if not args.keep_temps: temp_fs.purge() raise errors.SampleTableError( 'sample tables in config {0!r} and {1!r} differ; ' 'all sample tables must be the same.'.format( ref_config_path, v)) info.write('\t[[observed_paths]]') for i in sorted(abc_team.observed_stats_paths.iterkeys()): info.write('\t\t{0} = {1}'.format( i, os.path.relpath(abc_team.observed_stats_paths[i], os.path.dirname(info_path)))) info.write('\t[[prior_configs]]') for i in sorted(abc_team.models.iterkeys()): info.write('\t\t{0} = {1}'.format( i, os.path.relpath(abc_team.models[i], os.path.dirname(info_path)))) ########################################################################## ## begin analysis --- get observed summary stats set_memory_trace() # start logging memory profile start_time = datetime.datetime.now() if args.data_key_path: log.info('Using provided summary statitics...') elif not args.dry_run: obs_temp_dir = base_temp_dir if args.staging_dir: obs_temp_dir = args.staging_dir observed_temp_fs = TempFileSystem(parent=obs_temp_dir, prefix='observed-temps-') if args.reps < 1: log.info('Calculating summary statistics from sequence data...') obs_workers = [] for i, cfg in enumerate(args.observed_configs): ss_worker = ObsSumStatsWorker(temp_fs=observed_temp_fs, config_path=cfg, output_path=observed_paths[i], schema='abctoolbox', stat_patterns=stat_patterns) obs_workers.append(ss_worker) obs_workers = Manager.run_workers(workers=obs_workers, num_processors=args.np) # re-vet all configs to see if some were changed by obsSumStats.pl new_ref_config = ref_config ref_modified = False # new ref because if all configs were updated all is good if not ref_config.equal_sample_table(ref_config_path): ref_modified = True new_ref_config = MsBayesConfig(ref_config_path) log.warning(""" The alignment lengths in config {0!r} have been corrected for sites with *any* ambiguous bases and/or gaps by obsSumStats.pl. """.format(ref_config_path)) for config in all_config_paths: if not new_ref_config.equal_sample_table(config): corrected_config = config if ref_modified: corrected_config = ref_config_path if not args.keep_temps: observed_temp_fs.purge() temp_fs.purge() raise errors.SampleTableError(""" The sample tables in configs {0!r} and {1!r} differ because obsSumStats.pl modified alignment lengths in config {2!r} to correct for sites in the alignments with *any* ambiguous bases and/or gaps. Please make sure the sample tables in all configs will be the same after correcting alignment lengths for sites that contain *any* ambiguous bases and/or gaps. You can do this by copying and pasting the sample table in {2!r} that has been corrected by obsSumStats.pl into the other configs that were not corrected. """.format(ref_config_path, config, corrected_config)) else: log.info('Simulating summary statistics from observed configs...') num_observed_workers = min([args.reps, args.np]) if args.reps <= args.np: observed_batch_size = 1 remainder = 0 else: observed_batch_size, remainder = long_division( args.reps, args.np) msbayes_workers = [] for idx, cfg in enumerate(args.observed_configs): observed_model_idx = configs_to_models.get(cfg, None) schema = 'abctoolbox' for i in range(num_observed_workers): worker = MsBayesWorker(temp_fs=observed_temp_fs, sample_size=observed_batch_size, config_path=cfg, model_index=observed_model_idx, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False, staging_dir=None, tag=idx) msbayes_workers.append(worker) if remainder > 0: worker = MsBayesWorker(temp_fs=observed_temp_fs, sample_size=remainder, config_path=cfg, model_index=observed_model_idx, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False, staging_dir=None, tag=idx) msbayes_workers.append(worker) # run parallel msbayes processes msbayes_workers = Manager.run_workers(workers=msbayes_workers, num_processors=args.np) workers = dict( zip(range(len(args.observed_configs)), [[] for i in range(len(args.observed_configs))])) for w in msbayes_workers: workers[w.tag].append(w) # merge simulated observed data into one file for i in range(len(args.observed_configs)): merge_prior_files([w.prior_path for w in workers[i]], observed_paths[i]) lc = line_count(observed_paths[i], ignore_headers=True) if lc != args.reps: if not args.keep_temps: temp_fs.purge() raise Exception( 'The number of observed simulations ({0}) ' 'generated for observed config {1!r} and output to ' 'file {2!r} does not match the number of reps ' '({3})'.format(lc, args.observed_configs[i], observed_paths[i], args.reps)) if not args.keep_temps: log.debug('purging observed temps...') observed_temp_fs.purge() ########################################################################## ## Begin ABC analyses if not args.dry_run: abc_team.run() stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument( '-c', '--config', type=argparse_utils.arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action='store', type=int, default=0, choices=range(12), help=argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=args.config, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel=compress_level) for row in dict_line_iter(sample, sep='\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x=values, normed=True, bins=20, histtype='bar', align='mid', orientation='vertical', zorder=0) hist = plotting.ScatterPlot(hist_data_list=[hd], right_text=s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks=xticks, labels=xtick_labels, horizontalalignment='center') yticks_obj = plotting.Ticks(ticks=yticks, labels=ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots=[hist], num_columns=1, label_schema=None, title=stat, title_size=14.0, title_top=False, y_title='Density', y_title_position=0.001, y_title_size=14.0, height=4.0, width=6.0, auto_height=False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()