def assertSameUnsortedFiles(self, files): files = list(files) all_equal = True diffs = StringIO() f1, close = process_file_arg(files.pop(0)) lines1 = sorted(f1.readlines()) for f in files: f2, close2 = process_file_arg(f) lines2 = sorted(f2.readlines()) if len(lines1) != len(lines2): all_equal = False diffs.write('{0} ({1}) and {2} ({3}) have different ' 'number of lines\n'.format(f1.name, len(lines1), f2.name, len(lines2))) for i in range(min(len(lines1), len(lines2))): if lines1[i].strip().split() != lines2[i].strip().split(): all_equal = False diffs.write('{0} and {1} differ at sorted index ' '{2}\n'.format(f1.name, f2.name, i)) if close2: f2.close() if not all_equal: _LOG.error('files are not equal after sorting:\n{0}\n'.format( diffs.getvalue())) self.assertTrue(all_equal) if close: f1.close()
def test_string_object(self): f, close = process_file_arg(self.test_path, 'w') self.assertIsInstance(f, file) self.assertTrue(close) self.assertFalse(f.closed) f.close() f, close = process_file_arg(self.cfg_path, 'rU') self.assertIsInstance(f, file) self.assertTrue(close) self.assertFalse(f.closed) f.close()
def test_read_compressed_file(self): gzfs, close = process_file_arg(self.gz_path, 'rb') out, close_out = process_file_arg(self.test_path, 'w') for line in gzfs: out.write(line) if close_out: out.close() if close: gzfs.close() self.assertSameFiles([self.ungz_path, self.test_path], exclude_line_endings=True)
def test_write_compressed_file(self): fs, close = process_file_arg(self.ungz_path, 'rb') out, close_out = process_file_arg(self.test_path, 'wb', compresslevel=9) for line in fs: out.write(line) if close_out: out.close() if close: fs.close() self.assertTrue(is_gzipped(self.test_path)) self.assertSameFiles([self.gz_path, self.test_path], exclude_line_endings=True)
def write_result_summaries(self, prior_indices=None, sep='\t', include_tau_exclusion_info=False): if not prior_indices: prior_indices = self.prior_index_to_config.keys() if self.combined_prior_index: prior_indices.append(self.combined_prior_index) for prior_idx in prior_indices: for observed_idx in self.observed_index_to_path.iterkeys(): out_path = self.get_result_summary_path( observed_idx, prior_idx) out_path = functions.get_new_path(out_path) out, close = process_file_arg(out_path, 'w', compresslevel=self.compresslevel) keys = [] for i, r in enumerate( self.flat_result_iter(observed_idx, prior_idx, include_tau_exclusion_info)): if i == 0: keys = r.keys() out.write('{0}\n'.format(sep.join(keys))) out.write('{0}\n'.format( sep.join([str(r[k]) for k in keys]))) out.close()
def parse_header(file_obj, sep='\t', strict=True, seek=True): file_stream, close = process_file_arg(file_obj, 'rU') try: header_line = file_stream.next() except StopIteration: file_stream.close() if strict: raise Exception('did not find header in {0}'.format( file_stream.name)) else: return None if not HEADER_PATTERN.match(header_line): file_stream.close() if strict: raise Exception('did not find header in {0}'.format( file_stream.name)) else: return None header = header_line.strip().split(sep) if close: file_stream.close() else: if seek: file_stream.seek(0) return header
def test_abcestimator(self): summary_path = self.get_test_path(prefix='test-summary-') post_path = self.get_test_path(prefix='test-posterior-') with open(post_path, 'w') as out: stream, close = fileio.process_file_arg(self.posterior_path) for line in stream: out.write('{0}'.format(line)) if close: stream.close() regress_posterior_path = self.get_test_path(prefix='test-adjusted-') regress_worker = workers.ABCToolBoxRegressWorker( temp_fs = self.temp_fs, observed_path = self.sum_stats_path, posterior_path = post_path, parameter_indices = None, regress_summary_path = summary_path, regress_posterior_path = regress_posterior_path, exe_path = None, stdout_path = None, stderr_path = None, keep_temps = False, bandwidth = None, num_posterior_quantiles = 100) self.assertFalse(regress_worker.finished) self.assertEqual(regress_worker.exe_path, ToolPathManager.get_tool_path('ABCestimator')) _LOG.info('{0}'.format(regress_worker.exe_path)) regress_worker.start() self.assertTrue(regress_worker.finished) self.assertTrue(os.path.isfile(regress_worker.regress_summary_path)) self.assertTrue(os.path.isfile(regress_worker.regress_posterior_path)) self.assertEqual(self.get_number_of_lines( regress_worker.regress_posterior_path), 101)
def parse_summary_file(file_obj): f, close = process_file_arg(file_obj) lines = [] for l in f: l = l.strip() if l: lines.append(l) if close: f.close() if len(lines) != 4: raise errors.SummaryFileParsingError( 'summary file {0} has {1} lines'.format(f.name, len(lines))) header = lines[0].split() means = [float(x) for x in lines[1].split()] std_devs = [float(x) for x in lines[2].split()] sample_sizes = [int(x) for x in lines[3].split()] if not len(header) == len(means) == len(std_devs) == len(sample_sizes): raise errors.SummaryFileParsingError( 'lines of summary file {0} have ' 'unequal numbers of columns'.format(f.name)) d = {} for i in range(len(header)): d[header[i]] = { 'mean': means[i], 'std_deviation': std_devs[i], 'n': sample_sizes[i] } return d, header
def parameter_density_iter(parameter_density_file, parameter_patterns = DIV_MODEL_PATTERNS + MODEL_PATTERNS + \ PSI_PATTERNS + MEAN_TAU_PATTERNS + OMEGA_PATTERNS + \ CV_PATTERNS): dens_file, close = process_file_arg(parameter_density_file) try: header = parse_header(dens_file, seek=False) parameter_indices = functions.get_indices_of_patterns( header, parameter_patterns) indices_to_heads = dict( zip(parameter_indices, [header[i] for i in parameter_indices])) heads_to_dens_tups = dict( zip([header[i] for i in parameter_indices], [None for i in range(len(parameter_indices))])) if not len(parameter_indices) == len(set( indices_to_heads.itervalues())): raise errors.ParameterParsingError( 'some parameters were found in ' 'multiple columns in density file {0!r}'.format( dens_file.name)) for i, line in enumerate(dens_file): l = line.strip().split() if l: for idx in parameter_indices: heads_to_dens_tups[indices_to_heads[idx]] = (float( l[idx]), float(l[idx + 1])) yield heads_to_dens_tups except: raise finally: if close: dens_file.close()
def test_abcestimator(self): summary_path = self.get_test_path(prefix='test-summary-') post_path = self.get_test_path(prefix='test-posterior-') with open(post_path, 'w') as out: stream, close = fileio.process_file_arg(self.posterior_path) for line in stream: out.write('{0}'.format(line)) if close: stream.close() regress_posterior_path = self.get_test_path(prefix='test-adjusted-') regress_worker = workers.ABCToolBoxRegressWorker( temp_fs=self.temp_fs, observed_path=self.sum_stats_path, posterior_path=post_path, parameter_indices=None, regress_summary_path=summary_path, regress_posterior_path=regress_posterior_path, exe_path=None, stdout_path=None, stderr_path=None, keep_temps=False, bandwidth=None, num_posterior_quantiles=100) self.assertFalse(regress_worker.finished) self.assertEqual(regress_worker.exe_path, ToolPathManager.get_tool_path('ABCestimator')) _LOG.info('{0}'.format(regress_worker.exe_path)) regress_worker.start() self.assertTrue(regress_worker.finished) self.assertTrue(os.path.isfile(regress_worker.regress_summary_path)) self.assertTrue(os.path.isfile(regress_worker.regress_posterior_path)) self.assertEqual( self.get_number_of_lines(regress_worker.regress_posterior_path), 101)
def parameter_density_iter(parameter_density_file, parameter_patterns = DIV_MODEL_PATTERNS + MODEL_PATTERNS + \ PSI_PATTERNS + MEAN_TAU_PATTERNS + OMEGA_PATTERNS + \ CV_PATTERNS): dens_file, close = process_file_arg(parameter_density_file) try: header = parse_header(dens_file, seek = False) parameter_indices = functions.get_indices_of_patterns(header, parameter_patterns) indices_to_heads = dict(zip(parameter_indices, [header[i] for i in parameter_indices])) heads_to_dens_tups = dict(zip([header[i] for i in parameter_indices], [None for i in range(len(parameter_indices))])) if not len(parameter_indices) == len(set(indices_to_heads.itervalues())): raise errors.ParameterParsingError('some parameters were found in ' 'multiple columns in density file {0!r}'.format( dens_file.name)) for i, line in enumerate(dens_file): l = line.strip().split() if l: for idx in parameter_indices: heads_to_dens_tups[indices_to_heads[idx]] = (float(l[idx]), float(l[idx + 1])) yield heads_to_dens_tups except: raise finally: if close: dens_file.close()
def result_path_iter(self, observed_index, prior_index): true_model = self.observed_index_to_prior_index[observed_index] out_dir = self.get_result_dir(observed_index, prior_index) if not os.path.isdir(out_dir): raise Exception('expected result direcory {0!r} does not ' 'exist'.format(out_dir)) observed_stream, close = process_file_arg( self.observed_index_to_path[observed_index]) header = parsing.parse_header(observed_stream, sep = '\t', strict = True, seek = False) parameter_indices = functions.get_indices_of_patterns(header, parsing.PARAMETER_PATTERNS) for i, line in enumerate(observed_stream): l = line.strip().split() true_params = dict(zip([header[x] for x in parameter_indices], [l[x] for x in parameter_indices])) true_params['PRI.model'] = str(true_model) result_prefix = '{0}{1}-'.format(self.get_result_path_prefix( observed_index, prior_index, i + 1), self.final_result_index) summary_path = result_prefix + 'posterior-summary.txt' psi_path = result_prefix + 'psi-results.txt' omega_path = result_prefix + 'omega-results.txt' cv_path = result_prefix + 'cv-results.txt' div_model_path = result_prefix + 'div-model-results.txt' model_path = result_prefix + 'model-results.txt' paths = {'summary': summary_path, 'psi': psi_path, 'omega': omega_path, 'cv': cv_path, 'div-model': div_model_path, 'model': model_path} yield true_params, paths observed_stream.close()
def parse_summary_file(file_obj): f, close = process_file_arg(file_obj) lines = [] for l in f: l = l.strip() if l: lines.append(l) if close: f.close() if len(lines) != 4: raise errors.SummaryFileParsingError( 'summary file {0} has {1} lines'.format(f.name, len(lines))) header = lines[0].split() means = [float(x) for x in lines[1].split()] std_devs = [float(x) for x in lines[2].split()] sample_sizes = [int(x) for x in lines[3].split()] if not len(header) == len(means) == len(std_devs) == len(sample_sizes): raise errors.SummaryFileParsingError('lines of summary file {0} have ' 'unequal numbers of columns'.format(f.name)) d = {} for i in range(len(header)): d[header[i]] = {'mean': means[i], 'std_deviation': std_devs[i], 'n': sample_sizes[i]} return d, header
def get_number_of_lines(self, path): f, close = process_file_arg(path) count = 0 for l in f: count += 1 if close: f.close() return count
def get_number_of_header_lines(self, path): f, close = process_file_arg(path) count = 0 for l in f: if HEADER_PATTERN.match(l.strip()): count += 1 if close: f.close() return count
def reduce_columns(in_file, out_file, column_indices, sep='\t', extra_tab=False): if not column_indices: raise Exception('no column indices to retain') in_stream, close_in = process_file_arg(in_file, 'rU') out_stream, close_out = process_file_arg(out_file, 'w') line_iter = iter(in_stream) for line_num, line in enumerate(line_iter): l = line.strip().split(sep) new_line = [l[i] for i in column_indices] if extra_tab: out_stream.write('%s\t\n' % sep.join(new_line)) else: out_stream.write('%s\n' % sep.join(new_line)) if close_in: in_stream.close() if close_out: out_stream.close()
def files_equal(self, f1, f2, exclude_line_endings=False): equal = True diffs = [] f1, c1 = process_file_arg(f1) f2, c2 = process_file_arg(f2) line = 0 f1_end = False f2_end = False lines_left = True while True: line += 1 if f1_end == False: try: l1 = f1.next() except (StopIteration, EOFError): f1_end = line pass if f2_end == False: try: l2 = f2.next() except (StopIteration, EOFError): f2_end = line pass if f1_end != False and f2_end != False: break if exclude_line_endings: l1 = l1.strip() l2 = l2.strip() if f1_end == False and f2_end == False and l1 != l2: diffs.append(line) equal = False if f1_end != f2_end: mn = min([f1_end, f2_end]) mx = max([f1_end, f2_end]) diffs.extend(range(mn, mx + 1)) equal = False assert len(diffs) == len(set(diffs)) if c1: f1.close() if c2: f2.close() return equal, diffs
def files_equal(self, f1, f2, exclude_line_endings=False): equal = True diffs = [] f1, c1 = process_file_arg(f1) f2, c2 = process_file_arg(f2) line = 0 f1_end = False f2_end = False lines_left = True while True: line += 1 if f1_end == False: try: l1 = f1.next() except (StopIteration, EOFError): f1_end = line pass if f2_end == False: try: l2 = f2.next() except (StopIteration, EOFError): f2_end = line pass if f1_end != False and f2_end != False: break if exclude_line_endings: l1 = l1.strip() l2 = l2.strip() if f1_end == False and f2_end == False and l1 != l2: diffs.append(line) equal = False if f1_end != f2_end: mn = min([f1_end, f2_end]) mx = max([f1_end, f2_end]) diffs.extend(range(mn, mx+1)) equal = False assert len(diffs) == len(set(diffs)) if c1: f1.close() if c2: f2.close() return equal, diffs
def test_file_object(self): f = open(self.cfg_path, 'rU') f2, close = process_file_arg(f) self.assertIsInstance(f2, file) self.assertFalse(close) self.assertFalse(f2.closed) self.assertFalse(f.closed) self.assertEqual(f, f2) f.close() self.assertTrue(f2.closed) self.assertTrue(f.closed)
def line_count(file_obj, ignore_headers=False): f, close = process_file_arg(file_obj) count = 0 for line in f: if ignore_headers: if HEADER_PATTERN.match(line): continue count += 1 if close: f.close() return count
def _parse_table(self, config_file): self.alignments = OrderedDict() cfg_stream, close = fileio.process_file_arg(config_file) try: table_started = False table_finished = False row_num = 0 for i, l in enumerate(cfg_stream): line = l.strip() if self._end_pattern.match(line): if not table_started: raise errors.SampleTableError( 'hit end of sample table before beginning') if len(self.alignments) < 1: raise errors.SampleTableError( 'no rows found in sample table') table_finished = True break if self._begin_pattern.match(line): table_started = True continue if not table_started: continue if (line == '') or (line.startswith('#')): continue row_num += 1 try: al = AlignmentConfig(line) except errors.SampleTableRowError as e: _LOG.error('sample table row {0} is invalid'.format( row_num)) raise e if not al.taxon_name in self.alignments: self.alignments[al.taxon_name] = OrderedDict() self.alignments[al.taxon_name][al.locus_name] = al self._ordering.append((al.taxon_name, al.locus_name)) continue if al.locus_name in self.alignments[al.taxon_name]: raise errors.SampleTableError('locus {0} found twice ' 'for taxon {1} at row {2} of sample ' 'table'.format(al.locus_name, al.taxon_name, row_num)) self.alignments[al.taxon_name][al.locus_name] = al self._ordering.append((al.taxon_name, al.locus_name)) if not table_started: raise errors.SampleTableError('no sample table found') if not table_finished: raise errors.SampleTableError('no end of table found') finally: if close: cfg_stream.close()
def parse_model_key_file(path): wd = os.path.dirname(path) f, close = process_file_arg(path) model_paths = {} for line in f: l = line.strip().split('=') if len(l) != 2: raise Exception('unexpected line {0!r} in model key file'.format( line)) model_index = l[0].strip().strip('m') p = os.path.abspath(os.path.join(wd, l[1].strip())) model_paths[int(model_index)] = p f.close() return model_paths
def parse_model_key_file(path): wd = os.path.dirname(path) f, close = process_file_arg(path) model_paths = {} for line in f: l = line.strip().split('=') if len(l) != 2: raise Exception( 'unexpected line {0!r} in model key file'.format(line)) model_index = l[0].strip().strip('m') p = os.path.abspath(os.path.join(wd, l[1].strip())) model_paths[int(model_index)] = p f.close() return model_paths
def parse_abctoolbox_summary_file(file_obj): sum_file, close = process_file_arg(file_obj, 'rU') header = sum_file.next().strip().split() param_names = header[1:] params_to_indices = dict(zip(param_names, [i for i in range(len(param_names))])) summaries = dict(zip(param_names, [{} for i in range(len(param_names))])) for line in sum_file: l = line.strip().split() stat_name = l.pop(0) for k, d in summaries.iteritems(): d[stat_name] = float(l[params_to_indices[k]]) if close: sum_file.close() return summaries
def parse_abctoolbox_summary_file(file_obj): sum_file, close = process_file_arg(file_obj, 'rU') header = sum_file.next().strip().split() param_names = header[1:] params_to_indices = dict( zip(param_names, [i for i in range(len(param_names))])) summaries = dict(zip(param_names, [{} for i in range(len(param_names))])) for line in sum_file: l = line.strip().split() stat_name = l.pop(0) for k, d in summaries.iteritems(): d[stat_name] = float(l[params_to_indices[k]]) if close: sum_file.close() return summaries
def _has_non_sorted_results(self, div_model_path): length = None f, close = process_file_arg(div_model_path) f.next() # header for line in f: l = line.strip() if l: div_model_key = l.split()[0] div_model = div_model_key.split(',') if not length: length = len(div_model) if length != len(div_model): f.close() return False f.close() return True
def is_config(cls, cfg_file): cfg_stream, close = fileio.process_file_arg(cfg_file) for i in range(100): try: line = cfg_stream.next() if cls._table_begin_pattern.match(line.strip()): if close: cfg_stream.close() return True except: if close: cfg_stream.close() return False if close: cfg_stream.close() return False
def prior_for_msreject(in_file, out_file, stat_patterns=DEFAULT_STAT_PATTERNS, parameter_patterns=PARAMETER_PATTERNS, dummy_patterns=DUMMY_PATTERNS, include_header=False): header = parse_header(in_file) in_file, close = process_file_arg(in_file) indices = get_parameter_indices(header, parameter_patterns=parameter_patterns) indices.extend(get_stat_indices(header, stat_patterns=stat_patterns)) indices.extend(get_dummy_indices(header, dummy_patterns=DUMMY_PATTERNS)) if not include_header: in_file.next() reduce_columns(in_file, out_file, sorted(indices), extra_tab=False) if close: in_file.close() return [header[i] for i in sorted(indices)]
def _parse_results_file(self): file_stream, close = process_file_arg(self.path) ss_iter = parsing.spreadsheet_iter([file_stream]) for d in ss_iter: if self._full(): if close: file_stream.close() return try: dms = UnorderedDivergenceModelSummary(d) except: file_stream.close() raise self.n += 1 self.cumulative_prob += dms.prob self.models.append(dms) if close: file_stream.close()
def rescale_posterior(in_path, out_path, scale_factor, model_indices): header = None out, close = process_file_arg(out_path, 'w', compresslevel=9) omegas = [] psis = [] for i, d in enumerate(spreadsheet_iter([in_path])): if i == 0: header = d.keys() out.write('{0}\n'.format('\t'.join(header))) model_index = int(d['PRI.model']) if model_index in model_indices: d['PRI.E.t'] = float(d['PRI.E.t']) * scale_factor d['PRI.var.t'] = float(d['PRI.var.t']) * (scale_factor * 0.5) d['PRI.omega'] = float(d['PRI.omega']) * scale_factor omegas.append(d['PRI.omega']) psis.append(int(d['PRI.Psi'])) out.write('{0}\n'.format('\t'.join([ str(d[k]) for k in d.iterkeys()]))) out.close() return omegas, psis
def _split_config(self, cfg): cfg_stream, close = fileio.process_file_arg(cfg) preamble = StringIO() table = StringIO() preamble.write('[preamble]\n') table_started = False for i, line in enumerate(cfg_stream): if self._table_end_pattern.match(line.strip()): table.write(line) break if self._table_begin_pattern.match(line.strip()): table_started = True table.write(line) continue if table_started: table.write(line) else: preamble.write(line) if close: cfg_stream.close() return preamble, table
def write_result_summaries(self, prior_indices = None, sep = '\t', include_tau_exclusion_info = False): if not prior_indices: prior_indices = self.prior_index_to_config.keys() if self.combined_prior_index: prior_indices.append(self.combined_prior_index) for prior_idx in prior_indices: for observed_idx in self.observed_index_to_path.iterkeys(): out_path = self.get_result_summary_path(observed_idx, prior_idx) out_path = functions.get_new_path(out_path) out, close = process_file_arg(out_path, 'w', compresslevel = self.compresslevel) keys = [] for i, r in enumerate(self.flat_result_iter(observed_idx, prior_idx, include_tau_exclusion_info)): if i == 0: keys = r.keys() out.write('{0}\n'.format(sep.join(keys))) out.write('{0}\n'.format(sep.join([str(r[k ]) for k in keys]))) out.close()
def spreadsheet_iter(spreadsheets, sep = '\t', header = None): head_line = False if not header: head_line = True header = parse_header(spreadsheets[0], sep = sep) for sheet_idx, ss in enumerate(spreadsheets): file_stream, close = process_file_arg(ss, 'rU') if head_line: h = file_stream.next().strip().split(sep) if header != h: raise Exception('headers do not match') for row_idx, row in enumerate(file_stream): if row.strip() == '': continue r = [el.strip() for el in row.strip().split(sep)] if len(r) != len(header): raise Exception('row {0} of spreadsheet {1} has {2} columns, ' 'header has {3}'.format(row_idx + 1, sheet_idx + 1, len(r), len(header))) yield dict(zip(header, r)) if close: file_stream.close()
def result_path_iter(self, observed_index, prior_index): true_model = self.observed_index_to_prior_index[observed_index] out_dir = self.get_result_dir(observed_index, prior_index) if not os.path.isdir(out_dir): raise Exception('expected result direcory {0!r} does not ' 'exist'.format(out_dir)) observed_stream, close = process_file_arg( self.observed_index_to_path[observed_index]) header = parsing.parse_header(observed_stream, sep='\t', strict=True, seek=False) parameter_indices = functions.get_indices_of_patterns( header, parsing.PARAMETER_PATTERNS) for i, line in enumerate(observed_stream): l = line.strip().split() true_params = dict( zip([header[x] for x in parameter_indices], [l[x] for x in parameter_indices])) true_params['PRI.model'] = str(true_model) result_prefix = '{0}{1}-'.format( self.get_result_path_prefix(observed_index, prior_index, i + 1), self.final_result_index) summary_path = result_prefix + 'posterior-summary.txt' psi_path = result_prefix + 'psi-results.txt' omega_path = result_prefix + 'omega-results.txt' cv_path = result_prefix + 'cv-results.txt' div_model_path = result_prefix + 'div-model-results.txt' model_path = result_prefix + 'model-results.txt' paths = { 'summary': summary_path, 'psi': psi_path, 'omega': omega_path, 'cv': cv_path, 'div-model': div_model_path, 'model': model_path } yield true_params, paths observed_stream.close()
def parse_header(file_obj, sep='\t', strict=True, seek=True): file_stream, close = process_file_arg(file_obj, 'rU') try: header_line = file_stream.next() except StopIteration: file_stream.close() if strict: raise Exception('did not find header in {0}'.format(file_stream.name)) else: return None if not HEADER_PATTERN.match(header_line): file_stream.close() if strict: raise Exception('did not find header in {0}'.format(file_stream.name)) else: return None header = header_line.strip().split(sep) if close: file_stream.close() else: if seek: file_stream.seek(0) return header
def spreadsheet_iter(spreadsheets, sep='\t', header=None): head_line = False if not header: head_line = True header = parse_header(spreadsheets[0], sep=sep) for sheet_idx, ss in enumerate(spreadsheets): file_stream, close = process_file_arg(ss, 'rU') if head_line: h = file_stream.next().strip().split(sep) if header != h: raise Exception('headers do not match') for row_idx, row in enumerate(file_stream): if row.strip() == '': continue r = [el.strip() for el in row.strip().split(sep)] if len(r) != len(header): raise Exception('row {0} of spreadsheet {1} has {2} columns, ' 'header has {3}'.format( row_idx + 1, sheet_idx + 1, len(r), len(header))) yield dict(zip(header, r)) if close: file_stream.close()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('-c', '--config', type = argparse_utils.arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action = 'store', type = int, default = 0, choices = range(12), help = argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = args.config, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel = compress_level) for row in dict_line_iter(sample, sep = '\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x = values, normed = True, bins = 20, histtype = 'bar', align = 'mid', orientation = 'vertical', zorder = 0) hist = plotting.ScatterPlot(hist_data_list = [hd], right_text = s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks = xticks, labels = xtick_labels, horizontalalignment = 'center') yticks_obj = plotting.Ticks(ticks = yticks, labels = ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots = [hist], num_columns = 1, label_schema = None, title = stat, title_size = 14.0, title_top = False, y_title = 'Density', y_title_position = 0.001, y_title_size = 14.0, height = 4.0, width = 6.0, auto_height = False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def parameter_iter(file_obj, include_line=False, include_thetas=False): indices = {} post_file, close = process_file_arg(file_obj) header = parse_header(post_file, seek=False) mean_t_indices = functions.get_indices_of_patterns(header, MEAN_TAU_PATTERNS) if len(mean_t_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} mean ' 'tau columns'.format( post_file.name, len(mean_t_indices))) if mean_t_indices: indices['mean_tau'] = mean_t_indices omega_indices = functions.get_indices_of_patterns(header, OMEGA_PATTERNS) if len(omega_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} omega ' 'columns'.format( post_file.name, len(omega_indices))) if omega_indices: indices['omega'] = omega_indices cv_indices = functions.get_indices_of_patterns(header, CV_PATTERNS) if len(cv_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} cv ' 'columns'.format( post_file.name, len(cv_indices))) if cv_indices: indices['cv'] = cv_indices t_indices = functions.get_indices_of_patterns(header, TAU_PATTERNS) if t_indices: indices['taus'] = t_indices if include_thetas: a_theta_indices = functions.get_indices_of_patterns( header, A_THETA_PATTERNS) d1_theta_indices = functions.get_indices_of_patterns( header, D1_THETA_PATTERNS) d2_theta_indices = functions.get_indices_of_patterns( header, D2_THETA_PATTERNS) if a_theta_indices: indices['a_thetas'] = a_theta_indices if d1_theta_indices: indices['d1_thetas'] = d1_theta_indices if d2_theta_indices: indices['d2_thetas'] = d2_theta_indices psi_indices = functions.get_indices_of_patterns(header, PSI_PATTERNS) if len(psi_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} psi ' 'columns'.format( post_file.name, len(psi_indices))) if psi_indices: indices['psi'] = psi_indices model_indices = functions.get_indices_of_patterns(header, MODEL_PATTERNS) if len(model_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} model ' 'columns'.format( post_file.name, len(model_indices))) if model_indices: indices['model'] = model_indices div_model_indices = functions.get_indices_of_patterns( header, DIV_MODEL_PATTERNS) if len(div_model_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} div ' 'model columns'.format( post_file.name, len(div_model_indices))) if div_model_indices: indices['div_model'] = div_model_indices samples = dict(zip(indices.keys(), [None for i in range(len(indices))])) for i, line in enumerate(post_file): l = line.strip().split() if l: if len(l) != len(header): post_file.close() raise errors.ParameterParsingError( 'posterior file {0} has ' '{1} columns at line {2}; expecting {3}'.format( post_file.name, len(l), i + 2, len(header))) for k, idx_list in indices.iteritems(): if k in ['mean_tau', 'omega', 'cv']: samples[k] = [float(l[i]) for i in idx_list] elif k in ['psi', 'model', 'div_model']: samples[k] = [int(l[i]) for i in idx_list] elif k in ['taus', 'a_thetas', 'd1_thetas', 'd2_thetas']: samples[k] = [[float(l[i]) for i in idx_list]] else: post_file.close() raise errors.ParameterParsingError( 'unexpected key {0!r}; ' 'posterior file {1}, line {2}'.format( k, post_file.name, i + 2)) if include_line: yield samples, l else: yield samples if close: post_file.close()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description=description) parser.add_argument( '-c', '--config', type=arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument( '--vertical-lines', nargs='*', type=float, default=[], help=('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=config_path, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel=compress_level) for row in dict_line_iter(stats_by_time, sep='\t', header=header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = { 'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$' } spg = SaturationPlotGrid(stats_by_time, x_key='PRI.t', y_keys=args.stat_prefixes, y_labels=y_labels, num_columns=2, vertical_line_positions=args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def summarize_sim_results(info_path): info_path = expand_path(info_path) sim_results = DMCSimulationResults(info_path) out_dir = os.path.dirname(info_path) summary_path = os.path.join(out_dir, 'results-summary.txt') result_path = sim_results.get_result_summary_path(observed_index = 1, prior_index = sim_results.combined_prior_index) d = get_dict_from_spreadsheets([result_path]) num_excluded = [int(x) for x in d['num_excluded']] num_excluded_glm = [int(x) for x in d['num_excluded_glm']] bf_num_excluded = [int(x) for x in d['bf_num_excluded']] bf_num_excluded_glm = [int(x) for x in d['bf_num_excluded_glm']] prob_of_exclusion = [float(x) for x in d['prob_of_exclusion']] prob_of_exclusion_glm = [float(x) for x in d['prob_of_exclusion_glm']] prior_prob_of_exclusion = [float(x) for x in d['prior_prob_of_exclusion']] bf_of_exclusion = [float(x) for x in d['bf_of_exclusion']] bf_of_exclusion_glm = [float(x) for x in d['bf_of_exclusion_glm']] num_sims = sim_results.num_sim_reps assert len(num_excluded) == num_sims assert len(num_excluded_glm) == num_sims assert len(prob_of_exclusion) == num_sims assert len(prob_of_exclusion_glm) == num_sims summary_stream, close = process_file_arg(summary_path, 'w') summary_stream.write('Proportion of simulations excluding truth: {0}' '\n'.format( len([1 for x in bf_num_excluded if x > 0]) / float(num_sims))) summary_stream.write('Proportion of simulations excluding truth with GLM-' 'adjustment: {0}\n'.format( len([1 for x in bf_num_excluded_glm if x > 0]) / float(num_sims))) summary_stream.write('Average number of tau parameters excluded: {0}' '\n'.format( sum(bf_num_excluded) / float(num_sims))) summary_stream.write('Average number of tau parameters excluded with GLM: ' '{0}\n'.format(sum(bf_num_excluded_glm) / float(num_sims))) summary_stream.write('Mode number of tau parameters excluded: {0}\n'.format( mode_list(bf_num_excluded))) summary_stream.write('Mode number of tau parameters excluded with GLM: ' '{0}\n'.format(mode_list(bf_num_excluded_glm))) summary_stream.write('Max number of tau parameters excluded: {0}\n'.format( max(bf_num_excluded))) summary_stream.write('Max number of tau parameters excluded with GLM: ' '{0}\n'.format(max(bf_num_excluded_glm))) summary_stream.write('Average probability of exclusion: {0}\n'.format( sum(prob_of_exclusion) / float(num_sims))) summary_stream.write('Average probability of exclusion with GLM: {0}\n'.format( sum(prob_of_exclusion_glm) / float(num_sims))) summary_stream.write('Median probability of exclusion: {0}\n'.format( median(prob_of_exclusion))) summary_stream.write('Median probability of exclusion with GLM: {0}\n'.format( median(prob_of_exclusion_glm))) summary_stream.write('Average Bayes factor of exclusion: {0}\n'.format( sum(bf_of_exclusion) / float(num_sims))) summary_stream.write('Average Bayes factor of exclusion with GLM: {0}\n'.format( sum(bf_of_exclusion_glm) / float(num_sims))) summary_stream.write('Median Bayes factor of exclusion: {0}\n'.format( median(bf_of_exclusion))) summary_stream.write('Median Bayes factor of exclusion with GLM: {0}\n'.format( median(bf_of_exclusion_glm))) summary_stream.write('Max Bayes factor of exclusion: {0}\n'.format( max(bf_of_exclusion))) summary_stream.write('Max Bayes factor of exclusion with GLM: {0}\n'.format( max(bf_of_exclusion_glm))) prob_of_bf_exclusion = (len([1 for x in bf_of_exclusion if x > 10.0]) / float(num_sims)) prob_of_bf_exclusion_glm = (len([ 1 for x in bf_of_exclusion_glm if x > 10.0]) / float(num_sims)) summary_stream.write('Estimated probability Bayes factor of exclusion ' '> 10: {0}\n'.format(prob_of_bf_exclusion)) summary_stream.write('Estimated probability Bayes factor of exclusion ' '> 10 with GLM: {0}\n'.format(prob_of_bf_exclusion_glm)) summary_stream.close() if plotting.MATPLOTLIB_AVAILABLE: approx_prior_exclusion = 0.39184 prior_odds = approx_prior_exclusion / (1.0 - approx_prior_exclusion) post_odds = prior_odds * 10 post = post_odds / (1.0 + post_odds) observed_config1 = MsBayesConfig(sim_results.observed_index_to_config[1]) observed_config2 = MsBayesConfig(sim_results.observed_index_to_config[1]) cfg_to_num_ex = {observed_config1: bf_num_excluded, observed_config2: bf_num_excluded_glm} cfg_to_prob_exclusion = {observed_config1: prob_of_exclusion, observed_config2: prob_of_exclusion_glm} cfg_to_prob_of_bf_exclusion = {observed_config1: prob_of_bf_exclusion, observed_config2: prob_of_bf_exclusion_glm} ex_prob_plot = plotting.ProbabilityPowerPlotGrid( observed_config_to_estimates = cfg_to_prob_exclusion, variable = 'tau_exclusion', div_model_prior = 'psi', bayes_factor = 10, bayes_factor_prob = post, cfg_to_prob_of_bf_exclusion = cfg_to_prob_of_bf_exclusion, height = 3.7, margin_left = 0.03, margin_bottom = 0.06, margin_right = 1, margin_top = 0.96, padding_between_horizontal = 0.5, padding_between_vertical = 1.0, num_columns = 2) fig = ex_prob_plot.create_grid() fig.savefig(os.path.join(out_dir, 'prob_of_exclusion.pdf')) ex_plot = plotting.PowerPlotGrid( observed_config_to_estimates = cfg_to_num_ex, variable = 'tau_exclusion', num_columns = 2, height = 3.7, margin_left = 0.03, margin_bottom = 0.06, margin_right = 1, margin_top = 0.95, padding_between_horizontal = 0.5, padding_between_vertical = 1.0) fig = ex_plot.create_grid() fig.savefig(os.path.join(out_dir, 'num_tau_excluded.pdf'))
def assertSameSamples(self, files, columns_to_ignore=[], header=True, places=5, num_mismatches_per_sample=0, num_sample_mismatches=0): files = list(files) all_equal = True diffs = StringIO() f1, close = process_file_arg(files.pop(0)) f1_lines = f1.readlines() indices = [ i for i in range(len(f1_lines[0].strip().split())) if i not in columns_to_ignore ] h1 = [] if header: head = f1_lines.pop(0).strip().split() h1 = [head[i] for i in indices] lines1 = sorted(f1_lines) for f in files: f2, close2 = process_file_arg(f) f2_lines = f2.readlines() h2 = [] if header: head = f2_lines.pop(0).strip().split() h2 = [head[i] for i in indices] if h1 != h2: all_equal = False diffs.write('{0} and {1} have different headers; not ' 'comparing further\n'.format(f1.name, f2.name)) continue lines2 = sorted(f2_lines) if len(lines1) != len(lines2): all_equal = False diffs.write('{0} ({1}) and {2} ({3}) have different ' 'number of lines\n'.format(f1.name, len(lines1), f2.name, len(lines2))) n_matches = 0 n_mismatches = 0 for l1 in lines1: found = False for l2 in lines2: values1 = l1.strip().split() values2 = l2.strip().split() v1 = [float(values1[x]) for x in indices] v2 = [float(values2[x]) for x in indices] if self.same_samples( v1, v2, places=places, num_mismatches=num_mismatches_per_sample): found = True if found: n_matches += 1 else: n_mismatches += 1 if n_mismatches > 0: if n_mismatches > num_sample_mismatches: all_equal = False diffs.write('{0} and {1}\nhave {2} mismatching samples and ' 'share {3} samples\n'.format( f1.name, f2.name, n_mismatches, n_matches)) if close2: f2.close() if diffs.getvalue() != '': _LOG.error('files are not equal after sorting:\n{0}\n'.format( diffs.getvalue())) self.assertTrue(all_equal) if close: f1.close()
def parameter_iter(file_obj, include_line = False, include_thetas = False): indices = {} post_file, close = process_file_arg(file_obj) header = parse_header(post_file, seek = False) mean_t_indices = functions.get_indices_of_patterns(header, MEAN_TAU_PATTERNS) if len(mean_t_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} mean ' 'tau columns'.format(post_file.name, len(mean_t_indices))) if mean_t_indices: indices['mean_tau'] = mean_t_indices omega_indices = functions.get_indices_of_patterns(header, OMEGA_PATTERNS) if len(omega_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} omega ' 'columns'.format(post_file.name, len(omega_indices))) if omega_indices: indices['omega'] = omega_indices cv_indices = functions.get_indices_of_patterns(header, CV_PATTERNS) if len(cv_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} cv ' 'columns'.format(post_file.name, len(cv_indices))) if cv_indices: indices['cv'] = cv_indices t_indices = functions.get_indices_of_patterns(header, TAU_PATTERNS) if t_indices: indices['taus'] = t_indices if include_thetas: a_theta_indices = functions.get_indices_of_patterns(header, A_THETA_PATTERNS) d1_theta_indices = functions.get_indices_of_patterns(header, D1_THETA_PATTERNS) d2_theta_indices = functions.get_indices_of_patterns(header, D2_THETA_PATTERNS) if a_theta_indices: indices['a_thetas'] = a_theta_indices if d1_theta_indices: indices['d1_thetas'] = d1_theta_indices if d2_theta_indices: indices['d2_thetas'] = d2_theta_indices psi_indices = functions.get_indices_of_patterns(header, PSI_PATTERNS) if len(psi_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} psi ' 'columns'.format(post_file.name, len(psi_indices))) if psi_indices: indices['psi'] = psi_indices model_indices = functions.get_indices_of_patterns(header, MODEL_PATTERNS) if len(model_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} model ' 'columns'.format(post_file.name, len(model_indices))) if model_indices: indices['model'] = model_indices div_model_indices = functions.get_indices_of_patterns(header, DIV_MODEL_PATTERNS) if len(div_model_indices) > 1: post_file.close() raise errors.ParameterParsingError('posterior file {0} has {1} div ' 'model columns'.format(post_file.name, len(div_model_indices))) if div_model_indices: indices['div_model'] = div_model_indices samples = dict(zip(indices.keys(), [None for i in range(len(indices))])) for i, line in enumerate(post_file): l = line.strip().split() if l: if len(l) != len(header): post_file.close() raise errors.ParameterParsingError('posterior file {0} has ' '{1} columns at line {2}; expecting {3}'.format( post_file.name, len(l), i + 2, len(header))) for k, idx_list in indices.iteritems(): if k in ['mean_tau', 'omega', 'cv']: samples[k] = [float(l[i]) for i in idx_list] elif k in ['psi', 'model', 'div_model']: samples[k] = [int(l[i]) for i in idx_list] elif k in ['taus', 'a_thetas', 'd1_thetas', 'd2_thetas']: samples[k] = [[float(l[i]) for i in idx_list]] else: post_file.close() raise errors.ParameterParsingError('unexpected key {0!r}; ' 'posterior file {1}, line {2}'.format( k, post_file.name, i+2)) if include_line: yield samples, l else: yield samples if close: post_file.close()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description) parser.add_argument('-c', '--config', type = arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--vertical-lines', nargs = '*', type = float, default = [], help = ('Positions along x-axis where vertical lines are to be ' 'drawn. Default is to draw no vertical lines.')) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid MSBAYES_SORT_INDEX.set_index(0) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt') if args.compress: stats_by_time_path += '.gz' plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf') if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs cfg.div_model_prior = 'constrained' cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs, num_taxon_pairs) config_path = temp_fs.get_file_path(prefix='cfg-') cfg.write(config_path) info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = config_path, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') stats_by_time = get_stats_by_time([w.prior_path for w in workers]) stat_keys = stats_by_time.keys() stat_keys.remove('PRI.t') for prefix in args.stat_prefixes: if not prefix in stat_keys: raise Exception('stat prefix {0!r} not found in simulated stats:' '\n\t{1}'.format(prefix, ', '.join(stat_keys))) header = ['PRI.t'] + args.stat_prefixes log.info('Writing stats-by-time matrix...') out, close = process_file_arg(stats_by_time_path, 'w', compresslevel = compress_level) for row in dict_line_iter(stats_by_time, sep = '\t', header = header): out.write(row) if close: out.close() log.info('Creating plots...') if not MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(stats_by_time_path)) else: y_labels = {'pi': r'$\pi$', 'pi.net': r'$\pi_{net}$', 'wattTheta': r'$\theta_W$', 'tajD.denom': r'$SD(\pi - \theta_W)$'} spg = SaturationPlotGrid(stats_by_time, x_key = 'PRI.t', y_keys = args.stat_prefixes, y_labels = y_labels, num_columns = 2, vertical_line_positions = args.vertical_lines) fig = spg.create_grid() fig.savefig(plot_path) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument( '-c', '--config', type=argparse_utils.arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action='store', type=int, default=0, choices=range(12), help=argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=args.config, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel=compress_level) for row in dict_line_iter(sample, sep='\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x=values, normed=True, bins=20, histtype='bar', align='mid', orientation='vertical', zorder=0) hist = plotting.ScatterPlot(hist_data_list=[hd], right_text=s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks=xticks, labels=xtick_labels, horizontalalignment='center') yticks_obj = plotting.Ticks(ticks=yticks, labels=ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots=[hist], num_columns=1, label_schema=None, title=stat, title_size=14.0, title_top=False, y_title='Density', y_title_position=0.001, y_title_size=14.0, height=4.0, width=6.0, auto_height=False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()