Exemple #1
0
    def assertSameUnsortedFiles(self, files):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1, close = process_file_arg(files.pop(0))
        lines1 = sorted(f1.readlines())
        for f in files:
            f2, close2 = process_file_arg(f)
            lines2 = sorted(f2.readlines())
            if len(lines1) != len(lines2):
                all_equal = False
                diffs.write('{0} ({1}) and {2} ({3}) have different '
                        'number of lines\n'.format(f1.name, len(lines1),
                                f2.name, len(lines2)))
            for i in range(min(len(lines1), len(lines2))):
                if lines1[i].strip().split() != lines2[i].strip().split():
                    all_equal = False
                    diffs.write('{0} and {1} differ at sorted index '
                            '{2}\n'.format(f1.name, f2.name, i))

            if close2:
                f2.close()
        if not all_equal:
            _LOG.error('files are not equal after sorting:\n{0}\n'.format(
                    diffs.getvalue()))
        self.assertTrue(all_equal)
        if close:
            f1.close()
    def assertSameUnsortedFiles(self, files):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1, close = process_file_arg(files.pop(0))
        lines1 = sorted(f1.readlines())
        for f in files:
            f2, close2 = process_file_arg(f)
            lines2 = sorted(f2.readlines())
            if len(lines1) != len(lines2):
                all_equal = False
                diffs.write('{0} ({1}) and {2} ({3}) have different '
                            'number of lines\n'.format(f1.name, len(lines1),
                                                       f2.name, len(lines2)))
            for i in range(min(len(lines1), len(lines2))):
                if lines1[i].strip().split() != lines2[i].strip().split():
                    all_equal = False
                    diffs.write('{0} and {1} differ at sorted index '
                                '{2}\n'.format(f1.name, f2.name, i))

            if close2:
                f2.close()
        if not all_equal:
            _LOG.error('files are not equal after sorting:\n{0}\n'.format(
                diffs.getvalue()))
        self.assertTrue(all_equal)
        if close:
            f1.close()
 def test_string_object(self):
     f, close = process_file_arg(self.test_path, 'w')
     self.assertIsInstance(f, file)
     self.assertTrue(close)
     self.assertFalse(f.closed)
     f.close()
     f, close = process_file_arg(self.cfg_path, 'rU')
     self.assertIsInstance(f, file)
     self.assertTrue(close)
     self.assertFalse(f.closed)
     f.close()
 def test_read_compressed_file(self):
     gzfs, close = process_file_arg(self.gz_path, 'rb')
     out, close_out = process_file_arg(self.test_path, 'w')
     for line in gzfs:
         out.write(line)
     if close_out:
         out.close()
     if close:
         gzfs.close()
     self.assertSameFiles([self.ungz_path, self.test_path], 
             exclude_line_endings=True)
 def test_write_compressed_file(self):
     fs, close = process_file_arg(self.ungz_path, 'rb')
     out, close_out = process_file_arg(self.test_path, 'wb', compresslevel=9)
     for line in fs:
         out.write(line)
     if close_out:
         out.close()
     if close:
         fs.close()
     self.assertTrue(is_gzipped(self.test_path))
     self.assertSameFiles([self.gz_path, self.test_path],
             exclude_line_endings=True)
 def write_result_summaries(self,
                            prior_indices=None,
                            sep='\t',
                            include_tau_exclusion_info=False):
     if not prior_indices:
         prior_indices = self.prior_index_to_config.keys()
         if self.combined_prior_index:
             prior_indices.append(self.combined_prior_index)
     for prior_idx in prior_indices:
         for observed_idx in self.observed_index_to_path.iterkeys():
             out_path = self.get_result_summary_path(
                 observed_idx, prior_idx)
             out_path = functions.get_new_path(out_path)
             out, close = process_file_arg(out_path,
                                           'w',
                                           compresslevel=self.compresslevel)
             keys = []
             for i, r in enumerate(
                     self.flat_result_iter(observed_idx, prior_idx,
                                           include_tau_exclusion_info)):
                 if i == 0:
                     keys = r.keys()
                     out.write('{0}\n'.format(sep.join(keys)))
                 out.write('{0}\n'.format(
                     sep.join([str(r[k]) for k in keys])))
             out.close()
Exemple #7
0
def parse_header(file_obj, sep='\t', strict=True, seek=True):
    file_stream, close = process_file_arg(file_obj, 'rU')
    try:
        header_line = file_stream.next()
    except StopIteration:
        file_stream.close()
        if strict:
            raise Exception('did not find header in {0}'.format(
                file_stream.name))
        else:
            return None
    if not HEADER_PATTERN.match(header_line):
        file_stream.close()
        if strict:
            raise Exception('did not find header in {0}'.format(
                file_stream.name))
        else:
            return None
    header = header_line.strip().split(sep)
    if close:
        file_stream.close()
    else:
        if seek:
            file_stream.seek(0)
    return header
Exemple #8
0
 def test_abcestimator(self):
     summary_path = self.get_test_path(prefix='test-summary-')
     post_path = self.get_test_path(prefix='test-posterior-')
     with open(post_path, 'w') as out:
         stream, close = fileio.process_file_arg(self.posterior_path)
         for line in stream:
             out.write('{0}'.format(line))
         if close:
             stream.close()
     regress_posterior_path = self.get_test_path(prefix='test-adjusted-')
     regress_worker = workers.ABCToolBoxRegressWorker(
             temp_fs = self.temp_fs,
             observed_path = self.sum_stats_path,
             posterior_path = post_path,
             parameter_indices = None,
             regress_summary_path = summary_path,
             regress_posterior_path = regress_posterior_path,
             exe_path = None,
             stdout_path = None,
             stderr_path = None,
             keep_temps = False,
             bandwidth = None,
             num_posterior_quantiles = 100)
     self.assertFalse(regress_worker.finished)
     self.assertEqual(regress_worker.exe_path,
             ToolPathManager.get_tool_path('ABCestimator'))
     _LOG.info('{0}'.format(regress_worker.exe_path))
     regress_worker.start()
     self.assertTrue(regress_worker.finished)
     self.assertTrue(os.path.isfile(regress_worker.regress_summary_path))
     self.assertTrue(os.path.isfile(regress_worker.regress_posterior_path))
     self.assertEqual(self.get_number_of_lines(
             regress_worker.regress_posterior_path), 101)
Exemple #9
0
def parse_summary_file(file_obj):
    f, close = process_file_arg(file_obj)
    lines = []
    for l in f:
        l = l.strip()
        if l:
            lines.append(l)
    if close:
        f.close()
    if len(lines) != 4:
        raise errors.SummaryFileParsingError(
            'summary file {0} has {1} lines'.format(f.name, len(lines)))
    header = lines[0].split()
    means = [float(x) for x in lines[1].split()]
    std_devs = [float(x) for x in lines[2].split()]
    sample_sizes = [int(x) for x in lines[3].split()]
    if not len(header) == len(means) == len(std_devs) == len(sample_sizes):
        raise errors.SummaryFileParsingError(
            'lines of summary file {0} have '
            'unequal numbers of columns'.format(f.name))
    d = {}
    for i in range(len(header)):
        d[header[i]] = {
            'mean': means[i],
            'std_deviation': std_devs[i],
            'n': sample_sizes[i]
        }
    return d, header
Exemple #10
0
def parameter_density_iter(parameter_density_file,
        parameter_patterns = DIV_MODEL_PATTERNS + MODEL_PATTERNS + \
                PSI_PATTERNS + MEAN_TAU_PATTERNS + OMEGA_PATTERNS + \
                CV_PATTERNS):
    dens_file, close = process_file_arg(parameter_density_file)
    try:
        header = parse_header(dens_file, seek=False)
        parameter_indices = functions.get_indices_of_patterns(
            header, parameter_patterns)
        indices_to_heads = dict(
            zip(parameter_indices, [header[i] for i in parameter_indices]))
        heads_to_dens_tups = dict(
            zip([header[i] for i in parameter_indices],
                [None for i in range(len(parameter_indices))]))
        if not len(parameter_indices) == len(set(
                indices_to_heads.itervalues())):
            raise errors.ParameterParsingError(
                'some parameters were found in '
                'multiple columns in density file {0!r}'.format(
                    dens_file.name))
        for i, line in enumerate(dens_file):
            l = line.strip().split()
            if l:
                for idx in parameter_indices:
                    heads_to_dens_tups[indices_to_heads[idx]] = (float(
                        l[idx]), float(l[idx + 1]))
                yield heads_to_dens_tups
    except:
        raise
    finally:
        if close:
            dens_file.close()
Exemple #11
0
 def test_abcestimator(self):
     summary_path = self.get_test_path(prefix='test-summary-')
     post_path = self.get_test_path(prefix='test-posterior-')
     with open(post_path, 'w') as out:
         stream, close = fileio.process_file_arg(self.posterior_path)
         for line in stream:
             out.write('{0}'.format(line))
         if close:
             stream.close()
     regress_posterior_path = self.get_test_path(prefix='test-adjusted-')
     regress_worker = workers.ABCToolBoxRegressWorker(
         temp_fs=self.temp_fs,
         observed_path=self.sum_stats_path,
         posterior_path=post_path,
         parameter_indices=None,
         regress_summary_path=summary_path,
         regress_posterior_path=regress_posterior_path,
         exe_path=None,
         stdout_path=None,
         stderr_path=None,
         keep_temps=False,
         bandwidth=None,
         num_posterior_quantiles=100)
     self.assertFalse(regress_worker.finished)
     self.assertEqual(regress_worker.exe_path,
                      ToolPathManager.get_tool_path('ABCestimator'))
     _LOG.info('{0}'.format(regress_worker.exe_path))
     regress_worker.start()
     self.assertTrue(regress_worker.finished)
     self.assertTrue(os.path.isfile(regress_worker.regress_summary_path))
     self.assertTrue(os.path.isfile(regress_worker.regress_posterior_path))
     self.assertEqual(
         self.get_number_of_lines(regress_worker.regress_posterior_path),
         101)
Exemple #12
0
def parameter_density_iter(parameter_density_file,
        parameter_patterns = DIV_MODEL_PATTERNS + MODEL_PATTERNS + \
                PSI_PATTERNS + MEAN_TAU_PATTERNS + OMEGA_PATTERNS + \
                CV_PATTERNS):
    dens_file, close = process_file_arg(parameter_density_file)
    try:
        header = parse_header(dens_file, seek = False)
        parameter_indices = functions.get_indices_of_patterns(header,
                parameter_patterns)
        indices_to_heads = dict(zip(parameter_indices,
                [header[i] for i in parameter_indices]))
        heads_to_dens_tups = dict(zip([header[i] for i in parameter_indices],
                [None for i in range(len(parameter_indices))]))
        if not len(parameter_indices) == len(set(indices_to_heads.itervalues())):
            raise errors.ParameterParsingError('some parameters were found in '
                    'multiple columns in density file {0!r}'.format(
                            dens_file.name))
        for i, line in enumerate(dens_file):
            l = line.strip().split()
            if l:
                for idx in parameter_indices:
                    heads_to_dens_tups[indices_to_heads[idx]] = (float(l[idx]),
                            float(l[idx + 1]))
                yield heads_to_dens_tups
    except:
        raise
    finally:
        if close:
            dens_file.close()
Exemple #13
0
 def result_path_iter(self, observed_index, prior_index):
     true_model = self.observed_index_to_prior_index[observed_index]
     out_dir = self.get_result_dir(observed_index, prior_index)
     if not os.path.isdir(out_dir):
         raise Exception('expected result direcory {0!r} does not '
                 'exist'.format(out_dir))
     observed_stream, close = process_file_arg(
             self.observed_index_to_path[observed_index])
     header = parsing.parse_header(observed_stream, sep = '\t', strict = True,
             seek = False)
     parameter_indices = functions.get_indices_of_patterns(header,
             parsing.PARAMETER_PATTERNS)
     for i, line in enumerate(observed_stream):
         l = line.strip().split()
         true_params = dict(zip([header[x] for x in parameter_indices],
                 [l[x] for x in parameter_indices]))
         true_params['PRI.model'] = str(true_model)
         result_prefix = '{0}{1}-'.format(self.get_result_path_prefix(
                 observed_index, prior_index, i + 1), 
                 self.final_result_index)
         summary_path = result_prefix + 'posterior-summary.txt'
         psi_path = result_prefix + 'psi-results.txt'
         omega_path = result_prefix + 'omega-results.txt'
         cv_path = result_prefix + 'cv-results.txt'
         div_model_path = result_prefix + 'div-model-results.txt'
         model_path = result_prefix + 'model-results.txt'
         paths = {'summary': summary_path,
                  'psi': psi_path,
                  'omega': omega_path,
                  'cv': cv_path,
                  'div-model': div_model_path,
                  'model': model_path}
         yield true_params, paths
     observed_stream.close()
Exemple #14
0
def parse_summary_file(file_obj):
    f, close = process_file_arg(file_obj)
    lines = []
    for l in f:
        l = l.strip()
        if l:
            lines.append(l)
    if close:
        f.close()
    if len(lines) != 4:
        raise errors.SummaryFileParsingError(
                'summary file {0} has {1} lines'.format(f.name, len(lines)))
    header = lines[0].split()
    means = [float(x) for x in lines[1].split()]
    std_devs = [float(x) for x in lines[2].split()]
    sample_sizes = [int(x) for x in lines[3].split()]
    if not len(header) == len(means) == len(std_devs) == len(sample_sizes):
        raise errors.SummaryFileParsingError('lines of summary file {0} have '
                'unequal numbers of columns'.format(f.name))
    d = {}
    for i in range(len(header)):
        d[header[i]] = {'mean': means[i],
                        'std_deviation': std_devs[i],
                        'n': sample_sizes[i]}
    return d, header
Exemple #15
0
 def get_number_of_lines(self, path):
     f, close = process_file_arg(path)
     count = 0
     for l in f:
         count += 1
     if close:
         f.close()
     return count
 def get_number_of_lines(self, path):
     f, close = process_file_arg(path)
     count = 0
     for l in f:
         count += 1
     if close:
         f.close()
     return count
 def get_number_of_header_lines(self, path):
     f, close = process_file_arg(path)
     count = 0
     for l in f:
         if HEADER_PATTERN.match(l.strip()):
             count += 1
     if close:
         f.close()
     return count
Exemple #18
0
def reduce_columns(in_file, out_file, column_indices, sep='\t',
        extra_tab=False):
    if not column_indices:
        raise Exception('no column indices to retain')
    in_stream, close_in = process_file_arg(in_file, 'rU')
    out_stream, close_out = process_file_arg(out_file, 'w')
    line_iter = iter(in_stream)
    for line_num, line in enumerate(line_iter):
        l = line.strip().split(sep)
        new_line = [l[i] for i in column_indices]
        if extra_tab:
            out_stream.write('%s\t\n' % sep.join(new_line))
        else:
            out_stream.write('%s\n' % sep.join(new_line))
    if close_in:
        in_stream.close()
    if close_out:
        out_stream.close()
Exemple #19
0
 def get_number_of_header_lines(self, path):
     f, close = process_file_arg(path)
     count = 0
     for l in f:
         if HEADER_PATTERN.match(l.strip()):
             count += 1
     if close:
         f.close()
     return count
 def files_equal(self, f1, f2, exclude_line_endings=False):
     equal = True
     diffs = []
     f1, c1 = process_file_arg(f1)
     f2, c2 = process_file_arg(f2)
     line = 0
     f1_end = False
     f2_end = False
     lines_left = True
     while True:
         line += 1
         if f1_end == False:
             try:
                 l1 = f1.next()
             except (StopIteration, EOFError):
                 f1_end = line
                 pass
         if f2_end == False:
             try:
                 l2 = f2.next()
             except (StopIteration, EOFError):
                 f2_end = line
                 pass
         if f1_end != False and f2_end != False:
             break
         if exclude_line_endings:
             l1 = l1.strip()
             l2 = l2.strip()
         if f1_end == False and f2_end == False and l1 != l2:
             diffs.append(line)
             equal = False
     if f1_end != f2_end:
         mn = min([f1_end, f2_end])
         mx = max([f1_end, f2_end])
         diffs.extend(range(mn, mx + 1))
         equal = False
     assert len(diffs) == len(set(diffs))
     if c1:
         f1.close()
     if c2:
         f2.close()
     return equal, diffs
Exemple #21
0
 def files_equal(self, f1, f2, exclude_line_endings=False):
     equal = True
     diffs = []
     f1, c1 = process_file_arg(f1)
     f2, c2 = process_file_arg(f2)
     line = 0
     f1_end = False
     f2_end = False
     lines_left = True
     while True:
         line += 1
         if f1_end == False:
             try:
                 l1 = f1.next()
             except (StopIteration, EOFError):
                 f1_end = line
                 pass
         if f2_end == False:
             try:
                 l2 = f2.next()
             except (StopIteration, EOFError):
                 f2_end = line
                 pass
         if f1_end != False and f2_end != False:
             break
         if exclude_line_endings:
             l1 = l1.strip()
             l2 = l2.strip()
         if f1_end == False and f2_end == False and l1 != l2:
             diffs.append(line)
             equal = False
     if f1_end != f2_end:
         mn = min([f1_end, f2_end])
         mx = max([f1_end, f2_end])
         diffs.extend(range(mn, mx+1))
         equal = False
     assert len(diffs) == len(set(diffs))
     if c1:
         f1.close()
     if c2:
         f2.close()
     return equal, diffs
 def test_file_object(self):
     f = open(self.cfg_path, 'rU')
     f2, close = process_file_arg(f)
     self.assertIsInstance(f2, file)
     self.assertFalse(close)
     self.assertFalse(f2.closed)
     self.assertFalse(f.closed)
     self.assertEqual(f, f2)
     f.close()
     self.assertTrue(f2.closed)
     self.assertTrue(f.closed)
Exemple #23
0
def line_count(file_obj, ignore_headers=False):
    f, close = process_file_arg(file_obj)
    count = 0
    for line in f:
        if ignore_headers:
            if HEADER_PATTERN.match(line):
                continue
        count += 1
    if close:
        f.close()
    return count
Exemple #24
0
def line_count(file_obj, ignore_headers=False):
    f, close = process_file_arg(file_obj)
    count = 0
    for line in f:
        if ignore_headers:
            if HEADER_PATTERN.match(line):
                continue
        count += 1
    if close:
        f.close()
    return count
Exemple #25
0
def reduce_columns(in_file,
                   out_file,
                   column_indices,
                   sep='\t',
                   extra_tab=False):
    if not column_indices:
        raise Exception('no column indices to retain')
    in_stream, close_in = process_file_arg(in_file, 'rU')
    out_stream, close_out = process_file_arg(out_file, 'w')
    line_iter = iter(in_stream)
    for line_num, line in enumerate(line_iter):
        l = line.strip().split(sep)
        new_line = [l[i] for i in column_indices]
        if extra_tab:
            out_stream.write('%s\t\n' % sep.join(new_line))
        else:
            out_stream.write('%s\n' % sep.join(new_line))
    if close_in:
        in_stream.close()
    if close_out:
        out_stream.close()
Exemple #26
0
 def _parse_table(self, config_file):
     self.alignments = OrderedDict()
     cfg_stream, close = fileio.process_file_arg(config_file)
     try:
         table_started = False
         table_finished = False
         row_num = 0
         for i, l in enumerate(cfg_stream):
             line = l.strip()
             if self._end_pattern.match(line):
                 if not table_started:
                     raise errors.SampleTableError(
                             'hit end of sample table before beginning')
                 if len(self.alignments) < 1:
                     raise errors.SampleTableError(
                             'no rows found in sample table')
                 table_finished = True
                 break
             if self._begin_pattern.match(line):
                 table_started = True
                 continue
             if not table_started:
                 continue
             if (line == '') or (line.startswith('#')):
                 continue
             row_num += 1
             try:
                 al = AlignmentConfig(line)
             except errors.SampleTableRowError as e:
                 _LOG.error('sample table row {0} is invalid'.format(
                         row_num))
                 raise e
             if not al.taxon_name in self.alignments:
                 self.alignments[al.taxon_name] = OrderedDict()
                 self.alignments[al.taxon_name][al.locus_name] = al
                 self._ordering.append((al.taxon_name, al.locus_name))
                 continue
             if al.locus_name in self.alignments[al.taxon_name]:
                 raise errors.SampleTableError('locus {0} found twice '
                         'for taxon {1} at row {2} of sample '
                         'table'.format(al.locus_name, al.taxon_name,
                                 row_num))
             self.alignments[al.taxon_name][al.locus_name] = al
             self._ordering.append((al.taxon_name, al.locus_name))
         if not table_started:
             raise errors.SampleTableError('no sample table found')
         if not table_finished:
             raise errors.SampleTableError('no end of table found')
     finally:
         if close:
             cfg_stream.close()
Exemple #27
0
def parse_model_key_file(path):
    wd = os.path.dirname(path)
    f, close = process_file_arg(path)
    model_paths = {}
    for line in f:
        l = line.strip().split('=')
        if len(l) != 2:
            raise Exception('unexpected line {0!r} in model key file'.format(
                line))
        model_index = l[0].strip().strip('m')
        p = os.path.abspath(os.path.join(wd, l[1].strip()))
        model_paths[int(model_index)] = p
    f.close()
    return model_paths
Exemple #28
0
def parse_model_key_file(path):
    wd = os.path.dirname(path)
    f, close = process_file_arg(path)
    model_paths = {}
    for line in f:
        l = line.strip().split('=')
        if len(l) != 2:
            raise Exception(
                'unexpected line {0!r} in model key file'.format(line))
        model_index = l[0].strip().strip('m')
        p = os.path.abspath(os.path.join(wd, l[1].strip()))
        model_paths[int(model_index)] = p
    f.close()
    return model_paths
Exemple #29
0
def parse_abctoolbox_summary_file(file_obj):
    sum_file, close = process_file_arg(file_obj, 'rU')
    header = sum_file.next().strip().split()
    param_names = header[1:]
    params_to_indices = dict(zip(param_names,
            [i for i in range(len(param_names))]))
    summaries = dict(zip(param_names, [{} for i in range(len(param_names))]))
    for line in sum_file:
        l = line.strip().split()
        stat_name = l.pop(0)
        for k, d in summaries.iteritems():
            d[stat_name] = float(l[params_to_indices[k]])
    if close:
        sum_file.close()
    return summaries
Exemple #30
0
def parse_abctoolbox_summary_file(file_obj):
    sum_file, close = process_file_arg(file_obj, 'rU')
    header = sum_file.next().strip().split()
    param_names = header[1:]
    params_to_indices = dict(
        zip(param_names, [i for i in range(len(param_names))]))
    summaries = dict(zip(param_names, [{} for i in range(len(param_names))]))
    for line in sum_file:
        l = line.strip().split()
        stat_name = l.pop(0)
        for k, d in summaries.iteritems():
            d[stat_name] = float(l[params_to_indices[k]])
    if close:
        sum_file.close()
    return summaries
Exemple #31
0
 def _has_non_sorted_results(self, div_model_path):
     length = None
     f, close = process_file_arg(div_model_path)
     f.next()  # header
     for line in f:
         l = line.strip()
         if l:
             div_model_key = l.split()[0]
             div_model = div_model_key.split(',')
             if not length:
                 length = len(div_model)
             if length != len(div_model):
                 f.close()
                 return False
     f.close()
     return True
Exemple #32
0
 def _has_non_sorted_results(self, div_model_path):
     length = None
     f, close = process_file_arg(div_model_path)
     f.next() # header
     for line in f:
         l = line.strip()
         if l:
             div_model_key = l.split()[0]
             div_model = div_model_key.split(',')
             if not length:
                 length = len(div_model)
             if length != len(div_model):
                 f.close()
                 return False
     f.close()
     return True
Exemple #33
0
 def is_config(cls, cfg_file):
     cfg_stream, close = fileio.process_file_arg(cfg_file)
     for i in range(100):
         try:
             line = cfg_stream.next()
             if cls._table_begin_pattern.match(line.strip()):
                 if close:
                     cfg_stream.close()
                 return True
         except:
             if close:
                 cfg_stream.close()
             return False
     if close:
         cfg_stream.close()
     return False
Exemple #34
0
def prior_for_msreject(in_file, out_file,
        stat_patterns=DEFAULT_STAT_PATTERNS,
        parameter_patterns=PARAMETER_PATTERNS,
        dummy_patterns=DUMMY_PATTERNS,
        include_header=False):
    header = parse_header(in_file)
    in_file, close = process_file_arg(in_file)
    indices = get_parameter_indices(header,
            parameter_patterns=parameter_patterns)
    indices.extend(get_stat_indices(header, stat_patterns=stat_patterns))
    indices.extend(get_dummy_indices(header, dummy_patterns=DUMMY_PATTERNS))
    if not include_header:
        in_file.next()
    reduce_columns(in_file, out_file, sorted(indices), extra_tab=False)
    if close:
        in_file.close()
    return [header[i] for i in sorted(indices)]
Exemple #35
0
 def _parse_results_file(self):
     file_stream, close = process_file_arg(self.path)
     ss_iter = parsing.spreadsheet_iter([file_stream])
     for d in ss_iter:
         if self._full():
             if close:
                 file_stream.close()
             return
         try:
             dms = UnorderedDivergenceModelSummary(d)
         except:
             file_stream.close()
             raise
         self.n += 1
         self.cumulative_prob += dms.prob
         self.models.append(dms)
     if close:
         file_stream.close()
Exemple #36
0
 def _parse_results_file(self):
     file_stream, close = process_file_arg(self.path)
     ss_iter = parsing.spreadsheet_iter([file_stream])
     for d in ss_iter:
         if self._full():
             if close:
                 file_stream.close()
             return
         try:
             dms = UnorderedDivergenceModelSummary(d)
         except:
             file_stream.close()
             raise
         self.n += 1
         self.cumulative_prob += dms.prob
         self.models.append(dms)
     if close:
         file_stream.close()
Exemple #37
0
def prior_for_msreject(in_file,
                       out_file,
                       stat_patterns=DEFAULT_STAT_PATTERNS,
                       parameter_patterns=PARAMETER_PATTERNS,
                       dummy_patterns=DUMMY_PATTERNS,
                       include_header=False):
    header = parse_header(in_file)
    in_file, close = process_file_arg(in_file)
    indices = get_parameter_indices(header,
                                    parameter_patterns=parameter_patterns)
    indices.extend(get_stat_indices(header, stat_patterns=stat_patterns))
    indices.extend(get_dummy_indices(header, dummy_patterns=DUMMY_PATTERNS))
    if not include_header:
        in_file.next()
    reduce_columns(in_file, out_file, sorted(indices), extra_tab=False)
    if close:
        in_file.close()
    return [header[i] for i in sorted(indices)]
def rescale_posterior(in_path, out_path, scale_factor, model_indices):
    header = None
    out, close = process_file_arg(out_path, 'w', compresslevel=9)
    omegas = []
    psis = []
    for i, d in enumerate(spreadsheet_iter([in_path])):
        if i == 0:
            header = d.keys()
            out.write('{0}\n'.format('\t'.join(header)))
        model_index = int(d['PRI.model'])
        if model_index in model_indices:
            d['PRI.E.t'] = float(d['PRI.E.t']) * scale_factor
            d['PRI.var.t'] = float(d['PRI.var.t']) * (scale_factor * 0.5)
            d['PRI.omega'] = float(d['PRI.omega']) * scale_factor
            omegas.append(d['PRI.omega'])
            psis.append(int(d['PRI.Psi']))
        out.write('{0}\n'.format('\t'.join([
                str(d[k]) for k in d.iterkeys()])))
    out.close()
    return omegas, psis
Exemple #39
0
 def _split_config(self, cfg):
     cfg_stream, close = fileio.process_file_arg(cfg)
     preamble = StringIO()
     table = StringIO()
     preamble.write('[preamble]\n')
     table_started = False
     for i, line in enumerate(cfg_stream):
         if self._table_end_pattern.match(line.strip()):
             table.write(line)
             break
         if self._table_begin_pattern.match(line.strip()):
             table_started = True
             table.write(line)
             continue
         if table_started:
             table.write(line)
         else:
             preamble.write(line)
     if close:
         cfg_stream.close()
     return preamble, table
Exemple #40
0
 def write_result_summaries(self, prior_indices = None, sep = '\t',
         include_tau_exclusion_info = False):
     if not prior_indices:
         prior_indices = self.prior_index_to_config.keys()
         if self.combined_prior_index:
             prior_indices.append(self.combined_prior_index)
     for prior_idx in prior_indices:
         for observed_idx in self.observed_index_to_path.iterkeys():
             out_path = self.get_result_summary_path(observed_idx, prior_idx)
             out_path = functions.get_new_path(out_path)
             out, close = process_file_arg(out_path, 'w',
                     compresslevel = self.compresslevel)
             keys = []
             for i, r in enumerate(self.flat_result_iter(observed_idx,
                     prior_idx, include_tau_exclusion_info)):
                 if i == 0:
                     keys = r.keys()
                     out.write('{0}\n'.format(sep.join(keys)))
                 out.write('{0}\n'.format(sep.join([str(r[k
                         ]) for k in keys])))
             out.close()
Exemple #41
0
def spreadsheet_iter(spreadsheets, sep = '\t', header = None):
    head_line = False
    if not header:
        head_line = True
        header = parse_header(spreadsheets[0], sep = sep)
    for sheet_idx, ss in enumerate(spreadsheets):
        file_stream, close = process_file_arg(ss, 'rU')
        if head_line:
            h = file_stream.next().strip().split(sep)
            if header != h:
                raise Exception('headers do not match')
        for row_idx, row in enumerate(file_stream):
            if row.strip() == '':
                continue
            r = [el.strip() for el in row.strip().split(sep)]
            if len(r) != len(header):
                raise Exception('row {0} of spreadsheet {1} has {2} columns, '
                        'header has {3}'.format(row_idx + 1, sheet_idx + 1,
                                len(r), len(header)))
            yield dict(zip(header, r))
        if close:
            file_stream.close()
Exemple #42
0
 def result_path_iter(self, observed_index, prior_index):
     true_model = self.observed_index_to_prior_index[observed_index]
     out_dir = self.get_result_dir(observed_index, prior_index)
     if not os.path.isdir(out_dir):
         raise Exception('expected result direcory {0!r} does not '
                         'exist'.format(out_dir))
     observed_stream, close = process_file_arg(
         self.observed_index_to_path[observed_index])
     header = parsing.parse_header(observed_stream,
                                   sep='\t',
                                   strict=True,
                                   seek=False)
     parameter_indices = functions.get_indices_of_patterns(
         header, parsing.PARAMETER_PATTERNS)
     for i, line in enumerate(observed_stream):
         l = line.strip().split()
         true_params = dict(
             zip([header[x] for x in parameter_indices],
                 [l[x] for x in parameter_indices]))
         true_params['PRI.model'] = str(true_model)
         result_prefix = '{0}{1}-'.format(
             self.get_result_path_prefix(observed_index, prior_index,
                                         i + 1), self.final_result_index)
         summary_path = result_prefix + 'posterior-summary.txt'
         psi_path = result_prefix + 'psi-results.txt'
         omega_path = result_prefix + 'omega-results.txt'
         cv_path = result_prefix + 'cv-results.txt'
         div_model_path = result_prefix + 'div-model-results.txt'
         model_path = result_prefix + 'model-results.txt'
         paths = {
             'summary': summary_path,
             'psi': psi_path,
             'omega': omega_path,
             'cv': cv_path,
             'div-model': div_model_path,
             'model': model_path
         }
         yield true_params, paths
     observed_stream.close()
Exemple #43
0
def parse_header(file_obj, sep='\t', strict=True, seek=True):
    file_stream, close = process_file_arg(file_obj, 'rU')
    try:
        header_line = file_stream.next()
    except StopIteration:
        file_stream.close()
        if strict:
            raise Exception('did not find header in {0}'.format(file_stream.name))
        else:
            return None
    if not HEADER_PATTERN.match(header_line):
        file_stream.close()
        if strict:
            raise Exception('did not find header in {0}'.format(file_stream.name))
        else:
            return None
    header = header_line.strip().split(sep)
    if close:
        file_stream.close()
    else:
        if seek:
            file_stream.seek(0)
    return header
Exemple #44
0
def spreadsheet_iter(spreadsheets, sep='\t', header=None):
    head_line = False
    if not header:
        head_line = True
        header = parse_header(spreadsheets[0], sep=sep)
    for sheet_idx, ss in enumerate(spreadsheets):
        file_stream, close = process_file_arg(ss, 'rU')
        if head_line:
            h = file_stream.next().strip().split(sep)
            if header != h:
                raise Exception('headers do not match')
        for row_idx, row in enumerate(file_stream):
            if row.strip() == '':
                continue
            r = [el.strip() for el in row.strip().split(sep)]
            if len(r) != len(header):
                raise Exception('row {0} of spreadsheet {1} has {2} columns, '
                                'header has {3}'.format(
                                    row_idx + 1, sheet_idx + 1, len(r),
                                    len(header)))
            yield dict(zip(header, r))
        if close:
            file_stream.close()
Exemple #45
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse_utils.SmartHelpFormatter)
    parser.add_argument('-c', '--config',
            type = argparse_utils.arg_is_config,
            required = True,
            help = ('msBayes config file to be used to generate saturation '
                    'plot.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = int,
            default = 1000,
            help = ('The number of prior samples to simulate for the '
                    'saturation plot.'))
    parser.add_argument('--np',
            action = 'store',
            type = int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--sort-index',
            action = 'store',
            type = int,
            default = 0,
            choices = range(12),
            help = argparse_utils.get_sort_index_help_message())
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress plot data file.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability, stats
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes import plotting

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    sample_path = os.path.join(args.output_dir, 'prior-sample.txt')
    if args.compress:
        sample_path += '.gz'

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
            [s + '.' for s in args.stat_prefixes],
            ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()), log.info)
    info.write('\tstat_patterns = {0!r}'.format(
            ', '.join([p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
            log.info)
    info.write('\tsample_path = {0!r}'.format(sample_path), log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(
                temp_fs = temp_fs,
                sample_size = sample_size,
                config_path = args.config,
                report_parameters = True,
                schema = schema,
                include_header = True,
                stat_patterns = stat_patterns,
                write_stats_file = False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(
            workers = workers,
            num_processors = args.np)
    log.info('Parsing samples...')
    sample = get_dict_from_spreadsheets([w.prior_path for w in workers])

    log.info('Writing prior samples...')
    out, close = process_file_arg(sample_path, 'w',
            compresslevel = compress_level)
    for row in dict_line_iter(sample, sep = '\t'):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.warning(
                '`matplotlib` could not be imported, so the plot can not be\n'
                'produced. The data to create the plot can be found in:\n\t'
                '{0!r}'.format(sample_path))
        sys.exit(1)

    for stat_pattern in stat_patterns:
        found = False
        for stat, values in sample.iteritems():
            if stat_pattern.match(stat):
                values = [float(v) for v in values]
                found = True
                plot_path = os.path.join(args.output_dir,
                        'plot-{0}.pdf'.format(stat))
                summary = stats.get_summary(values)
                s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format(
                        summary['mean'],
                        summary['qi_95'][0],
                        summary['qi_95'][1])
                hd = plotting.HistData(x = values,
                        normed = True,
                        bins = 20,
                        histtype = 'bar',
                        align = 'mid',
                        orientation = 'vertical',
                        zorder = 0)
                hist = plotting.ScatterPlot(hist_data_list = [hd],
                        right_text = s)
                hist.left_text_size = 12.0
                hist.right_text_size = 12.0
                xticks = [i for i in hist.ax.get_xticks()]
                xtick_labels = [i for i in xticks]
                yticks = [i for i in hist.ax.get_yticks()]
                ytick_labels = [i for i in yticks]
                if len(xtick_labels) >= 8:
                    for i in range(1, len(xtick_labels), 2):
                        xtick_labels[i] = ''
                if len(ytick_labels) >= 8:
                    for i in range(1, len(ytick_labels), 2):
                        ytick_labels[i] = ''
                xticks_obj = plotting.Ticks(ticks = xticks,
                        labels = xtick_labels,
                        horizontalalignment = 'center')
                yticks_obj = plotting.Ticks(ticks = yticks,
                        labels = ytick_labels)
                hist.xticks_obj = xticks_obj
                hist.yticks_obj = yticks_obj

                plot_grid = plotting.PlotGrid(subplots = [hist],
                        num_columns = 1,
                        label_schema = None,
                        title = stat,
                        title_size = 14.0,
                        title_top = False,
                        y_title = 'Density',
                        y_title_position = 0.001,
                        y_title_size = 14.0,
                        height = 4.0,
                        width = 6.0,
                        auto_height = False)
                plot_grid.auto_adjust_margins = False
                plot_grid.margin_left = 0.04
                plot_grid.margin_bottom = 0.04 
                plot_grid.margin_right = 1.0 
                plot_grid.margin_top = 0.97
                plot_grid.reset_figure()
                plot_grid.savefig(plot_path)

        if not found:
            raise Exception('stat pattern {0!r} not found in simulated stats:'
                    '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys())))

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Exemple #46
0
def parameter_iter(file_obj, include_line=False, include_thetas=False):
    indices = {}
    post_file, close = process_file_arg(file_obj)
    header = parse_header(post_file, seek=False)
    mean_t_indices = functions.get_indices_of_patterns(header,
                                                       MEAN_TAU_PATTERNS)
    if len(mean_t_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} mean '
                                           'tau columns'.format(
                                               post_file.name,
                                               len(mean_t_indices)))
    if mean_t_indices:
        indices['mean_tau'] = mean_t_indices
    omega_indices = functions.get_indices_of_patterns(header, OMEGA_PATTERNS)
    if len(omega_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} omega '
                                           'columns'.format(
                                               post_file.name,
                                               len(omega_indices)))
    if omega_indices:
        indices['omega'] = omega_indices
    cv_indices = functions.get_indices_of_patterns(header, CV_PATTERNS)
    if len(cv_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} cv '
                                           'columns'.format(
                                               post_file.name,
                                               len(cv_indices)))
    if cv_indices:
        indices['cv'] = cv_indices
    t_indices = functions.get_indices_of_patterns(header, TAU_PATTERNS)
    if t_indices:
        indices['taus'] = t_indices
    if include_thetas:
        a_theta_indices = functions.get_indices_of_patterns(
            header, A_THETA_PATTERNS)
        d1_theta_indices = functions.get_indices_of_patterns(
            header, D1_THETA_PATTERNS)
        d2_theta_indices = functions.get_indices_of_patterns(
            header, D2_THETA_PATTERNS)
        if a_theta_indices:
            indices['a_thetas'] = a_theta_indices
        if d1_theta_indices:
            indices['d1_thetas'] = d1_theta_indices
        if d2_theta_indices:
            indices['d2_thetas'] = d2_theta_indices
    psi_indices = functions.get_indices_of_patterns(header, PSI_PATTERNS)
    if len(psi_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} psi '
                                           'columns'.format(
                                               post_file.name,
                                               len(psi_indices)))
    if psi_indices:
        indices['psi'] = psi_indices
    model_indices = functions.get_indices_of_patterns(header, MODEL_PATTERNS)
    if len(model_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} model '
                                           'columns'.format(
                                               post_file.name,
                                               len(model_indices)))
    if model_indices:
        indices['model'] = model_indices
    div_model_indices = functions.get_indices_of_patterns(
        header, DIV_MODEL_PATTERNS)
    if len(div_model_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} div '
                                           'model columns'.format(
                                               post_file.name,
                                               len(div_model_indices)))
    if div_model_indices:
        indices['div_model'] = div_model_indices
    samples = dict(zip(indices.keys(), [None for i in range(len(indices))]))
    for i, line in enumerate(post_file):
        l = line.strip().split()
        if l:
            if len(l) != len(header):
                post_file.close()
                raise errors.ParameterParsingError(
                    'posterior file {0} has '
                    '{1} columns at line {2}; expecting {3}'.format(
                        post_file.name, len(l), i + 2, len(header)))
            for k, idx_list in indices.iteritems():
                if k in ['mean_tau', 'omega', 'cv']:
                    samples[k] = [float(l[i]) for i in idx_list]
                elif k in ['psi', 'model', 'div_model']:
                    samples[k] = [int(l[i]) for i in idx_list]
                elif k in ['taus', 'a_thetas', 'd1_thetas', 'd2_thetas']:
                    samples[k] = [[float(l[i]) for i in idx_list]]
                else:
                    post_file.close()
                    raise errors.ParameterParsingError(
                        'unexpected key {0!r}; '
                        'posterior file {1}, line {2}'.format(
                            k, post_file.name, i + 2))
            if include_line:
                yield samples, l
            else:
                yield samples
    if close:
        post_file.close()
Exemple #47
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        '-c',
        '--config',
        type=arg_is_config,
        required=True,
        help=('msBayes config file to be used to generate saturation '
              'plot.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=int,
        default=1000,
        help=('The number of prior samples to simulate for the '
              'saturation plot.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument(
        '--vertical-lines',
        nargs='*',
        type=float,
        default=[],
        help=('Positions along x-axis where vertical lines are to be '
              'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress plot data file.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
        [s + '.' for s in args.stat_prefixes], ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
                                                      num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()),
               log.info)
    info.write(
        '\tstat_patterns = {0!r}'.format(', '.join(
            [p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
               log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
               log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(temp_fs=temp_fs,
                          sample_size=sample_size,
                          config_path=config_path,
                          report_parameters=True,
                          schema=schema,
                          include_header=True,
                          stat_patterns=stat_patterns,
                          write_stats_file=False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(workers=workers, num_processors=args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                            '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path,
                                  'w',
                                  compresslevel=compress_level)
    for row in dict_line_iter(stats_by_time, sep='\t', header=header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
            '`matplotlib` could not be imported, so the plot can not be\n'
            'produced. The data to create the plot can be found in:\n\t'
            '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {
            'pi': r'$\pi$',
            'pi.net': r'$\pi_{net}$',
            'wattTheta': r'$\theta_W$',
            'tajD.denom': r'$SD(\pi - \theta_W)$'
        }
        spg = SaturationPlotGrid(stats_by_time,
                                 x_key='PRI.t',
                                 y_keys=args.stat_prefixes,
                                 y_labels=y_labels,
                                 num_columns=2,
                                 vertical_line_positions=args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
def summarize_sim_results(info_path):
    info_path = expand_path(info_path)
    sim_results = DMCSimulationResults(info_path)
    out_dir = os.path.dirname(info_path)
    summary_path = os.path.join(out_dir, 'results-summary.txt')
    result_path = sim_results.get_result_summary_path(observed_index = 1,
            prior_index = sim_results.combined_prior_index)
    d = get_dict_from_spreadsheets([result_path])
    num_excluded = [int(x) for x in d['num_excluded']]
    num_excluded_glm = [int(x) for x in d['num_excluded_glm']]
    bf_num_excluded = [int(x) for x in d['bf_num_excluded']]
    bf_num_excluded_glm = [int(x) for x in d['bf_num_excluded_glm']]
    prob_of_exclusion = [float(x) for x in d['prob_of_exclusion']]
    prob_of_exclusion_glm = [float(x) for x in d['prob_of_exclusion_glm']]
    prior_prob_of_exclusion = [float(x) for x in d['prior_prob_of_exclusion']]
    bf_of_exclusion = [float(x) for x in d['bf_of_exclusion']]
    bf_of_exclusion_glm = [float(x) for x in d['bf_of_exclusion_glm']]
    num_sims = sim_results.num_sim_reps
    assert len(num_excluded) == num_sims
    assert len(num_excluded_glm) == num_sims
    assert len(prob_of_exclusion) == num_sims
    assert len(prob_of_exclusion_glm) == num_sims
    summary_stream, close = process_file_arg(summary_path, 'w')
    summary_stream.write('Proportion of simulations excluding truth: {0}'
            '\n'.format(
                len([1 for x in bf_num_excluded if x > 0]) / float(num_sims)))
    summary_stream.write('Proportion of simulations excluding truth with GLM-'
            'adjustment: {0}\n'.format(
                len([1 for x in bf_num_excluded_glm if x > 0]) / float(num_sims)))
    summary_stream.write('Average number of tau parameters excluded: {0}'
            '\n'.format(
                sum(bf_num_excluded) / float(num_sims)))
    summary_stream.write('Average number of tau parameters excluded with GLM: '
            '{0}\n'.format(sum(bf_num_excluded_glm) / float(num_sims)))
    summary_stream.write('Mode number of tau parameters excluded: {0}\n'.format(
            mode_list(bf_num_excluded)))
    summary_stream.write('Mode number of tau parameters excluded with GLM: '
            '{0}\n'.format(mode_list(bf_num_excluded_glm)))
    summary_stream.write('Max number of tau parameters excluded: {0}\n'.format(
            max(bf_num_excluded)))
    summary_stream.write('Max number of tau parameters excluded with GLM: '
            '{0}\n'.format(max(bf_num_excluded_glm)))
    summary_stream.write('Average probability of exclusion: {0}\n'.format(
            sum(prob_of_exclusion) / float(num_sims)))
    summary_stream.write('Average probability of exclusion with GLM: {0}\n'.format(
            sum(prob_of_exclusion_glm) / float(num_sims)))
    summary_stream.write('Median probability of exclusion: {0}\n'.format(
            median(prob_of_exclusion)))
    summary_stream.write('Median probability of exclusion with GLM: {0}\n'.format(
            median(prob_of_exclusion_glm)))
    summary_stream.write('Average Bayes factor of exclusion: {0}\n'.format(
            sum(bf_of_exclusion) / float(num_sims)))
    summary_stream.write('Average Bayes factor of exclusion with GLM: {0}\n'.format(
            sum(bf_of_exclusion_glm) / float(num_sims)))
    summary_stream.write('Median Bayes factor of exclusion: {0}\n'.format(
            median(bf_of_exclusion)))
    summary_stream.write('Median Bayes factor of exclusion with GLM: {0}\n'.format(
            median(bf_of_exclusion_glm)))
    summary_stream.write('Max Bayes factor of exclusion: {0}\n'.format(
            max(bf_of_exclusion)))
    summary_stream.write('Max Bayes factor of exclusion with GLM: {0}\n'.format(
            max(bf_of_exclusion_glm)))
    prob_of_bf_exclusion = (len([1 for x in bf_of_exclusion if x > 10.0]) /
            float(num_sims))
    prob_of_bf_exclusion_glm = (len([
            1 for x in bf_of_exclusion_glm if x > 10.0]) /
            float(num_sims))
    summary_stream.write('Estimated probability Bayes factor of exclusion '
            '> 10: {0}\n'.format(prob_of_bf_exclusion))
    summary_stream.write('Estimated probability Bayes factor of exclusion '
            '> 10 with GLM: {0}\n'.format(prob_of_bf_exclusion_glm))
    summary_stream.close()
    if plotting.MATPLOTLIB_AVAILABLE:
        approx_prior_exclusion = 0.39184
        prior_odds = approx_prior_exclusion / (1.0 - approx_prior_exclusion)
        post_odds = prior_odds * 10
        post = post_odds / (1.0 + post_odds)
        observed_config1 = MsBayesConfig(sim_results.observed_index_to_config[1])
        observed_config2 = MsBayesConfig(sim_results.observed_index_to_config[1])
        cfg_to_num_ex = {observed_config1: bf_num_excluded,
                observed_config2: bf_num_excluded_glm}
        cfg_to_prob_exclusion = {observed_config1: prob_of_exclusion,
                observed_config2: prob_of_exclusion_glm}
        cfg_to_prob_of_bf_exclusion = {observed_config1: prob_of_bf_exclusion,
                observed_config2: prob_of_bf_exclusion_glm}
        ex_prob_plot = plotting.ProbabilityPowerPlotGrid(
                observed_config_to_estimates = cfg_to_prob_exclusion,
                variable = 'tau_exclusion',
                div_model_prior = 'psi',
                bayes_factor = 10,
                bayes_factor_prob = post,
                cfg_to_prob_of_bf_exclusion = cfg_to_prob_of_bf_exclusion,
                height = 3.7,
                margin_left = 0.03,
                margin_bottom = 0.06,
                margin_right = 1,
                margin_top = 0.96,
                padding_between_horizontal = 0.5,
                padding_between_vertical = 1.0,
                num_columns = 2)
        fig = ex_prob_plot.create_grid()
        fig.savefig(os.path.join(out_dir, 'prob_of_exclusion.pdf'))
        ex_plot = plotting.PowerPlotGrid(
                observed_config_to_estimates = cfg_to_num_ex,
                variable = 'tau_exclusion',
                num_columns = 2,
                height = 3.7,
                margin_left = 0.03,
                margin_bottom = 0.06,
                margin_right = 1,
                margin_top = 0.95,
                padding_between_horizontal = 0.5,
                padding_between_vertical = 1.0)
        fig = ex_plot.create_grid()
        fig.savefig(os.path.join(out_dir, 'num_tau_excluded.pdf'))
 def assertSameSamples(self,
                       files,
                       columns_to_ignore=[],
                       header=True,
                       places=5,
                       num_mismatches_per_sample=0,
                       num_sample_mismatches=0):
     files = list(files)
     all_equal = True
     diffs = StringIO()
     f1, close = process_file_arg(files.pop(0))
     f1_lines = f1.readlines()
     indices = [
         i for i in range(len(f1_lines[0].strip().split()))
         if i not in columns_to_ignore
     ]
     h1 = []
     if header:
         head = f1_lines.pop(0).strip().split()
         h1 = [head[i] for i in indices]
     lines1 = sorted(f1_lines)
     for f in files:
         f2, close2 = process_file_arg(f)
         f2_lines = f2.readlines()
         h2 = []
         if header:
             head = f2_lines.pop(0).strip().split()
             h2 = [head[i] for i in indices]
             if h1 != h2:
                 all_equal = False
                 diffs.write('{0} and {1} have different headers; not '
                             'comparing further\n'.format(f1.name, f2.name))
                 continue
         lines2 = sorted(f2_lines)
         if len(lines1) != len(lines2):
             all_equal = False
             diffs.write('{0} ({1}) and {2} ({3}) have different '
                         'number of lines\n'.format(f1.name, len(lines1),
                                                    f2.name, len(lines2)))
         n_matches = 0
         n_mismatches = 0
         for l1 in lines1:
             found = False
             for l2 in lines2:
                 values1 = l1.strip().split()
                 values2 = l2.strip().split()
                 v1 = [float(values1[x]) for x in indices]
                 v2 = [float(values2[x]) for x in indices]
                 if self.same_samples(
                         v1,
                         v2,
                         places=places,
                         num_mismatches=num_mismatches_per_sample):
                     found = True
             if found:
                 n_matches += 1
             else:
                 n_mismatches += 1
         if n_mismatches > 0:
             if n_mismatches > num_sample_mismatches:
                 all_equal = False
             diffs.write('{0} and {1}\nhave {2} mismatching samples and '
                         'share {3} samples\n'.format(
                             f1.name, f2.name, n_mismatches, n_matches))
         if close2:
             f2.close()
     if diffs.getvalue() != '':
         _LOG.error('files are not equal after sorting:\n{0}\n'.format(
             diffs.getvalue()))
     self.assertTrue(all_equal)
     if close:
         f1.close()
Exemple #50
0
def parameter_iter(file_obj, include_line = False, include_thetas = False):
    indices = {}
    post_file, close = process_file_arg(file_obj)
    header = parse_header(post_file, seek = False)
    mean_t_indices = functions.get_indices_of_patterns(header,
            MEAN_TAU_PATTERNS)
    if len(mean_t_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} mean '
                'tau columns'.format(post_file.name, len(mean_t_indices)))
    if mean_t_indices:
        indices['mean_tau'] = mean_t_indices
    omega_indices = functions.get_indices_of_patterns(header, OMEGA_PATTERNS)
    if len(omega_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} omega '
                'columns'.format(post_file.name, len(omega_indices)))
    if omega_indices:
        indices['omega'] = omega_indices
    cv_indices = functions.get_indices_of_patterns(header, CV_PATTERNS)
    if len(cv_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} cv '
                'columns'.format(post_file.name, len(cv_indices)))
    if cv_indices:
        indices['cv'] = cv_indices
    t_indices = functions.get_indices_of_patterns(header, TAU_PATTERNS)
    if t_indices:
        indices['taus'] = t_indices
    if include_thetas:
        a_theta_indices = functions.get_indices_of_patterns(header,
                A_THETA_PATTERNS)
        d1_theta_indices = functions.get_indices_of_patterns(header,
                D1_THETA_PATTERNS)
        d2_theta_indices = functions.get_indices_of_patterns(header,
                D2_THETA_PATTERNS)
        if a_theta_indices:
            indices['a_thetas'] = a_theta_indices
        if d1_theta_indices:
            indices['d1_thetas'] = d1_theta_indices
        if d2_theta_indices:
            indices['d2_thetas'] = d2_theta_indices
    psi_indices = functions.get_indices_of_patterns(header, PSI_PATTERNS)
    if len(psi_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} psi '
                'columns'.format(post_file.name, len(psi_indices)))
    if psi_indices:
        indices['psi'] = psi_indices
    model_indices = functions.get_indices_of_patterns(header, MODEL_PATTERNS)
    if len(model_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} model '
                'columns'.format(post_file.name, len(model_indices)))
    if model_indices:
        indices['model'] = model_indices
    div_model_indices = functions.get_indices_of_patterns(header,
            DIV_MODEL_PATTERNS)
    if len(div_model_indices) > 1:
        post_file.close()
        raise errors.ParameterParsingError('posterior file {0} has {1} div '
                'model columns'.format(post_file.name, len(div_model_indices)))
    if div_model_indices:
        indices['div_model'] = div_model_indices
    samples = dict(zip(indices.keys(), [None for i in range(len(indices))]))
    for i, line in enumerate(post_file):
        l = line.strip().split()
        if l:
            if len(l) != len(header):
                post_file.close()
                raise errors.ParameterParsingError('posterior file {0} has '
                        '{1} columns at line {2}; expecting {3}'.format(
                                post_file.name, len(l), i + 2, len(header)))
            for k, idx_list in indices.iteritems():
                if k in ['mean_tau', 'omega', 'cv']:
                    samples[k] = [float(l[i]) for i in idx_list]
                elif k in ['psi', 'model', 'div_model']:
                    samples[k] = [int(l[i]) for i in idx_list]
                elif k in ['taus', 'a_thetas', 'd1_thetas', 'd2_thetas']:
                    samples[k] = [[float(l[i]) for i in idx_list]]
                else:
                    post_file.close()
                    raise errors.ParameterParsingError('unexpected key {0!r}; '
                            'posterior file {1}, line {2}'.format(
                                k, post_file.name, i+2))
            if include_line:
                yield samples, l
            else:
                yield samples
    if close:
        post_file.close()
Exemple #51
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('-c', '--config',
            type = arg_is_config,
            required = True,
            help = ('msBayes config file to be used to generate saturation '
                    'plot.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = int,
            default = 1000,
            help = ('The number of prior samples to simulate for the '
                    'saturation plot.'))
    parser.add_argument('--np',
            action = 'store',
            type = int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--vertical-lines',
            nargs = '*',
            type = float,
            default = [],
            help = ('Positions along x-axis where vertical lines are to be '
                    'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress plot data file.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
            [s + '.' for s in args.stat_prefixes],
            ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
            num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()), log.info)
    info.write('\tstat_patterns = {0!r}'.format(
            ', '.join([p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
            log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
            log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(
                temp_fs = temp_fs,
                sample_size = sample_size,
                config_path = config_path,
                report_parameters = True,
                schema = schema,
                include_header = True,
                stat_patterns = stat_patterns,
                write_stats_file = False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(
            workers = workers,
            num_processors = args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                    '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path, 'w',
            compresslevel = compress_level)
    for row in dict_line_iter(stats_by_time, sep = '\t', header = header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
                '`matplotlib` could not be imported, so the plot can not be\n'
                'produced. The data to create the plot can be found in:\n\t'
                '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {'pi': r'$\pi$',
                   'pi.net': r'$\pi_{net}$',
                   'wattTheta': r'$\theta_W$',
                   'tajD.denom': r'$SD(\pi - \theta_W)$'}
        spg = SaturationPlotGrid(stats_by_time,
                x_key = 'PRI.t',
                y_keys = args.stat_prefixes,
                y_labels = y_labels,
                num_columns = 2,
                vertical_line_positions = args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse_utils.SmartHelpFormatter)
    parser.add_argument(
        '-c',
        '--config',
        type=argparse_utils.arg_is_config,
        required=True,
        help=('msBayes config file to be used to generate saturation '
              'plot.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=int,
        default=1000,
        help=('The number of prior samples to simulate for the '
              'saturation plot.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--sort-index',
                        action='store',
                        type=int,
                        default=0,
                        choices=range(12),
                        help=argparse_utils.get_sort_index_help_message())
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress plot data file.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         get_dict_from_spreadsheets,
                                         dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability, stats
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes import plotting

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    sample_path = os.path.join(args.output_dir, 'prior-sample.txt')
    if args.compress:
        sample_path += '.gz'

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
        [s + '.' for s in args.stat_prefixes], ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()),
               log.info)
    info.write(
        '\tstat_patterns = {0!r}'.format(', '.join(
            [p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
               log.info)
    info.write('\tsample_path = {0!r}'.format(sample_path), log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(temp_fs=temp_fs,
                          sample_size=sample_size,
                          config_path=args.config,
                          report_parameters=True,
                          schema=schema,
                          include_header=True,
                          stat_patterns=stat_patterns,
                          write_stats_file=False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(workers=workers, num_processors=args.np)
    log.info('Parsing samples...')
    sample = get_dict_from_spreadsheets([w.prior_path for w in workers])

    log.info('Writing prior samples...')
    out, close = process_file_arg(sample_path,
                                  'w',
                                  compresslevel=compress_level)
    for row in dict_line_iter(sample, sep='\t'):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.warning(
            '`matplotlib` could not be imported, so the plot can not be\n'
            'produced. The data to create the plot can be found in:\n\t'
            '{0!r}'.format(sample_path))
        sys.exit(1)

    for stat_pattern in stat_patterns:
        found = False
        for stat, values in sample.iteritems():
            if stat_pattern.match(stat):
                values = [float(v) for v in values]
                found = True
                plot_path = os.path.join(args.output_dir,
                                         'plot-{0}.pdf'.format(stat))
                summary = stats.get_summary(values)
                s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format(
                    summary['mean'], summary['qi_95'][0], summary['qi_95'][1])
                hd = plotting.HistData(x=values,
                                       normed=True,
                                       bins=20,
                                       histtype='bar',
                                       align='mid',
                                       orientation='vertical',
                                       zorder=0)
                hist = plotting.ScatterPlot(hist_data_list=[hd], right_text=s)
                hist.left_text_size = 12.0
                hist.right_text_size = 12.0
                xticks = [i for i in hist.ax.get_xticks()]
                xtick_labels = [i for i in xticks]
                yticks = [i for i in hist.ax.get_yticks()]
                ytick_labels = [i for i in yticks]
                if len(xtick_labels) >= 8:
                    for i in range(1, len(xtick_labels), 2):
                        xtick_labels[i] = ''
                if len(ytick_labels) >= 8:
                    for i in range(1, len(ytick_labels), 2):
                        ytick_labels[i] = ''
                xticks_obj = plotting.Ticks(ticks=xticks,
                                            labels=xtick_labels,
                                            horizontalalignment='center')
                yticks_obj = plotting.Ticks(ticks=yticks, labels=ytick_labels)
                hist.xticks_obj = xticks_obj
                hist.yticks_obj = yticks_obj

                plot_grid = plotting.PlotGrid(subplots=[hist],
                                              num_columns=1,
                                              label_schema=None,
                                              title=stat,
                                              title_size=14.0,
                                              title_top=False,
                                              y_title='Density',
                                              y_title_position=0.001,
                                              y_title_size=14.0,
                                              height=4.0,
                                              width=6.0,
                                              auto_height=False)
                plot_grid.auto_adjust_margins = False
                plot_grid.margin_left = 0.04
                plot_grid.margin_bottom = 0.04
                plot_grid.margin_right = 1.0
                plot_grid.margin_top = 0.97
                plot_grid.reset_figure()
                plot_grid.savefig(plot_path)

        if not found:
            raise Exception('stat pattern {0!r} not found in simulated stats:'
                            '\n\t{1}'.format(stat_pattern,
                                             ', '.join(sample.keys())))

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()