Python TempFileSystem Examples

Programming Language: Python

Namespace/Package Name: pymsbayes.utils.tempfs

Class/Type: TempFileSystem

Examples at hotexamples.com: 10

Python TempFileSystem - 10 examples found. These are the top rated real world Python examples of pymsbayes.utils.tempfs.TempFileSystem extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TempFileSystem(4)

purge(4)

get_file_path(2)

_register_dir(1)

_register_file(1)

create_subdir(1)

Example #1

Show file

File: pymsbayes_test_case.py Project: vishalbelsare/PyMsBayes

 def set_up(self):
     self.temp_fs = TempFileSystem(parent=package_paths.test_path(),
                                   prefix='PyMsBayesTestTemp-')
     self.test_id = 'pymsbayes-' + random_str()

Example #2

Show file

def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('-c', '--config',
            type = arg_is_config,
            required = True,
            help = ('msBayes config file to be used to generate saturation '
                    'plot.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = int,
            default = 1000,
            help = ('The number of prior samples to simulate for the '
                    'saturation plot.'))
    parser.add_argument('--np',
            action = 'store',
            type = int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--vertical-lines',
            nargs = '*',
            type = float,
            default = [],
            help = ('Positions along x-axis where vertical lines are to be '
                    'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress plot data file.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
            [s + '.' for s in args.stat_prefixes],
            ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
            num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()), log.info)
    info.write('\tstat_patterns = {0!r}'.format(
            ', '.join([p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
            log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
            log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(
                temp_fs = temp_fs,
                sample_size = sample_size,
                config_path = config_path,
                report_parameters = True,
                schema = schema,
                include_header = True,
                stat_patterns = stat_patterns,
                write_stats_file = False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(
            workers = workers,
            num_processors = args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                    '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path, 'w',
            compresslevel = compress_level)
    for row in dict_line_iter(stats_by_time, sep = '\t', header = header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
                '`matplotlib` could not be imported, so the plot can not be\n'
                'produced. The data to create the plot can be found in:\n\t'
                '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {'pi': r'$\pi$',
                   'pi.net': r'$\pi_{net}$',
                   'wattTheta': r'$\theta_W$',
                   'tajD.denom': r'$SD(\pi - \theta_W)$'}
        spg = SaturationPlotGrid(stats_by_time,
                x_key = 'PRI.t',
                y_keys = args.stat_prefixes,
                y_labels = y_labels,
                num_columns = 2,
                vertical_line_positions = args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()

Example #3

Show file

File: pymsbayes_test_case.py Project: vishalbelsare/PyMsBayes

class PyMsBayesTestCase(unittest.TestCase):
    def set_up(self):
        self.temp_fs = TempFileSystem(parent=package_paths.test_path(),
                                      prefix='PyMsBayesTestTemp-')
        self.test_id = 'pymsbayes-' + random_str()

    def tear_down(self):
        self.register_file_system()
        self.temp_fs.purge()
        self.assertEqual(FileStream.open_files, set())

    def get_test_path(self, parent=None, prefix='temp'):
        return self.temp_fs.get_file_path(parent=parent, prefix=prefix)

    def get_test_subdir(self, parent=None, prefix='temp'):
        return self.temp_fs.create_subdir(parent=parent, prefix=prefix)

    def register_file(self, path):
        self.temp_fs._register_file(path)

    def register_dir(self, path):
        self.temp_fs._register_dir(path)

    def register_file_system(self):
        _LOG.debug('registering test file system...')
        for path, dirs, files, in os.walk(self.temp_fs.base_dir):
            for f in files:
                if f.startswith(self.test_id):
                    self.register_file(os.path.join(path, f))
            for d in dirs:
                if d.startswith(self.test_id):
                    self.register_dir(os.path.join(path, d))

    def _exe_script(self,
                    script_name,
                    args,
                    stdout=None,
                    stderr=None,
                    return_code=0):
        script_path = package_paths.script_path(script_name)
        if isinstance(args, str):
            arg_list = args.split()
        else:
            arg_list = args
        arg_list = [str(x) for x in arg_list]
        cmd = [sys.executable, script_path] + arg_list
        _LOG.debug('Invocation:\n\t{0}'.format(' '.join(cmd)))
        p = subprocess.Popen(cmd,
                             shell=False,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.PIPE)
        o, e = p.communicate()
        exit_code = p.wait()
        if exit_code != return_code:
            _LOG.error("exit code {0} did not match {1}".format(
                exit_code, return_code))
            _LOG.error("here is the stdout:\n{0}".format(o))
            _LOG.error("here is the stderr:\n{0}".format(e))
        self.assertEqual(exit_code, return_code)
        if stdout != None:
            if o != stdout:
                _LOG.error("std out did not match expected:\n{0}".format(o))
            self.assertEqual(o, stdout)
        if stderr != None:
            if e != stderr:
                _LOG.error("std error did not match expected:\n{0}".format(e))
            self.assertEqual(e, stderr)

    def get_expected_indices(self,
                             num_pairs,
                             dummy_column=True,
                             parameters_reported=True):
        num_summary_params = 4
        if _CV_INCLUDED:
            num_summary_params += 1
        num_params = 4 * num_pairs
        num_default_stats = 4 * num_pairs
        start = 0
        if dummy_column:
            start = 1
        param_indices = range(start, start + num_summary_params)
        start += num_summary_params
        if parameters_reported:
            param_indices += range(start, start + num_params)
            start += num_params
        stat_indices = range(start, start + num_default_stats)
        return param_indices, stat_indices

    def prior_file_is_valid(self,
                            prior_path,
                            num_of_samples,
                            num_of_columns=None):
        try:
            prior_file = open(prior_path, 'rU')
        except:
            _LOG.error('prior invalid: could not open prior path {0}'.format(
                prior_path))
            return False
        nrows = 0
        for i, line in enumerate(prior_file):
            if nrows == 0 and HEADER_PATTERN.match(line):
                pass
            else:
                nrows += 1
            if not num_of_columns:
                num_of_columns = len(line.strip().split())
            ncols = len(line.strip().split())
            if num_of_columns != ncols:
                _LOG.error('prior invalid: num of columns at line {0} is {1} '
                           'NOT {2}'.format(i + 1, ncols, num_of_columns))
                return False
        prior_file.close()
        if num_of_samples != nrows:
            _LOG.error('prior invalid: num of rows is {0} NOT {1}'.format(
                nrows, num_of_samples))
            return False
        return True

    def get_number_of_lines(self, path):
        f, close = process_file_arg(path)
        count = 0
        for l in f:
            count += 1
        if close:
            f.close()
        return count

    def get_number_of_header_lines(self, path):
        f, close = process_file_arg(path)
        count = 0
        for l in f:
            if HEADER_PATTERN.match(l.strip()):
                count += 1
        if close:
            f.close()
        return count

    def parse_python_config(self, path):
        return ConfigObj(path)

    def get_config_from_msbayes_workers(self, msbayes_workers):
        cfgs = [MsBayesConfig(w.config_path) for w in msbayes_workers]
        self.assertSameConfigs(cfgs)
        return cfgs[0]

    def assertSameConfigs(self, cfgs):
        configs = list(cfgs)
        c1 = configs.pop(0)
        for c2 in cfgs:
            self.assertEqual(c1.time_in_subs_per_site,
                             c2.time_in_subs_per_site)
            self.assertEqual(c1.npairs, c2.npairs)
            self.assertEqual(c1.implementation, c2.implementation)
            self.assertEqual(c1.div_model_prior, c2.div_model_prior)
            self.assertEqual(c1.bottle_proportion_shared,
                             c2.bottle_proportion_shared)
            self.assertEqual(c1.theta_parameters, c2.theta_parameters)
            self.assertEqual(c1.taxa, c2.taxa)
            self.assertTrue(c1.sample_table.equals(c2.sample_table))
            if c1.psi:
                self.assertSameDistributions(c1.psi, c2.psi)
            else:
                self.assertEqual(c1.psi, c2.psi)
            if c1.tau:
                self.assertSameDistributions(c1.tau, c2.tau)
            else:
                self.assertEqual(c1.tau, c2.tau)
            if c1.theta:
                self.assertSameDistributions(c1.theta, c2.theta)
            else:
                self.assertEqual(c1.theta, c2.theta)
            if c1.a_theta:
                self.assertSameDistributions(c1.a_theta, c2.a_theta)
            else:
                self.assertEqual(c1.a_theta, c2.a_theta)
            if c1.d_theta:
                self.assertSameDistributions(c1.d_theta, c2.d_theta)
            else:
                self.assertEqual(c1.d_theta, c2.d_theta)
            if c1.recombination:
                self.assertSameDistributions(c1.recombination,
                                             c2.recombination)
            else:
                self.assertEqual(c1.recombination, c2.recombination)
            if c1.migration:
                self.assertSameDistributions(c1.migration, c2.migration)
            else:
                self.assertEqual(c1.migration, c2.migration)
            if c1.dpp_concentration:
                self.assertSameDistributions(c1.dpp_concentration,
                                             c2.dpp_concentration)
            else:
                self.assertEqual(c1.dpp_concentration, c2.dpp_concentration)
            if c1.bottle_proportion:
                self.assertSameDistributions(c1.bottle_proportion,
                                             c2.bottle_proportion)
            else:
                self.assertEqual(c1.bottle_proportion, c2.bottle_proportion)

    def get_parameter_summaries_from_msbayes_workers(self,
                                                     msbayes_workers,
                                                     shuffle_taus=True):
        msbayes_workers = list(msbayes_workers)
        s = dict(
            zip([i for i in msbayes_workers[0].parameter_indices], [
                SampleSummarizer(tag=msbayes_workers[0].header[i])
                for i in msbayes_workers[0].parameter_indices
            ]))
        ncols = None
        header = msbayes_workers[0].header
        pi = msbayes_workers[0].parameter_indices
        for w in msbayes_workers:
            self.assertEqual(w.header, header)
            self.assertEqual(w.parameter_indices, pi)
            f = open(w.prior_path, 'rU')
            for line_idx, row in enumerate(f):
                if not ncols:
                    ncols = len(row.strip().split())
                if HEADER_PATTERN.match(row.strip()):
                    continue
                r = row.strip().split()
                assert len(r) == ncols
                if shuffle_taus:  # because taus are sorted in prior files
                    psi_index = get_indices_of_patterns(
                        w.header, PSI_PATTERNS)[0]
                    tau_indices = get_indices_of_patterns(
                        w.header, TAU_PATTERNS)
                    psi = int(r[psi_index])
                    taus = [float(r[i]) for i in tau_indices]
                    self.assertEqual(psi, len(set(taus)))
                    random.shuffle(taus)
                    for n, i in enumerate(tau_indices):
                        s[i].add_sample(taus[n])
                    p_set = set(w.parameter_indices) - set(tau_indices)
                    p = sorted(list(p_set))
                    for i in p:
                        s[i].add_sample(float(r[i]))
                else:
                    for i in w.parameter_indices:
                        s[i].add_sample(float(r[i]))
            f.close()
        return s

    def assertPriorIsPrecise(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        param_sums = self.get_parameter_summaries_from_msbayes_workers(
            msbayes_workers)
        sample_size = 0
        for w in msbayes_workers:
            sample_size += w.sample_size
        for s in param_sums.itervalues():
            self.assertEqual(s.n, sample_size)
        cfg = self.get_config_from_msbayes_workers(msbayes_workers)
        psi_indices = get_indices_of_patterns(msbayes_workers[0].header,
                                              PSI_PATTERNS)
        self.assertEqual(len(psi_indices), 1)
        model_indices = get_indices_of_patterns(msbayes_workers[0].header,
                                                MODEL_PATTERNS)
        if not msbayes_workers[0].model_index is None:
            self.assertEqual(len(model_indices), 1)
        else:
            self.assertEqual(len(model_indices), 0)
        tau_indices = get_indices_of_patterns(msbayes_workers[0].header,
                                              TAU_PATTERNS)
        a_theta_indices = get_indices_of_patterns(msbayes_workers[0].header,
                                                  A_THETA_PATTERNS)
        d_theta_indices = get_indices_of_patterns(msbayes_workers[0].header,
                                                  D_THETA_PATTERNS)
        if msbayes_workers[0].report_parameters:
            self.assertEqual(len(tau_indices), cfg.npairs)
            self.assertEqual(len(a_theta_indices), cfg.npairs)
            self.assertEqual(len(d_theta_indices), 2 * cfg.npairs)
        else:
            self.assertEqual(len(tau_indices), 0)
            self.assertEqual(len(a_theta_indices), 0)
            self.assertEqual(len(d_theta_indices), 0)
        _LOG.debug('\n{0}\n'.format('\n'.join(
            [str(param_sums[i]) for i in sorted(param_sums.iterkeys())])))
        for i in psi_indices:
            self.assertSampleIsFromDistribution(param_sums[i],
                                                cfg.psi,
                                                places=places)
        for i in tau_indices:
            self.assertSampleIsFromDistribution(param_sums[i],
                                                cfg.tau,
                                                places=places)
        for i in a_theta_indices:
            self.assertSampleIsFromDistribution(param_sums[i],
                                                cfg.a_theta,
                                                places=places)
        for i in d_theta_indices:
            self.assertSampleIsFromDistribution(param_sums[i],
                                                cfg.d_theta,
                                                mean_adj=cfg.theta.mean,
                                                max_adj=cfg.theta.maximum,
                                                compare_variance=False,
                                                places=places)

    def assertPriorIsAccurate(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        pass

    def assertPriorIsValid(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        self.assertPriorIsPrecise(msbayes_workers, places=places)
        self.assertPriorIsAccurate(msbayes_workers, places=places)

    def assertWorkersFinished(self, msbayes_workers):
        for w in msbayes_workers:
            self.assertTrue(w.finished)

    def assertSampleIsFromDistribution(self,
                                       sample_sum,
                                       dist,
                                       places=2,
                                       mean_adj=1,
                                       max_adj=1,
                                       compare_variance=True):
        if isinstance(dist, probability.DiscreteUniformDistribution):
            self.assertEqual(sample_sum.minimum, dist.minimum)
            self.assertEqual(sample_sum.maximum, dist.maximum)
        else:
            if dist.minimum != float('-inf') or dist.minimum != float('inf'):
                self.assertAlmostEqual(sample_sum.minimum, dist.minimum,
                                       places)
            if dist.maximum != float('-inf') or dist.maximum != float('inf'):
                self.assertAlmostEqual(sample_sum.maximum,
                                       dist.maximum * max_adj, places)
        self.assertAlmostEqual(sample_sum.mean, dist.mean * mean_adj, places)
        if compare_variance:
            self.assertAlmostEqual(sample_sum.variance, dist.variance, places)

    def assertApproxEqual(self, x, y, percent_tol=1e-6):
        eq = (((abs(x - y) / ((abs(x) + abs(y)) / 2)) * 100) < percent_tol)
        if not eq:
            _LOG.error('x ({0}) and y ({1}) are not equal'.format(x, y))
        self.assertTrue(eq)

    def files_equal(self, f1, f2, exclude_line_endings=False):
        equal = True
        diffs = []
        f1, c1 = process_file_arg(f1)
        f2, c2 = process_file_arg(f2)
        line = 0
        f1_end = False
        f2_end = False
        lines_left = True
        while True:
            line += 1
            if f1_end == False:
                try:
                    l1 = f1.next()
                except (StopIteration, EOFError):
                    f1_end = line
                    pass
            if f2_end == False:
                try:
                    l2 = f2.next()
                except (StopIteration, EOFError):
                    f2_end = line
                    pass
            if f1_end != False and f2_end != False:
                break
            if exclude_line_endings:
                l1 = l1.strip()
                l2 = l2.strip()
            if f1_end == False and f2_end == False and l1 != l2:
                diffs.append(line)
                equal = False
        if f1_end != f2_end:
            mn = min([f1_end, f2_end])
            mx = max([f1_end, f2_end])
            diffs.extend(range(mn, mx + 1))
            equal = False
        assert len(diffs) == len(set(diffs))
        if c1:
            f1.close()
        if c2:
            f2.close()
        return equal, diffs

    def assertSameFiles(self, files, exclude_line_endings=False):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1 = files.pop(0)
        for f2 in files:
            equal, diff_list = self.files_equal(f1, f2, exclude_line_endings)
            if not equal:
                all_equal = False
                n1 = f1
                if not isinstance(n1, str):
                    n1 = f1.name
                n2 = f2
                if not isinstance(n2, str):
                    n2 = f2.name
                diffs.write('{0} and {1} differ at lines:\n\t{2}\n'.format(
                    n1, n2, ','.join([str(i) for i in diff_list])))
        if not all_equal:
            _LOG.error('files are not equal:\n{0}\n'.format(diffs.getvalue()))
        self.assertTrue(all_equal)

    def assertSameUnsortedFiles(self, files):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1, close = process_file_arg(files.pop(0))
        lines1 = sorted(f1.readlines())
        for f in files:
            f2, close2 = process_file_arg(f)
            lines2 = sorted(f2.readlines())
            if len(lines1) != len(lines2):
                all_equal = False
                diffs.write('{0} ({1}) and {2} ({3}) have different '
                            'number of lines\n'.format(f1.name, len(lines1),
                                                       f2.name, len(lines2)))
            for i in range(min(len(lines1), len(lines2))):
                if lines1[i].strip().split() != lines2[i].strip().split():
                    all_equal = False
                    diffs.write('{0} and {1} differ at sorted index '
                                '{2}\n'.format(f1.name, f2.name, i))

            if close2:
                f2.close()
        if not all_equal:
            _LOG.error('files are not equal after sorting:\n{0}\n'.format(
                diffs.getvalue()))
        self.assertTrue(all_equal)
        if close:
            f1.close()

    def same_samples(self, sample1, sample2, places=4, num_mismatches=0):
        if len(sample1) != len(sample2):
            return False
        for i in range(len(sample1)):
            if round(float(sample1[i]) - float(sample2[i]), places) != 0:
                if num_mismatches < 1:
                    return False
                num_mismatches -= 1
        return True

    def assertSameSamples(self,
                          files,
                          columns_to_ignore=[],
                          header=True,
                          places=5,
                          num_mismatches_per_sample=0,
                          num_sample_mismatches=0):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1, close = process_file_arg(files.pop(0))
        f1_lines = f1.readlines()
        indices = [
            i for i in range(len(f1_lines[0].strip().split()))
            if i not in columns_to_ignore
        ]
        h1 = []
        if header:
            head = f1_lines.pop(0).strip().split()
            h1 = [head[i] for i in indices]
        lines1 = sorted(f1_lines)
        for f in files:
            f2, close2 = process_file_arg(f)
            f2_lines = f2.readlines()
            h2 = []
            if header:
                head = f2_lines.pop(0).strip().split()
                h2 = [head[i] for i in indices]
                if h1 != h2:
                    all_equal = False
                    diffs.write('{0} and {1} have different headers; not '
                                'comparing further\n'.format(f1.name, f2.name))
                    continue
            lines2 = sorted(f2_lines)
            if len(lines1) != len(lines2):
                all_equal = False
                diffs.write('{0} ({1}) and {2} ({3}) have different '
                            'number of lines\n'.format(f1.name, len(lines1),
                                                       f2.name, len(lines2)))
            n_matches = 0
            n_mismatches = 0
            for l1 in lines1:
                found = False
                for l2 in lines2:
                    values1 = l1.strip().split()
                    values2 = l2.strip().split()
                    v1 = [float(values1[x]) for x in indices]
                    v2 = [float(values2[x]) for x in indices]
                    if self.same_samples(
                            v1,
                            v2,
                            places=places,
                            num_mismatches=num_mismatches_per_sample):
                        found = True
                if found:
                    n_matches += 1
                else:
                    n_mismatches += 1
            if n_mismatches > 0:
                if n_mismatches > num_sample_mismatches:
                    all_equal = False
                diffs.write('{0} and {1}\nhave {2} mismatching samples and '
                            'share {3} samples\n'.format(
                                f1.name, f2.name, n_mismatches, n_matches))
            if close2:
                f2.close()
        if diffs.getvalue() != '':
            _LOG.error('files are not equal after sorting:\n{0}\n'.format(
                diffs.getvalue()))
        self.assertTrue(all_equal)
        if close:
            f1.close()

    def assertSameDistributions(self, d1, d2):
        self.assertEqual(d1.name, d2.name)
        self.assertEqual(str(d1), str(d2))
        self.assertEqual(d1.minimum, d2.minimum)
        self.assertEqual(d1.maximum, d2.maximum)
        self.assertEqual(d1.mean, d2.mean)
        self.assertEqual(d1.variance, d2.variance)

    def assertSameIntegerPartitions(self, integer_partitions):
        ips = list(integer_partitions)
        ip1 = ips.pop(0)
        for ip2 in ips:
            self.assertEqual(ip1._initialized, ip2._initialized)
            self.assertEqual(ip1.n, ip2.n)
            self.assertEqual(ip1.key, ip2.key)
            self.assertEqual(ip1.integer_partition, ip2.integer_partition)
            self.assertEqual(ip1._items, ip2._items)

    def assertSamePartitions(self, partitions):
        ps = list(partitions)
        p1 = ps.pop(0)
        for p2 in ps:
            self.assertEqual(p1._initialized, p2._initialized)
            self.assertEqual(p1.n, p2.n)
            self.assertEqual(p1.key, p2.key)
            self.assertEqual(p1.partition, p2.partition)
            self.assertEqual(p1.values, p2.values)

Example #4

Show file

 def set_up(self):
     self.temp_fs = TempFileSystem(
             parent = package_paths.test_path(),
             prefix = 'PyMsBayesTestTemp-')
     self.test_id = 'pymsbayes-' + random_str()

Example #5

Show file

def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse_utils.SmartHelpFormatter)
    parser.add_argument('-c', '--config',
            type = argparse_utils.arg_is_config,
            required = True,
            help = ('msBayes config file to be used to generate saturation '
                    'plot.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = int,
            default = 1000,
            help = ('The number of prior samples to simulate for the '
                    'saturation plot.'))
    parser.add_argument('--np',
            action = 'store',
            type = int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--sort-index',
            action = 'store',
            type = int,
            default = 0,
            choices = range(12),
            help = argparse_utils.get_sort_index_help_message())
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress plot data file.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability, stats
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes import plotting

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    sample_path = os.path.join(args.output_dir, 'prior-sample.txt')
    if args.compress:
        sample_path += '.gz'

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
            [s + '.' for s in args.stat_prefixes],
            ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()), log.info)
    info.write('\tstat_patterns = {0!r}'.format(
            ', '.join([p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
            log.info)
    info.write('\tsample_path = {0!r}'.format(sample_path), log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(
                temp_fs = temp_fs,
                sample_size = sample_size,
                config_path = args.config,
                report_parameters = True,
                schema = schema,
                include_header = True,
                stat_patterns = stat_patterns,
                write_stats_file = False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(
            workers = workers,
            num_processors = args.np)
    log.info('Parsing samples...')
    sample = get_dict_from_spreadsheets([w.prior_path for w in workers])

    log.info('Writing prior samples...')
    out, close = process_file_arg(sample_path, 'w',
            compresslevel = compress_level)
    for row in dict_line_iter(sample, sep = '\t'):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.warning(
                '`matplotlib` could not be imported, so the plot can not be\n'
                'produced. The data to create the plot can be found in:\n\t'
                '{0!r}'.format(sample_path))
        sys.exit(1)

    for stat_pattern in stat_patterns:
        found = False
        for stat, values in sample.iteritems():
            if stat_pattern.match(stat):
                values = [float(v) for v in values]
                found = True
                plot_path = os.path.join(args.output_dir,
                        'plot-{0}.pdf'.format(stat))
                summary = stats.get_summary(values)
                s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format(
                        summary['mean'],
                        summary['qi_95'][0],
                        summary['qi_95'][1])
                hd = plotting.HistData(x = values,
                        normed = True,
                        bins = 20,
                        histtype = 'bar',
                        align = 'mid',
                        orientation = 'vertical',
                        zorder = 0)
                hist = plotting.ScatterPlot(hist_data_list = [hd],
                        right_text = s)
                hist.left_text_size = 12.0
                hist.right_text_size = 12.0
                xticks = [i for i in hist.ax.get_xticks()]
                xtick_labels = [i for i in xticks]
                yticks = [i for i in hist.ax.get_yticks()]
                ytick_labels = [i for i in yticks]
                if len(xtick_labels) >= 8:
                    for i in range(1, len(xtick_labels), 2):
                        xtick_labels[i] = ''
                if len(ytick_labels) >= 8:
                    for i in range(1, len(ytick_labels), 2):
                        ytick_labels[i] = ''
                xticks_obj = plotting.Ticks(ticks = xticks,
                        labels = xtick_labels,
                        horizontalalignment = 'center')
                yticks_obj = plotting.Ticks(ticks = yticks,
                        labels = ytick_labels)
                hist.xticks_obj = xticks_obj
                hist.yticks_obj = yticks_obj

                plot_grid = plotting.PlotGrid(subplots = [hist],
                        num_columns = 1,
                        label_schema = None,
                        title = stat,
                        title_size = 14.0,
                        title_top = False,
                        y_title = 'Density',
                        y_title_position = 0.001,
                        y_title_size = 14.0,
                        height = 4.0,
                        width = 6.0,
                        auto_height = False)
                plot_grid.auto_adjust_margins = False
                plot_grid.margin_left = 0.04
                plot_grid.margin_bottom = 0.04 
                plot_grid.margin_right = 1.0 
                plot_grid.margin_top = 0.97
                plot_grid.reset_figure()
                plot_grid.savefig(plot_path)

        if not found:
            raise Exception('stat pattern {0!r} not found in simulated stats:'
                    '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys())))

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()

Example #6

Show file

class PyMsBayesTestCase(unittest.TestCase):
    
    def set_up(self):
        self.temp_fs = TempFileSystem(
                parent = package_paths.test_path(),
                prefix = 'PyMsBayesTestTemp-')
        self.test_id = 'pymsbayes-' + random_str()

    def tear_down(self):
        self.register_file_system()
        self.temp_fs.purge()
        self.assertEqual(FileStream.open_files, set())

    def get_test_path(self, parent=None, prefix='temp'):
        return self.temp_fs.get_file_path(parent=parent, prefix=prefix)

    def get_test_subdir(self, parent=None, prefix='temp'):
        return self.temp_fs.create_subdir(parent=parent, prefix=prefix)

    def register_file(self, path):
        self.temp_fs._register_file(path)

    def register_dir(self, path):
        self.temp_fs._register_dir(path)

    def register_file_system(self):
        _LOG.debug('registering test file system...')
        for path, dirs, files, in os.walk(self.temp_fs.base_dir):
            for f in files:
                if f.startswith(self.test_id):
                    self.register_file(os.path.join(path, f))
            for d in dirs:
                if d.startswith(self.test_id):
                    self.register_dir(os.path.join(path, d))

    def _exe_script(self, script_name, args, stdout = None, stderr = None,
            return_code = 0):
        script_path = package_paths.script_path(script_name)
        if isinstance(args, str):
            arg_list = args.split()
        else:
            arg_list = args
        arg_list = [str(x) for x in arg_list]
        cmd = [sys.executable, script_path] + arg_list
        _LOG.debug('Invocation:\n\t{0}'.format(' '.join(cmd)))
        p = subprocess.Popen(cmd, shell=False, stdout=subprocess.PIPE,
                stderr=subprocess.PIPE)
        o, e  = p.communicate()
        exit_code = p.wait()
        if exit_code != return_code:
            _LOG.error("exit code {0} did not match {1}".format(exit_code,
                    return_code))
            _LOG.error("here is the stdout:\n{0}".format(o))
            _LOG.error("here is the stderr:\n{0}".format(e))
        self.assertEqual(exit_code, return_code)
        if stdout != None:
            if o != stdout:
                _LOG.error("std out did not match expected:\n{0}".format(o))
            self.assertEqual(o, stdout)
        if stderr != None:
            if e != stderr:
                _LOG.error("std error did not match expected:\n{0}".format(e))
            self.assertEqual(e, stderr)

    def get_expected_indices(self, num_pairs, dummy_column=True,
            parameters_reported=True):
        num_summary_params = 4
        if _CV_INCLUDED:
            num_summary_params += 1
        num_params = 4*num_pairs
        num_default_stats = 4*num_pairs
        start = 0
        if dummy_column:
            start = 1
        param_indices = range(start, start+num_summary_params)
        start += num_summary_params
        if parameters_reported:
            param_indices += range(start, start+num_params)
            start += num_params
        stat_indices = range(start, start+num_default_stats)
        return param_indices, stat_indices
    
    def prior_file_is_valid(self, prior_path, num_of_samples,
            num_of_columns=None):
        try:
            prior_file = open(prior_path, 'rU')
        except:
            _LOG.error('prior invalid: could not open prior path {0}'.format(
                    prior_path))
            return False
        nrows = 0
        for i, line in enumerate(prior_file):
            if nrows == 0 and HEADER_PATTERN.match(line):
                pass
            else:
                nrows += 1
            if not num_of_columns:
                num_of_columns = len(line.strip().split())
            ncols = len(line.strip().split())
            if num_of_columns != ncols:
                _LOG.error('prior invalid: num of columns at line {0} is {1} '
                        'NOT {2}'.format(i+1, ncols, num_of_columns))
                return False
        prior_file.close()
        if num_of_samples != nrows:
            _LOG.error('prior invalid: num of rows is {0} NOT {1}'.format(
                    nrows, num_of_samples))
            return False
        return True
    
    def get_number_of_lines(self, path):
        f, close = process_file_arg(path)
        count = 0
        for l in f:
            count += 1
        if close:
            f.close()
        return count

    def get_number_of_header_lines(self, path):
        f, close = process_file_arg(path)
        count = 0
        for l in f:
            if HEADER_PATTERN.match(l.strip()):
                count += 1
        if close:
            f.close()
        return count

    def parse_python_config(self, path):
        return ConfigObj(path)

    def get_config_from_msbayes_workers(self, msbayes_workers):
        cfgs = [MsBayesConfig(w.config_path) for w in msbayes_workers]
        self.assertSameConfigs(cfgs)
        return cfgs[0]

    def assertSameConfigs(self, cfgs):
        configs = list(cfgs)
        c1 = configs.pop(0)
        for c2 in cfgs:
            self.assertEqual(c1.time_in_subs_per_site,
                    c2.time_in_subs_per_site)
            self.assertEqual(c1.npairs, c2.npairs)
            self.assertEqual(c1.implementation, c2.implementation)
            self.assertEqual(c1.div_model_prior, c2.div_model_prior)
            self.assertEqual(c1.bottle_proportion_shared,
                    c2.bottle_proportion_shared)
            self.assertEqual(c1.theta_parameters, c2.theta_parameters)
            self.assertEqual(c1.taxa, c2.taxa)
            self.assertTrue(c1.sample_table.equals(c2.sample_table))
            if c1.psi:
                self.assertSameDistributions(c1.psi, c2.psi)
            else:
                self.assertEqual(c1.psi, c2.psi)
            if c1.tau:
                self.assertSameDistributions(c1.tau, c2.tau)
            else:
                self.assertEqual(c1.tau, c2.tau)
            if c1.theta:
                self.assertSameDistributions(c1.theta, c2.theta)
            else:
                self.assertEqual(c1.theta, c2.theta)
            if c1.a_theta:
                self.assertSameDistributions(c1.a_theta, c2.a_theta)
            else:
                self.assertEqual(c1.a_theta, c2.a_theta)
            if c1.d_theta:
                self.assertSameDistributions(c1.d_theta, c2.d_theta)
            else:
                self.assertEqual(c1.d_theta, c2.d_theta)
            if c1.recombination:
                self.assertSameDistributions(c1.recombination, c2.recombination)
            else:
                self.assertEqual(c1.recombination, c2.recombination)
            if c1.migration:
                self.assertSameDistributions(c1.migration, c2.migration)
            else:
                self.assertEqual(c1.migration, c2.migration)
            if c1.dpp_concentration:
                self.assertSameDistributions(c1.dpp_concentration,
                        c2.dpp_concentration)
            else:
                self.assertEqual(c1.dpp_concentration, c2.dpp_concentration)
            if c1.bottle_proportion:
                self.assertSameDistributions(c1.bottle_proportion,
                        c2.bottle_proportion)
            else:
                self.assertEqual(c1.bottle_proportion, c2.bottle_proportion)

    def get_parameter_summaries_from_msbayes_workers(self, msbayes_workers,
            shuffle_taus=True):
        msbayes_workers = list(msbayes_workers)
        s = dict(zip(
            [i for i in msbayes_workers[0].parameter_indices],
            [SampleSummarizer(
                tag=msbayes_workers[0].header[i]) for i in msbayes_workers[
                    0].parameter_indices]))
        ncols = None
        header = msbayes_workers[0].header
        pi = msbayes_workers[0].parameter_indices
        for w in msbayes_workers:
            self.assertEqual(w.header, header)
            self.assertEqual(w.parameter_indices, pi)
            f = open(w.prior_path, 'rU')
            for line_idx, row in enumerate(f):
                if not ncols:
                    ncols = len(row.strip().split())
                if HEADER_PATTERN.match(row.strip()):
                    continue
                r = row.strip().split()
                assert len(r) == ncols
                if shuffle_taus: # because taus are sorted in prior files
                    psi_index = get_indices_of_patterns(w.header,
                            PSI_PATTERNS)[0]
                    tau_indices = get_indices_of_patterns(w.header,
                            TAU_PATTERNS)
                    psi = int(r[psi_index])
                    taus = [float(r[i]) for i in tau_indices]
                    self.assertEqual(psi, len(set(taus)))
                    random.shuffle(taus)
                    for n, i in enumerate(tau_indices):
                        s[i].add_sample(taus[n])
                    p_set = set(w.parameter_indices) - set(tau_indices)
                    p = sorted(list(p_set))
                    for i in p:
                        s[i].add_sample(float(r[i]))
                else:
                    for i in w.parameter_indices:
                        s[i].add_sample(float(r[i]))
            f.close()
        return s

    def assertPriorIsPrecise(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        param_sums = self.get_parameter_summaries_from_msbayes_workers(
                msbayes_workers)
        sample_size = 0
        for w in msbayes_workers:
            sample_size += w.sample_size
        for s in param_sums.itervalues():
            self.assertEqual(s.n, sample_size)
        cfg = self.get_config_from_msbayes_workers(msbayes_workers)
        psi_indices = get_indices_of_patterns(msbayes_workers[0].header,
                PSI_PATTERNS)
        self.assertEqual(len(psi_indices), 1)
        model_indices = get_indices_of_patterns(msbayes_workers[0].header,
                MODEL_PATTERNS)
        if not msbayes_workers[0].model_index is None:
            self.assertEqual(len(model_indices), 1)
        else:
            self.assertEqual(len(model_indices), 0)
        tau_indices = get_indices_of_patterns(msbayes_workers[0].header,
                TAU_PATTERNS)
        a_theta_indices = get_indices_of_patterns(msbayes_workers[0].header,
                A_THETA_PATTERNS)
        d_theta_indices = get_indices_of_patterns(msbayes_workers[0].header,
                D_THETA_PATTERNS)
        if msbayes_workers[0].report_parameters:
            self.assertEqual(len(tau_indices), cfg.npairs)
            self.assertEqual(len(a_theta_indices), cfg.npairs)
            self.assertEqual(len(d_theta_indices), 2*cfg.npairs)
        else:
            self.assertEqual(len(tau_indices), 0)
            self.assertEqual(len(a_theta_indices), 0)
            self.assertEqual(len(d_theta_indices), 0)
        _LOG.debug('\n{0}\n'.format('\n'.join(
                [str(param_sums[i]) for i in sorted(param_sums.iterkeys())])))
        for i in psi_indices:
            self.assertSampleIsFromDistribution(param_sums[i], cfg.psi,
                    places=places)
        for i in tau_indices:
            self.assertSampleIsFromDistribution(param_sums[i], cfg.tau,
                    places=places)
        for i in a_theta_indices:
            self.assertSampleIsFromDistribution(param_sums[i], cfg.a_theta,
                    places=places)
        for i in d_theta_indices:
            self.assertSampleIsFromDistribution(param_sums[i], cfg.d_theta,
                    mean_adj=cfg.theta.mean,
                    max_adj=cfg.theta.maximum,
                    compare_variance=False,
                    places=places)

    def assertPriorIsAccurate(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        pass

    def assertPriorIsValid(self, msbayes_workers, places=2):
        msbayes_workers = list(msbayes_workers)
        self.assertWorkersFinished(msbayes_workers)
        self.assertPriorIsPrecise(msbayes_workers, places=places)
        self.assertPriorIsAccurate(msbayes_workers, places=places)

    def assertWorkersFinished(self, msbayes_workers):
        for w in msbayes_workers:
            self.assertTrue(w.finished)
                    
    def assertSampleIsFromDistribution(self, sample_sum, dist, places=2,
            mean_adj=1,
            max_adj=1,
            compare_variance=True):
        if isinstance(dist, probability.DiscreteUniformDistribution):
            self.assertEqual(sample_sum.minimum, dist.minimum)
            self.assertEqual(sample_sum.maximum, dist.maximum)
        else:
            if dist.minimum != float('-inf') or dist.minimum != float('inf'):
                self.assertAlmostEqual(sample_sum.minimum, dist.minimum, places)
            if dist.maximum != float('-inf') or dist.maximum != float('inf'):
                self.assertAlmostEqual(sample_sum.maximum, dist.maximum*max_adj, places)
        self.assertAlmostEqual(sample_sum.mean, dist.mean*mean_adj, places)
        if compare_variance:
            self.assertAlmostEqual(sample_sum.variance, dist.variance, places)

    def assertApproxEqual(self, x, y, percent_tol=1e-6):
        eq = (((abs(x-y) / ((abs(x)+abs(y))/2))*100) < percent_tol)
        if not eq:
            _LOG.error('x ({0}) and y ({1}) are not equal'.format(x, y))
        self.assertTrue(eq)

    def files_equal(self, f1, f2, exclude_line_endings=False):
        equal = True
        diffs = []
        f1, c1 = process_file_arg(f1)
        f2, c2 = process_file_arg(f2)
        line = 0
        f1_end = False
        f2_end = False
        lines_left = True
        while True:
            line += 1
            if f1_end == False:
                try:
                    l1 = f1.next()
                except (StopIteration, EOFError):
                    f1_end = line
                    pass
            if f2_end == False:
                try:
                    l2 = f2.next()
                except (StopIteration, EOFError):
                    f2_end = line
                    pass
            if f1_end != False and f2_end != False:
                break
            if exclude_line_endings:
                l1 = l1.strip()
                l2 = l2.strip()
            if f1_end == False and f2_end == False and l1 != l2:
                diffs.append(line)
                equal = False
        if f1_end != f2_end:
            mn = min([f1_end, f2_end])
            mx = max([f1_end, f2_end])
            diffs.extend(range(mn, mx+1))
            equal = False
        assert len(diffs) == len(set(diffs))
        if c1:
            f1.close()
        if c2:
            f2.close()
        return equal, diffs

    def assertSameFiles(self, files, exclude_line_endings=False):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1 = files.pop(0)
        for f2 in files:
            equal, diff_list = self.files_equal(f1, f2, exclude_line_endings)
            if not equal:
                all_equal = False
                n1 = f1
                if not isinstance(n1, str):
                    n1 = f1.name
                n2 = f2
                if not isinstance(n2, str):
                    n2 = f2.name
                diffs.write('{0} and {1} differ at lines:\n\t{2}\n'.format(
                        n1, n2, ','.join([str(i) for i in diff_list])))
        if not all_equal:
            _LOG.error('files are not equal:\n{0}\n'.format(diffs.getvalue()))
        self.assertTrue(all_equal)

    def assertSameUnsortedFiles(self, files):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1, close = process_file_arg(files.pop(0))
        lines1 = sorted(f1.readlines())
        for f in files:
            f2, close2 = process_file_arg(f)
            lines2 = sorted(f2.readlines())
            if len(lines1) != len(lines2):
                all_equal = False
                diffs.write('{0} ({1}) and {2} ({3}) have different '
                        'number of lines\n'.format(f1.name, len(lines1),
                                f2.name, len(lines2)))
            for i in range(min(len(lines1), len(lines2))):
                if lines1[i].strip().split() != lines2[i].strip().split():
                    all_equal = False
                    diffs.write('{0} and {1} differ at sorted index '
                            '{2}\n'.format(f1.name, f2.name, i))

            if close2:
                f2.close()
        if not all_equal:
            _LOG.error('files are not equal after sorting:\n{0}\n'.format(
                    diffs.getvalue()))
        self.assertTrue(all_equal)
        if close:
            f1.close()

    def same_samples(self, sample1, sample2, places = 4, num_mismatches = 0):
        if len(sample1) != len(sample2):
            return False
        for i in range(len(sample1)):
            if round(float(sample1[i]) - float(sample2[i]), places) != 0:
                if num_mismatches < 1:
                    return False
                num_mismatches -= 1
        return True
            
    def assertSameSamples(self, files, columns_to_ignore = [], header = True,
            places = 5, num_mismatches_per_sample = 0,
            num_sample_mismatches = 0):
        files = list(files)
        all_equal = True
        diffs = StringIO()
        f1, close = process_file_arg(files.pop(0))
        f1_lines = f1.readlines()
        indices = [i for i in range(len(
                f1_lines[0].strip().split())) if i not in columns_to_ignore]
        h1 = []
        if header:
            head = f1_lines.pop(0).strip().split()
            h1 = [head[i] for i in indices]
        lines1 = sorted(f1_lines)
        for f in files:
            f2, close2 = process_file_arg(f)
            f2_lines = f2.readlines()
            h2 = []
            if header:
                head = f2_lines.pop(0).strip().split()
                h2 = [head[i] for i in indices]
                if h1 != h2:
                    all_equal = False
                    diffs.write('{0} and {1} have different headers; not '
                            'comparing further\n'.format(
                                    f1.name, f2.name))
                    continue
            lines2 = sorted(f2_lines)
            if len(lines1) != len(lines2):
                all_equal = False
                diffs.write('{0} ({1}) and {2} ({3}) have different '
                        'number of lines\n'.format(f1.name, len(lines1),
                                f2.name, len(lines2)))
            n_matches = 0
            n_mismatches = 0
            for l1 in lines1:
                found = False
                for l2 in lines2:
                    values1 = l1.strip().split()
                    values2 = l2.strip().split()
                    v1 = [float(values1[x]) for x in indices]
                    v2 = [float(values2[x]) for x in indices]
                    if self.same_samples(v1, v2, places = places,
                            num_mismatches = num_mismatches_per_sample):
                        found = True
                if found:
                    n_matches += 1
                else:
                    n_mismatches += 1
            if n_mismatches > 0:
                if n_mismatches > num_sample_mismatches:
                    all_equal = False
                diffs.write('{0} and {1}\nhave {2} mismatching samples and '
                        'share {3} samples\n'.format(
                                f1.name, f2.name, n_mismatches, n_matches))
            if close2:
                f2.close()
        if diffs.getvalue() != '':
            _LOG.error('files are not equal after sorting:\n{0}\n'.format(
                    diffs.getvalue()))
        self.assertTrue(all_equal)
        if close:
            f1.close()
    
    def assertSameDistributions(self, d1, d2):
        self.assertEqual(d1.name, d2.name)
        self.assertEqual(str(d1), str(d2))
        self.assertEqual(d1.minimum, d2.minimum)
        self.assertEqual(d1.maximum, d2.maximum)
        self.assertEqual(d1.mean, d2.mean)
        self.assertEqual(d1.variance, d2.variance)

    def assertSameIntegerPartitions(self, integer_partitions):
        ips = list(integer_partitions)
        ip1 = ips.pop(0)
        for ip2 in ips:
            self.assertEqual(ip1._initialized, ip2._initialized)
            self.assertEqual(ip1.n, ip2.n)
            self.assertEqual(ip1.key, ip2.key)
            self.assertEqual(ip1.integer_partition, ip2.integer_partition)
            self.assertEqual(ip1._items, ip2._items)

    def assertSamePartitions(self, partitions):
        ps = list(partitions)
        p1 = ps.pop(0)
        for p2 in ps:
            self.assertEqual(p1._initialized, p2._initialized)
            self.assertEqual(p1.n, p2.n)
            self.assertEqual(p1.key, p2.key)
            self.assertEqual(p1.partition, p2.partition)
            self.assertEqual(p1.values, p2.values)

Example #7

Show file

def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        '-c',
        '--config',
        type=arg_is_config,
        required=True,
        help=('msBayes config file to be used to generate saturation '
              'plot.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=int,
        default=1000,
        help=('The number of prior samples to simulate for the '
              'saturation plot.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument(
        '--vertical-lines',
        nargs='*',
        type=float,
        default=[],
        help=('Positions along x-axis where vertical lines are to be '
              'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress plot data file.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
        [s + '.' for s in args.stat_prefixes], ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
                                                      num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()),
               log.info)
    info.write(
        '\tstat_patterns = {0!r}'.format(', '.join(
            [p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
               log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
               log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(temp_fs=temp_fs,
                          sample_size=sample_size,
                          config_path=config_path,
                          report_parameters=True,
                          schema=schema,
                          include_header=True,
                          stat_patterns=stat_patterns,
                          write_stats_file=False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(workers=workers, num_processors=args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                            '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path,
                                  'w',
                                  compresslevel=compress_level)
    for row in dict_line_iter(stats_by_time, sep='\t', header=header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
            '`matplotlib` could not be imported, so the plot can not be\n'
            'produced. The data to create the plot can be found in:\n\t'
            '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {
            'pi': r'$\pi$',
            'pi.net': r'$\pi_{net}$',
            'wattTheta': r'$\theta_W$',
            'tajD.denom': r'$SD(\pi - \theta_W)$'
        }
        spg = SaturationPlotGrid(stats_by_time,
                                 x_key='PRI.t',
                                 y_keys=args.stat_prefixes,
                                 y_labels=y_labels,
                                 num_columns=2,
                                 vertical_line_positions=args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()

Example #8

Show file

File: main_dmc.py Project: joaks1/PyMsBayes

def main_cli(argv = sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description,
            formatter_class = argparse_utils.SmartHelpFormatter)
    parser.add_argument('-o', '--observed-configs',
            nargs = '+',
            type = argparse_utils.arg_is_config,
            required = True,
            help = ('One or more msBayes config files to be used to either '
                    'calculate or simulate observed summary statistics. If '
                    'used in combination with `-r` each config will be used to '
                    'simulate pseudo-observed data. If analyzing real data, do '
                    'not use the `-r` option, and the fasta files specified '
                    'within the config must exist and contain the sequence '
                    'data.'))
    parser.add_argument('-p', '--prior-configs',
            nargs = '+',
            type = argparse_utils.arg_is_path,
            required = True,
            help = ('One or more config files to be used to generate prior '
                    'samples. If more than one config is specified, they '
                    'should be separated by spaces. '
                    'This option can also be used to specify the path to a '
                    'directory containing the prior samples and summary '
                    'statistic means and standard deviations generated by a '
                    'previous run using the `generate-samples-only` option. '
                    'These files should be found in the directory '
                    '`pymsbayes-output/prior-stats-summaries`. The'
                    '`pymsbayes-output/model-key.txt` also needs to be present.'
                    ' If specifying this directory, it should be the only '
                    'argument (i.e., no other directories or config files can '
                    'be provided).'))
    parser.add_argument('-r', '--reps',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('This option has two effects. First, it signifies that '
                    'the analysis will be simulation based (i.e., no real '
                    'data will be used). Second, it specifies how many '
                    'simulation replicates to perform (i.e., how many data '
                    'sets to simulate and analyze).'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000000,
            help = ('The number of prior samples to simulate for each prior '
                    'config specified with `-p`.'))
    parser.add_argument('--prior-batch-size',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 10000,
            help = ('The number of prior samples to simulate for each batch.'))
    parser.add_argument('--generate-samples-only',
            action = 'store_true',
            help = ('Only generate samples from models as requested. I.e., '
                    'No analyses are performed to approximate posteriors. '
                    'This option can be useful if you want the prior samples '
                    'for other purposes.'))
    parser.add_argument('--num-posterior-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000,
            help = ('The number of posterior samples desired for each '
                    'analysis. Default: 1000.'))
    parser.add_argument('--num-standardizing-samples',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 10000,
            help = ('The number of prior samples desired to use for '
                    'standardizing statistics. Default: 10000.'))
    parser.add_argument('--np',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('--output-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('--staging-dir',
            action = 'store',
            type = argparse_utils.arg_is_dir,
            help = ('A directory to temporarily stage prior files. This option '
                    'can be useful on clusters to speed up I/O while '
                    'generating prior samples. You can designate a local temp '
                    'directory on a compute node to avoid constant writing to '
                    'a shared drive. The default is to use the `temp-dir`.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi wattTheta pi.net tajD.denom`.'))
    parser.add_argument('-b', '--bandwidth',
            action = 'store',
            type = float,
            help = ('Smoothing parameter for the posterior kernal density '
                    'estimation. This option is used for the `glm` '
                    'regression method. The default is 2 / '
                    '`num-posterior-samples`.'))
    parser.add_argument('-q', '--num-posterior-quantiles',
            action = 'store',
            type = argparse_utils.arg_is_positive_int,
            default = 1000,
            help = ('The number of equally spaced quantiles at which to '
                    'evaluate the GLM-estimated posterior density. '
                    'Default: 1000.'))
    parser.add_argument('--reporting-frequency',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('Suggested frequency (in number of prior samples) for '
                    'running regression and reporting current results. '
                    'Default: 0 (only report final results). '
                    'If a value is given, it may be adjusted so that the '
                    'reporting frequency is a multiple of the multi-processed '
                    'batch size.'))
    parser.add_argument('--sort-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            choices = range(12),
            help = argparse_utils.get_sort_index_help_message())
    parser.add_argument('--no-global-estimate',
            action = 'store_true',
            help = ('If multiple prior models are specified, by default a '
                    'global estimate is performed averaging over all models. '
                    'This option prevents the global estimation (i.e., only '
                    'inferences for each model are made).'))
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress large results files.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--output-prefix',
            action = 'store',
            type = str,
            default = '',
            help = ('Prefix to use at beginning of output files. The default '
                    'is no prefix.'))
    parser.add_argument('--data-key-path',
            action = 'store',
            type = argparse_utils.arg_is_file,
            help = ('The path to a `data-key.txt` file generated by a previous '
                    'run. This file should be found in the directory '
                    '`pymsbayes-output/data-key.txt`. This option '
                    'will override the `-o`/`--observed-configs` option, and '
                    'is intended to be used in combination with the '
                    '`--start-from` option to restart an analysis.'))
    parser.add_argument('--start-from-simulation-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('The simulation index at which to begin analyses. Must be '
                    'used in combination with either the number of simulation '
                    'replicates (`-r`/`--reps`) or the `--data-key-path` '
                    'option, and must be a positive '
                    'integer that is less than the number of simulation '
                    'replicates. This option can be useful if an analysis '
                    'needs to be restarted.'))
    parser.add_argument('--start-from-observed-index',
            action = 'store',
            type = argparse_utils.arg_is_nonnegative_int,
            default = 0,
            help = ('The observed config index at which to begin analyses. '
                    'Can be used in combination with the `--data-key-path` '
                    'option to restart long-running, multi-observed-config '
                    'analyses'))
    parser.add_argument('--dry-run',
            action = 'store_true',
            help = 'Do not run analyses; only process settings')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import (MsBayesWorker, merge_prior_files,
            ObsSumStatsWorker)
    from pymsbayes.teams import ABCTeam
    from pymsbayes.utils.functions import (is_file, is_dir, long_division,
            mk_new_dir)
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, DIV_MODEL_PATTERNS, MODEL_PATTERNS,
            PSI_PATTERNS, MEAN_TAU_PATTERNS, OMEGA_PATTERNS, CV_PATTERNS,
            line_count)
    from pymsbayes.utils import sumresults, errors
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace,
            MSBAYES_SORT_INDEX, ToolPathManager)

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    if len(args.observed_configs) != len(set(args.observed_configs)):
        raise ValueError('All paths to observed config files must be unique')

    if args.num_standardizing_samples > args.num_prior_samples:
        args.num_standardizing_samples = args.num_prior_samples
    
    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')
    eureject_path = ToolPathManager.get_tool_full_path('eureject')
    abctb_path = ToolPathManager.get_tool_full_path('ABCestimator')

    # vet prior-configs option
    using_previous_priors = False
    previous_prior_dir = None
    if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])):
        previous_prior_dir = args.prior_configs.pop(0)
        previous_priors = glob.glob(os.path.join(previous_prior_dir,
                '*-prior-sample.txt'))
        previous_sums = glob.glob(os.path.join(previous_prior_dir,
                '*-means-and-std-devs.txt'))
        if (not previous_priors) or (not previous_sums):
            raise ValueError('directory {0!r} specified with `prior-configs` '
                    'option does not contain necessary prior and summary '
                    'files'.format(args.prior_configs[0]))
        using_previous_priors = True
    else:
        for path in args.prior_configs:
            if not is_file(path):
                raise ValueError('prior config {0!r} is not a file'.format(
                        path))
    if len(args.prior_configs) != len(set(args.prior_configs)):
        raise ValueError('All paths to prior config files must be unique') 
    if not args.output_dir:
        args.output_dir = os.path.dirname(args.observed_configs[0])
    base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results'))
    if not args.temp_dir:
        args.temp_dir = base_dir
    info_path = os.path.join(base_dir, args.output_prefix + \
            'pymsbayes-info.txt')
    info = InfoLogger(info_path)
    info.write('[pymsbayes]'.format(base_dir))
    info.write('\tversion = {version}'.format(**_program_info))
    info.write('\toutput_directory = {0}'.format(base_dir))
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    base_temp_dir = temp_fs.base_dir
    info.write('\ttemp_directory = {0}'.format(base_temp_dir))
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()))
    info.write('\tsimulation_reps = {0}'.format(args.reps))
    stat_patterns = DEFAULT_STAT_PATTERNS
    if args.stat_prefixes:
        for i in range(len(args.stat_prefixes)):
            if not args.stat_prefixes[i].endswith('.'):
                args.stat_prefixes[i] += '.'
        stat_patterns = get_patterns_from_prefixes(
                args.stat_prefixes,
                ignore_case=True)
    if not args.bandwidth:
        args.bandwidth = 2 / float(args.num_posterior_samples)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    if args.data_key_path:
        observed_map = sumresults.parse_data_key_file(args.data_key_path)
        observed_paths = [observed_map[k] for k in sorted(observed_map.keys())]
    else:
        observed_dir = mk_new_dir(os.path.join(base_dir,
                'observed-summary-stats'))
        observed_paths = [os.path.join(observed_dir, args.output_prefix + \
            'observed-{0}.txt'.format(i+1)) for i in range(len(
                    args.observed_configs))]
    info.write('\tseed = {0}'.format(args.seed))
    info.write('\tnum_processors = {0}'.format(args.np))
    info.write('\tnum_prior_samples = {0}'.format(
            args.num_prior_samples))
    info.write('\tnum_standardizing_samples = {0}'.format(
            args.num_standardizing_samples))
    info.write('\tbandwidth = {0}'.format(args.bandwidth))
    info.write('\tposterior_quantiles = {0}'.format(
            args.num_posterior_quantiles))
    info.write('\tposterior_sample_size = {0}'.format(
            args.num_posterior_samples))
    info.write('\tstat_patterns = {0}'.format(
            ', '.join([p.pattern for p in stat_patterns])))

    # vet observed configs
    ref_config_path = args.observed_configs[0]
    ref_config = MsBayesConfig(ref_config_path) 
    all_config_paths = []
    num_taxon_pairs = ref_config.npairs
    assert num_taxon_pairs > 0
    for config in args.observed_configs:
        all_config_paths.append(config)
        if not ref_config.equal_sample_table(config):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                    'sample tables in config {0!r} and {1!r} differ; '
                    'all sample tables must be the same.'.format(
                            ref_config_path, config))

    info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs))
    info.write('\tdry_run = {0}'.format(args.dry_run))
    info.write('\t[[tool_paths]]')
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path))
    info.write('\t\tmsbayes = {0}'.format(msbayes_path))
    info.write('\t\teureject = {0}'.format(eureject_path))
    info.write('\t\tabcestimator = {0}'.format(abctb_path))
    info.write('\t[[observed_configs]]')
    for i, cfg in enumerate(args.observed_configs):
        info.write('\t\t{0} = {1}'.format(i + 1, os.path.relpath(cfg,
                os.path.dirname(info_path))))

    abc_team = ABCTeam(
            temp_fs = temp_fs,
            observed_stats_files = observed_paths,
            num_taxon_pairs = num_taxon_pairs,
            config_paths = args.prior_configs,
            previous_prior_dir = previous_prior_dir,
            num_prior_samples = args.num_prior_samples,
            num_processors = args.np,
            num_standardizing_samples = args.num_standardizing_samples,
            num_posterior_samples = args.num_posterior_samples,
            num_posterior_density_quantiles = args.num_posterior_quantiles,
            batch_size = args.prior_batch_size,
            output_dir = base_dir,
            output_prefix = args.output_prefix,
            prior_temp_dir = args.staging_dir,
            rng = GLOBAL_RNG,
            report_parameters = True,
            stat_patterns = stat_patterns,
            eureject_exe_path = eureject_path,
            abctoolbox_exe_path = abctb_path,
            msbayes_exe_path = None,
            abctoolbox_bandwidth = args.bandwidth,
            omega_threshold = 0.01,
            cv_threshold = 0.01,
            compress = args.compress,
            reporting_frequency = args.reporting_frequency,
            keep_temps = args.keep_temps,
            global_estimate_only = False,
            global_estimate = not args.no_global_estimate,
            generate_prior_samples_only = args.generate_samples_only,
            start_from_simulation_index = args.start_from_simulation_index,
            start_from_observed_index = args.start_from_observed_index)

    models_to_configs = {}
    configs_to_models = {}
    for k, v in abc_team.models.iteritems():
        models_to_configs[k] = v
        configs_to_models[v] = k
        cfg = MsBayesConfig(v)
        all_config_paths.append(v)
        # vet prior configs
        if not ref_config.equal_sample_table(cfg):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                    'sample tables in config {0!r} and {1!r} differ; '
                    'all sample tables must be the same.'.format(
                            ref_config_path, v))

    info.write('\t[[observed_paths]]')
    for i in sorted(abc_team.observed_stats_paths.iterkeys()):
        info.write('\t\t{0} = {1}'.format(i, os.path.relpath(
                abc_team.observed_stats_paths[i],
                os.path.dirname(info_path))))
    info.write('\t[[prior_configs]]')
    for i in sorted(abc_team.models.iterkeys()):
        info.write('\t\t{0} = {1}'.format(i, os.path.relpath(
                abc_team.models[i],
                os.path.dirname(info_path))))

    ##########################################################################
    ## begin analysis --- get observed summary stats

    set_memory_trace() # start logging memory profile
    start_time = datetime.datetime.now()

    if args.data_key_path:
        log.info('Using provided summary statitics...')
    elif not args.dry_run:
        obs_temp_dir = base_temp_dir
        if args.staging_dir:
            obs_temp_dir = args.staging_dir
        observed_temp_fs = TempFileSystem(parent = obs_temp_dir,
                prefix = 'observed-temps-')

        if args.reps < 1:
            log.info('Calculating summary statistics from sequence data...')
            obs_workers = []
            for i, cfg in enumerate(args.observed_configs):
                ss_worker = ObsSumStatsWorker(
                        temp_fs = observed_temp_fs,
                        config_path = cfg,
                        output_path = observed_paths[i],
                        schema = 'abctoolbox',
                        stat_patterns = stat_patterns)
                obs_workers.append(ss_worker)

            obs_workers = Manager.run_workers(
                workers = obs_workers,
                num_processors = args.np)

            # re-vet all configs to see if some were changed by obsSumStats.pl
            new_ref_config = ref_config
            ref_modified = False
            # new ref because if all configs were updated all is good
            if not ref_config.equal_sample_table(ref_config_path):
                ref_modified = True
                new_ref_config = MsBayesConfig(ref_config_path)
                log.warning("""
The alignment lengths in config
{0!r}
have been corrected for sites with *any* ambiguous bases and/or gaps by
obsSumStats.pl.
                    """.format(ref_config_path))
            for config in all_config_paths:
                if not new_ref_config.equal_sample_table(config):
                    corrected_config = config
                    if ref_modified:
                        corrected_config = ref_config_path
                    if not args.keep_temps:
                        observed_temp_fs.purge()
                        temp_fs.purge()
                    raise errors.SampleTableError("""
The sample tables in configs
{0!r}
and
{1!r}
differ because obsSumStats.pl modified alignment lengths in config
{2!r}
to correct for sites in the alignments with *any* ambiguous bases and/or gaps.
Please make sure the sample tables in all configs will be the same after
correcting alignment lengths for sites that contain *any* ambiguous bases
and/or gaps. You can do this by copying and pasting the sample table in
{2!r}
that has been corrected by obsSumStats.pl into the other configs that were not
corrected.
                        """.format(ref_config_path, config, corrected_config))

        else:
            log.info('Simulating summary statistics from observed configs...')
            num_observed_workers = min([args.reps, args.np])
            if args.reps <= args.np:
                observed_batch_size = 1
                remainder = 0
            else:
                observed_batch_size, remainder = long_division(args.reps,
                        args.np)
            msbayes_workers = []
            for idx, cfg in enumerate(args.observed_configs):
                observed_model_idx = configs_to_models.get(cfg,
                        None)
                schema = 'abctoolbox'
                for i in range(num_observed_workers):
                    worker = MsBayesWorker(
                            temp_fs = observed_temp_fs,
                            sample_size = observed_batch_size,
                            config_path = cfg,
                            model_index = observed_model_idx,
                            report_parameters = True,
                            schema = schema,
                            include_header = True,
                            stat_patterns = stat_patterns,
                            write_stats_file = False,
                            staging_dir = None,
                            tag = idx)
                    msbayes_workers.append(worker)
                if remainder > 0:
                    worker = MsBayesWorker(
                            temp_fs = observed_temp_fs,
                            sample_size = remainder,
                            config_path = cfg,
                            model_index = observed_model_idx,
                            report_parameters = True,
                            schema = schema,
                            include_header = True,
                            stat_patterns = stat_patterns,
                            write_stats_file = False,
                            staging_dir = None,
                            tag = idx)
                    msbayes_workers.append(worker)

            # run parallel msbayes processes
            msbayes_workers = Manager.run_workers(
                workers = msbayes_workers,
                num_processors = args.np)

            workers = dict(zip(range(len(args.observed_configs)),
                    [[] for i in range(len(args.observed_configs))]))
            for w in msbayes_workers:
                workers[w.tag].append(w)

            # merge simulated observed data into one file
            for i in range(len(args.observed_configs)):
                merge_prior_files([w.prior_path for w in workers[i]],
                        observed_paths[i])
                lc = line_count(observed_paths[i], ignore_headers=True)
                if lc != args.reps:
                    if not args.keep_temps:
                        temp_fs.purge()
                    raise Exception('The number of observed simulations ({0}) '
                            'generated for observed config {1!r} and output to '
                            'file {2!r} does not match the number of reps '
                            '({3})'.format(lc, args.observed_configs[i],
                                observed_paths[i], args.reps))
        if not args.keep_temps:
            log.debug('purging observed temps...')
            observed_temp_fs.purge()

    ##########################################################################
    ## Begin ABC analyses

    if not args.dry_run:
        abc_team.run()

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()

Example #9

Show file

File: main_dmc.py Project: vishalbelsare/PyMsBayes

def main_cli(argv=sys.argv):
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse_utils.SmartHelpFormatter)
    parser.add_argument(
        '-o',
        '--observed-configs',
        nargs='+',
        type=argparse_utils.arg_is_config,
        required=True,
        help=('One or more msBayes config files to be used to either '
              'calculate or simulate observed summary statistics. If '
              'used in combination with `-r` each config will be used to '
              'simulate pseudo-observed data. If analyzing real data, do '
              'not use the `-r` option, and the fasta files specified '
              'within the config must exist and contain the sequence '
              'data.'))
    parser.add_argument(
        '-p',
        '--prior-configs',
        nargs='+',
        type=argparse_utils.arg_is_path,
        required=True,
        help=('One or more config files to be used to generate prior '
              'samples. If more than one config is specified, they '
              'should be separated by spaces. '
              'This option can also be used to specify the path to a '
              'directory containing the prior samples and summary '
              'statistic means and standard deviations generated by a '
              'previous run using the `generate-samples-only` option. '
              'These files should be found in the directory '
              '`pymsbayes-output/prior-stats-summaries`. The'
              '`pymsbayes-output/model-key.txt` also needs to be present.'
              ' If specifying this directory, it should be the only '
              'argument (i.e., no other directories or config files can '
              'be provided).'))
    parser.add_argument(
        '-r',
        '--reps',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('This option has two effects. First, it signifies that '
              'the analysis will be simulation based (i.e., no real '
              'data will be used). Second, it specifies how many '
              'simulation replicates to perform (i.e., how many data '
              'sets to simulate and analyze).'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000000,
        help=('The number of prior samples to simulate for each prior '
              'config specified with `-p`.'))
    parser.add_argument(
        '--prior-batch-size',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=10000,
        help=('The number of prior samples to simulate for each batch.'))
    parser.add_argument(
        '--generate-samples-only',
        action='store_true',
        help=('Only generate samples from models as requested. I.e., '
              'No analyses are performed to approximate posteriors. '
              'This option can be useful if you want the prior samples '
              'for other purposes.'))
    parser.add_argument(
        '--num-posterior-samples',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000,
        help=('The number of posterior samples desired for each '
              'analysis. Default: 1000.'))
    parser.add_argument('--num-standardizing-samples',
                        action='store',
                        type=argparse_utils.arg_is_positive_int,
                        default=10000,
                        help=('The number of prior samples desired to use for '
                              'standardizing statistics. Default: 10000.'))
    parser.add_argument(
        '--np',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '--staging-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage prior files. This option '
              'can be useful on clusters to speed up I/O while '
              'generating prior samples. You can designate a local temp '
              'directory on a compute node to avoid constant writing to '
              'a shared drive. The default is to use the `temp-dir`.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi wattTheta pi.net tajD.denom`.'))
    parser.add_argument(
        '-b',
        '--bandwidth',
        action='store',
        type=float,
        help=('Smoothing parameter for the posterior kernal density '
              'estimation. This option is used for the `glm` '
              'regression method. The default is 2 / '
              '`num-posterior-samples`.'))
    parser.add_argument(
        '-q',
        '--num-posterior-quantiles',
        action='store',
        type=argparse_utils.arg_is_positive_int,
        default=1000,
        help=('The number of equally spaced quantiles at which to '
              'evaluate the GLM-estimated posterior density. '
              'Default: 1000.'))
    parser.add_argument(
        '--reporting-frequency',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('Suggested frequency (in number of prior samples) for '
              'running regression and reporting current results. '
              'Default: 0 (only report final results). '
              'If a value is given, it may be adjusted so that the '
              'reporting frequency is a multiple of the multi-processed '
              'batch size.'))
    parser.add_argument('--sort-index',
                        action='store',
                        type=argparse_utils.arg_is_nonnegative_int,
                        default=0,
                        choices=range(12),
                        help=argparse_utils.get_sort_index_help_message())
    parser.add_argument(
        '--no-global-estimate',
        action='store_true',
        help=('If multiple prior models are specified, by default a '
              'global estimate is performed averaging over all models. '
              'This option prevents the global estimation (i.e., only '
              'inferences for each model are made).'))
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress large results files.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument(
        '--output-prefix',
        action='store',
        type=str,
        default='',
        help=('Prefix to use at beginning of output files. The default '
              'is no prefix.'))
    parser.add_argument(
        '--data-key-path',
        action='store',
        type=argparse_utils.arg_is_file,
        help=('The path to a `data-key.txt` file generated by a previous '
              'run. This file should be found in the directory '
              '`pymsbayes-output/data-key.txt`. This option '
              'will override the `-o`/`--observed-configs` option, and '
              'is intended to be used in combination with the '
              '`--start-from` option to restart an analysis.'))
    parser.add_argument(
        '--start-from-simulation-index',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('The simulation index at which to begin analyses. Must be '
              'used in combination with either the number of simulation '
              'replicates (`-r`/`--reps`) or the `--data-key-path` '
              'option, and must be a positive '
              'integer that is less than the number of simulation '
              'replicates. This option can be useful if an analysis '
              'needs to be restarted.'))
    parser.add_argument(
        '--start-from-observed-index',
        action='store',
        type=argparse_utils.arg_is_nonnegative_int,
        default=0,
        help=('The observed config index at which to begin analyses. '
              'Can be used in combination with the `--data-key-path` '
              'option to restart long-running, multi-observed-config '
              'analyses'))
    parser.add_argument('--dry-run',
                        action='store_true',
                        help='Do not run analyses; only process settings')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    if argv == sys.argv:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import (MsBayesWorker, merge_prior_files,
                                   ObsSumStatsWorker)
    from pymsbayes.teams import ABCTeam
    from pymsbayes.utils.functions import (is_file, is_dir, long_division,
                                           mk_new_dir)
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         DIV_MODEL_PATTERNS, MODEL_PATTERNS,
                                         PSI_PATTERNS, MEAN_TAU_PATTERNS,
                                         OMEGA_PATTERNS, CV_PATTERNS,
                                         line_count)
    from pymsbayes.utils import sumresults, errors
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import (GLOBAL_RNG, set_memory_trace,
                                 MSBAYES_SORT_INDEX, ToolPathManager)

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    if len(args.observed_configs) != len(set(args.observed_configs)):
        raise ValueError('All paths to observed config files must be unique')

    if args.num_standardizing_samples > args.num_prior_samples:
        args.num_standardizing_samples = args.num_prior_samples

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')
    eureject_path = ToolPathManager.get_tool_full_path('eureject')
    abctb_path = ToolPathManager.get_tool_full_path('ABCestimator')

    # vet prior-configs option
    using_previous_priors = False
    previous_prior_dir = None
    if (len(args.prior_configs) == 1) and (is_dir(args.prior_configs[0])):
        previous_prior_dir = args.prior_configs.pop(0)
        previous_priors = glob.glob(
            os.path.join(previous_prior_dir, '*-prior-sample.txt'))
        previous_sums = glob.glob(
            os.path.join(previous_prior_dir, '*-means-and-std-devs.txt'))
        if (not previous_priors) or (not previous_sums):
            raise ValueError(
                'directory {0!r} specified with `prior-configs` '
                'option does not contain necessary prior and summary '
                'files'.format(args.prior_configs[0]))
        using_previous_priors = True
    else:
        for path in args.prior_configs:
            if not is_file(path):
                raise ValueError(
                    'prior config {0!r} is not a file'.format(path))
    if len(args.prior_configs) != len(set(args.prior_configs)):
        raise ValueError('All paths to prior config files must be unique')
    if not args.output_dir:
        args.output_dir = os.path.dirname(args.observed_configs[0])
    base_dir = mk_new_dir(os.path.join(args.output_dir, 'pymsbayes-results'))
    if not args.temp_dir:
        args.temp_dir = base_dir
    info_path = os.path.join(base_dir, args.output_prefix + \
            'pymsbayes-info.txt')
    info = InfoLogger(info_path)
    info.write('[pymsbayes]'.format(base_dir))
    info.write('\tversion = {version}'.format(**_program_info))
    info.write('\toutput_directory = {0}'.format(base_dir))
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    base_temp_dir = temp_fs.base_dir
    info.write('\ttemp_directory = {0}'.format(base_temp_dir))
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()))
    info.write('\tsimulation_reps = {0}'.format(args.reps))
    stat_patterns = DEFAULT_STAT_PATTERNS
    if args.stat_prefixes:
        for i in range(len(args.stat_prefixes)):
            if not args.stat_prefixes[i].endswith('.'):
                args.stat_prefixes[i] += '.'
        stat_patterns = get_patterns_from_prefixes(args.stat_prefixes,
                                                   ignore_case=True)
    if not args.bandwidth:
        args.bandwidth = 2 / float(args.num_posterior_samples)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    if args.data_key_path:
        observed_map = sumresults.parse_data_key_file(args.data_key_path)
        observed_paths = [observed_map[k] for k in sorted(observed_map.keys())]
    else:
        observed_dir = mk_new_dir(
            os.path.join(base_dir, 'observed-summary-stats'))
        observed_paths = [os.path.join(observed_dir, args.output_prefix + \
            'observed-{0}.txt'.format(i+1)) for i in range(len(
                    args.observed_configs))]
    info.write('\tseed = {0}'.format(args.seed))
    info.write('\tnum_processors = {0}'.format(args.np))
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples))
    info.write('\tnum_standardizing_samples = {0}'.format(
        args.num_standardizing_samples))
    info.write('\tbandwidth = {0}'.format(args.bandwidth))
    info.write('\tposterior_quantiles = {0}'.format(
        args.num_posterior_quantiles))
    info.write('\tposterior_sample_size = {0}'.format(
        args.num_posterior_samples))
    info.write('\tstat_patterns = {0}'.format(', '.join(
        [p.pattern for p in stat_patterns])))

    # vet observed configs
    ref_config_path = args.observed_configs[0]
    ref_config = MsBayesConfig(ref_config_path)
    all_config_paths = []
    num_taxon_pairs = ref_config.npairs
    assert num_taxon_pairs > 0
    for config in args.observed_configs:
        all_config_paths.append(config)
        if not ref_config.equal_sample_table(config):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                'sample tables in config {0!r} and {1!r} differ; '
                'all sample tables must be the same.'.format(
                    ref_config_path, config))

    info.write('\tnum_taxon_pairs = {0}'.format(num_taxon_pairs))
    info.write('\tdry_run = {0}'.format(args.dry_run))
    info.write('\t[[tool_paths]]')
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path))
    info.write('\t\tmsbayes = {0}'.format(msbayes_path))
    info.write('\t\teureject = {0}'.format(eureject_path))
    info.write('\t\tabcestimator = {0}'.format(abctb_path))
    info.write('\t[[observed_configs]]')
    for i, cfg in enumerate(args.observed_configs):
        info.write('\t\t{0} = {1}'.format(
            i + 1, os.path.relpath(cfg, os.path.dirname(info_path))))

    abc_team = ABCTeam(
        temp_fs=temp_fs,
        observed_stats_files=observed_paths,
        num_taxon_pairs=num_taxon_pairs,
        config_paths=args.prior_configs,
        previous_prior_dir=previous_prior_dir,
        num_prior_samples=args.num_prior_samples,
        num_processors=args.np,
        num_standardizing_samples=args.num_standardizing_samples,
        num_posterior_samples=args.num_posterior_samples,
        num_posterior_density_quantiles=args.num_posterior_quantiles,
        batch_size=args.prior_batch_size,
        output_dir=base_dir,
        output_prefix=args.output_prefix,
        prior_temp_dir=args.staging_dir,
        rng=GLOBAL_RNG,
        report_parameters=True,
        stat_patterns=stat_patterns,
        eureject_exe_path=eureject_path,
        abctoolbox_exe_path=abctb_path,
        msbayes_exe_path=None,
        abctoolbox_bandwidth=args.bandwidth,
        omega_threshold=0.01,
        cv_threshold=0.01,
        compress=args.compress,
        reporting_frequency=args.reporting_frequency,
        keep_temps=args.keep_temps,
        global_estimate_only=False,
        global_estimate=not args.no_global_estimate,
        generate_prior_samples_only=args.generate_samples_only,
        start_from_simulation_index=args.start_from_simulation_index,
        start_from_observed_index=args.start_from_observed_index)

    models_to_configs = {}
    configs_to_models = {}
    for k, v in abc_team.models.iteritems():
        models_to_configs[k] = v
        configs_to_models[v] = k
        cfg = MsBayesConfig(v)
        all_config_paths.append(v)
        # vet prior configs
        if not ref_config.equal_sample_table(cfg):
            if not args.keep_temps:
                temp_fs.purge()
            raise errors.SampleTableError(
                'sample tables in config {0!r} and {1!r} differ; '
                'all sample tables must be the same.'.format(
                    ref_config_path, v))

    info.write('\t[[observed_paths]]')
    for i in sorted(abc_team.observed_stats_paths.iterkeys()):
        info.write('\t\t{0} = {1}'.format(
            i,
            os.path.relpath(abc_team.observed_stats_paths[i],
                            os.path.dirname(info_path))))
    info.write('\t[[prior_configs]]')
    for i in sorted(abc_team.models.iterkeys()):
        info.write('\t\t{0} = {1}'.format(
            i, os.path.relpath(abc_team.models[i],
                               os.path.dirname(info_path))))

    ##########################################################################
    ## begin analysis --- get observed summary stats

    set_memory_trace()  # start logging memory profile
    start_time = datetime.datetime.now()

    if args.data_key_path:
        log.info('Using provided summary statitics...')
    elif not args.dry_run:
        obs_temp_dir = base_temp_dir
        if args.staging_dir:
            obs_temp_dir = args.staging_dir
        observed_temp_fs = TempFileSystem(parent=obs_temp_dir,
                                          prefix='observed-temps-')

        if args.reps < 1:
            log.info('Calculating summary statistics from sequence data...')
            obs_workers = []
            for i, cfg in enumerate(args.observed_configs):
                ss_worker = ObsSumStatsWorker(temp_fs=observed_temp_fs,
                                              config_path=cfg,
                                              output_path=observed_paths[i],
                                              schema='abctoolbox',
                                              stat_patterns=stat_patterns)
                obs_workers.append(ss_worker)

            obs_workers = Manager.run_workers(workers=obs_workers,
                                              num_processors=args.np)

            # re-vet all configs to see if some were changed by obsSumStats.pl
            new_ref_config = ref_config
            ref_modified = False
            # new ref because if all configs were updated all is good
            if not ref_config.equal_sample_table(ref_config_path):
                ref_modified = True
                new_ref_config = MsBayesConfig(ref_config_path)
                log.warning("""
The alignment lengths in config
{0!r}
have been corrected for sites with *any* ambiguous bases and/or gaps by
obsSumStats.pl.
                    """.format(ref_config_path))
            for config in all_config_paths:
                if not new_ref_config.equal_sample_table(config):
                    corrected_config = config
                    if ref_modified:
                        corrected_config = ref_config_path
                    if not args.keep_temps:
                        observed_temp_fs.purge()
                        temp_fs.purge()
                    raise errors.SampleTableError("""
The sample tables in configs
{0!r}
and
{1!r}
differ because obsSumStats.pl modified alignment lengths in config
{2!r}
to correct for sites in the alignments with *any* ambiguous bases and/or gaps.
Please make sure the sample tables in all configs will be the same after
correcting alignment lengths for sites that contain *any* ambiguous bases
and/or gaps. You can do this by copying and pasting the sample table in
{2!r}
that has been corrected by obsSumStats.pl into the other configs that were not
corrected.
                        """.format(ref_config_path, config, corrected_config))

        else:
            log.info('Simulating summary statistics from observed configs...')
            num_observed_workers = min([args.reps, args.np])
            if args.reps <= args.np:
                observed_batch_size = 1
                remainder = 0
            else:
                observed_batch_size, remainder = long_division(
                    args.reps, args.np)
            msbayes_workers = []
            for idx, cfg in enumerate(args.observed_configs):
                observed_model_idx = configs_to_models.get(cfg, None)
                schema = 'abctoolbox'
                for i in range(num_observed_workers):
                    worker = MsBayesWorker(temp_fs=observed_temp_fs,
                                           sample_size=observed_batch_size,
                                           config_path=cfg,
                                           model_index=observed_model_idx,
                                           report_parameters=True,
                                           schema=schema,
                                           include_header=True,
                                           stat_patterns=stat_patterns,
                                           write_stats_file=False,
                                           staging_dir=None,
                                           tag=idx)
                    msbayes_workers.append(worker)
                if remainder > 0:
                    worker = MsBayesWorker(temp_fs=observed_temp_fs,
                                           sample_size=remainder,
                                           config_path=cfg,
                                           model_index=observed_model_idx,
                                           report_parameters=True,
                                           schema=schema,
                                           include_header=True,
                                           stat_patterns=stat_patterns,
                                           write_stats_file=False,
                                           staging_dir=None,
                                           tag=idx)
                    msbayes_workers.append(worker)

            # run parallel msbayes processes
            msbayes_workers = Manager.run_workers(workers=msbayes_workers,
                                                  num_processors=args.np)

            workers = dict(
                zip(range(len(args.observed_configs)),
                    [[] for i in range(len(args.observed_configs))]))
            for w in msbayes_workers:
                workers[w.tag].append(w)

            # merge simulated observed data into one file
            for i in range(len(args.observed_configs)):
                merge_prior_files([w.prior_path for w in workers[i]],
                                  observed_paths[i])
                lc = line_count(observed_paths[i], ignore_headers=True)
                if lc != args.reps:
                    if not args.keep_temps:
                        temp_fs.purge()
                    raise Exception(
                        'The number of observed simulations ({0}) '
                        'generated for observed config {1!r} and output to '
                        'file {2!r} does not match the number of reps '
                        '({3})'.format(lc, args.observed_configs[i],
                                       observed_paths[i], args.reps))
        if not args.keep_temps:
            log.debug('purging observed temps...')
            observed_temp_fs.purge()

    ##########################################################################
    ## Begin ABC analyses

    if not args.dry_run:
        abc_team.run()

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()

Example #10

Show file

File: dmc_plot_stats.py Project: vishalbelsare/PyMsBayes

def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(
        description=description,
        formatter_class=argparse_utils.SmartHelpFormatter)
    parser.add_argument(
        '-c',
        '--config',
        type=argparse_utils.arg_is_config,
        required=True,
        help=('msBayes config file to be used to generate saturation '
              'plot.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=int,
        default=1000,
        help=('The number of prior samples to simulate for the '
              'saturation plot.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=argparse_utils.arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--sort-index',
                        action='store',
                        type=int,
                        default=0,
                        choices=range(12),
                        help=argparse_utils.get_sort_index_help_message())
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress plot data file.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         get_dict_from_spreadsheets,
                                         dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability, stats
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes import plotting

    MSBAYES_SORT_INDEX.set_index(args.sort_index)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    sample_path = os.path.join(args.output_dir, 'prior-sample.txt')
    if args.compress:
        sample_path += '.gz'

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
        [s + '.' for s in args.stat_prefixes], ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()),
               log.info)
    info.write(
        '\tstat_patterns = {0!r}'.format(', '.join(
            [p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
               log.info)
    info.write('\tsample_path = {0!r}'.format(sample_path), log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(temp_fs=temp_fs,
                          sample_size=sample_size,
                          config_path=args.config,
                          report_parameters=True,
                          schema=schema,
                          include_header=True,
                          stat_patterns=stat_patterns,
                          write_stats_file=False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(workers=workers, num_processors=args.np)
    log.info('Parsing samples...')
    sample = get_dict_from_spreadsheets([w.prior_path for w in workers])

    log.info('Writing prior samples...')
    out, close = process_file_arg(sample_path,
                                  'w',
                                  compresslevel=compress_level)
    for row in dict_line_iter(sample, sep='\t'):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not plotting.MATPLOTLIB_AVAILABLE:
        log.warning(
            '`matplotlib` could not be imported, so the plot can not be\n'
            'produced. The data to create the plot can be found in:\n\t'
            '{0!r}'.format(sample_path))
        sys.exit(1)

    for stat_pattern in stat_patterns:
        found = False
        for stat, values in sample.iteritems():
            if stat_pattern.match(stat):
                values = [float(v) for v in values]
                found = True
                plot_path = os.path.join(args.output_dir,
                                         'plot-{0}.pdf'.format(stat))
                summary = stats.get_summary(values)
                s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format(
                    summary['mean'], summary['qi_95'][0], summary['qi_95'][1])
                hd = plotting.HistData(x=values,
                                       normed=True,
                                       bins=20,
                                       histtype='bar',
                                       align='mid',
                                       orientation='vertical',
                                       zorder=0)
                hist = plotting.ScatterPlot(hist_data_list=[hd], right_text=s)
                hist.left_text_size = 12.0
                hist.right_text_size = 12.0
                xticks = [i for i in hist.ax.get_xticks()]
                xtick_labels = [i for i in xticks]
                yticks = [i for i in hist.ax.get_yticks()]
                ytick_labels = [i for i in yticks]
                if len(xtick_labels) >= 8:
                    for i in range(1, len(xtick_labels), 2):
                        xtick_labels[i] = ''
                if len(ytick_labels) >= 8:
                    for i in range(1, len(ytick_labels), 2):
                        ytick_labels[i] = ''
                xticks_obj = plotting.Ticks(ticks=xticks,
                                            labels=xtick_labels,
                                            horizontalalignment='center')
                yticks_obj = plotting.Ticks(ticks=yticks, labels=ytick_labels)
                hist.xticks_obj = xticks_obj
                hist.yticks_obj = yticks_obj

                plot_grid = plotting.PlotGrid(subplots=[hist],
                                              num_columns=1,
                                              label_schema=None,
                                              title=stat,
                                              title_size=14.0,
                                              title_top=False,
                                              y_title='Density',
                                              y_title_position=0.001,
                                              y_title_size=14.0,
                                              height=4.0,
                                              width=6.0,
                                              auto_height=False)
                plot_grid.auto_adjust_margins = False
                plot_grid.margin_left = 0.04
                plot_grid.margin_bottom = 0.04
                plot_grid.margin_right = 1.0
                plot_grid.margin_top = 0.97
                plot_grid.reset_figure()
                plot_grid.savefig(plot_path)

        if not found:
            raise Exception('stat pattern {0!r} not found in simulated stats:'
                            '\n\t{1}'.format(stat_pattern,
                                             ', '.join(sample.keys())))

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()