Esempio n. 1
0
 def test_new_implementation_round_trip(self):
     p = {
         "c_shape": 2,
         "c_scale": 5,
         "theta_shape": 2.0,
         "theta_scale": 0.002,
         "tau_shape": 1.0,
         "tau_scale": 5.0,
         "a_theta_shape": 1,
         "a_theta_scale": 0.01,
         "theta_params": "001",
         "migration_shape": 1.5,
         "migration_scale": 0.1,
         "recombination_shape": 0.0,
         "recombination_scale": 0.0,
         "psi": 0,
     }
     self._update_config(self.cfg, p, new_impl=True)
     _LOG.debug("testing config:\n\n{0}\n".format(self.cfg.getvalue()))
     c = MsBayesConfig(self.cfg)
     s = StringIO()
     c.write(s)
     _LOG.debug("written config:\n\n{0}\n".format(s.getvalue()))
     s.seek(0)
     c2 = MsBayesConfig(s)
     self.assertSameConfigs([c, c2])
Esempio n. 2
0
 def test_new_implementation_round_trip(self):
     p = {
         'c_shape': 2,
         'c_scale': 5,
         'theta_shape': 2.0,
         'theta_scale': 0.002,
         'tau_shape': 1.0,
         'tau_scale': 5.0,
         'a_theta_shape': 1,
         'a_theta_scale': 0.01,
         'theta_params': '001',
         'migration_shape': 1.5,
         'migration_scale': 0.1,
         'recombination_shape': 0.0,
         'recombination_scale': 0.0,
         'psi': 0,
     }
     self._update_config(self.cfg, p, new_impl=True)
     _LOG.debug('testing config:\n\n{0}\n'.format(self.cfg.getvalue()))
     c = MsBayesConfig(self.cfg)
     s = StringIO()
     c.write(s)
     _LOG.debug('written config:\n\n{0}\n'.format(s.getvalue()))
     s.seek(0)
     c2 = MsBayesConfig(s)
     self.assertSameConfigs([c, c2])
Esempio n. 3
0
 def test_multi_locus_round_trip(self):
     p = {"ltheta": 0.0001, "utheta": 0.1, "utau": 10.0, "psi": 0, "umig": 0.0, "urec": 0.0, "atheta": 1.0}
     self._update_config(self.cfg, p, multi_locus=True)
     _LOG.debug("testing config:\n\n{0}\n".format(self.cfg.getvalue()))
     c = MsBayesConfig(self.cfg)
     s = StringIO()
     c.write(s)
     _LOG.debug("written config:\n\n{0}\n".format(s.getvalue()))
     s.seek(0)
     c2 = MsBayesConfig(s)
     self.assertSameConfigs([c, c2])
Esempio n. 4
0
 def test_multi_locus_round_trip(self):
     p = {
         'ltheta': 0.0001,
         'utheta': 0.1,
         'utau': 10.0,
         'psi': 0,
         'umig': 0.0,
         'urec': 0.0,
         'atheta': 1.0,
     }
     self._update_config(self.cfg, p, multi_locus=True)
     _LOG.debug('testing config:\n\n{0}\n'.format(self.cfg.getvalue()))
     c = MsBayesConfig(self.cfg)
     s = StringIO()
     c.write(s)
     _LOG.debug('written config:\n\n{0}\n'.format(s.getvalue()))
     s.seek(0)
     c2 = MsBayesConfig(s)
     self.assertSameConfigs([c, c2])
Esempio n. 5
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description = description)
    parser.add_argument('-c', '--config',
            type = arg_is_config,
            required = True,
            help = ('msBayes config file to be used to generate saturation '
                    'plot.'))
    parser.add_argument('-n', '--num-prior-samples',
            action = 'store',
            type = int,
            default = 1000,
            help = ('The number of prior samples to simulate for the '
                    'saturation plot.'))
    parser.add_argument('--np',
            action = 'store',
            type = int,
            default = multiprocessing.cpu_count(),
            help = ('The maximum number of processes to run in parallel. The '
                    'default is the number of CPUs available on the machine.'))
    parser.add_argument('-o', '--output-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('The directory in which all output files will be written. '
                    'The default is to use the directory of the first observed '
                    'config file.'))
    parser.add_argument('--temp-dir',
            action = 'store',
            type = arg_is_dir,
            help = ('A directory to temporarily stage files. The default is to '
                    'use the output directory.'))
    parser.add_argument('-s', '--stat-prefixes',
            nargs = '*',
            type = str,
            default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
            help = ('Prefixes of summary statistics to use in the analyses. '
                    'The prefixes should be separated by spaces. '
                    'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument('--vertical-lines',
            nargs = '*',
            type = float,
            default = [],
            help = ('Positions along x-axis where vertical lines are to be '
                    'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
            action = 'store_true',
            help = 'Compress plot data file.')
    parser.add_argument('--keep-temps',
            action = 'store_true',
            help = 'Keep all temporary files.')
    parser.add_argument('--seed',
            action = 'store',
            type = int,
            help = 'Random number seed to use for the analysis.')
    parser.add_argument('--version',
            action = 'version',
            version = '%(prog)s ' + _program_info['version'],
            help = 'Report version and exit.')
    parser.add_argument('--quiet',
            action = 'store_true',
            help = 'Run without verbose messaging.')
    parser.add_argument('--debug',
            action = 'store_true',
            help = 'Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl,
            InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
            DEFAULT_STAT_PATTERNS, get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
            [s + '.' for s in args.stat_prefixes],
            ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
            num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(
            MSBAYES_SORT_INDEX.current_value()), log.info)
    info.write('\tstat_patterns = {0!r}'.format(
            ', '.join([p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
            log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
            log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(
                temp_fs = temp_fs,
                sample_size = sample_size,
                config_path = config_path,
                report_parameters = True,
                schema = schema,
                include_header = True,
                stat_patterns = stat_patterns,
                write_stats_file = False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(
            workers = workers,
            num_processors = args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                    '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path, 'w',
            compresslevel = compress_level)
    for row in dict_line_iter(stats_by_time, sep = '\t', header = header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
                '`matplotlib` could not be imported, so the plot can not be\n'
                'produced. The data to create the plot can be found in:\n\t'
                '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {'pi': r'$\pi$',
                   'pi.net': r'$\pi_{net}$',
                   'wattTheta': r'$\theta_W$',
                   'tajD.denom': r'$SD(\pi - \theta_W)$'}
        spg = SaturationPlotGrid(stats_by_time,
                x_key = 'PRI.t',
                y_keys = args.stat_prefixes,
                y_labels = y_labels,
                num_columns = 2,
                vertical_line_positions = args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
            log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()
Esempio n. 6
0
def main_cli():
    description = '{name} {version}'.format(**_program_info)
    parser = argparse.ArgumentParser(description=description)
    parser.add_argument(
        '-c',
        '--config',
        type=arg_is_config,
        required=True,
        help=('msBayes config file to be used to generate saturation '
              'plot.'))
    parser.add_argument(
        '-n',
        '--num-prior-samples',
        action='store',
        type=int,
        default=1000,
        help=('The number of prior samples to simulate for the '
              'saturation plot.'))
    parser.add_argument(
        '--np',
        action='store',
        type=int,
        default=multiprocessing.cpu_count(),
        help=('The maximum number of processes to run in parallel. The '
              'default is the number of CPUs available on the machine.'))
    parser.add_argument(
        '-o',
        '--output-dir',
        action='store',
        type=arg_is_dir,
        help=('The directory in which all output files will be written. '
              'The default is to use the directory of the first observed '
              'config file.'))
    parser.add_argument(
        '--temp-dir',
        action='store',
        type=arg_is_dir,
        help=('A directory to temporarily stage files. The default is to '
              'use the output directory.'))
    parser.add_argument(
        '-s',
        '--stat-prefixes',
        nargs='*',
        type=str,
        default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'],
        help=('Prefixes of summary statistics to use in the analyses. '
              'The prefixes should be separated by spaces. '
              'Default: `-s pi pi.net wattTheta tajD.denom`.'))
    parser.add_argument(
        '--vertical-lines',
        nargs='*',
        type=float,
        default=[],
        help=('Positions along x-axis where vertical lines are to be '
              'drawn. Default is to draw no vertical lines.'))
    parser.add_argument('--compress',
                        action='store_true',
                        help='Compress plot data file.')
    parser.add_argument('--keep-temps',
                        action='store_true',
                        help='Keep all temporary files.')
    parser.add_argument('--seed',
                        action='store',
                        type=int,
                        help='Random number seed to use for the analysis.')
    parser.add_argument('--version',
                        action='version',
                        version='%(prog)s ' + _program_info['version'],
                        help='Report version and exit.')
    parser.add_argument('--quiet',
                        action='store_true',
                        help='Run without verbose messaging.')
    parser.add_argument('--debug',
                        action='store_true',
                        help='Run in debugging mode.')

    args = parser.parse_args()

    ##########################################################################
    ## handle args

    from pymsbayes.utils.messaging import (LoggingControl, InfoLogger)

    LoggingControl.set_logging_level("INFO")
    if args.quiet:
        LoggingControl.set_logging_level("WARNING")
    if args.debug:
        LoggingControl.set_logging_level("DEBUG")
    log = LoggingControl.get_logger(__name__)

    from pymsbayes.workers import MsBayesWorker
    from pymsbayes.utils.parsing import (get_patterns_from_prefixes,
                                         DEFAULT_STAT_PATTERNS,
                                         get_stats_by_time, dict_line_iter)
    from pymsbayes.manager import Manager
    from pymsbayes.utils.tempfs import TempFileSystem
    from pymsbayes.utils import probability
    from pymsbayes.utils.functions import long_division
    from pymsbayes.config import MsBayesConfig
    from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager
    from pymsbayes.fileio import process_file_arg
    from pymsbayes.plotting import MATPLOTLIB_AVAILABLE, SaturationPlotGrid

    MSBAYES_SORT_INDEX.set_index(0)

    # get full paths to tools
    msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl')
    dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl')

    if not args.output_dir:
        args.output_dir = os.path.dirname(args.config)
    info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt'))

    stats_by_time_path = os.path.join(args.output_dir, 'stats-by-time.txt')
    if args.compress:
        stats_by_time_path += '.gz'
    plot_path = os.path.join(args.output_dir, 'saturation-plot.pdf')

    if not args.temp_dir:
        args.temp_dir = args.output_dir
    temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-')
    args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes]
    stat_patterns = get_patterns_from_prefixes(
        [s + '.' for s in args.stat_prefixes], ignore_case=True)
    if not args.seed:
        args.seed = random.randint(1, 999999999)
    GLOBAL_RNG.seed(args.seed)
    compress_level = None
    if args.compress:
        compress_level = 9

    cfg = MsBayesConfig(args.config)
    num_taxon_pairs = cfg.npairs
    cfg.div_model_prior = 'constrained'
    cfg.psi = probability.DiscreteUniformDistribution(num_taxon_pairs,
                                                      num_taxon_pairs)
    config_path = temp_fs.get_file_path(prefix='cfg-')
    cfg.write(config_path)

    info.write('[pymsbayes]', log.info)
    info.write('\tprogram_name = {name}'.format(**_program_info), log.info)
    info.write('\tversion = {version}'.format(**_program_info), log.info)
    info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info)
    info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info)
    info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info)
    info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()),
               log.info)
    info.write(
        '\tstat_patterns = {0!r}'.format(', '.join(
            [p.pattern for p in stat_patterns])), log.info)
    info.write('\tseed = {0}'.format(args.seed), log.info)
    info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples),
               log.info)
    info.write('\tstats_by_time_path = {0!r}'.format(stats_by_time_path),
               log.info)
    info.write('\t[[tool_paths]]', log.info)
    info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info)
    info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info)

    info.write('\t[[config]]', log.debug)
    info.write('{0}'.format(str(cfg)), log.debug)

    ##########################################################################
    ## begin analysis --- generate samples

    start_time = datetime.datetime.now()

    if args.np > args.num_prior_samples:
        args.np = args.num_prior_samples
    batch_size, remainder = long_division(args.num_prior_samples, args.np)
    schema = 'abctoolbox'
    workers = []
    for i in range(args.np):
        sample_size = batch_size
        if i == (args.np - 1):
            sample_size += remainder
        w = MsBayesWorker(temp_fs=temp_fs,
                          sample_size=sample_size,
                          config_path=config_path,
                          report_parameters=True,
                          schema=schema,
                          include_header=True,
                          stat_patterns=stat_patterns,
                          write_stats_file=False)
        workers.append(w)

    log.info('Generating samples...')
    workers = Manager.run_workers(workers=workers, num_processors=args.np)
    log.info('Parsing samples...')
    stats_by_time = get_stats_by_time([w.prior_path for w in workers])
    stat_keys = stats_by_time.keys()
    stat_keys.remove('PRI.t')
    for prefix in args.stat_prefixes:
        if not prefix in stat_keys:
            raise Exception('stat prefix {0!r} not found in simulated stats:'
                            '\n\t{1}'.format(prefix, ', '.join(stat_keys)))
    header = ['PRI.t'] + args.stat_prefixes
    log.info('Writing stats-by-time matrix...')
    out, close = process_file_arg(stats_by_time_path,
                                  'w',
                                  compresslevel=compress_level)
    for row in dict_line_iter(stats_by_time, sep='\t', header=header):
        out.write(row)
    if close:
        out.close()

    log.info('Creating plots...')

    if not MATPLOTLIB_AVAILABLE:
        log.warning(
            '`matplotlib` could not be imported, so the plot can not be\n'
            'produced. The data to create the plot can be found in:\n\t'
            '{0!r}'.format(stats_by_time_path))
    else:
        y_labels = {
            'pi': r'$\pi$',
            'pi.net': r'$\pi_{net}$',
            'wattTheta': r'$\theta_W$',
            'tajD.denom': r'$SD(\pi - \theta_W)$'
        }
        spg = SaturationPlotGrid(stats_by_time,
                                 x_key='PRI.t',
                                 y_keys=args.stat_prefixes,
                                 y_labels=y_labels,
                                 num_columns=2,
                                 vertical_line_positions=args.vertical_lines)
        fig = spg.create_grid()
        fig.savefig(plot_path)

    stop_time = datetime.datetime.now()
    log.info('Done!')
    info.write('\t[[run_stats]]', log.info)
    info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info)
    info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info)
    info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)),
               log.info)

    if not args.keep_temps:
        log.debug('purging temps...')
        temp_fs.purge()