def summarize_sim_results(info_path): info_path = expand_path(info_path) sim_results = DMCSimulationResults(info_path) out_dir = os.path.dirname(info_path) summary_path = os.path.join(out_dir, 'results-summary.txt') result_path = sim_results.get_result_summary_path(observed_index = 1, prior_index = sim_results.combined_prior_index) d = get_dict_from_spreadsheets([result_path]) num_excluded = [int(x) for x in d['num_excluded']] num_excluded_glm = [int(x) for x in d['num_excluded_glm']] bf_num_excluded = [int(x) for x in d['bf_num_excluded']] bf_num_excluded_glm = [int(x) for x in d['bf_num_excluded_glm']] prob_of_exclusion = [float(x) for x in d['prob_of_exclusion']] prob_of_exclusion_glm = [float(x) for x in d['prob_of_exclusion_glm']] prior_prob_of_exclusion = [float(x) for x in d['prior_prob_of_exclusion']] bf_of_exclusion = [float(x) for x in d['bf_of_exclusion']] bf_of_exclusion_glm = [float(x) for x in d['bf_of_exclusion_glm']] num_sims = sim_results.num_sim_reps assert len(num_excluded) == num_sims assert len(num_excluded_glm) == num_sims assert len(prob_of_exclusion) == num_sims assert len(prob_of_exclusion_glm) == num_sims summary_stream, close = process_file_arg(summary_path, 'w') summary_stream.write('Proportion of simulations excluding truth: {0}' '\n'.format( len([1 for x in bf_num_excluded if x > 0]) / float(num_sims))) summary_stream.write('Proportion of simulations excluding truth with GLM-' 'adjustment: {0}\n'.format( len([1 for x in bf_num_excluded_glm if x > 0]) / float(num_sims))) summary_stream.write('Average number of tau parameters excluded: {0}' '\n'.format( sum(bf_num_excluded) / float(num_sims))) summary_stream.write('Average number of tau parameters excluded with GLM: ' '{0}\n'.format(sum(bf_num_excluded_glm) / float(num_sims))) summary_stream.write('Mode number of tau parameters excluded: {0}\n'.format( mode_list(bf_num_excluded))) summary_stream.write('Mode number of tau parameters excluded with GLM: ' '{0}\n'.format(mode_list(bf_num_excluded_glm))) summary_stream.write('Max number of tau parameters excluded: {0}\n'.format( max(bf_num_excluded))) summary_stream.write('Max number of tau parameters excluded with GLM: ' '{0}\n'.format(max(bf_num_excluded_glm))) summary_stream.write('Average probability of exclusion: {0}\n'.format( sum(prob_of_exclusion) / float(num_sims))) summary_stream.write('Average probability of exclusion with GLM: {0}\n'.format( sum(prob_of_exclusion_glm) / float(num_sims))) summary_stream.write('Median probability of exclusion: {0}\n'.format( median(prob_of_exclusion))) summary_stream.write('Median probability of exclusion with GLM: {0}\n'.format( median(prob_of_exclusion_glm))) summary_stream.write('Average Bayes factor of exclusion: {0}\n'.format( sum(bf_of_exclusion) / float(num_sims))) summary_stream.write('Average Bayes factor of exclusion with GLM: {0}\n'.format( sum(bf_of_exclusion_glm) / float(num_sims))) summary_stream.write('Median Bayes factor of exclusion: {0}\n'.format( median(bf_of_exclusion))) summary_stream.write('Median Bayes factor of exclusion with GLM: {0}\n'.format( median(bf_of_exclusion_glm))) summary_stream.write('Max Bayes factor of exclusion: {0}\n'.format( max(bf_of_exclusion))) summary_stream.write('Max Bayes factor of exclusion with GLM: {0}\n'.format( max(bf_of_exclusion_glm))) prob_of_bf_exclusion = (len([1 for x in bf_of_exclusion if x > 10.0]) / float(num_sims)) prob_of_bf_exclusion_glm = (len([ 1 for x in bf_of_exclusion_glm if x > 10.0]) / float(num_sims)) summary_stream.write('Estimated probability Bayes factor of exclusion ' '> 10: {0}\n'.format(prob_of_bf_exclusion)) summary_stream.write('Estimated probability Bayes factor of exclusion ' '> 10 with GLM: {0}\n'.format(prob_of_bf_exclusion_glm)) summary_stream.close() if plotting.MATPLOTLIB_AVAILABLE: approx_prior_exclusion = 0.39184 prior_odds = approx_prior_exclusion / (1.0 - approx_prior_exclusion) post_odds = prior_odds * 10 post = post_odds / (1.0 + post_odds) observed_config1 = MsBayesConfig(sim_results.observed_index_to_config[1]) observed_config2 = MsBayesConfig(sim_results.observed_index_to_config[1]) cfg_to_num_ex = {observed_config1: bf_num_excluded, observed_config2: bf_num_excluded_glm} cfg_to_prob_exclusion = {observed_config1: prob_of_exclusion, observed_config2: prob_of_exclusion_glm} cfg_to_prob_of_bf_exclusion = {observed_config1: prob_of_bf_exclusion, observed_config2: prob_of_bf_exclusion_glm} ex_prob_plot = plotting.ProbabilityPowerPlotGrid( observed_config_to_estimates = cfg_to_prob_exclusion, variable = 'tau_exclusion', div_model_prior = 'psi', bayes_factor = 10, bayes_factor_prob = post, cfg_to_prob_of_bf_exclusion = cfg_to_prob_of_bf_exclusion, height = 3.7, margin_left = 0.03, margin_bottom = 0.06, margin_right = 1, margin_top = 0.96, padding_between_horizontal = 0.5, padding_between_vertical = 1.0, num_columns = 2) fig = ex_prob_plot.create_grid() fig.savefig(os.path.join(out_dir, 'prob_of_exclusion.pdf')) ex_plot = plotting.PowerPlotGrid( observed_config_to_estimates = cfg_to_num_ex, variable = 'tau_exclusion', num_columns = 2, height = 3.7, margin_left = 0.03, margin_bottom = 0.06, margin_right = 1, margin_top = 0.95, padding_between_horizontal = 0.5, padding_between_vertical = 1.0) fig = ex_plot.create_grid() fig.savefig(os.path.join(out_dir, 'num_tau_excluded.pdf'))
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser(description = description, formatter_class = argparse_utils.SmartHelpFormatter) parser.add_argument('-c', '--config', type = argparse_utils.arg_is_config, required = True, help = ('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument('-n', '--num-prior-samples', action = 'store', type = int, default = 1000, help = ('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument('--np', action = 'store', type = int, default = multiprocessing.cpu_count(), help = ('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument('-o', '--output-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument('--temp-dir', action = 'store', type = argparse_utils.arg_is_dir, help = ('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument('-s', '--stat-prefixes', nargs = '*', type = str, default = ['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help = ('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action = 'store', type = int, default = 0, choices = range(12), help = argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action = 'store_true', help = 'Compress plot data file.') parser.add_argument('--keep-temps', action = 'store_true', help = 'Keep all temporary files.') parser.add_argument('--seed', action = 'store', type = int, help = 'Random number seed to use for the analysis.') parser.add_argument('--version', action = 'version', version = '%(prog)s ' + _program_info['version'], help = 'Report version and exit.') parser.add_argument('--quiet', action = 'store_true', help = 'Run without verbose messaging.') parser.add_argument('--debug', action = 'store_true', help = 'Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format( MSBAYES_SORT_INDEX.current_value()), log.info) info.write('\tstat_patterns = {0!r}'.format( ', '.join([p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker( temp_fs = temp_fs, sample_size = sample_size, config_path = args.config, report_parameters = True, schema = schema, include_header = True, stat_patterns = stat_patterns, write_stats_file = False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers( workers = workers, num_processors = args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel = compress_level) for row in dict_line_iter(sample, sep = '\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x = values, normed = True, bins = 20, histtype = 'bar', align = 'mid', orientation = 'vertical', zorder = 0) hist = plotting.ScatterPlot(hist_data_list = [hd], right_text = s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks = xticks, labels = xtick_labels, horizontalalignment = 'center') yticks_obj = plotting.Ticks(ticks = yticks, labels = ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots = [hist], num_columns = 1, label_schema = None, title = stat, title_size = 14.0, title_top = False, y_title = 'Density', y_title_position = 0.001, y_title_size = 14.0, height = 4.0, width = 6.0, auto_height = False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()
def main_cli(): description = '{name} {version}'.format(**_program_info) parser = argparse.ArgumentParser( description=description, formatter_class=argparse_utils.SmartHelpFormatter) parser.add_argument( '-c', '--config', type=argparse_utils.arg_is_config, required=True, help=('msBayes config file to be used to generate saturation ' 'plot.')) parser.add_argument( '-n', '--num-prior-samples', action='store', type=int, default=1000, help=('The number of prior samples to simulate for the ' 'saturation plot.')) parser.add_argument( '--np', action='store', type=int, default=multiprocessing.cpu_count(), help=('The maximum number of processes to run in parallel. The ' 'default is the number of CPUs available on the machine.')) parser.add_argument( '-o', '--output-dir', action='store', type=argparse_utils.arg_is_dir, help=('The directory in which all output files will be written. ' 'The default is to use the directory of the first observed ' 'config file.')) parser.add_argument( '--temp-dir', action='store', type=argparse_utils.arg_is_dir, help=('A directory to temporarily stage files. The default is to ' 'use the output directory.')) parser.add_argument( '-s', '--stat-prefixes', nargs='*', type=str, default=['pi', 'pi.net', 'wattTheta', 'tajD.denom'], help=('Prefixes of summary statistics to use in the analyses. ' 'The prefixes should be separated by spaces. ' 'Default: `-s pi pi.net wattTheta tajD.denom`.')) parser.add_argument('--sort-index', action='store', type=int, default=0, choices=range(12), help=argparse_utils.get_sort_index_help_message()) parser.add_argument('--compress', action='store_true', help='Compress plot data file.') parser.add_argument('--keep-temps', action='store_true', help='Keep all temporary files.') parser.add_argument('--seed', action='store', type=int, help='Random number seed to use for the analysis.') parser.add_argument('--version', action='version', version='%(prog)s ' + _program_info['version'], help='Report version and exit.') parser.add_argument('--quiet', action='store_true', help='Run without verbose messaging.') parser.add_argument('--debug', action='store_true', help='Run in debugging mode.') args = parser.parse_args() ########################################################################## ## handle args from pymsbayes.utils.messaging import (LoggingControl, InfoLogger) LoggingControl.set_logging_level("INFO") if args.quiet: LoggingControl.set_logging_level("WARNING") if args.debug: LoggingControl.set_logging_level("DEBUG") log = LoggingControl.get_logger(__name__) from pymsbayes.workers import MsBayesWorker from pymsbayes.utils.parsing import (get_patterns_from_prefixes, DEFAULT_STAT_PATTERNS, get_dict_from_spreadsheets, dict_line_iter) from pymsbayes.manager import Manager from pymsbayes.utils.tempfs import TempFileSystem from pymsbayes.utils import probability, stats from pymsbayes.utils.functions import long_division from pymsbayes.config import MsBayesConfig from pymsbayes.utils import GLOBAL_RNG, MSBAYES_SORT_INDEX, ToolPathManager from pymsbayes.fileio import process_file_arg from pymsbayes import plotting MSBAYES_SORT_INDEX.set_index(args.sort_index) # get full paths to tools msbayes_path = ToolPathManager.get_tool_full_path('msbayes.pl') dpp_msbayes_path = ToolPathManager.get_tool_full_path('dpp-msbayes.pl') if not args.output_dir: args.output_dir = os.path.dirname(args.config) info = InfoLogger(os.path.join(args.output_dir, 'pymsbayes-info.txt')) sample_path = os.path.join(args.output_dir, 'prior-sample.txt') if args.compress: sample_path += '.gz' if not args.temp_dir: args.temp_dir = args.output_dir temp_fs = TempFileSystem(parent=args.temp_dir, prefix='temp-files-') args.stat_prefixes = [s.rstrip('.') for s in args.stat_prefixes] stat_patterns = get_patterns_from_prefixes( [s + '.' for s in args.stat_prefixes], ignore_case=True) if not args.seed: args.seed = random.randint(1, 999999999) GLOBAL_RNG.seed(args.seed) compress_level = None if args.compress: compress_level = 9 cfg = MsBayesConfig(args.config) num_taxon_pairs = cfg.npairs info.write('[pymsbayes]', log.info) info.write('\tprogram_name = {name}'.format(**_program_info), log.info) info.write('\tversion = {version}'.format(**_program_info), log.info) info.write('\tinvocation = {0!r}'.format(' '.join(sys.argv)), log.info) info.write('\toutput_directory = {0!r}'.format(args.output_dir), log.info) info.write('\ttemp_directory = {0!r}'.format(temp_fs.base_dir), log.info) info.write('\tsort_index = {0}'.format(MSBAYES_SORT_INDEX.current_value()), log.info) info.write( '\tstat_patterns = {0!r}'.format(', '.join( [p.pattern for p in stat_patterns])), log.info) info.write('\tseed = {0}'.format(args.seed), log.info) info.write('\tnum_prior_samples = {0}'.format(args.num_prior_samples), log.info) info.write('\tsample_path = {0!r}'.format(sample_path), log.info) info.write('\t[[tool_paths]]', log.info) info.write('\t\tdpp_msbayes = {0}'.format(dpp_msbayes_path), log.info) info.write('\t\tmsbayes = {0}'.format(msbayes_path), log.info) info.write('\t[[config]]', log.debug) info.write('{0}'.format(str(cfg)), log.debug) ########################################################################## ## begin analysis --- generate samples start_time = datetime.datetime.now() if args.np > args.num_prior_samples: args.np = args.num_prior_samples batch_size, remainder = long_division(args.num_prior_samples, args.np) schema = 'abctoolbox' workers = [] for i in range(args.np): sample_size = batch_size if i == (args.np - 1): sample_size += remainder w = MsBayesWorker(temp_fs=temp_fs, sample_size=sample_size, config_path=args.config, report_parameters=True, schema=schema, include_header=True, stat_patterns=stat_patterns, write_stats_file=False) workers.append(w) log.info('Generating samples...') workers = Manager.run_workers(workers=workers, num_processors=args.np) log.info('Parsing samples...') sample = get_dict_from_spreadsheets([w.prior_path for w in workers]) log.info('Writing prior samples...') out, close = process_file_arg(sample_path, 'w', compresslevel=compress_level) for row in dict_line_iter(sample, sep='\t'): out.write(row) if close: out.close() log.info('Creating plots...') if not plotting.MATPLOTLIB_AVAILABLE: log.warning( '`matplotlib` could not be imported, so the plot can not be\n' 'produced. The data to create the plot can be found in:\n\t' '{0!r}'.format(sample_path)) sys.exit(1) for stat_pattern in stat_patterns: found = False for stat, values in sample.iteritems(): if stat_pattern.match(stat): values = [float(v) for v in values] found = True plot_path = os.path.join(args.output_dir, 'plot-{0}.pdf'.format(stat)) summary = stats.get_summary(values) s = r'mean = {0:.4f} ({1:.4f}-{2:.4f})'.format( summary['mean'], summary['qi_95'][0], summary['qi_95'][1]) hd = plotting.HistData(x=values, normed=True, bins=20, histtype='bar', align='mid', orientation='vertical', zorder=0) hist = plotting.ScatterPlot(hist_data_list=[hd], right_text=s) hist.left_text_size = 12.0 hist.right_text_size = 12.0 xticks = [i for i in hist.ax.get_xticks()] xtick_labels = [i for i in xticks] yticks = [i for i in hist.ax.get_yticks()] ytick_labels = [i for i in yticks] if len(xtick_labels) >= 8: for i in range(1, len(xtick_labels), 2): xtick_labels[i] = '' if len(ytick_labels) >= 8: for i in range(1, len(ytick_labels), 2): ytick_labels[i] = '' xticks_obj = plotting.Ticks(ticks=xticks, labels=xtick_labels, horizontalalignment='center') yticks_obj = plotting.Ticks(ticks=yticks, labels=ytick_labels) hist.xticks_obj = xticks_obj hist.yticks_obj = yticks_obj plot_grid = plotting.PlotGrid(subplots=[hist], num_columns=1, label_schema=None, title=stat, title_size=14.0, title_top=False, y_title='Density', y_title_position=0.001, y_title_size=14.0, height=4.0, width=6.0, auto_height=False) plot_grid.auto_adjust_margins = False plot_grid.margin_left = 0.04 plot_grid.margin_bottom = 0.04 plot_grid.margin_right = 1.0 plot_grid.margin_top = 0.97 plot_grid.reset_figure() plot_grid.savefig(plot_path) if not found: raise Exception('stat pattern {0!r} not found in simulated stats:' '\n\t{1}'.format(stat_pattern, ', '.join(sample.keys()))) stop_time = datetime.datetime.now() log.info('Done!') info.write('\t[[run_stats]]', log.info) info.write('\t\tstart_time = {0}'.format(str(start_time)), log.info) info.write('\t\tstop_time = {0}'.format(str(stop_time)), log.info) info.write('\t\ttotal_duration = {0}'.format(str(stop_time - start_time)), log.info) if not args.keep_temps: log.debug('purging temps...') temp_fs.purge()