def write_posterior(self, output_prefix, target_data_filepath): if output_prefix is None: output_prefix = os.path.splitext( os.path.basename(target_data_filepath))[0] + ".posterior" with utility.universal_open(target_data_filepath) as src: reader = csv.DictReader(src, delimiter=self.field_delimiter, quoting=csv.QUOTE_NONE) for row_idx, row in enumerate(reader): assert len(row) == len(reader.fieldnames) target_stat_values = [] target_other_values = [] for key_idx, key in enumerate( self.all_fieldnames ): # keys must be read in same order! if key not in row: continue if not self.is_suppress_checks: if key not in self.stat_fieldnames_check and key not in self.other_fieldname_check: raise ValueError( "File '{}', target {}, column {}: field '{}' not recognized" .format(target_data_filepath, target_idx + 1, key_idx + 1, key)) if key.startswith(self.stats_field_prefix): target_stat_values.append(float(row[key])) else: target_other_values.append(row[key]) if self.rejection_criteria_type == "distance": posterior_indexes = self.filter_by_distance( target_stat_values=target_stat_values, max_distance=self.rejection_criteria_value) else: if self.rejection_criteria_type == "num": num_to_retain = self.rejection_criteria_value elif self.rejection_criteria_type == "proportion": num_to_retain = int(self.rejection_criteria_value * len(target_stat_values)) posterior_indexes = self.closest_values_indexes( target_stat_values=target_stat_values, num_to_retain=num_to_retain, ) dest = utility.universal_open( output_prefix + ".{}.tsv".format(row_idx + 1), "w") dest.write( self.field_delimiter.join( str(v) for v in self.other_fieldnames)) if self.is_output_summary_stats: dest.write( self.field_delimiter.join( str(v) for v in self.stat_fieldnames)) dest.write("\n") for distance, index in posterior_indexes: dest.write( self.field_delimiter.join( str(v) for v in self.other_values[index])) if self.is_output_summary_stats: dest.write( self.field_delimiter.join( str(v) for v in self.stat_values[index])) dest.write("\n")
def _generate_parameter_file( self, fsc2_config_d, ): assert self.parameter_filepath with utility.universal_open( os.path.join(self.working_directory, self.parameter_filepath), "w") as dest: self._write_parameter_configuration( dest=dest, fsc2_config_d=fsc2_config_d, )
def _parse_deme_site_frequencies(self, filepath, field_name_prefix, results_d): with utility.universal_open(filepath) as src: lines = src.read().split("\n") assert len(lines) == 4 and lines[3] == "" header_row = lines[1].split("\t") results_d_row = lines[2].split("\t") assert len(header_row) == len(results_d_row) for key, val in zip(header_row, results_d_row): if not val: continue results_d["{}.{}".format(field_name_prefix, key)] = float(val) return results_d
def _parse_joint_site_frequencies(self, filepath, field_name_prefix, results_d): with utility.universal_open(filepath) as src: lines = src.read().split("\n") col_keys = lines[1].split("\t")[1:] row_idx = 0 for line in lines[2:]: if not line: continue cols = line.split("\t") assert len(cols) - 1 == len(col_keys) row_key = cols[0] col_idx = 0 for col_key, val in zip(col_keys, cols[1:]): # results_d["{}.{}.{}".format(field_name_prefix, row_key, col_key)] = float(val) results_d["{}.{}.{}".format(field_name_prefix, row_idx, col_idx)] = float(val) col_idx += 1 row_idx += 1 return results_d
def read_simulated_data(self, filepaths): for filepath in filepaths: self.run_logger.info( "Reading simulation file: '{}'".format(filepath)) with utility.universal_open(filepath) as src: reader = csv.DictReader(src, delimiter=self.field_delimiter, quoting=csv.QUOTE_NONE) for row_idx, row in enumerate(reader): if self.logging_frequency and row_idx > 0 and row_idx % self.logging_frequency == 0: self.run_logger.info( "- Processing row {}".format(row_idx + 1)) if self.all_fieldnames is None: self.all_fieldnames = list(reader.fieldnames) self.stat_fieldnames = [] self.other_fieldnames = [] for field in reader.fieldnames: if field.startswith(self.stats_field_prefix): self.stat_fieldnames.append(field) else: self.other_fieldnames.append(field) self.stat_fieldnames_check = set(self.stat_fieldnames) self.other_fieldname_check = set(self.other_fieldnames) row_stat_values = [] row_other_values = [] for key_idx, key in enumerate( self.all_fieldnames ): # keys must be read in same order! if not self.is_suppress_checks: if key not in self.stat_fieldnames_check and key not in self.other_fieldname_check: raise ValueError( "File '{}', row {}, column {}: field '{}' not recognized" .format(filepath, row_idx + 1, key_idx + 1, key)) if key.startswith(self.stats_field_prefix): row_stat_values.append(float(row[key])) else: row_other_values.append(row[key]) # assert len(row) == len(row_stat_values) + len(row_other_values) self.stat_values.append(row_stat_values) self.other_values.append(row_other_values)
def write_results(self): for file_idx, file_info in enumerate(self.file_infos): output_filepath = self.compose_output_path_f( file_info.filepath, file_idx) self.run_logger.info("Writing file {} of {}: '{}'".format( file_idx + 1, len(self.file_infos), output_filepath)) with utility.universal_open(output_filepath, "w") as dest: writer = utility.get_csv_writer( dest=dest, fieldnames=file_info.fieldnames, delimiter=self.field_delimiter, restval=self.missing_data_value, ) writer.writeheader() for data_row_idx in range(*file_info.data_row_idx_range): if self.logging_frequency and data_row_idx > 0 and ( data_row_idx % self.logging_frequency) == 0: self.run_logger.info( "- Writing row {}".format(data_row_idx + 1)) row = {} for field_name in file_info.fieldnames: row[field_name] = self.fields[field_name][data_row_idx] writer.writerow(row)
def main(): parser = argparse.ArgumentParser() package_id = spectrasophy.package_id() parser.add_argument("--version", action="version", version=package_id) simulator_options = parser.add_argument_group("Simulation Configuration") simulator_options.add_argument( "configuration_filepath", metavar="CONFIGURATION-FILE", help="Path to file defining the simulation model and parameters.") output_options = parser.add_argument_group("Output Options") output_options.add_argument( '-o', '--output-name-prefix', action='store', dest='output_name_prefix', type=str, default=None, metavar='NAME-PREFIX', help= "Prefix for output filenames (default: same as configuration filename stem)." ) output_options.add_argument( '-O', '--output-directory', action='store', dest='output_directory', type=str, default=None, metavar='DIRECTORY', help="Directory for output files (default: current working directory)." ) output_options.add_argument( "-U", "--unfolded-site-frequency-spectrum", "--derived-site-frequency-spectrum", action="store_true", default=False, help="Calculate the unfolded or derived site frequency spectrum." " Otherwise, defaults to the folded or minor site frequency" " spectrum.") output_options.add_argument( "--infinite-sites-model", action="store_true", default=False, help="Use infinite sites model instead of finite sites.") output_options.add_argument( "--calculate-single-population-site-frequency-spectrum", action="store_true", default=False, help="Calculate the single (within) population site frequency" " spectrum in addition to the joint.") output_options.add_argument( "-l", "--labels", action="append", help= "Addition field/value pairs to add to the output (in format <FIELD-NAME>:value;)" ) output_options.add_argument( '--field-delimiter', type=str, default='\t', help="Delimiter string separating fields in output (default: <TAB>').") output_options.add_argument( '--summary-stats-label-prefix', type=str, default='stat', metavar='PREFIX', help= "Prefix for summar statistic field labels (default: '%(default)s').") output_options.add_argument( "--include-model-id-field", action="store_true", default=False, help= "Include a 'model.id' field (with same value as 'param.divTimeModel' field) in output." ) output_options.add_argument( "--append", action="store_true", default=False, help="Append instead of overwriting output file(s).") output_options.add_argument("--no-write-header", action="store_true", default=False, help="Do not writer header row.") run_options = parser.add_argument_group("Run Options") run_options.add_argument( "-n", "--num-reps", type=int, default=1, help="Number of replicates (default: %(default)s).") run_options.add_argument( "-m", "--num-processes", default=1, type=int, help="Number of processes/CPU to run (default: %(default)s).") run_options.add_argument("-z", "--random-seed", default=None, help="Seed for random number generator engine.") run_options.add_argument( "--log-frequency", default=None, type=int, help= "Frequency that background progress messages get written to the log (0: do not log informational messages)." ) run_options.add_argument( "--file-logging-level", default="none", choices=[ "debug", "info", "warning", "error", "critical", "none", ], help="Message level threshold for screen logs (default: %(default)s).") run_options.add_argument( "--stderr-logging-level", default="info", choices=[ "debug", "info", "warning", "error", "critical", "none", ], help="Message level threshold for screen logs (default: %(default)s).") run_options.add_argument( '-w', '--working-directory-parent', action='store', type=str, default=None, help="Directory within which to create temporary directories and files." ) run_options.add_argument("--no-cleanup", action="store_true", default=False, help="Do not clean-up temporary files.") run_options.add_argument("--debug-mode", action="store_true", default=False, help="Run in debugging mode.") fsc2_options = parser.add_argument_group("FastSimCoal2 Options") fsc2_options.add_argument( "--fsc2-path", metavar="FSC2-PATH", default="fsc25", help="Path to FastsimCoal2 application (default: %(default)s).") args = parser.parse_args() config_d = {} utility.parse_legacy_configuration(filepath=args.configuration_filepath, config_d=config_d) config_d["output_prefix"] = utility.output_prefix( primary_source_filepath=args.configuration_filepath, output_name_prefix=args.output_name_prefix, output_directory=args.output_directory) if args.log_frequency is None: config_d["logging_frequency"] = int(args.num_reps / 10.0) elif args.log_frequency == 0: config_d["logging_frequency"] = None else: config_d["logging_frequency"] = args.log_frequency config_d["fsc2_path"] = args.fsc2_path config_d["file_logging_level"] = args.file_logging_level config_d["standard_error_logging_level"] = args.stderr_logging_level # config_d["log_to_file"] = args.log_to_file # config_d["log_to_stderr"] = args.log_to_stderr config_d[ "is_unfolded_site_frequency_spectrum"] = args.unfolded_site_frequency_spectrum config_d[ "is_calculate_single_population_sfs"] = args.calculate_single_population_site_frequency_spectrum config_d["is_calculate_joint_population_sfs"] = True config_d["is_infinite_sites_model"] = args.infinite_sites_model config_d["stat_label_prefix"] = args.summary_stats_label_prefix config_d["supplemental_labels"] = utility.parse_fieldname_and_value( args.labels) config_d["field_delimiter"] = args.field_delimiter config_d["is_include_model_id_field"] = args.include_model_id_field with utility.TemporaryDirectory( prefix="spectrasophy-", parent_dir=args.working_directory_parent, is_suppress_cleanup=args.no_cleanup) as working_directory: config_d["working_directory"] = working_directory simulator = simulate.SpectrasophySimulator( config_d=config_d, num_processes=args.num_processes, is_verbose_setup=True, package_id=package_id, ) filepath = config_d["output_prefix"] + ".sumstats.tsv" dest = utility.universal_open(filepath, "a" if args.append else "w") # dest = utility.open_destput_file_for_csv_writer( # filepath=filepath, # is_append=args.append) if args.append or args.no_write_header: is_write_header = False else: is_write_header = True with dest: # writer = utility.get_csv_writer( # dest=dest, # delimiter=args.field_delimiter) try: results = simulator.execute(nreps=args.num_reps, dest=dest, results_store=None, is_write_header=is_write_header) except Exception as e: sys.stderr.write( "Traceback (most recent call last):\n {}{}\n".format( " ".join(traceback.format_tb(sys.exc_info()[2])), e)) sys.exit(1)
def read_files(self, filepaths): for file_idx, filepath in enumerate(filepaths): self.run_logger.info("Reading file {} of {}: '{}'".format( file_idx + 1, len(filepaths), filepath)) with utility.universal_open(filepath) as src: self._read_file(src)
def summarize( self, target_data_filepath, ): with utility.universal_open(target_data_filepath) as src: reader = csv.DictReader(src, delimiter=self.field_delimiter, quoting=csv.QUOTE_NONE) categorical_params = collections.OrderedDict() continuous_params = collections.OrderedDict() for row_idx, row in enumerate(reader): for key_idx, key in enumerate(reader.fieldnames): if key in categorical_params: categorical_params[key][row[key]] += 1 elif key in continuous_params: continuous_params[key].append(float(row[key])) else: if key in ("param.DivTimeModel", "param.numDivTimes"): val = row[key] is_categorical = True else: try: val = float(row[key]) is_categorical = False except ValueError: val = row[key] is_categorical = True if is_categorical: categorical_params[key] = collections.Counter() categorical_params[key][val] += 1 else: continuous_params[key] = [val] output_prefix = os.path.splitext( os.path.basename(target_data_filepath))[0] with utility.universal_open( output_prefix + ".summary.continuous.tsv", "w") as dest: row_results = collections.OrderedDict() for param_idx, param_name in enumerate(continuous_params): values = continuous_params[param_name] row_results["param"] = param_name summary = statistics.summarize(values) row_results["mean"] = summary["mean"] row_results["var"] = summary["var"] row_results["sd"] = summary["sd"] row_results["min"] = summary["range"][0] row_results["max"] = summary["range"][1] row_results["hpd5"] = summary["hpd95"][0] row_results["hpd95"] = summary["hpd95"][1] try: row_results["quant5"] = summary["quant_5_95"][0] row_results["quant95"] = summary["quant_5_95"][1] except TypeError: row_results["quant5"] = "NA" row_results["quant95"] = "NA" if param_idx == 0: dest.write( self.field_delimiter.join(row_results.keys()) + "\n") dest.write( self.field_delimiter.join( "{}".format(v) for v in row_results.values()) + "\n") for param_idx, param_name in enumerate(categorical_params): with utility.universal_open( output_prefix + ".summary.{:02d}.{}.tsv".format( param_idx + 1, param_name), "w") as dest: param_counter = categorical_params[param_name] total = float(sum(param_counter.values())) for category_idx, (category_name, category_count) in enumerate( param_counter.most_common()): row_results = collections.OrderedDict() row_results["label"] = category_name row_results["freq"] = category_count / total row_results["count"] = category_count if category_idx == 0: dest.write( self.field_delimiter.join(row_results.keys()) + "\n") dest.write( self.field_delimiter.join( "{}".format(v) for v in row_results.values()) + "\n")
def main(): parser = argparse.ArgumentParser() package_id = spectrasophy.package_id() parser.add_argument("--version", action="version", version=package_id) simulator_options = parser.add_argument_group("Configuration") simulator_options.add_argument("configuration_filepath", metavar="CONFIGURATION-FILE", help="Path to the configuration file listing the data.") output_options = parser.add_argument_group("Output Options") output_options.add_argument('-o', '--output-name-prefix', action='store', dest='output_name_prefix', type=str, default=None, metavar='NAME-PREFIX', help="Prefix for output filenames (default: same as configuration filename stem).") output_options.add_argument('-O', '--output-directory', action='store', dest='output_directory', type=str, default=None, metavar='DIRECTORY', help="Directory for output files (default: current working directory).") output_options.add_argument( "-U", "--unfolded-site-frequency-spectrum", "--derived-site-frequency-spectrum", action="store_true", default=False, help="Calculate the unfolded or derived site frequency spectrum." " Otherwise, defaults to the folded or minor site frequency" " spectrum." ) output_options.add_argument( "--calculate-single-population-site-frequency-spectrum", action="store_true", default=False, help="Calculate the single (within) population site frequency" " spectrum in addition to the joint." ) output_options.add_argument("-l", "--labels", action="append", help="Addition field/value pairs to add to the output (in format <FIELD-NAME>:value;)") output_options.add_argument('--field-delimiter', type=str, default='\t', help="Delimiter string separating fields in output (default: <TAB>').") output_options.add_argument('--summary-stats-label-prefix', type=str, default='stat', metavar='PREFIX', help="Prefix for summar statistic field labels (default: '%(default)s').") output_options.add_argument( "--append", action="store_true", default=False, help="Append instead of overwriting output file(s).") output_options.add_argument( "--no-write-header", action="store_true", default=False, help="Do not writer header row.") args = parser.parse_args() config_d = {} utility.parse_legacy_configuration( filepath=args.configuration_filepath, config_d=config_d) config_d["output_prefix"] = utility.output_prefix( primary_source_filepath=args.configuration_filepath, output_name_prefix=args.output_name_prefix, output_directory=args.output_directory) config_d["is_unfolded_site_frequency_spectrum"] = args.unfolded_site_frequency_spectrum config_d["is_calculate_single_population_sfs"] = args.calculate_single_population_site_frequency_spectrum config_d["is_calculate_joint_population_sfs"] = True config_d["stat_label_prefix"] = args.summary_stats_label_prefix config_d["supplemental_labels"] = utility.parse_fieldname_and_value(args.labels) config_d["alignment_directory_head"] = os.path.dirname(os.path.abspath(args.configuration_filepath)) config_d["field_delimiter"] = args.field_delimiter sscalc = sumstats.SpectrasophySummaryStatsCalculator(**config_d) filepath = config_d["output_prefix"] + ".obs.sumstats.tsv" # dest = utility.open_destput_file_for_csv_writer( # filepath=filepath, # is_append=args.append) dest = utility.universal_open(filepath, "a" if args.append else "w") if args.append or args.no_write_header: is_write_header = False else: is_write_header = True with dest: # writer = utility.get_csv_writer( # dest=dest, # delimiter=args.field_delimiter) try: results = sscalc.write_summary_stats( dest=dest, results_store=None, is_write_header=is_write_header) except Exception as e: sys.stderr.write("Traceback (most recent call last):\n {}{}\n".format( " ".join(traceback.format_tb(sys.exc_info()[2])), e)) sys.exit(1)