def profile_ksstatistic(cache_dir, group_name, control_filter, plate_group, filter=None, parallel=Uniprocessing(), normalization=RobustLinearNormalization, preprocess_file=None): group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) control_images_by_plate = images_by_plate(control_filter, plate_group) plate_by_image = dict((row[:-2], tuple(row[-2:-1])) for row in cpa.db.GetPlatesAndWellsPerImage()) def control_images(treated_images): if plate_group is None: return control_images_by_plate[None] else: return list(set(r for image in treated_images for r in control_images_by_plate[plate_by_image[image]])) keys = group.keys() parameters = [(cache_dir, group[k], control_images(group[k]), normalization.__name__, preprocess_file) for k in keys] if preprocess_file: preprocessor = cpa.util.unpickle1(preprocess_file) variables = preprocessor.variables else: cache = Cache(cache_dir) variables = normalization(cache).colnames return Profiles.compute(keys, variables, _compute_ksstatistic, parameters, parallel=parallel, group_name=group_name)
def profile_svmnormalvector(cache_dir, group_name, control_filter, filter=None, rfe=False, ipython_profile=None, job=None): cache = Cache(cache_dir) group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) variables = RobustLinearNormalization(cache).colnames control_images_by_plate = images_by_plate(control_filter) plate_by_image = dict( (row[:-2], row[-2]) for row in cpa.db.GetPlatesAndWellsPerImage()) def control_images(treated_images): return [ r for image in treated_images for r in control_images_by_plate[plate_by_image[image]] ] keys = group.keys() parameters = [(cache_dir, group[k], control_images(group[k]), rfe) for k in keys] if job: i = job - 1 memoized(_compute_svmnormalvector(parameters[i])) else: return Profiles.compute(keys, variables, memoized(_compute_svmnormalvector), parameters, ipython_profile, group_name=group_name)
def profile_ksstatistic(cache_dir, group_name, control_filter, filter=None, ipython_profile=None): cache = Cache(cache_dir) group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) variables = RobustLinearNormalization(cache).colnames control_images_by_plate = images_by_plate(control_filter) plate_by_image = dict( (row[:-2], row[-2]) for row in cpa.db.GetPlatesAndWellsPerImage()) def control_images(treated_images): return [ r for image in treated_images for r in control_images_by_plate[plate_by_image[image]] ] keys = group.keys() parameters = [(cache_dir, group[k], control_images(group[k])) for k in keys] return Profiles.compute(keys, variables, _compute_ksstatistic, parameters, ipython_profile, group_name=group_name)
def profile_ksstatistic(cache_dir, group_name, control_filter, filter=None, ipython_profile=None): cache = Cache(cache_dir) group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) variables = RobustLinearNormalization(cache).colnames control_images_by_plate = images_by_plate(control_filter) plate_by_image = dict((row[:-2], row[-2]) for row in cpa.db.GetPlatesAndWellsPerImage()) def control_images(treated_images): return [r for image in treated_images for r in control_images_by_plate[plate_by_image[image]]] keys = group.keys() parameters = [(cache_dir, group[k], control_images(group[k])) for k in keys] return Profiles.compute(keys, variables, _compute_ksstatistic, parameters, ipython_profile, group_name=group_name)
def profile_ksstatistic(cache_dir, group_name, control_filter, plate_group, filter=None, parallel=Uniprocessing(), normalization=RobustLinearNormalization, preprocess_file=None): group, colnames_group = cpf.db.group_map(group_name, reverse=True, filter=filter) control_images_by_plate = images_by_plate(control_filter, plate_group) plate_by_image = dict((row[:-2], tuple(row[-2:-1])) for row in cpf.db.GetPlatesAndWellsPerImage()) def control_images(treated_images): if plate_group is None: return control_images_by_plate[None] else: return list( set(r for image in treated_images for r in control_images_by_plate[plate_by_image[image]])) keys = group.keys() parameters = [(cache_dir, group[k], control_images(group[k]), normalization.__name__, preprocess_file) for k in keys] if preprocess_file: preprocessor = cpf.util.unpickle1(preprocess_file) variables = preprocessor.variables else: cache = Cache(cache_dir) variables = normalization(cache).colnames return Profiles.compute(keys, variables, _compute_ksstatistic, parameters, parallel=parallel, group_name=group_name)
def profile_svmnormalvector(cache_dir, group_name, control_filter, filter=None, rfe=False, ipython_profile=None, job=None): cache = Cache(cache_dir) group, colnames_group = cpa.db.group_map(group_name, reverse=True, filter=filter) variables = RobustLinearNormalization(cache).colnames control_images_by_plate = images_by_plate(control_filter) plate_by_image = dict((row[:-2], row[-2]) for row in cpa.db.GetPlatesAndWellsPerImage()) def control_images(treated_images): return [r for image in treated_images for r in control_images_by_plate[plate_by_image[image]]] keys = group.keys() parameters = [(cache_dir, group[k], control_images(group[k]), rfe) for k in keys] if job: i = job - 1 memoized(_compute_svmnormalvector(parameters[i])) else: return Profiles.compute(keys, variables, memoized(_compute_svmnormalvector), parameters, ipython_profile, group_name=group_name)
class Spats(object): """The main SPATS driver. :param cotrans: pass `True` for cotrans-style experiments. """ def __init__(self, cotrans = False): self.run = Run() self.run.cotrans = cotrans self.__processor = None self._targets = None self._masks = None self._profiles = None self.force_mask = None @property def _processor(self): if not self.__processor: self._addMasks() self.__processor = self.run._get_processor_class()(self.run, self._targets, self._masks) return self.__processor @property def targets(self): return self._targets def _addMasks(self): if not self._masks: self._masks = [ Mask(m) for m in self.run.masks ] def reset_processor(self): self.__processor = None def addTargets(self, *target_paths): """Used to add one or more target files for processing. Can be called multiple times to add more targets. Inputs are expected to be in FASTA format with one or more targets per path. Must be called before processing. :param args: one or more filesystem paths to target files. """ targets = [] for path in target_paths: for name, seq in fasta_parse(path): targets.append((name, seq, 1 + len(targets))) self._addTargets(targets) def addTarget(self, name, seq, rowid = -1): self._addTargets( [ (name, seq, rowid if rowid != -1 else 0 if self._targets is None else len(self._targets.targets)) ] ) def loadTargets(self, pair_db): self._addTargets(pair_db.targets()) def _addTargets(self, target_list): targets = self._targets or Targets() for name, seq, rowid in target_list: targets.addTarget(name, seq.upper().replace('U', 'T'), rowid) if not targets.targets: raise Exception("didn't get any targets!") targets.minimum_match_length = self.run.minimum_target_match_length self._targets = targets def process_pair(self, pair): """Used process a single :class:`.pair.Pair`. Typically only used for debugging or analysis of specific cases. :param pair: a :class:`.pair.Pair` to process. """ if not self.run.pair_length: self.run.pair_length = len(pair.r1.original_seq) _set_debug(self.run) _debug("> processing " + pair.identifier + "\n --> " + pair.r1.original_seq + " , " + pair.r2.original_seq) _debug(" rc(R1): {}".format(pair.r1.reverse_complement)) try: self._processor.process_pair(pair) if pair.failure: _debug(pair.failure) else: assert(pair.has_site) _debug(" ===> KEPT {}-{}".format(pair.site, pair.end)) except: print("**** Error processing pair: {} / {}".format(pair.r1.original_seq, pair.r2.original_seq)) raise def _memory_db_from_pairs(self, data_r1_path, data_r2_path): if not self.run.quiet: print("Parsing pair data...") start = time.time() db = PairDB() total_pairs = db.parse(data_r1_path, data_r2_path) report = "Parsed {} records in {:.1f}s".format(total_pairs, time.time() - start) # unclear if this helps, but potentially useful for further analysis later, and doesn't cost *too* much # but if it's holding things up, nuke it db.index() report += ", indexed in {:.1f}s".format(time.time() - start) if self.run.quiet: _debug(report) else: print(report) return db def process_pair_data(self, data_r1_path, data_r2_path, force_mask = None): """Used to read and process a pair of FASTQ data files. Note that this parses the pair data into an in-memory SQLite database, which on most modern systems will be fine except for the largest input sets. If you hit memory issues, create a disk-based SQLite DB via :class:`.db.PairDB` and then use :meth:`.process_pair_db`. Note that this may be called multiple times to process more than one set of data files before computing profiles. :param data_r1_path: path to R1 fragments :param data_r2_path: path to matching R2 fragments. """ self.run._force_mask = force_mask self.run.apply_config_restrictions() self.force_mask = Mask(force_mask) if force_mask else None use_quality = self.run._parse_quality if not self.run.skip_database and not use_quality: self.process_pair_db(self._memory_db_from_pairs(data_r1_path, data_r2_path)) else: with FastFastqParser(data_r1_path, data_r2_path, use_quality) as parser: if not self.run.pair_length: self.run.pair_length = parser.pair_length() self._process_pair_iter(parser.iterator(batch_size = 131072)) def process_pair_db(self, pair_db, batch_size = 65536): """Processes pair data provided by a :class:`.db.PairDB`. Note that this may be called multiple times to process more than one set of inputs before computing profiles. :param pair_db: a :class:`.db.PairDB` of pairs to process. """ self.run.apply_config_restrictions() if not self.run.pair_length: self.run.pair_length = pair_db.pair_length() if not self._targets: self.loadTargets(pair_db) result_set_id = pair_db.add_result_set(self.run.result_set_name or "default", self.run.resume_processing) if self.run.writeback_results else None if self._processor.uses_tags: self._processor.setup_tags(pair_db) if self.run.resume_processing: db_iter = pair_db.unique_pairs_with_counts_and_no_results(result_set_id, batch_size = batch_size) elif self.run._redo_tag: db_iter = pair_db.unique_pairs_with_counts_and_tag(self.run.cmp_set_id, self.run._redo_tag, batch_size = batch_size) elif self.run._process_all_pairs: if not self.run.quiet: print("Using all_pairs...") db_iter = pair_db.all_pairs(batch_size = batch_size) else: db_iter = pair_db.unique_pairs_with_counts(batch_size = batch_size) self._process_pair_iter(db_iter, pair_db, result_set_id) #@profile def _process_pair_iter(self, pair_iter, pair_db = None, result_set_id = None): _set_debug(self.run) start = time.time() # force the processor to load and do whatever indexing/etc is required self._processor worker = SpatsWorker(self.run, self._processor, pair_db, result_set_id) if not self.run.quiet: print("Processing pairs{}...".format(" with mask={}".format(self.force_mask.chars) if self.force_mask else "")) worker.force_mask = self.force_mask worker.run(pair_iter) if not self.run.quiet: self._report_counts(time.time() - start) def _report_counts(self, delta = None): counters = self.counters total = counters.total_pairs print("Successfully processed {} properly paired fragments:".format(counters.registered_pairs)) warn_keys = [ "multiple_R1_match", ] skip_keypat = re.compile("(prefix_)|(mut_count_)|(indel_len)") skipped_some = False countinfo = counters.counts_dict() for key in sorted(countinfo.keys(), key = lambda k : countinfo[k], reverse = True): if skip_keypat.search(key): skipped_some = True continue print(" {}{} : {} ({:.1f}%)".format("*** " if key in warn_keys else "", key, countinfo[key], 100.0 * (float(countinfo[key])/float(total)) if total else 0)) print("Masks:") for m in self._masks: kept, total = counters.mask_kept(m), counters.mask_total(m) print(" {}: kept {}/{} ({:.1f}%)".format(m.chars, kept, total, (100.0 * float(kept)) / float(total) if total else 0)) if 1 < len(self._targets.targets): print("Targets:") tmap = { t.name : counters.target_total(t) for t in self._targets.targets } total = counters.registered_pairs for tgt in sorted(self._targets.targets, key = lambda t : tmap[t.name], reverse = True): if tmap[tgt.name] > 0: print(" {}: {} ({:.1f}%)".format(tgt.name, tmap[tgt.name], (100.0 * float(tmap[tgt.name])) / float(total) if total else 0)) if skipped_some: print("Some counters not printed above; use 'spats_tool dump ...' commands to obtain.") if delta: print("Total time: ({:.1f}s)".format(delta)) @property def counters(self): """Returns the underlying :class:`.counters.Counters` object, which contains information about site and tag counts. """ return self._processor.counters def compute_profiles(self): """Computes beta/theta/c reactivity values after pair data have been processed. :return: a :class:`.profiles.Profiles` object, which contains the reactivities for all targets. """ self._profiles = Profiles(self._targets, self.run, self._processor.counters) self._profiles.compute() return self._profiles def write_reactivities(self, output_path): """Convenience function used to write the reactivities to an output file. Must be called after :meth:`.compute_profiles`. :param output_path: the path for writing the output. """ self._profiles.write(output_path) def store(self, output_path): """Saves the state of the SPATS run for later processing. :param output_path: the path for writing the output. Recommended file extension is `.spats` """ if os.path.exists(output_path): os.remove(output_path) pair_db = PairDB(output_path) pair_db.store_run(self.run) pair_db.add_targets(self.targets) pair_db.store_counters("spats", self.counters) def load(self, input_path): """Loads SPATS state from a file. :param input_path: the path of a previously saved SPATS session. """ pair_db = PairDB(input_path) pair_db.load_run(self.run) self.loadTargets(pair_db) pair_db.load_counters("spats", self.counters) def validate_results(self, data_r1_path, data_r2_path, algorithm = "find_partial", verbose = False): """Used to validate the results of the current run using against a different algorithm. Must be run after running :meth:`.process_pair_data`, or after loading the data (:meth:`.load`) from a previously-run session. :param data_r1_path: path to R1 fragments :param data_r2_path: path to matching R2 fragments. :param algorithm: Generally the default is correct, but you can select a particular algorithm for data validation (see :attr:`.run.Run.algorithm`). :param verbose: set to `True` for detailed output of mismatched sites. :return: `True` if results validate, `False` otherwise. """ original_algorithm = self.run.algorithm if original_algorithm == algorithm: raise Exception("Validation cannot be run using the same algorithm.") if not self.counters.registered_dict(): raise Exception("Normal SPATS run required first in order to validate the results.") other = Spats() other.run.load_from_config(self.run.config_dict()) other.run.algorithm = algorithm other._targets = self._targets other.process_pair_data(data_r1_path, data_r2_path) match_count, total = self.compare_results(other, verbose = verbose) if match_count == total: print("Original results ({} algorithm) validated using {} algorithm, {} registered sites match.".format(original_algorithm, algorithm, match_count)) return True else: print("Validation FAILURE: results ({} algorithm) only match {}/{} registered sites (when validated using {} algorithm).".format(original_algorithm, match_count, total, algorithm)) return False def compare_results(self, other_spats, verbose = False): """Used to compare the results of the current run against another SPATS instance. Must be run after running :meth:`.process_pair_data`, or after loading the data (:meth:`.load`) from a previously-run session. :param other_spats: :class:`.Spats` instance to compare. :param verbose: set to `True` for detailed output of mismatched sites. :return: `(match_count, total)` : `match_count` indicates the number of sites matched, `total` indicates total number of sites. """ our_counts = self.counters.registered_dict() their_counts = other_spats.counters.registered_dict() match_count = 0 total = 0 for key, value in our_counts.iteritems(): total += 1 if their_counts.get(key, 0) == value: match_count += 1 elif verbose: print("Mismatch {}: {} != {}".format(key, value, their_counts.get(key, 0))) return match_count, total
class Spats(object): """The main SPATS driver. :param cotrans: pass `True` for cotrans-style experiments. """ def __init__(self, cotrans = False): self.run = Run() self.run.cotrans = cotrans self.__processor = None self._targets = None self._masks = None self._profiles = None self.force_mask = None @property def _processor(self): if not self.__processor: self._addMasks() self.__processor = self.run._get_processor_class()(self.run, self._targets, self._masks) return self.__processor @property def targets(self): return self._targets def _addMasks(self): if not self._masks: pl = iter([ PLUS_PLACEHOLDER, MINUS_PLACEHOLDER ]) self._masks = [Mask(m if m else next(pl)) for m in self.run.masks ] def reset_processor(self): self.__processor = None def addTargets(self, *target_paths): """Used to add one or more target files for processing. Can be called multiple times to add more targets. Inputs are expected to be in FASTA format with one or more targets per path. Must be called before processing. :param args: one or more filesystem paths to target files. """ targets = [] for path in target_paths: for name, seq in fasta_parse(path): targets.append((name, seq, 1 + len(targets))) self._addTargets(targets) def addTarget(self, name, seq, rowid = -1): self._addTargets( [ (name, seq, rowid if rowid != -1 else 0 if self._targets is None else len(self._targets.targets)) ] ) def loadTargets(self, pair_db): self._addTargets(pair_db.targets()) def _addTargets(self, target_list): targets = self._targets or Targets() for name, seq, rowid in target_list: targets.addTarget(name, seq.upper().replace('U', 'T'), rowid) if not targets.targets: raise Exception("didn't get any targets!") targets.minimum_match_length = self.run.minimum_target_match_length self._targets = targets def merge_targets(self, pair_db): assert(self._targets) self._targets.minimum_match_length = min(self._targets.minimum_match_length, self.run.minimum_target_match_length) for name, seq, rowid in pair_db.targets(): self._targets.merge_target(name, seq.upper().replace('U', 'T'), rowid) def process_pair(self, pair): """Used process a single :class:`.pair.Pair`. Typically only used for debugging or analysis of specific cases. :param pair: a :class:`.pair.Pair` to process. """ if not self.run.pair_length: self.run.pair_length = len(pair.r1.original_seq) _set_debug(self.run) _debug("> processing " + pair.identifier + "\n --> " + pair.r1.original_seq + " , " + pair.r2.original_seq) _debug(" rc(R1): {}".format(pair.r1.reverse_complement)) try: self._processor.process_pair(pair) if pair.failure: _debug(pair.failure) else: assert(pair.has_site) _debug(" ===> KEPT {}-{}".format(pair.site, pair.end)) except: print("**** Error processing pair: {} / {}".format(pair.r1.original_seq, pair.r2.original_seq)) raise def _memory_db_from_pairs(self, data_r1_path, data_r2_path): if not self.run.quiet: print("Parsing pair data...") start = time.time() db = PairDB() total_pairs = db.parse(data_r1_path, data_r2_path) report = "Parsed {} records in {:.1f}s".format(total_pairs, time.time() - start) # unclear if this helps, but potentially useful for further analysis later, and doesn't cost *too* much # but if it's holding things up, nuke it db.index() report += ", indexed in {:.1f}s".format(time.time() - start) if self.run.quiet: _debug(report) else: print(report) return db def process_pair_data(self, data_r1_path, data_r2_path, force_mask = None): """Used to read and process a pair of FASTQ data files. Note that this parses the pair data into an in-memory SQLite database, which on most modern systems will be fine except for the largest input sets. If you hit memory issues, create a disk-based SQLite DB via :class:`.db.PairDB` and then use :meth:`.process_pair_db`. Note that this may be called multiple times to process more than one set of data files before computing profiles. :param data_r1_path: path to R1 fragments :param data_r2_path: path to matching R2 fragments. """ self.run._force_mask = force_mask self.run.apply_config_restrictions() self.force_mask = Mask(force_mask) if force_mask else None use_quality = self.run._parse_quality if not self.run.skip_database and not use_quality: self.process_pair_db(self._memory_db_from_pairs(data_r1_path, data_r2_path)) else: with FastFastqParser(data_r1_path, data_r2_path, use_quality) as parser: if not self.run.pair_length: self.run.pair_length = parser.pair_length() self._process_pair_iter(parser.iterator(batch_size = 131072)) def process_pair_db(self, pair_db, batch_size = 65536): """Processes pair data provided by a :class:`.db.PairDB`. Note that this may be called multiple times to process more than one set of inputs before computing profiles. :param pair_db: a :class:`.db.PairDB` of pairs to process. """ self.run.apply_config_restrictions() if not self.run.pair_length: self.run.pair_length = pair_db.pair_length() if not self._targets: self.loadTargets(pair_db) result_set_id = pair_db.add_result_set(self.run.result_set_name or "default", self.run.resume_processing) if self.run.writeback_results else None if self._processor.uses_tags: self._processor.setup_tags(pair_db) if self.run.resume_processing: db_iter = pair_db.unique_pairs_with_counts_and_no_results(result_set_id, batch_size = batch_size) elif self.run._redo_tag: db_iter = pair_db.unique_pairs_with_counts_and_tag(self.run.cmp_set_id, self.run._redo_tag, batch_size = batch_size) elif self.run._process_all_pairs: if not self.run.quiet: print("Using all_pairs...") db_iter = pair_db.all_pairs(batch_size = batch_size) else: db_iter = pair_db.unique_pairs_with_counts(batch_size = batch_size) self._process_pair_iter(db_iter, pair_db, result_set_id) #@profile def _process_pair_iter(self, pair_iter, pair_db = None, result_set_id = None): _set_debug(self.run) start = time.time() # force the processor to load and do whatever indexing/etc is required self._processor worker = SpatsWorker(self.run, self._processor, pair_db, result_set_id, self.force_mask) if not self.run.quiet: print("Processing pairs{}...".format(" with mask='{}'".format(self.force_mask.chars) if self.force_mask else "")) worker.run(pair_iter) if not self.run.quiet: self._report_counts(time.time() - start) def _report_counts(self, delta = None): counters = self.counters total = counters.total_pairs print("Successfully processed {} properly paired fragments:".format(counters.registered_pairs)) warn_keys = [ "multiple_R1_match", ] skip_keypat = re.compile("(prefix_)|(mut_count_)|(indel_len)") skipped_some = False countinfo = counters.counts_dict() for key in sorted(countinfo.keys(), key = lambda k : countinfo[k], reverse = True): if skip_keypat.search(key): skipped_some = True continue print(" {}{} : {} ({:.1f}%)".format("*** " if key in warn_keys else "", key, countinfo[key], 100.0 * (float(countinfo[key])/float(total)) if total else 0)) print("Masks:") for m in self._masks: kept, total = counters.mask_kept(m), counters.mask_total(m) print(" {}: kept {}/{} ({:.1f}%)".format((m.empty_place_holder if m.empty_place_holder else m.chars), kept, total, (100.0 * float(kept)) / float(total) if total else 0)) if 1 < len(self._targets.targets): print("Targets:") tmap = { t.name : counters.target_total(t) for t in self._targets.targets } total = counters.registered_pairs for tgt in sorted(self._targets.targets, key = lambda t : tmap[t.name], reverse = True): if tmap[tgt.name] > 0: print(" {}: {} ({:.1f}%)".format(tgt.name, tmap[tgt.name], (100.0 * float(tmap[tgt.name])) / float(total) if total else 0)) if skipped_some: print("Some counters not printed above; use 'spats_tool dump ...' commands to obtain.") if delta: print("Total time: ({:.1f}s)".format(delta)) @property def counters(self): """Returns the underlying :class:`.counters.Counters` object, which contains information about site and tag counts. """ return self._processor.counters def compute_profiles(self): """Computes beta/theta/c reactivity values after pair data have been processed. :return: a :class:`.profiles.Profiles` object, which contains the reactivities for all targets. """ self._profiles = Profiles(self._targets, self.run, self._processor.counters) self._profiles.compute() return self._profiles def write_reactivities(self, output_path): """Convenience function used to write the reactivities to an output file. Must be called after :meth:`.compute_profiles`. :param output_path: the path for writing the output. """ self._profiles.write(output_path) def store(self, output_path): """Saves the state of the SPATS run for later processing. :param output_path: the path for writing the output. Recommended file extension is `.spats` """ if os.path.exists(output_path): os.remove(output_path) pair_db = PairDB(output_path) pair_db.store_run(self.run) pair_db.add_targets(self.targets) pair_db.store_counters("spats", self.counters) def load(self, input_path): """Loads SPATS state from a file. :param input_path: the path of a previously saved SPATS session. """ pair_db = PairDB(input_path) pair_db.load_run(self.run) self.loadTargets(pair_db) pair_db.load_counters("spats", self.counters) def merge(self, input_path): """Merges SPATS state from a file with existing state. :param input_path: the path of a previously saved SPATS session. """ pair_db = PairDB(input_path) pair_db.load_run(self.run) self.merge_targets(pair_db) pair_db.load_counters("spats", self.counters, False) def validate_results(self, data_r1_path, data_r2_path, algorithm = "find_partial", verbose = False): """Used to validate the results of the current run using against a different algorithm. Must be run after running :meth:`.process_pair_data`, or after loading the data (:meth:`.load`) from a previously-run session. :param data_r1_path: path to R1 fragments :param data_r2_path: path to matching R2 fragments. :param algorithm: Generally the default is correct, but you can select a particular algorithm for data validation (see :attr:`.run.Run.algorithm`). :param verbose: set to `True` for detailed output of mismatched sites. :return: `True` if results validate, `False` otherwise. """ original_algorithm = self.run.algorithm if original_algorithm == algorithm: raise Exception("Validation cannot be run using the same algorithm.") if not self.counters.registered_dict(): raise Exception("Normal SPATS run required first in order to validate the results.") other = Spats() other.run.load_from_config(self.run.config_dict()) other.run.algorithm = algorithm other._targets = self._targets other.process_pair_data(data_r1_path, data_r2_path) match_count, total = self.compare_results(other, verbose = verbose) if match_count == total: print("Original results ({} algorithm) validated using {} algorithm, {} registered sites match.".format(original_algorithm, algorithm, match_count)) return True else: print("Validation FAILURE: results ({} algorithm) only match {}/{} registered sites (when validated using {} algorithm).".format(original_algorithm, match_count, total, algorithm)) return False def compare_results(self, other_spats, verbose = False): """Used to compare the results of the current run against another SPATS instance. Must be run after running :meth:`.process_pair_data`, or after loading the data (:meth:`.load`) from a previously-run session. :param other_spats: :class:`.Spats` instance to compare. :param verbose: set to `True` for detailed output of mismatched sites. :return: `(match_count, total)` : `match_count` indicates the number of sites matched, `total` indicates total number of sites. """ our_counts = self.counters.registered_dict() their_counts = other_spats.counters.registered_dict() match_count = 0 total = 0 for key, value in our_counts.iteritems(): total += 1 if their_counts.get(key, 0) == value: match_count += 1 elif verbose: print("Mismatch {}: {} != {}".format(key, value, their_counts.get(key, 0))) return match_count, total