def test_multiple_searches_with_same_pool(self): """Tests more than one call to find_probe_covers_in_sequence() with the same pool. """ np.random.seed(1) sequence_a = 'ABCAXYZXYZDEFXYZAAYZ' sequence_b = 'GHIDAXYZXYZAAABCABCD' a = probe.Probe.from_str('AXYZXYZ') b = probe.Probe.from_str('AABCABC') probes = [a, b] # This should default to the random approach, so set k (rather than # min_k) kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 0, 6, k=3) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found_a = probe.find_probe_covers_in_sequence(sequence_a) self.assertEqual(found_a, {a: [(3, 10)]}) found_b = probe.find_probe_covers_in_sequence(sequence_b) self.assertEqual(found_b, {a: [(4, 11)], b: [(12, 19)]}) probe.close_probe_finding_pool()
def verify_target_genome_coverage(self, selected_probes, target_genomes, filter, desired_coverage, cover_extension=0): kmer_probe_map = probe.SharedKmerProbeMap.construct( probe.construct_kmer_probe_map_to_find_probe_covers( selected_probes, filter.mismatches, filter.lcf_thres, min_k=3, k=3) ) probe.open_probe_finding_pool(kmer_probe_map, filter.cover_range_fn) for tg in [g for genomes_from_group in target_genomes for g in genomes_from_group]: num_bp_covered = 0 for seq in tg.seqs: probe_cover_ranges = probe.find_probe_covers_in_sequence(seq) all_cover_ranges = [] for cover_ranges in probe_cover_ranges.values(): for cv in cover_ranges: start = max(0, cv[0] - cover_extension) end = min(len(seq), cv[1] + cover_extension) all_cover_ranges += [(start, end)] all_cover_ranges = interval.merge_overlapping(all_cover_ranges) for cover_range in all_cover_ranges: num_bp_covered += cover_range[1] - cover_range[0] if desired_coverage <= 1.0: # check fraction covered desired_bp_covered = desired_coverage * tg.size() self.assertGreaterEqual(num_bp_covered, desired_bp_covered) else: # directly check num bp covered desired_coverage_adjusted = min(desired_coverage, tg.size()) self.assertGreaterEqual(num_bp_covered, desired_coverage_adjusted) probe.close_probe_finding_pool()
def test_more_than_cover(self): """Tests with short sequence and short probes where probes contain more than what they cover. """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPQR' + ('Z' * 100) + 'STUVWXYZ' a = probe.Probe.from_str('XYZCDEFGHIJKABCSTUVWXABC') b = probe.Probe.from_str('PQRSGHIJKLMNXYZ') c = probe.Probe.from_str('ABCFGHIJKLZAZAZAGHIJKL') probes = [a, b, c] # This should default to the random approach, so set k (rather than # min_k) kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 0, 6, k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(2, 11), (118, 124)]) self.assertCountEqual(found[b], [(6, 14)]) self.assertCountEqual(found[c], [(5, 12)]) probe.close_probe_finding_pool()
def test_island_with_exact_match1(self): """Tests the 'island_with_exact_match' argument for probe.probe_covers_sequence_by_longest_common_substring(..). """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPYDEFGHQRSTU' a = probe.Probe.from_str('XDEFGH') b = probe.Probe.from_str('CXEFGH') c = probe.Probe.from_str('CDXFGH') d = probe.Probe.from_str('CDEXGH') e = probe.Probe.from_str('CDEFXH') f = probe.Probe.from_str('CDEFGX') g = probe.Probe.from_str('CDEFGH') probes = [a, b, c, d, e, f, g] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 1, 6, k=3) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) fn = probe.probe_covers_sequence_by_longest_common_substring(1, 6, 4) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, fn, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(2, 8), (16, 22)]) self.assertCountEqual(found[b], [(2, 8)]) self.assertFalse(c in found) self.assertFalse(d in found) self.assertCountEqual(found[e], [(2, 8)]) self.assertCountEqual(found[f], [(2, 8)]) self.assertCountEqual(found[g], [(2, 8), (16, 22)]) probe.close_probe_finding_pool()
def test_pigeonhole_with_mismatch(self): """Tests with short sequence and short probes where the call to construct_kmer_probe_map_to_find_probe_covers tries the pigeonhole approach. """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' a = probe.Probe.from_str('GHIJXL') b = probe.Probe.from_str('BTUVWX') c = probe.Probe.from_str('ACEFHJ') probes = [a, b, c] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 1, 6, min_k=3, k=4) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) # This should try the pigeonhole approach, which should choose k=3 self.assertEqual(kmer_map.k, 3) f = probe.probe_covers_sequence_by_longest_common_substring(1, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(6, 12)]) self.assertCountEqual(found[b], [(18, 24)]) self.assertFalse(c in found) probe.close_probe_finding_pool() kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 1, 6, min_k=4, k=4) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) # This should try the pigeonhole approach and fail because it # chooses k=3, but min_k=4. So it should then try the random # approach with k=4. self.assertEqual(kmer_map.k, 4) f = probe.probe_covers_sequence_by_longest_common_substring(1, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(6, 12)]) self.assertCountEqual(found[b], [(18, 24)]) self.assertFalse(c in found) probe.close_probe_finding_pool()
def _compute_tolerant_bp_covered_within_sequence(self, sequence, rc_too=True): """Compute number of bp captured in sequence by each input probe. A probe finding pool must be open prior to calling this function, and that pool should have been created using self.cover_range_tolerant_fn. That is, probe.open_probe_finding_pool() should have been called with the cover_range_for_probe_in_subsequence_fn argument equal to self.cover_range_tolerant_fn. The input probes are values in the kmer_probe_map argument that was passed to probe.open_probe_finding_pool(). Uses self.coverage_range_tolerant_fn for determining coverage (i.e., the coverage is determined in a relatively tolerant way so that more potential hybridizations are included). Args: sequence: sequence as a string in which to determine the coverage of the probes rc_too: when True, the returned values also include bp that are captured in the reverse complement of sequence Raises: RuntimeError if the probe finding pool was not created with self.cover_range_tolerant_fn Returns: dict mapping each candidate probe to the number of bp it covers, for only the candidate probes that cover at least one bp; candidate probes that do not cover any bp are not included as keys in the returned dict """ if probe._pfp_cover_range_for_probe_in_subsequence_fn != \ self.cover_range_tolerant_fn: raise RuntimeError(("_compute_tolerant_bp_covered_within_" "subsequence() was called but the probe " "finding pool was not created using " "self.cover_range_tolerant_fn")) reverse_complement = [False] if rc_too: reverse_complement += [True] rc_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} num_bp_covered = defaultdict(int) for rc in reverse_complement: if rc: sequence = ''.join([rc_map.get(b, b) for b in sequence[::-1]]) probe_cover_ranges = probe.find_probe_covers_in_sequence(sequence) all_cover_ranges = [] for p, cover_ranges in probe_cover_ranges.items(): for cover_range in cover_ranges: num_bp_covered[p] += cover_range[1] - cover_range[0] return dict(num_bp_covered)
def _votes_in_sequence(self, probes, sequence): """Compute votes for probes based on their overlap. Votes are determined by first determining the probes' hybridization (alignment) to sequence (e.g., one target genome) and then by considering their overlap. We use the greedy interval scheduling algorithm and assign 'A' votes to all probes selected by this algorithm. All other probes that hybridize to 'sequence' but are not selected receive a 'B' vote. Args: probes: a list of candidate probes for which to determine votes sequence: a string of a sequence (e.g., from a target genome) to use when determining overlap among probes Returns: A list L, in which L[i] corresponds to the probe probes[i]. L[i] is either (1,0) [vote for 'A'], (0,1) [vote for 'B'], or (0,0) [the probe does not hybridize in 'sequence']. """ probe_cover_ranges = probe.find_probe_covers_in_sequence(sequence) aligned_probes = set(probe_cover_ranges.keys()) # Make a list of all the intervals covered by all the probes, # along with a reference to the probe with the interval intervals = [] for p, cover_ranges in probe_cover_ranges.items(): for cover_range in cover_ranges: intervals += [(cover_range, p)] # Perform interval scheduling to choose probes that should be # assigned the 'A' adapter chosen_probes = set(interval.schedule(intervals)) votes = [] for p in probes: if p in chosen_probes: # vote for 'A' vote = (1, 0) else: if p in aligned_probes: # p should have been skipped by the interval scheduling # algorithm # vote for 'B' vote = (0, 1) else: # p does not hybridize to sequence vote = (0, 0) votes += [vote] return votes
def test_repetitive(self): """Tests with short sequence and short probes where the sequence and probes have repetitive sequences, so that one probe can cover a lot of the sequence. """ np.random.seed(1) sequence = 'ABCAAAAAAAAAAXYZXYZXYZXYZAAAAAAAAAAAAAXYZ' a = probe.Probe.from_str('NAAAAAAN') probes = [a] # This should default to the random approach, so set k (rather than # min_k) kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(3, 13), (25, 38)]) probe.close_probe_finding_pool()
def test_island_with_exact_match2(self): """Tests the 'island_with_exact_match' argument for probe.probe_covers_sequence_by_longest_common_substring(..). """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU' a = probe.Probe.from_str('HXJKLMNOPCDE') b = probe.Probe.from_str('XIJKXMNOXCDE') c = probe.Probe.from_str('XIJKXMNOPXDE') probes = [a, b, c] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 3, 6, k=3) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) fn = probe.probe_covers_sequence_by_longest_common_substring(3, 6, 4) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, fn, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(7, 19)]) self.assertFalse(b in found) self.assertCountEqual(found[c], [(7, 19)]) probe.close_probe_finding_pool()
def test_two_occurrences(self): """Tests with short sequence and short probes where one probe appears twice. """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU' a = probe.Probe.from_str('CDEFGH') b = probe.Probe.from_str('GHIJKL') c = probe.Probe.from_str('STUVWX') probes = [a, b, c] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, min_k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(2, 8), (16, 22)]) self.assertCountEqual(found[b], [(6, 12)]) self.assertFalse(c in found) probe.close_probe_finding_pool()
def test_too_short_sequence_small_k(self): """Tests with sequence shorter than the probe length. """ np.random.seed(1) sequence = 'ABCDEFGHI' a = probe.Probe.from_str('ABCDEFGHIJKL') b = probe.Probe.from_str('EFGHIJKLMNOP') c = probe.Probe.from_str('DEFGHIJKLMNO') d = probe.Probe.from_str('XYZXYZABCDEF') probes = [a, b, c, d] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, min_k=6, k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(0, 9)]) self.assertFalse(b in found) self.assertCountEqual(found[c], [(3, 9)]) self.assertCountEqual(found[d], [(0, 6)]) probe.close_probe_finding_pool()
def test_too_short_sequence_large_k(self): """Tests with sequence shorter than the probe length and also shorter than k. """ np.random.seed(1) sequence = 'ABCDEFGHI' a = probe.Probe.from_str('ABCDEFGHIJKL') b = probe.Probe.from_str('EFGHIJKLMNOP') c = probe.Probe.from_str('DEFGHIJKLMNO') d = probe.Probe.from_str('XYZXYZABCDEF') probes = [a, b, c, d] # probe.find_probe_covers_in_sequence() should not attempt # to cover the sequence (return {}), but should run gracefully for k in [10, 11, 12]: kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, min_k=k, k=k) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertEqual(found, {}) probe.close_probe_finding_pool()
def _find_covers_in_target_genomes(self): """Find intervals across the target genomes covered by the probe set. This considers the given probe set (self.probes) and determines the intervals, in each genome of the target genomes (as well as their reverse complements), that are covered by the probes. This saves a dict, self.target_covers, as follows: self.target_covers[i][j][b] is a list of all the intervals covered by the probes in the target genome j of grouping i (in the reverse complement of the genome if b is True, and provided sequence if b is False). The endpoints of the intervals are offset so as to give unique integer positions in the genome (e.g., endpoints in the second chromosome are offset based on the length of the first chromosome). There may be duplicate intervals if two probes cover the same region of a sequence. """ logger.info("Finding probe covers across target genomes") logger.info("Building map from k-mers to probes") # Note that if adapters are added to the probes before this filter # is run (which would be typical), then self.lcf_thres will likely # be less than the probe length. So the k-mer to probe map will # be constructed using the random approach (yielding many k-mers # and thus a slower runtime in finding probe covers) rather than # the pigeonhole approach. kmer_probe_map = probe.SharedKmerProbeMap.construct( probe.construct_kmer_probe_map_to_find_probe_covers( self.probes, self.mismatches, self.lcf_thres, min_k=self.kmer_probe_map_k, k=self.kmer_probe_map_k) ) probe.open_probe_finding_pool(kmer_probe_map, self.cover_range_fn) self.target_covers = {} for i, j, gnm, rc in self._iter_target_genomes(): if not rc: logger.info(("Computing coverage in grouping %d (of %d), " "with target genome %d (of %d)"), i + 1, len(self.target_genomes), j + 1, len(self.target_genomes[i])) if i not in self.target_covers: self.target_covers[i] = {} if j not in self.target_covers[i]: self.target_covers[i][j] = {False: None, True: None} gnm_covers = [] length_so_far = 0 for sequence in gnm.seqs: if rc: # Take the reverse complement of sequence rc_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} sequence = ''.join([rc_map.get(b, b) for b in sequence[::-1]]) # Find cover ranges of the probes, while allowing the ranges # to overlap (e.g., if one probe covers two regions that # overlap) probe_cover_ranges = probe.find_probe_covers_in_sequence( sequence, merge_overlapping=False) for p, cover_ranges in probe_cover_ranges.items(): for cover_range in cover_ranges: # Extend the range covered by probe p on both sides # by self.cover_extension cover_start = max(0, cover_range[0] - self.cover_extension) cover_end = min(len(sequence), cover_range[1] + self.cover_extension) # The endpoints of the cover give positions in just # this sequence (chromosome), so adjust them (according # to length_so_far) to give a unique integer position # in the genome gnm adjusted_cover = (cover_start + length_so_far, cover_end + length_so_far) gnm_covers += [adjusted_cover] length_so_far += len(sequence) self.target_covers[i][j][rc] = gnm_covers probe.close_probe_finding_pool()
def _make_sets(self, candidate_probes, target_genomes): """Return a collection of sets to use in set cover. In the returned collection of sets, each set corresponds to a candidate probe and contains the bases of the target genomes covered by the candidate probe. The target genomes must be in grouped lists inside the list target_genomes. The output is intended for input to set_cover.approx_multiuniverse as the 'sets' input. Args: candidate_probes: list of candidate probes target_genomes: list of groups of target genomes Returns: a dict mapping set_ids (from 0 through len(candidate_probes)-1) to dicts, where the dict for a particular set_id maps universe_ids to sets. set_id corresponds to a candidate probe in candidate_probes and universe_id is a tuple that corresponds to a target genome in a grouping from target_genomes. The j'th target genome from the i'th grouping in target_genomes is given universe_id equal to (i,j). That is, i ranges from 0 through len(target_genomes)-1 (i.e., the number of groupings) and j ranges from 0 through (n_i)-1 where n_i is the number of target genomes in the i'th group. In the returned value (sets), sets[set_id][universe_id] is a set of all the bases (as an instance of interval.IntervalSet) covered by probe set_id in the target genome universe_id. (If sets[set_id][universe_id] contains just one interval, then that interval is stored directly as a tuple -- not in an instance of interval.IntervalSet -- to save space and it should be coverted to an interval.IntervalSet when needed.) """ logger.info("Building map from k-mers to probes") kmer_probe_map = probe.SharedKmerProbeMap.construct( probe.construct_kmer_probe_map_to_find_probe_covers( candidate_probes, self.mismatches, self.lcf_thres, min_k=self.kmer_probe_map_k, k=self.kmer_probe_map_k)) probe.open_probe_finding_pool(kmer_probe_map, self.cover_range_fn) probe_id = {} sets = {} for id, p in enumerate(candidate_probes): probe_id[p] = id sets[id] = {} for i, genomes_from_group in enumerate(target_genomes): for j, gnm in enumerate(genomes_from_group): logger.info(("Computing coverage in grouping %d (of %d), " "with target genome %d (of %d)"), i + 1, len(target_genomes), j + 1, len(genomes_from_group)) universe_id = (i, j) length_so_far = 0 for sequence in gnm.seqs: probe_cover_ranges = probe.find_probe_covers_in_sequence( sequence) # Add the bases of sequence that are covered by all the # probes into sets with universe_id equal to (i,j) for p, cover_ranges in probe_cover_ranges.items(): set_id = probe_id[p] for cover_range in cover_ranges: # Extend the range covered by probe p on both sides # by self.cover_extension cover_start = max( 0, cover_range[0] - self.cover_extension) cover_end = min( len(sequence), cover_range[1] + self.cover_extension) # The endpoints of the cover give positions in # just this sequence (chromosome), so adding the # lengths of all the sequences previously iterated # (length_so_far) onto them gives unique # integer positions in the genome gnm adjusted_cover = (cover_start + length_so_far, cover_end + length_so_far) if universe_id not in sets[set_id]: # Since a list has a lot of overhead and most # probes align to just one interval, simply # store that interval alone (not in a list) sets[set_id][universe_id] = adjusted_cover else: prev_cover = sets[set_id][universe_id] if isinstance(prev_cover, tuple): # This probe now aligns to two intervals in # this universe/genome, so store them in # a list sets[set_id][universe_id] = [prev_cover] sets[set_id][universe_id].append( adjusted_cover) length_so_far += len(sequence) probe.close_probe_finding_pool() del kmer_probe_map gc.collect() # Make an IntervalSet out of the intervals of each set. But if # there is just one interval in a set, then save space by leaving # that entry as a tuple. for set_id in sets.keys(): for universe_id in sets[set_id].keys(): intervals = sets[set_id][universe_id] if not isinstance(intervals, tuple): sets[set_id][universe_id] = interval.IntervalSet(intervals) # Else, there is just one interval in this set; leave it # stored directly as a tuple return sets
def run_random(self, n, genome_min, genome_max, num_probes, probe_length=100, lcf_thres=None, seed=1, n_workers=2, use_native_dict=False): """Run tests with a randomly generated sequence. Repeatedly runs tests in which a sequence is randomly generated, probes are generated from that sequence, and then the probes are looked up in the sequence. Creates the probes with the intention of determining coverage with a longest common substring. Args: n: number of times to run the test genome_min/genome_max: the genome (sequence) size is randomly chosen between genome_min and genome_max num_probes: the number of probes generated from the random sequence probe_length: number of bp to make each probe lcf_thres: lcf threshold parameter; when None, it is randomly chosen among 80 and 100 seed: random number generator seed n_workers: number of workers to have in a probe finding pool use_native_dict: have the probe finding pool use a native Python dict """ np.random.seed(seed) fixed_lcf_thres = lcf_thres for n in range(n): if fixed_lcf_thres is not None: lcf_thres = fixed_lcf_thres else: # Choose either lcf_thres=80 or lcf_thres=100 lcf_thres = np.random.choice([80, 100]) # Make a random sequence seq_length = np.random.randint(genome_min, genome_max) sequence = "".join( np.random.choice(['A', 'T', 'C', 'G'], size=seq_length, replace=True)) desired_probe_cover_ranges = defaultdict(list) # Make num_probes random probes probes = [] for m in range(num_probes): subseq_start = np.random.randint(0, seq_length - probe_length) subseq_end = subseq_start + probe_length cover_length = np.random.randint(lcf_thres, probe_length + 1) cover_start = subseq_start + \ np.random.randint(0, probe_length - cover_length + 1) cover_end = min(seq_length, cover_start + cover_length) probe_str_cover = sequence[cover_start:cover_end] # Add random bases before and after what the probe should # cover probe_str_start = "".join( np.random.choice(['A', 'T', 'C', 'G'], size=cover_start - subseq_start, replace=True)) probe_str_end = "".join( np.random.choice(['A', 'T', 'C', 'G'], size=subseq_end - cover_end, replace=True)) probe_str = probe_str_start + probe_str_cover + probe_str_end # Add 0, 1, 2, or 3 random mismatches for k in range(np.random.randint(0, 4)): pos = np.random.randint(0, probe_length) base_choices = [ b for b in ['A', 'T', 'C', 'G'] if b != probe_str[pos] ] probe_str = probe_str[:pos] + \ "".join(np.random.choice(base_choices, size=1)) + \ probe_str[(pos + 1):] p = probe.Probe.from_str(probe_str) desired_probe_cover_ranges[p].append((cover_start, cover_end)) probes += [p] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 3, lcf_thres) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring( 3, lcf_thres) probe.open_probe_finding_pool(kmer_map, f, n_workers, use_native_dict=use_native_dict) found = probe.find_probe_covers_in_sequence(sequence) probe.close_probe_finding_pool() # Check that this didn't find any extraneous probes and that # it found at least 95% of the original (it may miss some # due to false negatives in the approach) self.assertLessEqual(len(found), len(probes)) self.assertGreaterEqual(len(found), 0.95 * len(probes)) # Check that each desired probe was found correctly for p, cover_ranges in desired_probe_cover_ranges.items(): if p not in found: continue found_cover_ranges = found[p] # This probe most likely was found once, but could have # been missed (due to false negatives in the approach) and # may have been found more than once due to chance (but # probably not too much more!) self.assertTrue(len(found_cover_ranges) in [1, 2]) # The cover ranges should have been captured, and the ones # found may extend past what was desired by a small amount due # to allowing mismatches and chance # Because of mismatches possibly added to the end of the # desired cover range, what was recaptured may not always # encompass the entire cover range, so allow some small # tolerance for desired_cv in cover_ranges: found_desired_cv = False for found_cv in found_cover_ranges: left_diff = desired_cv[0] - found_cv[0] right_diff = found_cv[1] - desired_cv[1] if left_diff >= -7 and left_diff < 15: if right_diff >= -7 and right_diff < 15: found_desired_cv = True break self.assertTrue(found_desired_cv)