def verify_target_genome_coverage(self, selected_probes, target_genomes, filter, desired_coverage, cover_extension=0): kmer_probe_map = probe.SharedKmerProbeMap.construct( probe.construct_kmer_probe_map_to_find_probe_covers( selected_probes, filter.mismatches, filter.lcf_thres, min_k=3, k=3) ) probe.open_probe_finding_pool(kmer_probe_map, filter.cover_range_fn) for tg in [g for genomes_from_group in target_genomes for g in genomes_from_group]: num_bp_covered = 0 for seq in tg.seqs: probe_cover_ranges = probe.find_probe_covers_in_sequence(seq) all_cover_ranges = [] for cover_ranges in probe_cover_ranges.values(): for cv in cover_ranges: start = max(0, cv[0] - cover_extension) end = min(len(seq), cv[1] + cover_extension) all_cover_ranges += [(start, end)] all_cover_ranges = interval.merge_overlapping(all_cover_ranges) for cover_range in all_cover_ranges: num_bp_covered += cover_range[1] - cover_range[0] if desired_coverage <= 1.0: # check fraction covered desired_bp_covered = desired_coverage * tg.size() self.assertGreaterEqual(num_bp_covered, desired_bp_covered) else: # directly check num bp covered desired_coverage_adjusted = min(desired_coverage, tg.size()) self.assertGreaterEqual(num_bp_covered, desired_coverage_adjusted) probe.close_probe_finding_pool()
def test_multiple_searches_with_same_pool(self): """Tests more than one call to find_probe_covers_in_sequence() with the same pool. """ np.random.seed(1) sequence_a = 'ABCAXYZXYZDEFXYZAAYZ' sequence_b = 'GHIDAXYZXYZAAABCABCD' a = probe.Probe.from_str('AXYZXYZ') b = probe.Probe.from_str('AABCABC') probes = [a, b] # This should default to the random approach, so set k (rather than # min_k) kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 0, 6, k=3) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found_a = probe.find_probe_covers_in_sequence(sequence_a) self.assertEqual(found_a, {a: [(3, 10)]}) found_b = probe.find_probe_covers_in_sequence(sequence_b) self.assertEqual(found_b, {a: [(4, 11)], b: [(12, 19)]}) probe.close_probe_finding_pool()
def test_island_with_exact_match1(self): """Tests the 'island_with_exact_match' argument for probe.probe_covers_sequence_by_longest_common_substring(..). """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPYDEFGHQRSTU' a = probe.Probe.from_str('XDEFGH') b = probe.Probe.from_str('CXEFGH') c = probe.Probe.from_str('CDXFGH') d = probe.Probe.from_str('CDEXGH') e = probe.Probe.from_str('CDEFXH') f = probe.Probe.from_str('CDEFGX') g = probe.Probe.from_str('CDEFGH') probes = [a, b, c, d, e, f, g] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 1, 6, k=3) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) fn = probe.probe_covers_sequence_by_longest_common_substring(1, 6, 4) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, fn, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(2, 8), (16, 22)]) self.assertCountEqual(found[b], [(2, 8)]) self.assertFalse(c in found) self.assertFalse(d in found) self.assertCountEqual(found[e], [(2, 8)]) self.assertCountEqual(found[f], [(2, 8)]) self.assertCountEqual(found[g], [(2, 8), (16, 22)]) probe.close_probe_finding_pool()
def test_more_than_cover(self): """Tests with short sequence and short probes where probes contain more than what they cover. """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPQR' + ('Z' * 100) + 'STUVWXYZ' a = probe.Probe.from_str('XYZCDEFGHIJKABCSTUVWXABC') b = probe.Probe.from_str('PQRSGHIJKLMNXYZ') c = probe.Probe.from_str('ABCFGHIJKLZAZAZAGHIJKL') probes = [a, b, c] # This should default to the random approach, so set k (rather than # min_k) kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 0, 6, k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(2, 11), (118, 124)]) self.assertCountEqual(found[b], [(6, 14)]) self.assertCountEqual(found[c], [(5, 12)]) probe.close_probe_finding_pool()
def test_pigeonhole_with_mismatch(self): """Tests with short sequence and short probes where the call to construct_kmer_probe_map_to_find_probe_covers tries the pigeonhole approach. """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' a = probe.Probe.from_str('GHIJXL') b = probe.Probe.from_str('BTUVWX') c = probe.Probe.from_str('ACEFHJ') probes = [a, b, c] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 1, 6, min_k=3, k=4) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) # This should try the pigeonhole approach, which should choose k=3 self.assertEqual(kmer_map.k, 3) f = probe.probe_covers_sequence_by_longest_common_substring(1, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(6, 12)]) self.assertCountEqual(found[b], [(18, 24)]) self.assertFalse(c in found) probe.close_probe_finding_pool() kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 1, 6, min_k=4, k=4) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) # This should try the pigeonhole approach and fail because it # chooses k=3, but min_k=4. So it should then try the random # approach with k=4. self.assertEqual(kmer_map.k, 4) f = probe.probe_covers_sequence_by_longest_common_substring(1, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(6, 12)]) self.assertCountEqual(found[b], [(18, 24)]) self.assertFalse(c in found) probe.close_probe_finding_pool()
def test_open_close_pool_without_work(self): """Tests opening a probe finding pool and closing it without doing any work in between. There was a bug, caused by a bug in early versions of Python, that could cause closing the pool to hang indefinitely when no work is submitted. """ probes = [probe.Probe.from_str('ABCDEF')] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, k=3) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8, None]: probe.open_probe_finding_pool(kmer_map, f, n_workers) time.sleep(1) probe.close_probe_finding_pool() time.sleep(1)
def test_repetitive(self): """Tests with short sequence and short probes where the sequence and probes have repetitive sequences, so that one probe can cover a lot of the sequence. """ np.random.seed(1) sequence = 'ABCAAAAAAAAAAXYZXYZXYZXYZAAAAAAAAAAAAAXYZ' a = probe.Probe.from_str('NAAAAAAN') probes = [a] # This should default to the random approach, so set k (rather than # min_k) kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(3, 13), (25, 38)]) probe.close_probe_finding_pool()
def test_island_with_exact_match2(self): """Tests the 'island_with_exact_match' argument for probe.probe_covers_sequence_by_longest_common_substring(..). """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU' a = probe.Probe.from_str('HXJKLMNOPCDE') b = probe.Probe.from_str('XIJKXMNOXCDE') c = probe.Probe.from_str('XIJKXMNOPXDE') probes = [a, b, c] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 3, 6, k=3) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) fn = probe.probe_covers_sequence_by_longest_common_substring(3, 6, 4) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, fn, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(7, 19)]) self.assertFalse(b in found) self.assertCountEqual(found[c], [(7, 19)]) probe.close_probe_finding_pool()
def test_two_occurrences(self): """Tests with short sequence and short probes where one probe appears twice. """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU' a = probe.Probe.from_str('CDEFGH') b = probe.Probe.from_str('GHIJKL') c = probe.Probe.from_str('STUVWX') probes = [a, b, c] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, min_k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(2, 8), (16, 22)]) self.assertCountEqual(found[b], [(6, 12)]) self.assertFalse(c in found) probe.close_probe_finding_pool()
def test_too_short_sequence_small_k(self): """Tests with sequence shorter than the probe length. """ np.random.seed(1) sequence = 'ABCDEFGHI' a = probe.Probe.from_str('ABCDEFGHIJKL') b = probe.Probe.from_str('EFGHIJKLMNOP') c = probe.Probe.from_str('DEFGHIJKLMNO') d = probe.Probe.from_str('XYZXYZABCDEF') probes = [a, b, c, d] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, min_k=6, k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(0, 9)]) self.assertFalse(b in found) self.assertCountEqual(found[c], [(3, 9)]) self.assertCountEqual(found[d], [(0, 6)]) probe.close_probe_finding_pool()
def test_too_short_sequence_large_k(self): """Tests with sequence shorter than the probe length and also shorter than k. """ np.random.seed(1) sequence = 'ABCDEFGHI' a = probe.Probe.from_str('ABCDEFGHIJKL') b = probe.Probe.from_str('EFGHIJKLMNOP') c = probe.Probe.from_str('DEFGHIJKLMNO') d = probe.Probe.from_str('XYZXYZABCDEF') probes = [a, b, c, d] # probe.find_probe_covers_in_sequence() should not attempt # to cover the sequence (return {}), but should run gracefully for k in [10, 11, 12]: kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, min_k=k, k=k) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertEqual(found, {}) probe.close_probe_finding_pool()
def _find_covers_in_target_genomes(self): """Find intervals across the target genomes covered by the probe set. This considers the given probe set (self.probes) and determines the intervals, in each genome of the target genomes (as well as their reverse complements), that are covered by the probes. This saves a dict, self.target_covers, as follows: self.target_covers[i][j][b] is a list of all the intervals covered by the probes in the target genome j of grouping i (in the reverse complement of the genome if b is True, and provided sequence if b is False). The endpoints of the intervals are offset so as to give unique integer positions in the genome (e.g., endpoints in the second chromosome are offset based on the length of the first chromosome). There may be duplicate intervals if two probes cover the same region of a sequence. """ logger.info("Finding probe covers across target genomes") logger.info("Building map from k-mers to probes") # Note that if adapters are added to the probes before this filter # is run (which would be typical), then self.lcf_thres will likely # be less than the probe length. So the k-mer to probe map will # be constructed using the random approach (yielding many k-mers # and thus a slower runtime in finding probe covers) rather than # the pigeonhole approach. kmer_probe_map = probe.SharedKmerProbeMap.construct( probe.construct_kmer_probe_map_to_find_probe_covers( self.probes, self.mismatches, self.lcf_thres, min_k=self.kmer_probe_map_k, k=self.kmer_probe_map_k) ) probe.open_probe_finding_pool(kmer_probe_map, self.cover_range_fn) self.target_covers = {} for i, j, gnm, rc in self._iter_target_genomes(): if not rc: logger.info(("Computing coverage in grouping %d (of %d), " "with target genome %d (of %d)"), i + 1, len(self.target_genomes), j + 1, len(self.target_genomes[i])) if i not in self.target_covers: self.target_covers[i] = {} if j not in self.target_covers[i]: self.target_covers[i][j] = {False: None, True: None} gnm_covers = [] length_so_far = 0 for sequence in gnm.seqs: if rc: # Take the reverse complement of sequence rc_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'} sequence = ''.join([rc_map.get(b, b) for b in sequence[::-1]]) # Find cover ranges of the probes, while allowing the ranges # to overlap (e.g., if one probe covers two regions that # overlap) probe_cover_ranges = probe.find_probe_covers_in_sequence( sequence, merge_overlapping=False) for p, cover_ranges in probe_cover_ranges.items(): for cover_range in cover_ranges: # Extend the range covered by probe p on both sides # by self.cover_extension cover_start = max(0, cover_range[0] - self.cover_extension) cover_end = min(len(sequence), cover_range[1] + self.cover_extension) # The endpoints of the cover give positions in just # this sequence (chromosome), so adjust them (according # to length_so_far) to give a unique integer position # in the genome gnm adjusted_cover = (cover_start + length_so_far, cover_end + length_so_far) gnm_covers += [adjusted_cover] length_so_far += len(sequence) self.target_covers[i][j][rc] = gnm_covers probe.close_probe_finding_pool()
def _make_votes_across_target_genomes(self, probes): """Compute, for each probe, votes for adapters to the probe. Votes are computed, cumulatively, across all the target genomes in self.target_genomes. Args: probes: list of candidate probes Returns: a list L such that L[i] is a tuple (A,B) where A gives the number of 'A' adapter votes for the probe probes[i] and B gives the number of 'B' adapter votes. """ logger.info("Building map from k-mers to probes") kmer_probe_map = probe.SharedKmerProbeMap.construct( probe.construct_kmer_probe_map_to_find_probe_covers( probes, self.mismatches, self.lcf_thres, min_k=self.kmer_probe_map_k, k=self.kmer_probe_map_k) ) probe.open_probe_finding_pool(kmer_probe_map, self.cover_range_fn) def iter_all_seqs(): for genomes_from_group in self.target_genomes: for g in genomes_from_group: for seq in g.seqs: yield seq # Store adapter votes for each probe in a list where the element # at index i is a tuple (A,B) that corresponds to the probe # probes[i] where A gives the 'A' votes for the probe and B gives # the 'B' votes cumulative_votes = [(0, 0) for _ in range(len(probes))] for sequence in iter_all_seqs(): # Compute votes for the adapters for each probe in 'sequence', # and also exchange all 'A' votes with 'B' votes and vice-versa. # Determine whether or not the exchange matches better with # cumulative_votes so far, and update cumulative_votes # accordingly. votes = self._votes_in_sequence(probes, sequence) votes_flipped = self._flip_AB_votes(votes) cumulative_votes_with_nonflipped = self._sum_votes_per_probe( cumulative_votes, votes) sum_nonflipped = self._sum_plurality_vote_across_probes( cumulative_votes_with_nonflipped) cumulative_votes_with_flipped = self._sum_votes_per_probe( cumulative_votes, votes_flipped) sum_flipped = self._sum_plurality_vote_across_probes( cumulative_votes_with_flipped) if sum_flipped > sum_nonflipped: # Add onto cumulative votes the votes in 'votes_flipped' # because these could be said to yield a more decisive # choice of adapter for each probe (i.e., the sum, across # all probes, of the most common vote of adapter for the # probe is higher) than the (unflipped) votes in 'votes' cumulative_votes = cumulative_votes_with_flipped else: cumulative_votes = cumulative_votes_with_nonflipped probe.close_probe_finding_pool() return cumulative_votes
def _make_ranks(self, candidate_probes, target_genomes): """Return a rank for each candidate probe to use in set cover. The "rank" of a candidate probe is a level of penalty for that probe, where higher ranks are more penalized. A set cover is sought that uses as many candidate probes from rank i as possible before considering probes with rank i+1. There are two considerations in computing ranks: - When identification is turned on (i.e., self.identify is True), the number of species that a probe "hits". Fewer hit species yields a smaller rank. - The number of bases in blacklisted genomes that the probe covers. Fewer covered bases yields a smaller rank. A probe that covers any part of a blacklisted genome will always receive a higher rank than a probe that does not. (This is achieved by first computing ranks using tuples of the form (x,y) where x=0 for any probe that does not cover a blacklisted genome and x=1 for a probe that does; y determines relative rank among those probes with the same x value. The tuple ranks are then converted into integer ranks by sorting the tuples.) When identification is enabled, a probe that hits more than one grouping (e.g., species) will always receive a higher rank than a probe that only hits one grouping (and does not cover any blacklisted genomes). When identification is not turned on, weighted set cover effectively does the following: (1) Covers as much of the target genomes as possible while minimizing the number of probes, without using any probe that covers any part of a blacklisted genome. (2) Covers whatever portions of the target genomes remain to be covered by using probes that cover parts of blacklisted genomes, while first seeking probes that cover less of the blacklisted genomes (i.e., even if probe B covers much more of the target genomes than probe A, A will be chosen before B if B covers a tiny bit more of the blacklisted genomes than A). When identification is turned on, weighted set cover: (1) Covers as much of the target genomes as possible while minimizing the number of probes, only using probes that hit one grouping. (2) Covers whatever portions of the target genomes remain to be covered while minimizing the number of probes, only using probes that hit two groupings, etc. (3) Considers probes that cover parts of blacklisted genomes, if there remains more of the target genomes to cover. The output is intended for input to set_cover.approx_multiuniverse as the 'ranks' input. Args: candidate_probes: list of candidate probes Returns: dict mapping set_ids (0 through len(candidate_probes)-1, each corresponding to a candidate probe) to a rank (integer) for that candidate probe """ # Only open a probe finding pool if it will be needed need_probe_finding_pool = (self.identify or len(self.blacklisted_genomes) > 0) if need_probe_finding_pool: logger.info("Building map from k-mers to probes") kmer_probe_map = probe.SharedKmerProbeMap.construct( probe.construct_kmer_probe_map_to_find_probe_covers( candidate_probes, self.mismatches_tolerant, self.lcf_thres_tolerant, min_k=self.kmer_probe_map_k, k=self.kmer_probe_map_k)) probe.open_probe_finding_pool( kmer_probe_map, self.cover_range_tolerant_fn, use_native_dict=self.kmer_probe_map_use_native_dict) if self.identify: # Find the number of target genome groupings (e.g., species) # that each probe "hits". (A probe "hits" a grouping if it # covers a part of at least one target genome in that grouping.) # A probe that hits just one grouping is good for # identification and is therefore ranked relatively low (a # rank of 1); probes that hit more than one grouping are poor # for identification and their ranks are equal to the number # of groupings they hit. num_groupings_hit = self._count_num_groupings_hit( candidate_probes, target_genomes) rank_val = {p: (0, hit) for p, hit in num_groupings_hit.items()} else: # Start each probe with the same rank rank_val = {p: (0, 0) for p in candidate_probes} # Find probes that cover part of a blacklisted genome. # All of these get a higher rank than any probe that does not # cover any part of a blacklisted genome (since the first element # of the tuple put into rank_val is 1, but 0 was the first # element of the tuple above) and the rank among these is based # on the number of bp they cover. blacklisted_bp_covered = self._count_blacklisted_bp_covered( candidate_probes) for p, bp in blacklisted_bp_covered.items(): if bp > 0: rank_val[p] = (1, bp) if need_probe_finding_pool: probe.close_probe_finding_pool() del kmer_probe_map gc.collect() # Convert the ranks, specified as tuples, into ranks from 0 # upward. The probe(s) with the smallest tuple rank get(s) # rank 0, the probe(s) with the next smallest tuple rank get(s) # rank 1, and so on.. all_rank_tuples = sorted(set(rank_val.values())) tuple_rank_idx = {} for i in range(len(all_rank_tuples)): tuple_rank_idx[all_rank_tuples[i]] = i ranks = {} for set_id, p in enumerate(candidate_probes): ranks[set_id] = tuple_rank_idx[rank_val[p]] return ranks
def _make_sets(self, candidate_probes, target_genomes): """Return a collection of sets to use in set cover. In the returned collection of sets, each set corresponds to a candidate probe and contains the bases of the target genomes covered by the candidate probe. The target genomes must be in grouped lists inside the list target_genomes. The output is intended for input to set_cover.approx_multiuniverse as the 'sets' input. Args: candidate_probes: list of candidate probes target_genomes: list of groups of target genomes Returns: a dict mapping set_ids (from 0 through len(candidate_probes)-1) to dicts, where the dict for a particular set_id maps universe_ids to sets. set_id corresponds to a candidate probe in candidate_probes and universe_id is a tuple that corresponds to a target genome in a grouping from target_genomes. The j'th target genome from the i'th grouping in target_genomes is given universe_id equal to (i,j). That is, i ranges from 0 through len(target_genomes)-1 (i.e., the number of groupings) and j ranges from 0 through (n_i)-1 where n_i is the number of target genomes in the i'th group. In the returned value (sets), sets[set_id][universe_id] is a set of all the bases (as an instance of interval.IntervalSet) covered by probe set_id in the target genome universe_id. (If sets[set_id][universe_id] contains just one interval, then that interval is stored directly as a tuple -- not in an instance of interval.IntervalSet -- to save space and it should be coverted to an interval.IntervalSet when needed.) """ logger.info("Building map from k-mers to probes") kmer_probe_map = probe.SharedKmerProbeMap.construct( probe.construct_kmer_probe_map_to_find_probe_covers( candidate_probes, self.mismatches, self.lcf_thres, min_k=self.kmer_probe_map_k, k=self.kmer_probe_map_k)) probe.open_probe_finding_pool(kmer_probe_map, self.cover_range_fn) probe_id = {} sets = {} for id, p in enumerate(candidate_probes): probe_id[p] = id sets[id] = {} for i, genomes_from_group in enumerate(target_genomes): for j, gnm in enumerate(genomes_from_group): logger.info(("Computing coverage in grouping %d (of %d), " "with target genome %d (of %d)"), i + 1, len(target_genomes), j + 1, len(genomes_from_group)) universe_id = (i, j) length_so_far = 0 for sequence in gnm.seqs: probe_cover_ranges = probe.find_probe_covers_in_sequence( sequence) # Add the bases of sequence that are covered by all the # probes into sets with universe_id equal to (i,j) for p, cover_ranges in probe_cover_ranges.items(): set_id = probe_id[p] for cover_range in cover_ranges: # Extend the range covered by probe p on both sides # by self.cover_extension cover_start = max( 0, cover_range[0] - self.cover_extension) cover_end = min( len(sequence), cover_range[1] + self.cover_extension) # The endpoints of the cover give positions in # just this sequence (chromosome), so adding the # lengths of all the sequences previously iterated # (length_so_far) onto them gives unique # integer positions in the genome gnm adjusted_cover = (cover_start + length_so_far, cover_end + length_so_far) if universe_id not in sets[set_id]: # Since a list has a lot of overhead and most # probes align to just one interval, simply # store that interval alone (not in a list) sets[set_id][universe_id] = adjusted_cover else: prev_cover = sets[set_id][universe_id] if isinstance(prev_cover, tuple): # This probe now aligns to two intervals in # this universe/genome, so store them in # a list sets[set_id][universe_id] = [prev_cover] sets[set_id][universe_id].append( adjusted_cover) length_so_far += len(sequence) probe.close_probe_finding_pool() del kmer_probe_map gc.collect() # Make an IntervalSet out of the intervals of each set. But if # there is just one interval in a set, then save space by leaving # that entry as a tuple. for set_id in sets.keys(): for universe_id in sets[set_id].keys(): intervals = sets[set_id][universe_id] if not isinstance(intervals, tuple): sets[set_id][universe_id] = interval.IntervalSet(intervals) # Else, there is just one interval in this set; leave it # stored directly as a tuple return sets
def run_random(self, n, genome_min, genome_max, num_probes, probe_length=100, lcf_thres=None, seed=1, n_workers=2, use_native_dict=False): """Run tests with a randomly generated sequence. Repeatedly runs tests in which a sequence is randomly generated, probes are generated from that sequence, and then the probes are looked up in the sequence. Creates the probes with the intention of determining coverage with a longest common substring. Args: n: number of times to run the test genome_min/genome_max: the genome (sequence) size is randomly chosen between genome_min and genome_max num_probes: the number of probes generated from the random sequence probe_length: number of bp to make each probe lcf_thres: lcf threshold parameter; when None, it is randomly chosen among 80 and 100 seed: random number generator seed n_workers: number of workers to have in a probe finding pool use_native_dict: have the probe finding pool use a native Python dict """ np.random.seed(seed) fixed_lcf_thres = lcf_thres for n in range(n): if fixed_lcf_thres is not None: lcf_thres = fixed_lcf_thres else: # Choose either lcf_thres=80 or lcf_thres=100 lcf_thres = np.random.choice([80, 100]) # Make a random sequence seq_length = np.random.randint(genome_min, genome_max) sequence = "".join( np.random.choice(['A', 'T', 'C', 'G'], size=seq_length, replace=True)) desired_probe_cover_ranges = defaultdict(list) # Make num_probes random probes probes = [] for m in range(num_probes): subseq_start = np.random.randint(0, seq_length - probe_length) subseq_end = subseq_start + probe_length cover_length = np.random.randint(lcf_thres, probe_length + 1) cover_start = subseq_start + \ np.random.randint(0, probe_length - cover_length + 1) cover_end = min(seq_length, cover_start + cover_length) probe_str_cover = sequence[cover_start:cover_end] # Add random bases before and after what the probe should # cover probe_str_start = "".join( np.random.choice(['A', 'T', 'C', 'G'], size=cover_start - subseq_start, replace=True)) probe_str_end = "".join( np.random.choice(['A', 'T', 'C', 'G'], size=subseq_end - cover_end, replace=True)) probe_str = probe_str_start + probe_str_cover + probe_str_end # Add 0, 1, 2, or 3 random mismatches for k in range(np.random.randint(0, 4)): pos = np.random.randint(0, probe_length) base_choices = [ b for b in ['A', 'T', 'C', 'G'] if b != probe_str[pos] ] probe_str = probe_str[:pos] + \ "".join(np.random.choice(base_choices, size=1)) + \ probe_str[(pos + 1):] p = probe.Probe.from_str(probe_str) desired_probe_cover_ranges[p].append((cover_start, cover_end)) probes += [p] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 3, lcf_thres) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring( 3, lcf_thres) probe.open_probe_finding_pool(kmer_map, f, n_workers, use_native_dict=use_native_dict) found = probe.find_probe_covers_in_sequence(sequence) probe.close_probe_finding_pool() # Check that this didn't find any extraneous probes and that # it found at least 95% of the original (it may miss some # due to false negatives in the approach) self.assertLessEqual(len(found), len(probes)) self.assertGreaterEqual(len(found), 0.95 * len(probes)) # Check that each desired probe was found correctly for p, cover_ranges in desired_probe_cover_ranges.items(): if p not in found: continue found_cover_ranges = found[p] # This probe most likely was found once, but could have # been missed (due to false negatives in the approach) and # may have been found more than once due to chance (but # probably not too much more!) self.assertTrue(len(found_cover_ranges) in [1, 2]) # The cover ranges should have been captured, and the ones # found may extend past what was desired by a small amount due # to allowing mismatches and chance # Because of mismatches possibly added to the end of the # desired cover range, what was recaptured may not always # encompass the entire cover range, so allow some small # tolerance for desired_cv in cover_ranges: found_desired_cv = False for found_cv in found_cover_ranges: left_diff = desired_cv[0] - found_cv[0] right_diff = found_cv[1] - desired_cv[1] if left_diff >= -7 and left_diff < 15: if right_diff >= -7 and right_diff < 15: found_desired_cv = True break self.assertTrue(found_desired_cv)