def test_no_match_with_mismatches(self): f = probe.probe_covers_sequence_by_longest_common_substring(1, 6) match = f('ZZZABCGHIXKXXYZ', self.seq, 6, 9, 15, len(self.seq)) self.assertTrue(match is None) f = probe.probe_covers_sequence_by_longest_common_substring(1, 6) match = f('ZZZZZAGHIJYZ', self.seq, 6, 9, 12, len(self.seq)) self.assertTrue(match is None)
def test_more_than_cover(self): """Tests with short sequence and short probes where probes contain more than what they cover. """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPQR' + ('Z' * 100) + 'STUVWXYZ' a = probe.Probe.from_str('XYZCDEFGHIJKABCSTUVWXABC') b = probe.Probe.from_str('PQRSGHIJKLMNXYZ') c = probe.Probe.from_str('ABCFGHIJKLZAZAZAGHIJKL') probes = [a, b, c] # This should default to the random approach, so set k (rather than # min_k) kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 0, 6, k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(2, 11), (118, 124)]) self.assertCountEqual(found[b], [(6, 14)]) self.assertCountEqual(found[c], [(5, 12)]) probe.close_probe_finding_pool()
def test_island_with_exact_match1(self): """Tests the 'island_with_exact_match' argument for probe.probe_covers_sequence_by_longest_common_substring(..). """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPYDEFGHQRSTU' a = probe.Probe.from_str('XDEFGH') b = probe.Probe.from_str('CXEFGH') c = probe.Probe.from_str('CDXFGH') d = probe.Probe.from_str('CDEXGH') e = probe.Probe.from_str('CDEFXH') f = probe.Probe.from_str('CDEFGX') g = probe.Probe.from_str('CDEFGH') probes = [a, b, c, d, e, f, g] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 1, 6, k=3) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) fn = probe.probe_covers_sequence_by_longest_common_substring(1, 6, 4) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, fn, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(2, 8), (16, 22)]) self.assertCountEqual(found[b], [(2, 8)]) self.assertFalse(c in found) self.assertFalse(d in found) self.assertCountEqual(found[e], [(2, 8)]) self.assertCountEqual(found[f], [(2, 8)]) self.assertCountEqual(found[g], [(2, 8), (16, 22)]) probe.close_probe_finding_pool()
def test_multiple_searches_with_same_pool(self): """Tests more than one call to find_probe_covers_in_sequence() with the same pool. """ np.random.seed(1) sequence_a = 'ABCAXYZXYZDEFXYZAAYZ' sequence_b = 'GHIDAXYZXYZAAABCABCD' a = probe.Probe.from_str('AXYZXYZ') b = probe.Probe.from_str('AABCABC') probes = [a, b] # This should default to the random approach, so set k (rather than # min_k) kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 0, 6, k=3) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found_a = probe.find_probe_covers_in_sequence(sequence_a) self.assertEqual(found_a, {a: [(3, 10)]}) found_b = probe.find_probe_covers_in_sequence(sequence_b) self.assertEqual(found_b, {a: [(4, 11)], b: [(12, 19)]}) probe.close_probe_finding_pool()
def test_pigeonhole_with_mismatch(self): """Tests with short sequence and short probes where the call to construct_kmer_probe_map_to_find_probe_covers tries the pigeonhole approach. """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ' a = probe.Probe.from_str('GHIJXL') b = probe.Probe.from_str('BTUVWX') c = probe.Probe.from_str('ACEFHJ') probes = [a, b, c] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 1, 6, min_k=3, k=4) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) # This should try the pigeonhole approach, which should choose k=3 self.assertEqual(kmer_map.k, 3) f = probe.probe_covers_sequence_by_longest_common_substring(1, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(6, 12)]) self.assertCountEqual(found[b], [(18, 24)]) self.assertFalse(c in found) probe.close_probe_finding_pool() kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes, 1, 6, min_k=4, k=4) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) # This should try the pigeonhole approach and fail because it # chooses k=3, but min_k=4. So it should then try the random # approach with k=4. self.assertEqual(kmer_map.k, 4) f = probe.probe_covers_sequence_by_longest_common_substring(1, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(6, 12)]) self.assertCountEqual(found[b], [(18, 24)]) self.assertFalse(c in found) probe.close_probe_finding_pool()
def __init__(self, probes, mismatches, lcf_thres, target_genomes, target_genomes_names=None, island_of_exact_match=0, cover_extension=0, kmer_probe_map_k=10, rc_too=True): """ Args: probes: collection of instances of probe.Probe that form a complete probe set mismatches/lcf_thres: consider a probe to hybridize to a sequence if a stretch of 'lcf_thres' or more bp aligns with 'mismatches' or fewer mismatched bp; used to compute whether a probe "covers" a portion of a sequence target_genomes: list [g_1, g_2, ..., g_m] of m groupings of genomes, where each g_i is a list of genome.Genomes belonging to group i. For example, a group may be a species and each g_i would be a list of the target genomes of species i. target_genomes_names: list [s_1, s_2, ..., s_m] of strings where the name of the i'th genome grouping (from target_genomes) is s_i. When None, the name of the i'th grouping is "Group i". island_of_exact_match: for a probe to hybridize to a sequence, require that there be an exact match of length at least 'island_of_exact_match' cover_extension: number of bp by which to extend the coverage on each side of a probe; a probe "covers" the portion of the sequence that it hybridizes to, as well as 'cover_extension' bp on each side of that portion kmer_probe_map_k: in calls to probe.construct_kmer_probe_map..., uses this value as min_k and k rc_too: when True, analyze all the target genomes in target_genomes, as well as their reverse complements (when False, do not analyze reverse complements) """ self.probes = probes self.target_genomes = target_genomes if target_genomes_names: if len(target_genomes_names) != len(target_genomes): raise ValueError(("Number of target genome names must be same " "as the number of target genomes")) self.target_genomes_names = target_genomes_names else: self.target_genomes_names = [ "Group %d" % i for i in range(len(target_genomes)) ] self.mismatches = mismatches self.lcf_thres = lcf_thres self.cover_range_fn = \ probe.probe_covers_sequence_by_longest_common_substring( mismatches, lcf_thres, island_of_exact_match) self.cover_extension = cover_extension self.kmer_probe_map_k = kmer_probe_map_k self.rc_too = rc_too
def test_match_no_mismatches(self): f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) match = f('ZZZABCGHIJKLXYZ', self.seq, 6, 9, 15, len(self.seq)) self.assertTrue(match is not None) start, end = match self.assertEqual(start, 6) self.assertEqual(end, 12) match = f('ZZZZAFGHIJKLMDEF', self.seq, 6, 9, 15, len(self.seq)) self.assertTrue(match is not None) start, end = match self.assertEqual(start, 5) self.assertEqual(end, 13)
def test_match_with_mismatches(self): f = probe.probe_covers_sequence_by_longest_common_substring(1, 6) match = f('ZZZGHIGHIXKLDEF', self.seq, 6, 9, 15, len(self.seq)) self.assertTrue(match is not None) start, end = match self.assertEqual(start, 6) self.assertEqual(end, 12) match = f('ZZZZZZGHIJKXSWZ', self.seq, 6, 9, 15, len(self.seq)) self.assertTrue(match is not None) start, end = match self.assertEqual(start, 6) self.assertEqual(end, 12) match = f('ZZAGTFGHIJKXM', self.seq, 6, 9, 13, len(self.seq)) self.assertTrue(match is not None) start, end = match self.assertEqual(start, 5) self.assertEqual(end, 13)
def test_open_close_pool_without_work(self): """Tests opening a probe finding pool and closing it without doing any work in between. There was a bug, caused by a bug in early versions of Python, that could cause closing the pool to hang indefinitely when no work is submitted. """ probes = [probe.Probe.from_str('ABCDEF')] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, k=3) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8, None]: probe.open_probe_finding_pool(kmer_map, f, n_workers) time.sleep(1) probe.close_probe_finding_pool() time.sleep(1)
def test_repetitive(self): """Tests with short sequence and short probes where the sequence and probes have repetitive sequences, so that one probe can cover a lot of the sequence. """ np.random.seed(1) sequence = 'ABCAAAAAAAAAAXYZXYZXYZXYZAAAAAAAAAAAAAXYZ' a = probe.Probe.from_str('NAAAAAAN') probes = [a] # This should default to the random approach, so set k (rather than # min_k) kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(3, 13), (25, 38)]) probe.close_probe_finding_pool()
def test_island_with_exact_match2(self): """Tests the 'island_with_exact_match' argument for probe.probe_covers_sequence_by_longest_common_substring(..). """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU' a = probe.Probe.from_str('HXJKLMNOPCDE') b = probe.Probe.from_str('XIJKXMNOXCDE') c = probe.Probe.from_str('XIJKXMNOPXDE') probes = [a, b, c] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 3, 6, k=3) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) fn = probe.probe_covers_sequence_by_longest_common_substring(3, 6, 4) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, fn, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(7, 19)]) self.assertFalse(b in found) self.assertCountEqual(found[c], [(7, 19)]) probe.close_probe_finding_pool()
def test_two_occurrences(self): """Tests with short sequence and short probes where one probe appears twice. """ np.random.seed(1) sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU' a = probe.Probe.from_str('CDEFGH') b = probe.Probe.from_str('GHIJKL') c = probe.Probe.from_str('STUVWX') probes = [a, b, c] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, min_k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(2, 8), (16, 22)]) self.assertCountEqual(found[b], [(6, 12)]) self.assertFalse(c in found) probe.close_probe_finding_pool()
def test_too_short_sequence_small_k(self): """Tests with sequence shorter than the probe length. """ np.random.seed(1) sequence = 'ABCDEFGHI' a = probe.Probe.from_str('ABCDEFGHIJKL') b = probe.Probe.from_str('EFGHIJKLMNOP') c = probe.Probe.from_str('DEFGHIJKLMNO') d = probe.Probe.from_str('XYZXYZABCDEF') probes = [a, b, c, d] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, min_k=6, k=6) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertCountEqual(found[a], [(0, 9)]) self.assertFalse(b in found) self.assertCountEqual(found[c], [(3, 9)]) self.assertCountEqual(found[d], [(0, 6)]) probe.close_probe_finding_pool()
def __init__(self, adapter_a, adapter_b, mismatches, lcf_thres, island_of_exact_match=0, kmer_probe_map_k=20): """ Args: adapter_a: tuple (x, y) where x gives the A adapter sequence to add onto the 5' end of a probe and y gives the A adapter sequence to add onto the 3' end of a probe adapter_b: tuple (x, y) where x gives the B adapter sequence to add onto the 5' end of a probe and y gives the B adapter sequence to add onto the 3' end of a probe mismatches/lcf_thres: consider a probe to hybridize to a sequence if a stretch of 'lcf_thres' or more bp aligns with 'mismatches' or fewer mismatched bp island_of_exact_match: for a probe to hybridize to a sequence, require that there be an exact match of length at least 'island_of_exact_match' kmer_probe_map_k: in calls to probe.construct_kmer_probe_map..., uses this value as min_k and k """ if len(adapter_a) != 2 or len(adapter_b) != 2: raise ValueError(("adapter_a/adapter_b arguments must be tuples " "of length 2, giving the sequences to add onto " "the 5' and 3' ends")) self.adapter_a_5end, self.adapter_a_3end = adapter_a self.adapter_b_5end, self.adapter_b_3end = adapter_b self.mismatches = mismatches self.lcf_thres = lcf_thres self.cover_range_fn = \ probe.probe_covers_sequence_by_longest_common_substring( mismatches=mismatches, lcf_thres=lcf_thres, island_of_exact_match=island_of_exact_match) self.kmer_probe_map_k = kmer_probe_map_k
def test_too_short_sequence_large_k(self): """Tests with sequence shorter than the probe length and also shorter than k. """ np.random.seed(1) sequence = 'ABCDEFGHI' a = probe.Probe.from_str('ABCDEFGHIJKL') b = probe.Probe.from_str('EFGHIJKLMNOP') c = probe.Probe.from_str('DEFGHIJKLMNO') d = probe.Probe.from_str('XYZXYZABCDEF') probes = [a, b, c, d] # probe.find_probe_covers_in_sequence() should not attempt # to cover the sequence (return {}), but should run gracefully for k in [10, 11, 12]: kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 0, 6, min_k=k, k=k) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) for n_workers in [1, 2, 4, 7, 8]: probe.open_probe_finding_pool(kmer_map, f, n_workers) found = probe.find_probe_covers_in_sequence(sequence) self.assertEqual(found, {}) probe.close_probe_finding_pool()
def test_no_match_with_probe_smaller_than_lcf_thres(self): f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) match = f('GHIX', self.seq[6:10], 1, 3, 4, len(self.seq[6:10])) self.assertTrue(match is None)
def run_random(self, n, genome_min, genome_max, num_probes, probe_length=100, lcf_thres=None, seed=1, n_workers=2, use_native_dict=False): """Run tests with a randomly generated sequence. Repeatedly runs tests in which a sequence is randomly generated, probes are generated from that sequence, and then the probes are looked up in the sequence. Creates the probes with the intention of determining coverage with a longest common substring. Args: n: number of times to run the test genome_min/genome_max: the genome (sequence) size is randomly chosen between genome_min and genome_max num_probes: the number of probes generated from the random sequence probe_length: number of bp to make each probe lcf_thres: lcf threshold parameter; when None, it is randomly chosen among 80 and 100 seed: random number generator seed n_workers: number of workers to have in a probe finding pool use_native_dict: have the probe finding pool use a native Python dict """ np.random.seed(seed) fixed_lcf_thres = lcf_thres for n in range(n): if fixed_lcf_thres is not None: lcf_thres = fixed_lcf_thres else: # Choose either lcf_thres=80 or lcf_thres=100 lcf_thres = np.random.choice([80, 100]) # Make a random sequence seq_length = np.random.randint(genome_min, genome_max) sequence = "".join( np.random.choice(['A', 'T', 'C', 'G'], size=seq_length, replace=True)) desired_probe_cover_ranges = defaultdict(list) # Make num_probes random probes probes = [] for m in range(num_probes): subseq_start = np.random.randint(0, seq_length - probe_length) subseq_end = subseq_start + probe_length cover_length = np.random.randint(lcf_thres, probe_length + 1) cover_start = subseq_start + \ np.random.randint(0, probe_length - cover_length + 1) cover_end = min(seq_length, cover_start + cover_length) probe_str_cover = sequence[cover_start:cover_end] # Add random bases before and after what the probe should # cover probe_str_start = "".join( np.random.choice(['A', 'T', 'C', 'G'], size=cover_start - subseq_start, replace=True)) probe_str_end = "".join( np.random.choice(['A', 'T', 'C', 'G'], size=subseq_end - cover_end, replace=True)) probe_str = probe_str_start + probe_str_cover + probe_str_end # Add 0, 1, 2, or 3 random mismatches for k in range(np.random.randint(0, 4)): pos = np.random.randint(0, probe_length) base_choices = [ b for b in ['A', 'T', 'C', 'G'] if b != probe_str[pos] ] probe_str = probe_str[:pos] + \ "".join(np.random.choice(base_choices, size=1)) + \ probe_str[(pos + 1):] p = probe.Probe.from_str(probe_str) desired_probe_cover_ranges[p].append((cover_start, cover_end)) probes += [p] kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers( probes, 3, lcf_thres) kmer_map = probe.SharedKmerProbeMap.construct(kmer_map) f = probe.probe_covers_sequence_by_longest_common_substring( 3, lcf_thres) probe.open_probe_finding_pool(kmer_map, f, n_workers, use_native_dict=use_native_dict) found = probe.find_probe_covers_in_sequence(sequence) probe.close_probe_finding_pool() # Check that this didn't find any extraneous probes and that # it found at least 95% of the original (it may miss some # due to false negatives in the approach) self.assertLessEqual(len(found), len(probes)) self.assertGreaterEqual(len(found), 0.95 * len(probes)) # Check that each desired probe was found correctly for p, cover_ranges in desired_probe_cover_ranges.items(): if p not in found: continue found_cover_ranges = found[p] # This probe most likely was found once, but could have # been missed (due to false negatives in the approach) and # may have been found more than once due to chance (but # probably not too much more!) self.assertTrue(len(found_cover_ranges) in [1, 2]) # The cover ranges should have been captured, and the ones # found may extend past what was desired by a small amount due # to allowing mismatches and chance # Because of mismatches possibly added to the end of the # desired cover range, what was recaptured may not always # encompass the entire cover range, so allow some small # tolerance for desired_cv in cover_ranges: found_desired_cv = False for found_cv in found_cover_ranges: left_diff = desired_cv[0] - found_cv[0] right_diff = found_cv[1] - desired_cv[1] if left_diff >= -7 and left_diff < 15: if right_diff >= -7 and right_diff < 15: found_desired_cv = True break self.assertTrue(found_desired_cv)
def test_match_with_island_of_exact_match(self): f = probe.probe_covers_sequence_by_longest_common_substring(1, 6, 4) match = f('ZZZGHIGHIJXLDEF', self.seq, 6, 9, 15, len(self.seq)) self.assertEqual(match, (6, 12))
def test_no_match_with_island_of_exact_match(self): f = probe.probe_covers_sequence_by_longest_common_substring(1, 6, 4) match = f('ZZZGHIGHIXKLDEF', self.seq, 6, 9, 15, len(self.seq)) self.assertTrue(match is None)
def test_no_match_with_probe_longer_than_sequence(self): f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) match = f('DEFX', 'DEFG', 1, 3, 7, 4) self.assertTrue(match is None)
def test_match_with_probe_longer_than_sequence(self): f = probe.probe_covers_sequence_by_longest_common_substring(0, 6) match = f('DEFG', 'DEFG', 1, 3, 7, 4) self.assertEqual(match, (0, 4))
def __init__(self, mismatches, lcf_thres, island_of_exact_match=0, mismatches_tolerant=None, lcf_thres_tolerant=None, island_of_exact_match_tolerant=None, custom_cover_range_fn=None, custom_cover_range_tolerant_fn=None, identify=False, blacklisted_genomes=[], coverage=1.0, cover_extension=0, cover_groupings_separately=False, kmer_probe_map_k=20, kmer_probe_map_use_native_dict=False): """ Args: mismatches/lcf_thres: consider a probe to hybridize to a sequence if a stretch of 'lcf_thres' or more bp aligns with 'mismatches' or fewer mismatched bp; used to compute whether a probe "covers" a portion of a sequence island_of_exact_match: for a probe to hybridize to a sequence, require that there be an exact match of length at least 'island_of_exact_match' mismatches_tolerant/lcf_thres_tolerant: more tolerant values corresponding to 'mismatches' and 'lcf_thres'. It should generally be true that 'mismatches_tolerant' > 'mismatches' and 'lcf_thres_tolerant' < 'lcf_thres'. These values are used in determining the overlap that a candidate probe has with different groupings when the identification option is enabled. They are also used when determining the coverage of each candidate probe with the blacklisted genomes. They are meant to capture more potential hybridizations (i.e., be more sensitive). When not set, they are by default equal to 'mismatches' and 'lcf_thres'. island_of_exact_match_tolerant: more tolerant value corresponding to 'island_of_exact_match'. It should generally be true that this value is less than 'island_of_exact_match'. Used when mismatches_tolerant/lcf_thres_tolerant are used. custom_cover_range_fn: if set, tuple (path, fn) where path gives a path to a Python module and fn gives the name of a function in that module. This function is dynamically loaded and used to determine whether a probe will hybridize to a region of target sequence (and what portion will hybridize). The function must accept the same arguments as the function returned by probe.probe_covers_sequence_by_longest_common_substring() and return the same value. When set, the parameters 'mismatches', 'lcf_thres', and 'island_of_exact_match' are ignored (even if their values are default values) because they are only used in the default cover_range_fn. custom_cover_range_tolerant_fn: same as custom_cover_range_fn, but with more tolerance for hybridization; likewise, the _tolerant parameters are ignored when this is set. identify: when True, indicates that probes should be designed with the identification option enabled (default is False) blacklisted_genomes: list of paths to FASTA files of genomes that should be blacklisted (i.e., probes are penalized by the amount they cover these genomes). coverage: either a float in [0,1] or an int > 1. When it is a float in [0,1], it determines the fraction of each of the target genomes that must be covered by the selected probes. When it is an int > 1, it determines the number of bp of each of the target genomes that must be covered by the selected probes. cover_extension: number of bp by which to extend the coverage of a probe on both sides. When this is 0, a probe "covers" exactly the portion of the sequence that it hybridizes to, as determined with the 'mismatches' and 'lcf_thres' parameters. This parameter allows a probe to cover a region surrounding and including the portion of the sequence that it hybridizes to. The probe covers the portion of the sequence that it hybridizes to, as well as 'cover_extension' bp on each side of that portion. (So the length of the region a probe covers is the length of the probe plus 2*'cover_extension' bp.) This may more realistically model hybrid selection because a probe hybridizes to a fragment of DNA, which includes the region targeted by the probe as well as the surrounding region, and this entire fragment is sequenced. Increasing the value of this parameter should reduce the number of required probes. cover_groupings_separately: when True, runs a separate instance of set cover with the target genomes from each grouping and yields the probes selected across (the union of) all the runs. (When False, just one instance of set cover is run.) This improves runtime by reducing the number of universes (and thus overall universe size) given to each instance of set cover, but it may yield more probes than just one instance would yield, particularly when the genomes across groupings are similar at a nucleotide level. kmer_probe_map_k: in calls to probe.construct_kmer_probe_map..., uses this value as min_k and k kmer_probe_map_use_native_dict: when finding probe covers for identification or blacklisting, use the native Python dict of SharedKmerProbeMap rather than its primitive types that are more suited for sharing across processes; depending on the input this can result in considerably more memory use but may give an improvement in runtime """ if custom_cover_range_fn is not None: # Use a custom function to determine whether a probe hybridizes # to a region of target sequence (and what part hybridizes), # rather than the default model. Ignore the given values for # mismatches and lcf_thres (which may be default values) because # these are only relevant for the default model self.mismatches, self.lcf_thres = None, None # Dynamically load the function fn_path, fn_name = custom_cover_range_fn self.cover_range_fn = dynamic_load.load_function_from_path( fn_path, fn_name) else: self.mismatches = mismatches self.lcf_thres = lcf_thres # Construct a function using the default model of hybridization self.cover_range_fn = \ probe.probe_covers_sequence_by_longest_common_substring( mismatches, lcf_thres, island_of_exact_match) if not mismatches_tolerant: mismatches_tolerant = mismatches if not lcf_thres_tolerant: lcf_thres_tolerant = lcf_thres if not island_of_exact_match_tolerant: island_of_exact_match_tolerant = island_of_exact_match if custom_cover_range_tolerant_fn is not None: # Use a custom function to determine, with more tolerance, # whether a probe hybridizes to a region of target sequence (and # what part hybridizes), rather than the default model. Ignore # the given values of mismatches_tolerant and lcf_thres_tolerant # (which may be default values) because these are only relevant for # the default model self.mismatches_tolerant, self.lcf_thres_tolerant = None, None # Dynamically load the function fn_path, fn_name = custom_cover_range_tolerant_fn self.cover_range_tolerant_fn = dynamic_load.load_function_from_path( fn_path, fn_name) else: self.mismatches_tolerant = mismatches_tolerant self.lcf_thres_tolerant = lcf_thres_tolerant # Construct a function using the default model of hybridization self.cover_range_tolerant_fn = \ probe.probe_covers_sequence_by_longest_common_substring( mismatches_tolerant, lcf_thres_tolerant, island_of_exact_match_tolerant) # Warn if identification is enabled but the coverage is high if identify: if (coverage <= 1.0 and coverage >= 0.25) or \ (coverage > 1 and coverage >= 5000): logger.warning(("Identification is enabled but the required " "coverage is high; generally coverage should " "be small when performing identification")) self.identify = identify self.blacklisted_genomes = blacklisted_genomes self.coverage = coverage self.cover_extension = cover_extension self.cover_groupings_separately = cover_groupings_separately self.kmer_probe_map_k = kmer_probe_map_k self.kmer_probe_map_use_native_dict = kmer_probe_map_use_native_dict
def __init__(self, adapter_a, adapter_b, mismatches, lcf_thres, island_of_exact_match=0, custom_cover_range_fn=None, kmer_probe_map_k=20): """ Args: adapter_a: tuple (x, y) where x gives the A adapter sequence to add onto the 5' end of a probe and y gives the A adapter sequence to add onto the 3' end of a probe adapter_b: tuple (x, y) where x gives the B adapter sequence to add onto the 5' end of a probe and y gives the B adapter sequence to add onto the 3' end of a probe mismatches/lcf_thres: consider a probe to hybridize to a sequence if a stretch of 'lcf_thres' or more bp aligns with 'mismatches' or fewer mismatched bp island_of_exact_match: for a probe to hybridize to a sequence, require that there be an exact match of length at least 'island_of_exact_match' custom_cover_range_fn: if set, tuple (path, fn) where path gives a path to a Python module and fn gives the name of a function in that module. This function is dynamically loaded and used to determine whether a probe will hybridize to a region of target sequence (and what portion will hybridize). The function must accept the same arguments as the function returned by probe.probe_covers_sequence_by_longest_common_substring() and return the same value. When set, the parameters 'mismatches', 'lcf_thres', and 'island_of_exact_match' are ignored (even if their values are default values) because they are only used in the default cover_range_fn kmer_probe_map_k: in calls to probe.construct_kmer_probe_map..., uses this value as min_k and k """ if len(adapter_a) != 2 or len(adapter_b) != 2: raise ValueError(("adapter_a/adapter_b arguments must be tuples " "of length 2, giving the sequences to add onto " "the 5' and 3' ends")) self.adapter_a_5end, self.adapter_a_3end = adapter_a self.adapter_b_5end, self.adapter_b_3end = adapter_b if custom_cover_range_fn is not None: # Use a custom function to determine whether a probe hybridizes # to a region of target sequence (and what part hybridizes), # rather than the default model. Ignore the given values for # mismatches and lcf_thres (which may be default values) because # these are only relevant for the default model self.mismatches, self.lcf_thres = None, None # Dynamically load the function fn_path, fn_name = custom_cover_range_fn self.cover_range_fn = dynamic_load.load_function_from_path( fn_path, fn_name) else: self.mismatches = mismatches self.lcf_thres = lcf_thres # Construct a function using the default model of hybridization self.cover_range_fn = \ probe.probe_covers_sequence_by_longest_common_substring( mismatches, lcf_thres, island_of_exact_match) self.kmer_probe_map_k = kmer_probe_map_k
def test_match_from_probe_on_end(self): f = probe.probe_covers_sequence_by_longest_common_substring(0, 10) match = f('ABCDEF', self.seq, 1, 3, 6, len(self.seq)) self.assertEqual(match, (0, 6))
def __init__(self, probes, mismatches, lcf_thres, target_genomes, target_genomes_names=None, island_of_exact_match=0, custom_cover_range_fn=None, cover_extension=0, kmer_probe_map_k=10, rc_too=True): """ Args: probes: collection of instances of probe.Probe that form a complete probe set mismatches/lcf_thres: consider a probe to hybridize to a sequence if a stretch of 'lcf_thres' or more bp aligns with 'mismatches' or fewer mismatched bp; used to compute whether a probe "covers" a portion of a sequence target_genomes: list [g_1, g_2, ..., g_m] of m groupings of genomes, where each g_i is a list of genome.Genomes belonging to group i. For example, a group may be a species and each g_i would be a list of the target genomes of species i. target_genomes_names: list [s_1, s_2, ..., s_m] of strings where the name of the i'th genome grouping (from target_genomes) is s_i. When None, the name of the i'th grouping is "Group i". island_of_exact_match: for a probe to hybridize to a sequence, require that there be an exact match of length at least 'island_of_exact_match' custom_cover_range_fn: if set, tuple (path, fn) where path gives a path to a Python module and fn gives the name of a function in that module. This function is dynamically loaded and used to determine whether a probe will hybridize to a region of target sequence (and what portion will hybridize). The function must accept the same arguments as the function returned by probe.probe_covers_sequence_by_longest_common_substring() and return the same value. When set, the parameters 'mismatches', 'lcf_thres', and 'island_of_exact_match' are ignored (even if their values are default values) because they are only used in the default cover_range_fn. cover_extension: number of bp by which to extend the coverage on each side of a probe; a probe "covers" the portion of the sequence that it hybridizes to, as well as 'cover_extension' bp on each side of that portion kmer_probe_map_k: in calls to probe.construct_kmer_probe_map..., uses this value as min_k and k rc_too: when True, analyze all the target genomes in target_genomes, as well as their reverse complements (when False, do not analyze reverse complements) """ self.probes = probes self.target_genomes = target_genomes if target_genomes_names: if len(target_genomes_names) != len(target_genomes): raise ValueError(("Number of target genome names must be same " "as the number of target genomes")) self.target_genomes_names = target_genomes_names else: self.target_genomes_names = ["Group %d" % i for i in range(len(target_genomes))] if custom_cover_range_fn is not None: # Use a custom function to determine whether a probe hybridizes # to a region of target sequence (and what part hybridizes), # rather than the default model. Ignore the given values for # mismatches and lcf_thres (which may be default values) because # these are only relevant for the default model self.mismatches, self.lcf_thres = None, None # Dynamically load the function fn_path, fn_name = custom_cover_range_fn self.cover_range_fn = dynamic_load.load_function_from_path( fn_path, fn_name) else: self.mismatches = mismatches self.lcf_thres = lcf_thres # Construct a function using the default model of hybridization self.cover_range_fn = \ probe.probe_covers_sequence_by_longest_common_substring( mismatches, lcf_thres, island_of_exact_match) self.cover_extension = cover_extension self.kmer_probe_map_k = kmer_probe_map_k self.rc_too = rc_too
def test_no_match_from_probe_on_end(self): f = probe.probe_covers_sequence_by_longest_common_substring(0, 10) match = f('ABCDEF', self.seq, 1, 3, 10, len(self.seq)) self.assertTrue(match is None)