Example #1
0
 def test_no_match_with_mismatches(self):
     f = probe.probe_covers_sequence_by_longest_common_substring(1, 6)
     match = f('ZZZABCGHIXKXXYZ', self.seq, 6, 9, 15, len(self.seq))
     self.assertTrue(match is None)
     f = probe.probe_covers_sequence_by_longest_common_substring(1, 6)
     match = f('ZZZZZAGHIJYZ', self.seq, 6, 9, 12, len(self.seq))
     self.assertTrue(match is None)
Example #2
0
 def test_more_than_cover(self):
     """Tests with short sequence and short probes
     where probes contain more than what they cover.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPQR' + ('Z' * 100) + 'STUVWXYZ'
     a = probe.Probe.from_str('XYZCDEFGHIJKABCSTUVWXABC')
     b = probe.Probe.from_str('PQRSGHIJKLMNXYZ')
     c = probe.Probe.from_str('ABCFGHIJKLZAZAZAGHIJKL')
     probes = [a, b, c]
     # This should default to the random approach, so set k (rather than
     # min_k)
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                    0,
                                                                    6,
                                                                    k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(2, 11), (118, 124)])
         self.assertCountEqual(found[b], [(6, 14)])
         self.assertCountEqual(found[c], [(5, 12)])
         probe.close_probe_finding_pool()
Example #3
0
 def test_island_with_exact_match1(self):
     """Tests the 'island_with_exact_match' argument for
     probe.probe_covers_sequence_by_longest_common_substring(..).
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPYDEFGHQRSTU'
     a = probe.Probe.from_str('XDEFGH')
     b = probe.Probe.from_str('CXEFGH')
     c = probe.Probe.from_str('CDXFGH')
     d = probe.Probe.from_str('CDEXGH')
     e = probe.Probe.from_str('CDEFXH')
     f = probe.Probe.from_str('CDEFGX')
     g = probe.Probe.from_str('CDEFGH')
     probes = [a, b, c, d, e, f, g]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                    1,
                                                                    6,
                                                                    k=3)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     fn = probe.probe_covers_sequence_by_longest_common_substring(1, 6, 4)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, fn, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(2, 8), (16, 22)])
         self.assertCountEqual(found[b], [(2, 8)])
         self.assertFalse(c in found)
         self.assertFalse(d in found)
         self.assertCountEqual(found[e], [(2, 8)])
         self.assertCountEqual(found[f], [(2, 8)])
         self.assertCountEqual(found[g], [(2, 8), (16, 22)])
         probe.close_probe_finding_pool()
Example #4
0
 def test_multiple_searches_with_same_pool(self):
     """Tests more than one call to find_probe_covers_in_sequence()
     with the same pool.
     """
     np.random.seed(1)
     sequence_a = 'ABCAXYZXYZDEFXYZAAYZ'
     sequence_b = 'GHIDAXYZXYZAAABCABCD'
     a = probe.Probe.from_str('AXYZXYZ')
     b = probe.Probe.from_str('AABCABC')
     probes = [a, b]
     # This should default to the random approach, so set k (rather than
     # min_k)
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                    0,
                                                                    6,
                                                                    k=3)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found_a = probe.find_probe_covers_in_sequence(sequence_a)
         self.assertEqual(found_a, {a: [(3, 10)]})
         found_b = probe.find_probe_covers_in_sequence(sequence_b)
         self.assertEqual(found_b, {a: [(4, 11)], b: [(12, 19)]})
         probe.close_probe_finding_pool()
Example #5
0
    def test_pigeonhole_with_mismatch(self):
        """Tests with short sequence and short probes
        where the call to construct_kmer_probe_map_to_find_probe_covers tries
        the pigeonhole approach.
        """
        np.random.seed(1)
        sequence = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        a = probe.Probe.from_str('GHIJXL')
        b = probe.Probe.from_str('BTUVWX')
        c = probe.Probe.from_str('ACEFHJ')
        probes = [a, b, c]

        kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                       1,
                                                                       6,
                                                                       min_k=3,
                                                                       k=4)
        kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
        # This should try the pigeonhole approach, which should choose k=3
        self.assertEqual(kmer_map.k, 3)
        f = probe.probe_covers_sequence_by_longest_common_substring(1, 6)
        for n_workers in [1, 2, 4, 7, 8]:
            probe.open_probe_finding_pool(kmer_map, f, n_workers)
            found = probe.find_probe_covers_in_sequence(sequence)
            self.assertCountEqual(found[a], [(6, 12)])
            self.assertCountEqual(found[b], [(18, 24)])
            self.assertFalse(c in found)
            probe.close_probe_finding_pool()

        kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                       1,
                                                                       6,
                                                                       min_k=4,
                                                                       k=4)
        kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
        # This should try the pigeonhole approach and fail because it
        # chooses k=3, but min_k=4. So it should then try the random
        # approach with k=4.
        self.assertEqual(kmer_map.k, 4)
        f = probe.probe_covers_sequence_by_longest_common_substring(1, 6)
        for n_workers in [1, 2, 4, 7, 8]:
            probe.open_probe_finding_pool(kmer_map, f, n_workers)
            found = probe.find_probe_covers_in_sequence(sequence)
            self.assertCountEqual(found[a], [(6, 12)])
            self.assertCountEqual(found[b], [(18, 24)])
            self.assertFalse(c in found)
            probe.close_probe_finding_pool()
Example #6
0
    def __init__(self,
                 probes,
                 mismatches,
                 lcf_thres,
                 target_genomes,
                 target_genomes_names=None,
                 island_of_exact_match=0,
                 cover_extension=0,
                 kmer_probe_map_k=10,
                 rc_too=True):
        """
        Args:
            probes: collection of instances of probe.Probe that form a
                complete probe set
            mismatches/lcf_thres: consider a probe to hybridize to a sequence
                if a stretch of 'lcf_thres' or more bp aligns with
                'mismatches' or fewer mismatched bp; used to compute whether
                a probe "covers" a portion of a sequence
            target_genomes: list [g_1, g_2, ..., g_m] of m groupings of
                genomes, where each g_i is a list of genome.Genomes belonging
                to group i. For example, a group may be a species and each g_i
                would be a list of the target genomes of species i.
            target_genomes_names: list [s_1, s_2, ..., s_m] of strings where
                the name of the i'th genome grouping (from target_genomes) is
                s_i. When None, the name of the i'th grouping is "Group i".
            island_of_exact_match: for a probe to hybridize to a sequence,
                require that there be an exact match of length at least
                'island_of_exact_match'
            cover_extension: number of bp by which to extend the coverage on
                each side of a probe; a probe "covers" the portion of the
                sequence that it hybridizes to, as well as 'cover_extension'
                bp on each side of that portion
            kmer_probe_map_k: in calls to probe.construct_kmer_probe_map...,
                uses this value as min_k and k
            rc_too: when True, analyze all the target genomes in
                target_genomes, as well as their reverse complements (when
                False, do not analyze reverse complements)
        """
        self.probes = probes
        self.target_genomes = target_genomes
        if target_genomes_names:
            if len(target_genomes_names) != len(target_genomes):
                raise ValueError(("Number of target genome names must be same "
                                  "as the number of target genomes"))
            self.target_genomes_names = target_genomes_names
        else:
            self.target_genomes_names = [
                "Group %d" % i for i in range(len(target_genomes))
            ]

        self.mismatches = mismatches
        self.lcf_thres = lcf_thres
        self.cover_range_fn = \
            probe.probe_covers_sequence_by_longest_common_substring(
                mismatches, lcf_thres, island_of_exact_match)
        self.cover_extension = cover_extension
        self.kmer_probe_map_k = kmer_probe_map_k
        self.rc_too = rc_too
Example #7
0
 def test_match_no_mismatches(self):
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     match = f('ZZZABCGHIJKLXYZ', self.seq, 6, 9, 15, len(self.seq))
     self.assertTrue(match is not None)
     start, end = match
     self.assertEqual(start, 6)
     self.assertEqual(end, 12)
     match = f('ZZZZAFGHIJKLMDEF', self.seq, 6, 9, 15, len(self.seq))
     self.assertTrue(match is not None)
     start, end = match
     self.assertEqual(start, 5)
     self.assertEqual(end, 13)
Example #8
0
 def test_match_with_mismatches(self):
     f = probe.probe_covers_sequence_by_longest_common_substring(1, 6)
     match = f('ZZZGHIGHIXKLDEF', self.seq, 6, 9, 15, len(self.seq))
     self.assertTrue(match is not None)
     start, end = match
     self.assertEqual(start, 6)
     self.assertEqual(end, 12)
     match = f('ZZZZZZGHIJKXSWZ', self.seq, 6, 9, 15, len(self.seq))
     self.assertTrue(match is not None)
     start, end = match
     self.assertEqual(start, 6)
     self.assertEqual(end, 12)
     match = f('ZZAGTFGHIJKXM', self.seq, 6, 9, 13, len(self.seq))
     self.assertTrue(match is not None)
     start, end = match
     self.assertEqual(start, 5)
     self.assertEqual(end, 13)
Example #9
0
    def test_open_close_pool_without_work(self):
        """Tests opening a probe finding pool and closing it without doing
        any work in between.

        There was a bug, caused by a bug in early versions of Python, that
        could cause closing the pool to hang indefinitely when no work
        is submitted.
        """
        probes = [probe.Probe.from_str('ABCDEF')]
        kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
            probes, 0, 6, k=3)
        kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
        f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
        for n_workers in [1, 2, 4, 7, 8, None]:
            probe.open_probe_finding_pool(kmer_map, f, n_workers)
            time.sleep(1)
            probe.close_probe_finding_pool()
            time.sleep(1)
Example #10
0
 def test_repetitive(self):
     """Tests with short sequence and short probes
     where the sequence and probes have repetitive sequences, so that
     one probe can cover a lot of the sequence.
     """
     np.random.seed(1)
     sequence = 'ABCAAAAAAAAAAXYZXYZXYZXYZAAAAAAAAAAAAAXYZ'
     a = probe.Probe.from_str('NAAAAAAN')
     probes = [a]
     # This should default to the random approach, so set k (rather than
     # min_k)
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 0, 6, k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(3, 13), (25, 38)])
         probe.close_probe_finding_pool()
Example #11
0
 def test_island_with_exact_match2(self):
     """Tests the 'island_with_exact_match' argument for
     probe.probe_covers_sequence_by_longest_common_substring(..).
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU'
     a = probe.Probe.from_str('HXJKLMNOPCDE')
     b = probe.Probe.from_str('XIJKXMNOXCDE')
     c = probe.Probe.from_str('XIJKXMNOPXDE')
     probes = [a, b, c]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 3, 6, k=3)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     fn = probe.probe_covers_sequence_by_longest_common_substring(3, 6, 4)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, fn, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(7, 19)])
         self.assertFalse(b in found)
         self.assertCountEqual(found[c], [(7, 19)])
         probe.close_probe_finding_pool()
Example #12
0
 def test_two_occurrences(self):
     """Tests with short sequence and short probes
     where one probe appears twice.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU'
     a = probe.Probe.from_str('CDEFGH')
     b = probe.Probe.from_str('GHIJKL')
     c = probe.Probe.from_str('STUVWX')
     probes = [a, b, c]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 0, 6, min_k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(2, 8), (16, 22)])
         self.assertCountEqual(found[b], [(6, 12)])
         self.assertFalse(c in found)
         probe.close_probe_finding_pool()
Example #13
0
 def test_too_short_sequence_small_k(self):
     """Tests with sequence shorter than the probe length.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHI'
     a = probe.Probe.from_str('ABCDEFGHIJKL')
     b = probe.Probe.from_str('EFGHIJKLMNOP')
     c = probe.Probe.from_str('DEFGHIJKLMNO')
     d = probe.Probe.from_str('XYZXYZABCDEF')
     probes = [a, b, c, d]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 0, 6, min_k=6, k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(0, 9)])
         self.assertFalse(b in found)
         self.assertCountEqual(found[c], [(3, 9)])
         self.assertCountEqual(found[d], [(0, 6)])
         probe.close_probe_finding_pool()
Example #14
0
    def __init__(self,
                 adapter_a,
                 adapter_b,
                 mismatches,
                 lcf_thres,
                 island_of_exact_match=0,
                 kmer_probe_map_k=20):
        """
        Args:
            adapter_a: tuple (x, y) where x gives the A adapter sequence to
                add onto the 5' end of a probe and y gives the A adapter
                sequence to add onto the 3' end of a probe
            adapter_b: tuple (x, y) where x gives the B adapter sequence to
                add onto the 5' end of a probe and y gives the B adapter
                sequence to add onto the 3' end of a probe
            mismatches/lcf_thres: consider a probe to hybridize to a
                sequence if a stretch of 'lcf_thres' or more bp aligns with
                'mismatches' or fewer mismatched bp
            island_of_exact_match: for a probe to hybridize to a sequence,
                require that there be an exact match of length at least
                'island_of_exact_match'
            kmer_probe_map_k: in calls to probe.construct_kmer_probe_map...,
                uses this value as min_k and k
        """
        if len(adapter_a) != 2 or len(adapter_b) != 2:
            raise ValueError(("adapter_a/adapter_b arguments must be tuples "
                              "of length 2, giving the sequences to add onto "
                              "the 5' and 3' ends"))

        self.adapter_a_5end, self.adapter_a_3end = adapter_a
        self.adapter_b_5end, self.adapter_b_3end = adapter_b
        self.mismatches = mismatches
        self.lcf_thres = lcf_thres
        self.cover_range_fn = \
            probe.probe_covers_sequence_by_longest_common_substring(
                mismatches=mismatches, lcf_thres=lcf_thres,
                island_of_exact_match=island_of_exact_match)
        self.kmer_probe_map_k = kmer_probe_map_k
Example #15
0
 def test_too_short_sequence_large_k(self):
     """Tests with sequence shorter than the probe length and also
     shorter than k.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHI'
     a = probe.Probe.from_str('ABCDEFGHIJKL')
     b = probe.Probe.from_str('EFGHIJKLMNOP')
     c = probe.Probe.from_str('DEFGHIJKLMNO')
     d = probe.Probe.from_str('XYZXYZABCDEF')
     probes = [a, b, c, d]
     # probe.find_probe_covers_in_sequence() should not attempt
     # to cover the sequence (return {}), but should run gracefully
     for k in [10, 11, 12]:
         kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
             probes, 0, 6, min_k=k, k=k)
         kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
         f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
         for n_workers in [1, 2, 4, 7, 8]:
             probe.open_probe_finding_pool(kmer_map, f, n_workers)
             found = probe.find_probe_covers_in_sequence(sequence)
             self.assertEqual(found, {})
             probe.close_probe_finding_pool()
Example #16
0
 def test_no_match_with_probe_smaller_than_lcf_thres(self):
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     match = f('GHIX', self.seq[6:10], 1, 3, 4, len(self.seq[6:10]))
     self.assertTrue(match is None)
Example #17
0
    def run_random(self,
                   n,
                   genome_min,
                   genome_max,
                   num_probes,
                   probe_length=100,
                   lcf_thres=None,
                   seed=1,
                   n_workers=2,
                   use_native_dict=False):
        """Run tests with a randomly generated sequence.

        Repeatedly runs tests in which a sequence is randomly generated,
        probes are generated from that sequence, and then the probes are
        looked up in the sequence.

        Creates the probes with the intention of determining coverage with
        a longest common substring.

        Args:
            n: number of times to run the test
            genome_min/genome_max: the genome (sequence) size is
                randomly chosen between genome_min and genome_max
            num_probes: the number of probes generated from the random
                sequence
            probe_length: number of bp to make each probe
            lcf_thres: lcf threshold parameter; when None, it is
                randomly chosen among 80 and 100
            seed: random number generator seed
            n_workers: number of workers to have in a probe finding pool
            use_native_dict: have the probe finding pool use a native Python
                dict
        """
        np.random.seed(seed)
        fixed_lcf_thres = lcf_thres

        for n in range(n):
            if fixed_lcf_thres is not None:
                lcf_thres = fixed_lcf_thres
            else:
                # Choose either lcf_thres=80 or lcf_thres=100
                lcf_thres = np.random.choice([80, 100])
            # Make a random sequence
            seq_length = np.random.randint(genome_min, genome_max)
            sequence = "".join(
                np.random.choice(['A', 'T', 'C', 'G'],
                                 size=seq_length,
                                 replace=True))
            desired_probe_cover_ranges = defaultdict(list)
            # Make num_probes random probes
            probes = []
            for m in range(num_probes):
                subseq_start = np.random.randint(0, seq_length - probe_length)
                subseq_end = subseq_start + probe_length
                cover_length = np.random.randint(lcf_thres, probe_length + 1)
                cover_start = subseq_start + \
                    np.random.randint(0, probe_length - cover_length + 1)
                cover_end = min(seq_length, cover_start + cover_length)
                probe_str_cover = sequence[cover_start:cover_end]
                # Add random bases before and after what the probe should
                # cover
                probe_str_start = "".join(
                    np.random.choice(['A', 'T', 'C', 'G'],
                                     size=cover_start - subseq_start,
                                     replace=True))
                probe_str_end = "".join(
                    np.random.choice(['A', 'T', 'C', 'G'],
                                     size=subseq_end - cover_end,
                                     replace=True))
                probe_str = probe_str_start + probe_str_cover + probe_str_end
                # Add 0, 1, 2, or 3 random mismatches
                for k in range(np.random.randint(0, 4)):
                    pos = np.random.randint(0, probe_length)
                    base_choices = [
                        b for b in ['A', 'T', 'C', 'G'] if b != probe_str[pos]
                    ]
                    probe_str = probe_str[:pos] + \
                        "".join(np.random.choice(base_choices, size=1)) + \
                        probe_str[(pos + 1):]
                p = probe.Probe.from_str(probe_str)
                desired_probe_cover_ranges[p].append((cover_start, cover_end))
                probes += [p]
            kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
                probes, 3, lcf_thres)
            kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
            f = probe.probe_covers_sequence_by_longest_common_substring(
                3, lcf_thres)
            probe.open_probe_finding_pool(kmer_map,
                                          f,
                                          n_workers,
                                          use_native_dict=use_native_dict)
            found = probe.find_probe_covers_in_sequence(sequence)
            probe.close_probe_finding_pool()
            # Check that this didn't find any extraneous probes and that
            # it found at least 95% of the original (it may miss some
            # due to false negatives in the approach)
            self.assertLessEqual(len(found), len(probes))
            self.assertGreaterEqual(len(found), 0.95 * len(probes))
            # Check that each desired probe was found correctly
            for p, cover_ranges in desired_probe_cover_ranges.items():
                if p not in found:
                    continue
                found_cover_ranges = found[p]
                # This probe most likely was found once, but could have
                # been missed (due to false negatives in the approach) and
                # may have been found more than once due to chance (but
                # probably not too much more!)
                self.assertTrue(len(found_cover_ranges) in [1, 2])
                # The cover ranges should have been captured, and the ones
                # found may extend past what was desired by a small amount due
                # to allowing mismatches and chance
                # Because of mismatches possibly added to the end of the
                # desired cover range, what was recaptured may not always
                # encompass the entire cover range, so allow some small
                # tolerance
                for desired_cv in cover_ranges:
                    found_desired_cv = False
                    for found_cv in found_cover_ranges:
                        left_diff = desired_cv[0] - found_cv[0]
                        right_diff = found_cv[1] - desired_cv[1]
                        if left_diff >= -7 and left_diff < 15:
                            if right_diff >= -7 and right_diff < 15:
                                found_desired_cv = True
                                break
                    self.assertTrue(found_desired_cv)
Example #18
0
 def test_match_with_island_of_exact_match(self):
     f = probe.probe_covers_sequence_by_longest_common_substring(1, 6, 4)
     match = f('ZZZGHIGHIJXLDEF', self.seq, 6, 9, 15, len(self.seq))
     self.assertEqual(match, (6, 12))
Example #19
0
 def test_no_match_with_island_of_exact_match(self):
     f = probe.probe_covers_sequence_by_longest_common_substring(1, 6, 4)
     match = f('ZZZGHIGHIXKLDEF', self.seq, 6, 9, 15, len(self.seq))
     self.assertTrue(match is None)
Example #20
0
 def test_no_match_with_probe_longer_than_sequence(self):
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     match = f('DEFX', 'DEFG', 1, 3, 7, 4)
     self.assertTrue(match is None)
Example #21
0
 def test_match_with_probe_longer_than_sequence(self):
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     match = f('DEFG', 'DEFG', 1, 3, 7, 4)
     self.assertEqual(match, (0, 4))
Example #22
0
    def __init__(self,
                 mismatches,
                 lcf_thres,
                 island_of_exact_match=0,
                 mismatches_tolerant=None,
                 lcf_thres_tolerant=None,
                 island_of_exact_match_tolerant=None,
                 custom_cover_range_fn=None,
                 custom_cover_range_tolerant_fn=None,
                 identify=False,
                 blacklisted_genomes=[],
                 coverage=1.0,
                 cover_extension=0,
                 cover_groupings_separately=False,
                 kmer_probe_map_k=20,
                 kmer_probe_map_use_native_dict=False):
        """
        Args:
            mismatches/lcf_thres: consider a probe to hybridize to a sequence
                if a stretch of 'lcf_thres' or more bp aligns with
                'mismatches' or fewer mismatched bp; used to compute whether
                a probe "covers" a portion of a sequence
            island_of_exact_match: for a probe to hybridize to a sequence,
                require that there be an exact match of length at least
                'island_of_exact_match'
            mismatches_tolerant/lcf_thres_tolerant: more tolerant
                values corresponding to 'mismatches' and 'lcf_thres'. It should
                generally be true that 'mismatches_tolerant' > 'mismatches' and
                'lcf_thres_tolerant' < 'lcf_thres'. These values are used in
                determining the overlap that a candidate probe has with
                different groupings when the identification option is enabled.
                They are also used when determining the coverage of each
                candidate probe with the blacklisted genomes. They are meant
                to capture more potential hybridizations (i.e., be more
                sensitive). When not set, they are by default equal to
                'mismatches' and 'lcf_thres'.
            island_of_exact_match_tolerant: more tolerant value corresponding
                to 'island_of_exact_match'. It should generally be true
                that this value is less than 'island_of_exact_match'. Used
                when mismatches_tolerant/lcf_thres_tolerant are used.
            custom_cover_range_fn: if set, tuple (path, fn) where path gives
                a path to a Python module and fn gives the name of a function
                in that module. This function is dynamically loaded and used
                to determine whether a probe will hybridize to a region of
                target sequence (and what portion will hybridize). The
                function must accept the same arguments as the function
                returned by
                probe.probe_covers_sequence_by_longest_common_substring()
                and return the same value. When set, the parameters
                'mismatches', 'lcf_thres', and 'island_of_exact_match'
                are ignored (even if their values are default values)
                because they are only used in the default cover_range_fn.
            custom_cover_range_tolerant_fn: same as custom_cover_range_fn,
                but with more tolerance for hybridization; likewise, the
                _tolerant parameters are ignored when this is set.
            identify: when True, indicates that probes should be designed
                with the identification option enabled (default is False)
            blacklisted_genomes: list of paths to FASTA files of genomes
                that should be blacklisted (i.e., probes are penalized by the
                amount they cover these genomes).
            coverage: either a float in [0,1] or an int > 1. When it is a
                float in [0,1], it determines the fraction of each of the
                target genomes that must be covered by the selected probes.
                When it is an int > 1, it determines the number of bp of each
                of the target genomes that must be covered by the selected
                probes.
            cover_extension: number of bp by which to extend the coverage of
                a probe on both sides. When this is 0, a probe "covers" exactly
                the portion of the sequence that it hybridizes to, as
                determined with the 'mismatches' and 'lcf_thres' parameters.
                This parameter allows a probe to cover a region surrounding
                and including the portion of the sequence that it hybridizes
                to. The probe covers the portion of the sequence that it
                hybridizes to, as well as 'cover_extension' bp on each side
                of that portion. (So the length of the region a probe covers
                is the length of the probe plus 2*'cover_extension' bp.)
                This may more realistically model hybrid selection because
                a probe hybridizes to a fragment of DNA, which includes the
                region targeted by the probe as well as the surrounding region,
                and this entire fragment is sequenced. Increasing the value
                of this parameter should reduce the number of required probes.
            cover_groupings_separately: when True, runs a separate instance
                of set cover with the target genomes from each grouping and
                yields the probes selected across (the union of) all the runs.
                (When False, just one instance of set cover is run.) This
                improves runtime by reducing the number of universes (and thus
                overall universe size) given to each instance of set cover, but
                it may yield more probes than just one instance would yield,
                particularly when the genomes across groupings are similar at
                a nucleotide level.
            kmer_probe_map_k: in calls to probe.construct_kmer_probe_map...,
                uses this value as min_k and k
            kmer_probe_map_use_native_dict: when finding probe covers
                for identification or blacklisting, use the native
                Python dict of SharedKmerProbeMap rather than its primitive
                types that are more suited for sharing across processes;
                depending on the input this can result in considerably
                more memory use but may give an improvement in runtime
        """
        if custom_cover_range_fn is not None:
            # Use a custom function to determine whether a probe hybridizes
            # to a region of target sequence (and what part hybridizes),
            # rather than the default model. Ignore the given values for
            # mismatches and lcf_thres (which may be default values) because
            # these are only relevant for the default model
            self.mismatches, self.lcf_thres = None, None

            # Dynamically load the function
            fn_path, fn_name = custom_cover_range_fn
            self.cover_range_fn = dynamic_load.load_function_from_path(
                fn_path, fn_name)
        else:
            self.mismatches = mismatches
            self.lcf_thres = lcf_thres
            # Construct a function using the default model of hybridization
            self.cover_range_fn = \
                probe.probe_covers_sequence_by_longest_common_substring(
                    mismatches, lcf_thres, island_of_exact_match)

        if not mismatches_tolerant:
            mismatches_tolerant = mismatches
        if not lcf_thres_tolerant:
            lcf_thres_tolerant = lcf_thres
        if not island_of_exact_match_tolerant:
            island_of_exact_match_tolerant = island_of_exact_match
        if custom_cover_range_tolerant_fn is not None:
            # Use a custom function to determine, with more tolerance,
            # whether a probe hybridizes to a region of target sequence (and
            # what part hybridizes), rather than the default model. Ignore
            # the given values of mismatches_tolerant and lcf_thres_tolerant
            # (which may be default values) because these are only relevant for
            # the default model
            self.mismatches_tolerant, self.lcf_thres_tolerant = None, None

            # Dynamically load the function
            fn_path, fn_name = custom_cover_range_tolerant_fn
            self.cover_range_tolerant_fn = dynamic_load.load_function_from_path(
                fn_path, fn_name)
        else:
            self.mismatches_tolerant = mismatches_tolerant
            self.lcf_thres_tolerant = lcf_thres_tolerant
            # Construct a function using the default model of hybridization
            self.cover_range_tolerant_fn = \
                probe.probe_covers_sequence_by_longest_common_substring(
                    mismatches_tolerant, lcf_thres_tolerant,
                    island_of_exact_match_tolerant)

        # Warn if identification is enabled but the coverage is high
        if identify:
            if (coverage <= 1.0 and coverage >= 0.25) or \
               (coverage > 1 and coverage >= 5000):
                logger.warning(("Identification is enabled but the required "
                                "coverage is high; generally coverage should "
                                "be small when performing identification"))

        self.identify = identify
        self.blacklisted_genomes = blacklisted_genomes
        self.coverage = coverage
        self.cover_extension = cover_extension
        self.cover_groupings_separately = cover_groupings_separately
        self.kmer_probe_map_k = kmer_probe_map_k
        self.kmer_probe_map_use_native_dict = kmer_probe_map_use_native_dict
Example #23
0
    def __init__(self,
                 adapter_a,
                 adapter_b,
                 mismatches,
                 lcf_thres,
                 island_of_exact_match=0,
                 custom_cover_range_fn=None,
                 kmer_probe_map_k=20):
        """
        Args:
            adapter_a: tuple (x, y) where x gives the A adapter sequence to
                add onto the 5' end of a probe and y gives the A adapter
                sequence to add onto the 3' end of a probe
            adapter_b: tuple (x, y) where x gives the B adapter sequence to
                add onto the 5' end of a probe and y gives the B adapter
                sequence to add onto the 3' end of a probe
            mismatches/lcf_thres: consider a probe to hybridize to a
                sequence if a stretch of 'lcf_thres' or more bp aligns with
                'mismatches' or fewer mismatched bp
            island_of_exact_match: for a probe to hybridize to a sequence,
                require that there be an exact match of length at least
                'island_of_exact_match'
            custom_cover_range_fn: if set, tuple (path, fn) where path gives
                a path to a Python module and fn gives the name of a function
                in that module. This function is dynamically loaded and used
                to determine whether a probe will hybridize to a region of
                target sequence (and what portion will hybridize). The
                function must accept the same arguments as the function
                returned by
                probe.probe_covers_sequence_by_longest_common_substring()
                and return the same value. When set, the parameters
                'mismatches', 'lcf_thres', and 'island_of_exact_match'
                are ignored (even if their values are default values)
                because they are only used in the default cover_range_fn
            kmer_probe_map_k: in calls to probe.construct_kmer_probe_map...,
                uses this value as min_k and k
        """
        if len(adapter_a) != 2 or len(adapter_b) != 2:
            raise ValueError(("adapter_a/adapter_b arguments must be tuples "
                              "of length 2, giving the sequences to add onto "
                              "the 5' and 3' ends"))

        self.adapter_a_5end, self.adapter_a_3end = adapter_a
        self.adapter_b_5end, self.adapter_b_3end = adapter_b

        if custom_cover_range_fn is not None:
            # Use a custom function to determine whether a probe hybridizes
            # to a region of target sequence (and what part hybridizes),
            # rather than the default model. Ignore the given values for
            # mismatches and lcf_thres (which may be default values) because
            # these are only relevant for the default model
            self.mismatches, self.lcf_thres = None, None

            # Dynamically load the function
            fn_path, fn_name = custom_cover_range_fn
            self.cover_range_fn = dynamic_load.load_function_from_path(
                fn_path, fn_name)
        else:
            self.mismatches = mismatches
            self.lcf_thres = lcf_thres
            # Construct a function using the default model of hybridization
            self.cover_range_fn = \
                probe.probe_covers_sequence_by_longest_common_substring(
                    mismatches, lcf_thres, island_of_exact_match)

        self.kmer_probe_map_k = kmer_probe_map_k
Example #24
0
 def test_match_from_probe_on_end(self):
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 10)
     match = f('ABCDEF', self.seq, 1, 3, 6, len(self.seq))
     self.assertEqual(match, (0, 6))
Example #25
0
    def __init__(self,
                 probes,
                 mismatches,
                 lcf_thres,
                 target_genomes,
                 target_genomes_names=None,
                 island_of_exact_match=0,
                 custom_cover_range_fn=None,
                 cover_extension=0,
                 kmer_probe_map_k=10,
                 rc_too=True):
        """
        Args:
            probes: collection of instances of probe.Probe that form a
                complete probe set
            mismatches/lcf_thres: consider a probe to hybridize to a sequence
                if a stretch of 'lcf_thres' or more bp aligns with
                'mismatches' or fewer mismatched bp; used to compute whether
                a probe "covers" a portion of a sequence
            target_genomes: list [g_1, g_2, ..., g_m] of m groupings of
                genomes, where each g_i is a list of genome.Genomes belonging
                to group i. For example, a group may be a species and each g_i
                would be a list of the target genomes of species i.
            target_genomes_names: list [s_1, s_2, ..., s_m] of strings where
                the name of the i'th genome grouping (from target_genomes) is
                s_i. When None, the name of the i'th grouping is "Group i".
            island_of_exact_match: for a probe to hybridize to a sequence,
                require that there be an exact match of length at least
                'island_of_exact_match'
            custom_cover_range_fn: if set, tuple (path, fn) where path gives
                a path to a Python module and fn gives the name of a function
                in that module. This function is dynamically loaded and used
                to determine whether a probe will hybridize to a region of
                target sequence (and what portion will hybridize). The
                function must accept the same arguments as the function
                returned by
                probe.probe_covers_sequence_by_longest_common_substring()
                and return the same value. When set, the parameters
                'mismatches', 'lcf_thres', and 'island_of_exact_match'
                are ignored (even if their values are default values)
                because they are only used in the default cover_range_fn.
            cover_extension: number of bp by which to extend the coverage on
                each side of a probe; a probe "covers" the portion of the
                sequence that it hybridizes to, as well as 'cover_extension'
                bp on each side of that portion
            kmer_probe_map_k: in calls to probe.construct_kmer_probe_map...,
                uses this value as min_k and k
            rc_too: when True, analyze all the target genomes in
                target_genomes, as well as their reverse complements (when
                False, do not analyze reverse complements)
        """
        self.probes = probes
        self.target_genomes = target_genomes
        if target_genomes_names:
            if len(target_genomes_names) != len(target_genomes):
                raise ValueError(("Number of target genome names must be same "
                                  "as the number of target genomes"))
            self.target_genomes_names = target_genomes_names
        else:
            self.target_genomes_names = ["Group %d" % i
                                         for i in range(len(target_genomes))]

        if custom_cover_range_fn is not None:
            # Use a custom function to determine whether a probe hybridizes
            # to a region of target sequence (and what part hybridizes),
            # rather than the default model. Ignore the given values for
            # mismatches and lcf_thres (which may be default values) because
            # these are only relevant for the default model
            self.mismatches, self.lcf_thres = None, None

            # Dynamically load the function
            fn_path, fn_name = custom_cover_range_fn
            self.cover_range_fn = dynamic_load.load_function_from_path(
                fn_path, fn_name)
        else:
            self.mismatches = mismatches
            self.lcf_thres = lcf_thres
            # Construct a function using the default model of hybridization
            self.cover_range_fn = \
                probe.probe_covers_sequence_by_longest_common_substring(
                    mismatches, lcf_thres, island_of_exact_match)

        self.cover_extension = cover_extension
        self.kmer_probe_map_k = kmer_probe_map_k
        self.rc_too = rc_too
Example #26
0
 def test_no_match_from_probe_on_end(self):
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 10)
     match = f('ABCDEF', self.seq, 1, 3, 10, len(self.seq))
     self.assertTrue(match is None)