コード例 #1
0
 def test_multiple_searches_with_same_pool(self):
     """Tests more than one call to find_probe_covers_in_sequence()
     with the same pool.
     """
     np.random.seed(1)
     sequence_a = 'ABCAXYZXYZDEFXYZAAYZ'
     sequence_b = 'GHIDAXYZXYZAAABCABCD'
     a = probe.Probe.from_str('AXYZXYZ')
     b = probe.Probe.from_str('AABCABC')
     probes = [a, b]
     # This should default to the random approach, so set k (rather than
     # min_k)
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                    0,
                                                                    6,
                                                                    k=3)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found_a = probe.find_probe_covers_in_sequence(sequence_a)
         self.assertEqual(found_a, {a: [(3, 10)]})
         found_b = probe.find_probe_covers_in_sequence(sequence_b)
         self.assertEqual(found_b, {a: [(4, 11)], b: [(12, 19)]})
         probe.close_probe_finding_pool()
コード例 #2
0
 def verify_target_genome_coverage(self, selected_probes, target_genomes,
                                   filter, desired_coverage,
                                   cover_extension=0):
     kmer_probe_map = probe.SharedKmerProbeMap.construct(
         probe.construct_kmer_probe_map_to_find_probe_covers(
             selected_probes, filter.mismatches, filter.lcf_thres,
             min_k=3, k=3)
     )
     probe.open_probe_finding_pool(kmer_probe_map,
                                   filter.cover_range_fn)
     for tg in [g for genomes_from_group in target_genomes
                for g in genomes_from_group]:
         num_bp_covered = 0
         for seq in tg.seqs:
             probe_cover_ranges = probe.find_probe_covers_in_sequence(seq)
             all_cover_ranges = []
             for cover_ranges in probe_cover_ranges.values():
                 for cv in cover_ranges:
                     start = max(0, cv[0] - cover_extension)
                     end = min(len(seq), cv[1] + cover_extension)
                     all_cover_ranges += [(start, end)]
             all_cover_ranges = interval.merge_overlapping(all_cover_ranges)
             for cover_range in all_cover_ranges:
                 num_bp_covered += cover_range[1] - cover_range[0]
         if desired_coverage <= 1.0:
             # check fraction covered
             desired_bp_covered = desired_coverage * tg.size()
             self.assertGreaterEqual(num_bp_covered, desired_bp_covered)
         else:
             # directly check num bp covered
             desired_coverage_adjusted = min(desired_coverage, tg.size())
             self.assertGreaterEqual(num_bp_covered,
                                     desired_coverage_adjusted)
     probe.close_probe_finding_pool()
コード例 #3
0
 def test_more_than_cover(self):
     """Tests with short sequence and short probes
     where probes contain more than what they cover.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPQR' + ('Z' * 100) + 'STUVWXYZ'
     a = probe.Probe.from_str('XYZCDEFGHIJKABCSTUVWXABC')
     b = probe.Probe.from_str('PQRSGHIJKLMNXYZ')
     c = probe.Probe.from_str('ABCFGHIJKLZAZAZAGHIJKL')
     probes = [a, b, c]
     # This should default to the random approach, so set k (rather than
     # min_k)
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                    0,
                                                                    6,
                                                                    k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(2, 11), (118, 124)])
         self.assertCountEqual(found[b], [(6, 14)])
         self.assertCountEqual(found[c], [(5, 12)])
         probe.close_probe_finding_pool()
コード例 #4
0
 def test_island_with_exact_match1(self):
     """Tests the 'island_with_exact_match' argument for
     probe.probe_covers_sequence_by_longest_common_substring(..).
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPYDEFGHQRSTU'
     a = probe.Probe.from_str('XDEFGH')
     b = probe.Probe.from_str('CXEFGH')
     c = probe.Probe.from_str('CDXFGH')
     d = probe.Probe.from_str('CDEXGH')
     e = probe.Probe.from_str('CDEFXH')
     f = probe.Probe.from_str('CDEFGX')
     g = probe.Probe.from_str('CDEFGH')
     probes = [a, b, c, d, e, f, g]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                    1,
                                                                    6,
                                                                    k=3)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     fn = probe.probe_covers_sequence_by_longest_common_substring(1, 6, 4)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, fn, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(2, 8), (16, 22)])
         self.assertCountEqual(found[b], [(2, 8)])
         self.assertFalse(c in found)
         self.assertFalse(d in found)
         self.assertCountEqual(found[e], [(2, 8)])
         self.assertCountEqual(found[f], [(2, 8)])
         self.assertCountEqual(found[g], [(2, 8), (16, 22)])
         probe.close_probe_finding_pool()
コード例 #5
0
    def test_pigeonhole_with_mismatch(self):
        """Tests with short sequence and short probes
        where the call to construct_kmer_probe_map_to_find_probe_covers tries
        the pigeonhole approach.
        """
        np.random.seed(1)
        sequence = 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'
        a = probe.Probe.from_str('GHIJXL')
        b = probe.Probe.from_str('BTUVWX')
        c = probe.Probe.from_str('ACEFHJ')
        probes = [a, b, c]

        kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                       1,
                                                                       6,
                                                                       min_k=3,
                                                                       k=4)
        kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
        # This should try the pigeonhole approach, which should choose k=3
        self.assertEqual(kmer_map.k, 3)
        f = probe.probe_covers_sequence_by_longest_common_substring(1, 6)
        for n_workers in [1, 2, 4, 7, 8]:
            probe.open_probe_finding_pool(kmer_map, f, n_workers)
            found = probe.find_probe_covers_in_sequence(sequence)
            self.assertCountEqual(found[a], [(6, 12)])
            self.assertCountEqual(found[b], [(18, 24)])
            self.assertFalse(c in found)
            probe.close_probe_finding_pool()

        kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(probes,
                                                                       1,
                                                                       6,
                                                                       min_k=4,
                                                                       k=4)
        kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
        # This should try the pigeonhole approach and fail because it
        # chooses k=3, but min_k=4. So it should then try the random
        # approach with k=4.
        self.assertEqual(kmer_map.k, 4)
        f = probe.probe_covers_sequence_by_longest_common_substring(1, 6)
        for n_workers in [1, 2, 4, 7, 8]:
            probe.open_probe_finding_pool(kmer_map, f, n_workers)
            found = probe.find_probe_covers_in_sequence(sequence)
            self.assertCountEqual(found[a], [(6, 12)])
            self.assertCountEqual(found[b], [(18, 24)])
            self.assertFalse(c in found)
            probe.close_probe_finding_pool()
コード例 #6
0
ファイル: set_cover_filter.py プロジェクト: pythseq/catch
    def _compute_tolerant_bp_covered_within_sequence(self,
                                                     sequence,
                                                     rc_too=True):
        """Compute number of bp captured in sequence by each input probe.

        A probe finding pool must be open prior to calling this function,
        and that pool should have been created using
        self.cover_range_tolerant_fn. That is, probe.open_probe_finding_pool()
        should have been called with the cover_range_for_probe_in_subsequence_fn
        argument equal to self.cover_range_tolerant_fn. The input probes
        are values in the kmer_probe_map argument that was passed to
        probe.open_probe_finding_pool().

        Uses self.coverage_range_tolerant_fn for determining coverage (i.e.,
        the coverage is determined in a relatively tolerant way so that
        more potential hybridizations are included).

        Args:
            sequence: sequence as a string in which to determine the
                coverage of the probes
            rc_too: when True, the returned values also include bp that
                are captured in the reverse complement of sequence

        Raises:
            RuntimeError if the probe finding pool was not created with
            self.cover_range_tolerant_fn

        Returns:
            dict mapping each candidate probe to the number of bp it
            covers, for only the candidate probes that cover at least
            one bp; candidate probes that do not cover any bp are not
            included as keys in the returned dict
        """
        if probe._pfp_cover_range_for_probe_in_subsequence_fn != \
                self.cover_range_tolerant_fn:
            raise RuntimeError(("_compute_tolerant_bp_covered_within_"
                                "subsequence() was called but the probe "
                                "finding pool was not created using "
                                "self.cover_range_tolerant_fn"))

        reverse_complement = [False]
        if rc_too:
            reverse_complement += [True]
        rc_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}

        num_bp_covered = defaultdict(int)

        for rc in reverse_complement:
            if rc:
                sequence = ''.join([rc_map.get(b, b) for b in sequence[::-1]])
            probe_cover_ranges = probe.find_probe_covers_in_sequence(sequence)

            all_cover_ranges = []
            for p, cover_ranges in probe_cover_ranges.items():
                for cover_range in cover_ranges:
                    num_bp_covered[p] += cover_range[1] - cover_range[0]

        return dict(num_bp_covered)
コード例 #7
0
    def _votes_in_sequence(self, probes, sequence):
        """Compute votes for probes based on their overlap.

        Votes are determined by first determining the probes' hybridization
        (alignment) to sequence (e.g., one target genome) and then
        by considering their overlap.

        We use the greedy interval scheduling algorithm and assign 'A'
        votes to all probes selected by this algorithm. All other probes
        that hybridize to 'sequence' but are not selected receive a 'B'
        vote.

        Args:
            probes: a list of candidate probes for which to determine votes
            sequence: a string of a sequence (e.g., from a target genome)
                to use when determining overlap among probes

        Returns:
            A list L, in which L[i] corresponds to the probe probes[i].
            L[i] is either (1,0) [vote for 'A'], (0,1) [vote for 'B'], or
            (0,0) [the probe does not hybridize in 'sequence'].
        """
        probe_cover_ranges = probe.find_probe_covers_in_sequence(sequence)
        aligned_probes = set(probe_cover_ranges.keys())
        # Make a list of all the intervals covered by all the probes,
        # along with a reference to the probe with the interval
        intervals = []
        for p, cover_ranges in probe_cover_ranges.items():
            for cover_range in cover_ranges:
                intervals += [(cover_range, p)]

        # Perform interval scheduling to choose probes that should be
        # assigned the 'A' adapter
        chosen_probes = set(interval.schedule(intervals))

        votes = []
        for p in probes:
            if p in chosen_probes:
                # vote for 'A'
                vote = (1, 0)
            else:
                if p in aligned_probes:
                    # p should have been skipped by the interval scheduling
                    # algorithm
                    # vote for 'B'
                    vote = (0, 1)
                else:
                    # p does not hybridize to sequence
                    vote = (0, 0)
            votes += [vote]
        return votes
コード例 #8
0
ファイル: test_probe.py プロジェクト: pythseq/catch
 def test_repetitive(self):
     """Tests with short sequence and short probes
     where the sequence and probes have repetitive sequences, so that
     one probe can cover a lot of the sequence.
     """
     np.random.seed(1)
     sequence = 'ABCAAAAAAAAAAXYZXYZXYZXYZAAAAAAAAAAAAAXYZ'
     a = probe.Probe.from_str('NAAAAAAN')
     probes = [a]
     # This should default to the random approach, so set k (rather than
     # min_k)
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 0, 6, k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(3, 13), (25, 38)])
         probe.close_probe_finding_pool()
コード例 #9
0
ファイル: test_probe.py プロジェクト: pythseq/catch
 def test_island_with_exact_match2(self):
     """Tests the 'island_with_exact_match' argument for
     probe.probe_covers_sequence_by_longest_common_substring(..).
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU'
     a = probe.Probe.from_str('HXJKLMNOPCDE')
     b = probe.Probe.from_str('XIJKXMNOXCDE')
     c = probe.Probe.from_str('XIJKXMNOPXDE')
     probes = [a, b, c]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 3, 6, k=3)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     fn = probe.probe_covers_sequence_by_longest_common_substring(3, 6, 4)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, fn, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(7, 19)])
         self.assertFalse(b in found)
         self.assertCountEqual(found[c], [(7, 19)])
         probe.close_probe_finding_pool()
コード例 #10
0
ファイル: test_probe.py プロジェクト: pythseq/catch
 def test_two_occurrences(self):
     """Tests with short sequence and short probes
     where one probe appears twice.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHIJKLMNOPCDEFGHQRSTU'
     a = probe.Probe.from_str('CDEFGH')
     b = probe.Probe.from_str('GHIJKL')
     c = probe.Probe.from_str('STUVWX')
     probes = [a, b, c]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 0, 6, min_k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(2, 8), (16, 22)])
         self.assertCountEqual(found[b], [(6, 12)])
         self.assertFalse(c in found)
         probe.close_probe_finding_pool()
コード例 #11
0
ファイル: test_probe.py プロジェクト: pythseq/catch
 def test_too_short_sequence_small_k(self):
     """Tests with sequence shorter than the probe length.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHI'
     a = probe.Probe.from_str('ABCDEFGHIJKL')
     b = probe.Probe.from_str('EFGHIJKLMNOP')
     c = probe.Probe.from_str('DEFGHIJKLMNO')
     d = probe.Probe.from_str('XYZXYZABCDEF')
     probes = [a, b, c, d]
     kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
         probes, 0, 6, min_k=6, k=6)
     kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
     f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
     for n_workers in [1, 2, 4, 7, 8]:
         probe.open_probe_finding_pool(kmer_map, f, n_workers)
         found = probe.find_probe_covers_in_sequence(sequence)
         self.assertCountEqual(found[a], [(0, 9)])
         self.assertFalse(b in found)
         self.assertCountEqual(found[c], [(3, 9)])
         self.assertCountEqual(found[d], [(0, 6)])
         probe.close_probe_finding_pool()
コード例 #12
0
ファイル: test_probe.py プロジェクト: pythseq/catch
 def test_too_short_sequence_large_k(self):
     """Tests with sequence shorter than the probe length and also
     shorter than k.
     """
     np.random.seed(1)
     sequence = 'ABCDEFGHI'
     a = probe.Probe.from_str('ABCDEFGHIJKL')
     b = probe.Probe.from_str('EFGHIJKLMNOP')
     c = probe.Probe.from_str('DEFGHIJKLMNO')
     d = probe.Probe.from_str('XYZXYZABCDEF')
     probes = [a, b, c, d]
     # probe.find_probe_covers_in_sequence() should not attempt
     # to cover the sequence (return {}), but should run gracefully
     for k in [10, 11, 12]:
         kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
             probes, 0, 6, min_k=k, k=k)
         kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
         f = probe.probe_covers_sequence_by_longest_common_substring(0, 6)
         for n_workers in [1, 2, 4, 7, 8]:
             probe.open_probe_finding_pool(kmer_map, f, n_workers)
             found = probe.find_probe_covers_in_sequence(sequence)
             self.assertEqual(found, {})
             probe.close_probe_finding_pool()
コード例 #13
0
    def _find_covers_in_target_genomes(self):
        """Find intervals across the target genomes covered by the probe set.

        This considers the given probe set (self.probes) and determines the
        intervals, in each genome of the target genomes (as well as their
        reverse complements), that are covered by the probes. This saves a
        dict, self.target_covers, as follows: self.target_covers[i][j][b]
        is a list of all the intervals covered by the probes in the target
        genome j of grouping i (in the reverse complement of the genome if
        b is True, and provided sequence if b is False).

        The endpoints of the intervals are offset so as to give unique integer
        positions in the genome (e.g., endpoints in the second chromosome
        are offset based on the length of the first chromosome). There may
        be duplicate intervals if two probes cover the same region of a
        sequence.
        """
        logger.info("Finding probe covers across target genomes")
        logger.info("Building map from k-mers to probes")
        # Note that if adapters are added to the probes before this filter
        # is run (which would be typical), then self.lcf_thres will likely
        # be less than the probe length. So the k-mer to probe map will
        # be constructed using the random approach (yielding many k-mers
        # and thus a slower runtime in finding probe covers) rather than
        # the pigeonhole approach.
        kmer_probe_map = probe.SharedKmerProbeMap.construct(
            probe.construct_kmer_probe_map_to_find_probe_covers(
                self.probes, self.mismatches, self.lcf_thres,
                min_k=self.kmer_probe_map_k, k=self.kmer_probe_map_k)
        )
        probe.open_probe_finding_pool(kmer_probe_map,
                                      self.cover_range_fn)

        self.target_covers = {}
        for i, j, gnm, rc in self._iter_target_genomes():
            if not rc:
                logger.info(("Computing coverage in grouping %d (of %d), "
                             "with target genome %d (of %d)"), i + 1,
                            len(self.target_genomes), j + 1,
                            len(self.target_genomes[i]))
            if i not in self.target_covers:
                self.target_covers[i] = {}
            if j not in self.target_covers[i]:
                self.target_covers[i][j] = {False: None, True: None}

            gnm_covers = []
            length_so_far = 0
            for sequence in gnm.seqs:
                if rc:
                    # Take the reverse complement of sequence
                    rc_map = {'A': 'T', 'T': 'A', 'C': 'G', 'G': 'C'}
                    sequence = ''.join([rc_map.get(b, b)
                                       for b in sequence[::-1]])

                # Find cover ranges of the probes, while allowing the ranges
                # to overlap (e.g., if one probe covers two regions that
                # overlap)
                probe_cover_ranges = probe.find_probe_covers_in_sequence(
                    sequence,
                    merge_overlapping=False)
                for p, cover_ranges in probe_cover_ranges.items():
                    for cover_range in cover_ranges:
                        # Extend the range covered by probe p on both sides
                        # by self.cover_extension
                        cover_start = max(0,
                            cover_range[0] - self.cover_extension)
                        cover_end = min(len(sequence),
                            cover_range[1] + self.cover_extension)
                        # The endpoints of the cover give positions in just
                        # this sequence (chromosome), so adjust them (according
                        # to length_so_far) to give a unique integer position
                        # in the genome gnm
                        adjusted_cover = (cover_start + length_so_far,
                                          cover_end + length_so_far)
                        gnm_covers += [adjusted_cover]
                length_so_far += len(sequence)
            self.target_covers[i][j][rc] = gnm_covers

        probe.close_probe_finding_pool()
コード例 #14
0
ファイル: set_cover_filter.py プロジェクト: pythseq/catch
    def _make_sets(self, candidate_probes, target_genomes):
        """Return a collection of sets to use in set cover.

        In the returned collection of sets, each set corresponds to a
        candidate probe and contains the bases of the target genomes
        covered by the candidate probe. The target genomes must be in
        grouped lists inside the list target_genomes.

        The output is intended for input to set_cover.approx_multiuniverse
        as the 'sets' input.

        Args:
            candidate_probes: list of candidate probes
            target_genomes: list of groups of target genomes

        Returns:
            a dict mapping set_ids (from 0 through
            len(candidate_probes)-1) to dicts, where the dict for a
            particular set_id maps universe_ids to sets. set_id
            corresponds to a candidate probe in candidate_probes and
            universe_id is a tuple that corresponds to a target genome in
            a grouping from target_genomes. The j'th target genome
            from the i'th grouping in target_genomes is given
            universe_id equal to (i,j). That is, i ranges from 0 through
            len(target_genomes)-1 (i.e., the number of groupings) and
            j ranges from 0 through (n_i)-1 where n_i is the number of
            target genomes in the i'th group. In the returned value
            (sets), sets[set_id][universe_id] is a set of all the bases
            (as an instance of interval.IntervalSet) covered by probe
            set_id in the target genome universe_id. (If
            sets[set_id][universe_id] contains just one interval, then that
            interval is stored directly as a tuple -- not in an instance
            of interval.IntervalSet -- to save space and it should be
            coverted to an interval.IntervalSet when needed.)
        """
        logger.info("Building map from k-mers to probes")
        kmer_probe_map = probe.SharedKmerProbeMap.construct(
            probe.construct_kmer_probe_map_to_find_probe_covers(
                candidate_probes,
                self.mismatches,
                self.lcf_thres,
                min_k=self.kmer_probe_map_k,
                k=self.kmer_probe_map_k))
        probe.open_probe_finding_pool(kmer_probe_map, self.cover_range_fn)

        probe_id = {}
        sets = {}
        for id, p in enumerate(candidate_probes):
            probe_id[p] = id
            sets[id] = {}

        for i, genomes_from_group in enumerate(target_genomes):
            for j, gnm in enumerate(genomes_from_group):
                logger.info(("Computing coverage in grouping %d (of %d), "
                             "with target genome %d (of %d)"), i + 1,
                            len(target_genomes), j + 1,
                            len(genomes_from_group))
                universe_id = (i, j)
                length_so_far = 0
                for sequence in gnm.seqs:
                    probe_cover_ranges = probe.find_probe_covers_in_sequence(
                        sequence)
                    # Add the bases of sequence that are covered by all the
                    # probes into sets with universe_id equal to (i,j)
                    for p, cover_ranges in probe_cover_ranges.items():
                        set_id = probe_id[p]
                        for cover_range in cover_ranges:
                            # Extend the range covered by probe p on both sides
                            # by self.cover_extension
                            cover_start = max(
                                0, cover_range[0] - self.cover_extension)
                            cover_end = min(
                                len(sequence),
                                cover_range[1] + self.cover_extension)
                            # The endpoints of the cover give positions in
                            # just this sequence (chromosome), so adding the
                            # lengths of all the sequences previously iterated
                            # (length_so_far) onto them gives unique
                            # integer positions in the genome gnm
                            adjusted_cover = (cover_start + length_so_far,
                                              cover_end + length_so_far)
                            if universe_id not in sets[set_id]:
                                # Since a list has a lot of overhead and most
                                # probes align to just one interval, simply
                                # store that interval alone (not in a list)
                                sets[set_id][universe_id] = adjusted_cover
                            else:
                                prev_cover = sets[set_id][universe_id]
                                if isinstance(prev_cover, tuple):
                                    # This probe now aligns to two intervals in
                                    # this universe/genome, so store them in
                                    # a list
                                    sets[set_id][universe_id] = [prev_cover]
                                sets[set_id][universe_id].append(
                                    adjusted_cover)
                    length_so_far += len(sequence)

        probe.close_probe_finding_pool()
        del kmer_probe_map
        gc.collect()

        # Make an IntervalSet out of the intervals of each set. But if
        # there is just one interval in a set, then save space by leaving
        # that entry as a tuple.
        for set_id in sets.keys():
            for universe_id in sets[set_id].keys():
                intervals = sets[set_id][universe_id]
                if not isinstance(intervals, tuple):
                    sets[set_id][universe_id] = interval.IntervalSet(intervals)
                # Else, there is just one interval in this set; leave it
                # stored directly as a tuple

        return sets
コード例 #15
0
    def run_random(self,
                   n,
                   genome_min,
                   genome_max,
                   num_probes,
                   probe_length=100,
                   lcf_thres=None,
                   seed=1,
                   n_workers=2,
                   use_native_dict=False):
        """Run tests with a randomly generated sequence.

        Repeatedly runs tests in which a sequence is randomly generated,
        probes are generated from that sequence, and then the probes are
        looked up in the sequence.

        Creates the probes with the intention of determining coverage with
        a longest common substring.

        Args:
            n: number of times to run the test
            genome_min/genome_max: the genome (sequence) size is
                randomly chosen between genome_min and genome_max
            num_probes: the number of probes generated from the random
                sequence
            probe_length: number of bp to make each probe
            lcf_thres: lcf threshold parameter; when None, it is
                randomly chosen among 80 and 100
            seed: random number generator seed
            n_workers: number of workers to have in a probe finding pool
            use_native_dict: have the probe finding pool use a native Python
                dict
        """
        np.random.seed(seed)
        fixed_lcf_thres = lcf_thres

        for n in range(n):
            if fixed_lcf_thres is not None:
                lcf_thres = fixed_lcf_thres
            else:
                # Choose either lcf_thres=80 or lcf_thres=100
                lcf_thres = np.random.choice([80, 100])
            # Make a random sequence
            seq_length = np.random.randint(genome_min, genome_max)
            sequence = "".join(
                np.random.choice(['A', 'T', 'C', 'G'],
                                 size=seq_length,
                                 replace=True))
            desired_probe_cover_ranges = defaultdict(list)
            # Make num_probes random probes
            probes = []
            for m in range(num_probes):
                subseq_start = np.random.randint(0, seq_length - probe_length)
                subseq_end = subseq_start + probe_length
                cover_length = np.random.randint(lcf_thres, probe_length + 1)
                cover_start = subseq_start + \
                    np.random.randint(0, probe_length - cover_length + 1)
                cover_end = min(seq_length, cover_start + cover_length)
                probe_str_cover = sequence[cover_start:cover_end]
                # Add random bases before and after what the probe should
                # cover
                probe_str_start = "".join(
                    np.random.choice(['A', 'T', 'C', 'G'],
                                     size=cover_start - subseq_start,
                                     replace=True))
                probe_str_end = "".join(
                    np.random.choice(['A', 'T', 'C', 'G'],
                                     size=subseq_end - cover_end,
                                     replace=True))
                probe_str = probe_str_start + probe_str_cover + probe_str_end
                # Add 0, 1, 2, or 3 random mismatches
                for k in range(np.random.randint(0, 4)):
                    pos = np.random.randint(0, probe_length)
                    base_choices = [
                        b for b in ['A', 'T', 'C', 'G'] if b != probe_str[pos]
                    ]
                    probe_str = probe_str[:pos] + \
                        "".join(np.random.choice(base_choices, size=1)) + \
                        probe_str[(pos + 1):]
                p = probe.Probe.from_str(probe_str)
                desired_probe_cover_ranges[p].append((cover_start, cover_end))
                probes += [p]
            kmer_map = probe.construct_kmer_probe_map_to_find_probe_covers(
                probes, 3, lcf_thres)
            kmer_map = probe.SharedKmerProbeMap.construct(kmer_map)
            f = probe.probe_covers_sequence_by_longest_common_substring(
                3, lcf_thres)
            probe.open_probe_finding_pool(kmer_map,
                                          f,
                                          n_workers,
                                          use_native_dict=use_native_dict)
            found = probe.find_probe_covers_in_sequence(sequence)
            probe.close_probe_finding_pool()
            # Check that this didn't find any extraneous probes and that
            # it found at least 95% of the original (it may miss some
            # due to false negatives in the approach)
            self.assertLessEqual(len(found), len(probes))
            self.assertGreaterEqual(len(found), 0.95 * len(probes))
            # Check that each desired probe was found correctly
            for p, cover_ranges in desired_probe_cover_ranges.items():
                if p not in found:
                    continue
                found_cover_ranges = found[p]
                # This probe most likely was found once, but could have
                # been missed (due to false negatives in the approach) and
                # may have been found more than once due to chance (but
                # probably not too much more!)
                self.assertTrue(len(found_cover_ranges) in [1, 2])
                # The cover ranges should have been captured, and the ones
                # found may extend past what was desired by a small amount due
                # to allowing mismatches and chance
                # Because of mismatches possibly added to the end of the
                # desired cover range, what was recaptured may not always
                # encompass the entire cover range, so allow some small
                # tolerance
                for desired_cv in cover_ranges:
                    found_desired_cv = False
                    for found_cv in found_cover_ranges:
                        left_diff = desired_cv[0] - found_cv[0]
                        right_diff = found_cv[1] - desired_cv[1]
                        if left_diff >= -7 and left_diff < 15:
                            if right_diff >= -7 and right_diff < 15:
                                found_desired_cv = True
                                break
                    self.assertTrue(found_desired_cv)