Esempio n. 1
0
 def test_trim_ends(self):
     a = AlignedPair(
         ("q", "ABCDEFGHIJ"),
         ("s", "KLMNOPQRST"))
     r = AlignedRegion(a, 2, 5)
     self.assertEqual(
         r.trim_ends(),
         AlignedPair(("q", "CDE"), ("s", "MNO")))
Esempio n. 2
0
    def test_region_subject_to_query_00(self):
        a = AlignedPair(
            ("a", "ABCDEFG"),
            ("b", "---KLMN"),
        )

        r = AlignedRegion.from_subject(a, 0, 2)
        self.assertEqual(r.in_alignment(), (3, 5)) # KL
        self.assertEqual(r.in_subject(), (0, 2)) # KL
        self.assertEqual(r.in_query(), (3, 5)) # DE

        r = AlignedRegion.from_subject(a, 0, 0)
        self.assertEqual(r.in_alignment(), (3, 3)) # --- | KLMN
        self.assertEqual(r.in_subject(), (0, 0)) # empty sequence
        self.assertEqual(r.in_query(), (3, 3)) # ABC | DEFG
Esempio n. 3
0
 def test_without_endgaps_basic(self):
     a = AlignedPair(
         ("q", "--ABCDE---"),
         ("s", "FGHIJKLMNO"))
     r = AlignedRegion.without_endgaps(a)
     self.assertEqual(r.start_idx, 2)
     self.assertEqual(r.end_idx, 7)
Esempio n. 4
0
 def test_from_subject_region_crazy(self):
     a = AlignedPair(
         ("a", "-A-BC-EF---"),
         ("b", "--HI-JK-LMN"))
     r = AlignedRegion.from_subject(a, 0, 3)
     self.assertEqual(r.start_idx, 2)
     self.assertEqual(r.end_idx, 6)
Esempio n. 5
0
    def find_in_seqs(self, seqs):
        if seqs.all_matched():
            return

        # Create the file paths
        subject_fp = self._make_fp("subject_{0}.fa".format(self.suffix))
        query_fp = self._make_fp("query_{0}.fa".format(self.suffix))
        result_fp = self._make_fp("query_{0}.txt".format(self.suffix))

        # Search
        with open(subject_fp, "w") as f:
            write_fasta(f, seqs.get_matched_offset0())
        ba = VsearchAligner(subject_fp)
        search_args = {
            "min_id": round(self.min_pct_id / 100, 2),
            "top_hits_only": None}
        if self.cores > 0:
            search_args["threads"] = self.cores
        hits = ba.search(
            seqs.get_unmatched_recs(), input_fp=query_fp, output_fp=result_fp,
            **search_args)

        # Refine
        bext = HitExtender(seqs.get_unmatched_recs(), seqs.get_matched_offset0())
        for hit in hits:
            alignment = bext.extend_hit(hit)
            subject_match = seqs.matches[alignment.subject_id]
            aligned_region = AlignedRegion.from_subject(
                alignment, subject_match.start, subject_match.end)
            query_start_idx, query_end_idx = aligned_region.in_query()
            query_offset = aligned_region.query_offset()
            matchobj = PrimerMatch(
                query_start_idx, query_end_idx, query_offset, "Alignment")
            yield alignment.query_id, matchobj
Esempio n. 6
0
    def _get_indiv_probability(self, alignment):
        region = AlignedRegion.without_endgaps(alignment).trim_ends()
        region_positions = region.alignment_len
        region_matches = region.count_matches()
        region_mismatches = region_positions - region_matches

        alpha = region_mismatches + self.prior_alpha
        beta = region_matches + self.prior_beta

        nonregion_subject_positions = (alignment.subject_len -
                                       region.subject_len)
        total_positions = (region_positions + nonregion_subject_positions)

        species_mismatch_threshold = 1 - self.species_threshold
        max_total_mismatches = int(
            math.floor(species_mismatch_threshold * total_positions))
        max_nonregion_mismatches = max_total_mismatches - region_mismatches

        prob_compatible = beta_binomial_cdf(max_nonregion_mismatches,
                                            nonregion_subject_positions, alpha,
                                            beta)
        prob_incompatible = 1 - prob_compatible

        return {
            "typestrain_id": alignment.subject_id,
            "probability_incompatible": prob_incompatible,
            "region_mismatches": region_mismatches,
            "region_positions": region_positions,
            "region_matches": region_matches,
            "nonregion_positions_in_subject": nonregion_subject_positions,
            "max_nonregion_mismatches": max_nonregion_mismatches,
        }
Esempio n. 7
0
 def test_from_subject_with_endgaps(self):
     a = AlignedPair(
         ("a", "--ABC-EF---"),
         ("b", "HIJKLMNOPQR"))
     r = AlignedRegion.from_subject(a, 1, 7)
     self.assertEqual(r.start_idx, 1)
     self.assertEqual(r.end_idx, 7)
Esempio n. 8
0
 def test_from_subject_no_endgaps(self):
     a = AlignedPair(
         ("a", "ABCDEF"),
         ("b", "HIJKLM"))
     r = AlignedRegion.from_subject(a, 2, 5)
     self.assertEqual(r.start_idx, 2)
     self.assertEqual(r.end_idx, 5)
Esempio n. 9
0
 def test_without_endgaps_hard(self):
     a = AlignedPair(
         ("q", "--A-CDEFGH"),
         #        |||||
         ("s", "FG-IJ-L---"))
     r = AlignedRegion.without_endgaps(a)
     self.assertEqual(r.start_idx, 2)
     self.assertEqual(r.end_idx, 7)
Esempio n. 10
0
    def test_region_subject_to_query_crazy_alignment(self):
        a = AlignedPair(
            ("a", "-A-BC-EF---"),
            ("b", "--HI-JK-LMN"))

        r = AlignedRegion.from_subject(a, 0, 3)
        self.assertEqual(r.in_subject(), (0, 3)) # HIJ
        self.assertEqual(r.in_alignment(), (2, 6)) # HI-J
        self.assertEqual(r.in_query(), (1, 3)) # BC in HIJ

        r = AlignedRegion.from_subject(a, 1, 4)
        self.assertEqual(r.in_subject(), (1, 4)) # IJK
        self.assertEqual(r.in_alignment(), (3, 7)) # I-JK
        self.assertEqual(r.in_query(), (1, 4)) # BCE in IJK

        r = AlignedRegion.from_subject(a)
        self.assertEqual(r.in_subject(), (0, 7)) # whole sequence, HIJKLMN
        self.assertEqual(r.in_alignment(), (2, 11)) # HI-JK-LMN
        self.assertEqual(r.in_query(), (1, 5)) # BCEF in subject
Esempio n. 11
0
    def test_region_subject_to_query_with_endgaps(self):
        a = AlignedPair(
            ("a", "--ABC-EF---"),
            ("b", "HIJKLMNOPQR"))

        r = AlignedRegion.from_subject(a, 0, 3)
        self.assertEqual(r.in_subject(), (0, 3)) # HIJ
        self.assertEqual(r.in_alignment(), (0, 3))
        self.assertEqual(r.in_query(), (0, 1)) # A in HIJ

        r = AlignedRegion.from_subject(a, 1, 6)
        self.assertEqual(r.in_subject(), (1, 6)) # IJKLM
        self.assertEqual(r.in_alignment(), (1, 6))
        self.assertEqual(r.in_query(), (0, 3)) # ABC in IJKLM

        r = AlignedRegion.from_subject(a)
        self.assertEqual(r.in_subject(), (0, 11)) # whole sequence
        self.assertEqual(r.in_alignment(), (0, 11))
        self.assertEqual(r.in_query(), (0, 5)) # ABCEF in subject
Esempio n. 12
0
 def test_from_query_region_crazy(self):
     a = AlignedPair(
         ("a", "-A-BC-EF---"),
         #       ||||          (1, 5)
         ("b", "--HI-JK-LMN"))
     r = AlignedRegion.from_query(a, 0, 3)
     self.assertEqual(r.in_alignment(), (1, 5))
     self.assertEqual(r.in_query(), (0, 3))
     self.assertEqual(r.in_subject(), (0, 2))
     self.assertEqual(r.query_offset(), 0)
     self.assertEqual(r.subject_offset(), 0)
Esempio n. 13
0
 def test_from_query_with_endgaps(self):
     a = AlignedPair(
         ("a", "--ABC-EF---"),
         ("b", "HIJKLMNOPQR"))
     r = AlignedRegion.from_query(a, 1, 4)
     self.assertEqual(r.start_idx, 3)
     self.assertEqual(r.end_idx, 7)
     self.assertEqual(r.in_query(), (1, 4))
     self.assertEqual(r.in_subject(), (3, 7))
     self.assertEqual(r.query_offset(), 0)
     self.assertEqual(r.subject_offset(), 0)
Esempio n. 14
0
 def test_from_query_no_endgaps(self):
     a = AlignedPair(
         ("a", "ABCDEF"),
         ("b", "HIJKLM"))
     r = AlignedRegion.from_query(a, 2, 5)
     self.assertEqual(r.start_idx, 2)
     self.assertEqual(r.end_idx, 5)
     self.assertEqual(r.in_query(), (2, 5))
     self.assertEqual(r.in_subject(), (2, 5))
     self.assertEqual(r.query_offset(), 0)
     self.assertEqual(r.subject_offset(), 0)
Esempio n. 15
0
 def test_query_offset_left(self):
     a = AlignedPair(
         ("a", "ABCDEF------"),
         #             |||
         ("b", "--JKLMNOPQRS"))
     r = AlignedRegion(a, 7, 10)
     self.assertEqual(r.in_query(), (6, 6))
     self.assertEqual(r.in_subject(), (5, 8))
     self.assertEqual(r.query_offset(), -1)
     self.assertEqual(r.subject_offset(), 0)
Esempio n. 16
0
 def test_query_offset_right(self):
     a = AlignedPair(
         ("a", "------ABCDEF"),
         #       |||
         ("b", "GHIJKLMNOP--"))
     r = AlignedRegion(a, 1, 4)
     self.assertEqual(r.in_query(), (0, 0))
     self.assertEqual(r.in_subject(), (1, 4))
     self.assertEqual(r.query_offset(), 2)
     self.assertEqual(r.subject_offset(), 0)
Esempio n. 17
0
    def test_region_subject_to_query_no_endgaps(self):
        a = AlignedPair(
            ("a", "ABCDEF"),
            ("b", "HIJKLM"))
        # In an alignment with no gaps, the query sequence coordinates should
        # always match the subject sequence coordinates
        r = AlignedRegion.from_subject(a, 0, 3)
        self.assertEqual(r.in_alignment(), (0, 3))
        rq = AlignedRegion.from_query(a, 0, 3)
        self.assertEqual(r.in_alignment(), (0, 3))

        r = AlignedRegion.from_subject(a, 1, 5)
        self.assertEqual(r.in_alignment(), (1, 5))
        rq = AlignedRegion.from_query(a, 1, 5)
        self.assertEqual(r.in_alignment(), (1, 5))

        r = AlignedRegion.from_subject(a)
        self.assertEqual(r.in_alignment(), (0, 6))
        rq = AlignedRegion.from_query(a)
        self.assertEqual(r.in_alignment(), (0, 6))
Esempio n. 18
0
    def unassign_threshold(self, min_id=0.975, soft_threshold=False):
        # Use all the beta-binomial logic from ConstantMismatchRate,
        # just adjust alpha and beta based on reference
        # sequences. Here's how. Reparameterize beta as mu and v,
        # following wikipedia. Hold v constant. We are going to update
        # mu. In the constant rate algorithm, mu2 = mu1. In the
        # variable rate algorithm, we determine log(mu2 / mu1) by
        # averaging the observed values from the reference
        # sequences. To stabilize things, start with a list of
        # [0,0,0,0,0]. Then, for each reference sequence, compute mu2
        # and mu1, take the log, and append to the list. Average the
        # values in the list. Now use this as the new value of mu2 for
        # the query sequence.

        # Clip out the aligned region
        region = AlignedRegion.without_endgaps(self.alignment)
        region_alignment = region.trim_ends()
        region_positions = region_alignment.alignment_len
        region_matches = region_alignment.count_matches()
        region_mismatches = region_positions - region_matches
        region_subject_positions = region_alignment.subject_len

        # Calcuate alpha, beta, mu, and v in aligned region
        alpha1 = region_mismatches + 0.5
        beta1 = region_matches + 0.5
        v1 = alpha1 + beta1
        mu1 = alpha1 / v1

        # Compute number of positions outside aligned region
        nonregion_subject_positions = (self.alignment.subject_len -
                                       region_subject_positions)
        total_positions = (region_positions + nonregion_subject_positions)

        # Get mismatches from database
        typestrain_id = self.alignment.subject_id
        typestrain_start_idx, typestrain_end_idx = region.in_subject()
        reference_mismatches = self._get_mismatches(typestrain_id,
                                                    typestrain_start_idx,
                                                    typestrain_end_idx)

        # Get estimate for gamma = log(mu2 / mu1)
        # From reference alignments
        reference_logvals = [0, 0, 0, 0, 0]
        for region_mms, nonregion_mms in reference_mismatches:
            # mu = alpha / (alpha + beta)
            # alpha = mismatches + 0.5
            # beta = matches + 0.5
            # matches = len - mismatches
            # beta = len - mismatches + 0.5
            # mu = (mismatches + 0.5) / (len + 1)
            ref_mu1 = (region_mms + 0.5) / (region_subject_positions + 1)
            ref_mu2 = (nonregion_mms + 0.5) / (nonregion_subject_positions + 1)
            log_mu2_mu1 = math.log(ref_mu2 / ref_mu1)
            reference_logvals.append(log_mu2_mu1)
        # TODO: add weighting
        gamma = numpy.mean(reference_logvals)

        # Calculate mu2, get alpha2 and beta2
        # log(mu2 / mu1) = gamma
        # log(mu2) - log(mu1) = gamma
        # log(mu2) = log(mu1) + gamma
        # mu2 = exp(log(mu1) + gamma)
        mu2 = math.exp(math.log(mu1) + gamma)
        v2 = v1
        # alpha2, beta2
        # mu2 = alpha2 / v2
        # alpha2 = mu2 * v2
        alpha2 = mu2 * v2
        # v2 = alpha2 + beta2
        beta2 = v2 - alpha2

        # Maximum number of mismatches outside observed region
        species_mismatch_threshold = 1 - min_id
        max_total_mismatches = int(
            math.floor(species_mismatch_threshold * total_positions))
        max_nonregion_mismatches = max_total_mismatches - region_mismatches

        # Compute probability
        if soft_threshold:
            threshold_fcn = soft_species_probability
        else:
            threshold_fcn = hard_species_probability
        prob_compatible = threshold_assignment_probability(
            region_mismatches,
            region_positions,
            nonregion_subject_positions,
            alpha2,
            beta2,
            100 * species_mismatch_threshold,
            threshold_fcn,
        )
        prob_incompatible = 1 - prob_compatible

        return {
            "typestrain_id": self.alignment.subject_id,
            "region_mismatches": region_mismatches,
            "region_positions": region_positions,
            "probability_incompatible": prob_incompatible,
            "mu1": mu1,
            "num_references": len(reference_logvals),
            "mu2": mu2,
            "nonregion_positions_in_subject": nonregion_subject_positions,
            "max_nonregion_mismatches": max_nonregion_mismatches,
        }