def test_trim_ends(self): a = AlignedPair( ("q", "ABCDEFGHIJ"), ("s", "KLMNOPQRST")) r = AlignedRegion(a, 2, 5) self.assertEqual( r.trim_ends(), AlignedPair(("q", "CDE"), ("s", "MNO")))
def test_region_subject_to_query_00(self): a = AlignedPair( ("a", "ABCDEFG"), ("b", "---KLMN"), ) r = AlignedRegion.from_subject(a, 0, 2) self.assertEqual(r.in_alignment(), (3, 5)) # KL self.assertEqual(r.in_subject(), (0, 2)) # KL self.assertEqual(r.in_query(), (3, 5)) # DE r = AlignedRegion.from_subject(a, 0, 0) self.assertEqual(r.in_alignment(), (3, 3)) # --- | KLMN self.assertEqual(r.in_subject(), (0, 0)) # empty sequence self.assertEqual(r.in_query(), (3, 3)) # ABC | DEFG
def test_without_endgaps_basic(self): a = AlignedPair( ("q", "--ABCDE---"), ("s", "FGHIJKLMNO")) r = AlignedRegion.without_endgaps(a) self.assertEqual(r.start_idx, 2) self.assertEqual(r.end_idx, 7)
def test_from_subject_region_crazy(self): a = AlignedPair( ("a", "-A-BC-EF---"), ("b", "--HI-JK-LMN")) r = AlignedRegion.from_subject(a, 0, 3) self.assertEqual(r.start_idx, 2) self.assertEqual(r.end_idx, 6)
def find_in_seqs(self, seqs): if seqs.all_matched(): return # Create the file paths subject_fp = self._make_fp("subject_{0}.fa".format(self.suffix)) query_fp = self._make_fp("query_{0}.fa".format(self.suffix)) result_fp = self._make_fp("query_{0}.txt".format(self.suffix)) # Search with open(subject_fp, "w") as f: write_fasta(f, seqs.get_matched_offset0()) ba = VsearchAligner(subject_fp) search_args = { "min_id": round(self.min_pct_id / 100, 2), "top_hits_only": None} if self.cores > 0: search_args["threads"] = self.cores hits = ba.search( seqs.get_unmatched_recs(), input_fp=query_fp, output_fp=result_fp, **search_args) # Refine bext = HitExtender(seqs.get_unmatched_recs(), seqs.get_matched_offset0()) for hit in hits: alignment = bext.extend_hit(hit) subject_match = seqs.matches[alignment.subject_id] aligned_region = AlignedRegion.from_subject( alignment, subject_match.start, subject_match.end) query_start_idx, query_end_idx = aligned_region.in_query() query_offset = aligned_region.query_offset() matchobj = PrimerMatch( query_start_idx, query_end_idx, query_offset, "Alignment") yield alignment.query_id, matchobj
def _get_indiv_probability(self, alignment): region = AlignedRegion.without_endgaps(alignment).trim_ends() region_positions = region.alignment_len region_matches = region.count_matches() region_mismatches = region_positions - region_matches alpha = region_mismatches + self.prior_alpha beta = region_matches + self.prior_beta nonregion_subject_positions = (alignment.subject_len - region.subject_len) total_positions = (region_positions + nonregion_subject_positions) species_mismatch_threshold = 1 - self.species_threshold max_total_mismatches = int( math.floor(species_mismatch_threshold * total_positions)) max_nonregion_mismatches = max_total_mismatches - region_mismatches prob_compatible = beta_binomial_cdf(max_nonregion_mismatches, nonregion_subject_positions, alpha, beta) prob_incompatible = 1 - prob_compatible return { "typestrain_id": alignment.subject_id, "probability_incompatible": prob_incompatible, "region_mismatches": region_mismatches, "region_positions": region_positions, "region_matches": region_matches, "nonregion_positions_in_subject": nonregion_subject_positions, "max_nonregion_mismatches": max_nonregion_mismatches, }
def test_from_subject_with_endgaps(self): a = AlignedPair( ("a", "--ABC-EF---"), ("b", "HIJKLMNOPQR")) r = AlignedRegion.from_subject(a, 1, 7) self.assertEqual(r.start_idx, 1) self.assertEqual(r.end_idx, 7)
def test_from_subject_no_endgaps(self): a = AlignedPair( ("a", "ABCDEF"), ("b", "HIJKLM")) r = AlignedRegion.from_subject(a, 2, 5) self.assertEqual(r.start_idx, 2) self.assertEqual(r.end_idx, 5)
def test_without_endgaps_hard(self): a = AlignedPair( ("q", "--A-CDEFGH"), # ||||| ("s", "FG-IJ-L---")) r = AlignedRegion.without_endgaps(a) self.assertEqual(r.start_idx, 2) self.assertEqual(r.end_idx, 7)
def test_region_subject_to_query_crazy_alignment(self): a = AlignedPair( ("a", "-A-BC-EF---"), ("b", "--HI-JK-LMN")) r = AlignedRegion.from_subject(a, 0, 3) self.assertEqual(r.in_subject(), (0, 3)) # HIJ self.assertEqual(r.in_alignment(), (2, 6)) # HI-J self.assertEqual(r.in_query(), (1, 3)) # BC in HIJ r = AlignedRegion.from_subject(a, 1, 4) self.assertEqual(r.in_subject(), (1, 4)) # IJK self.assertEqual(r.in_alignment(), (3, 7)) # I-JK self.assertEqual(r.in_query(), (1, 4)) # BCE in IJK r = AlignedRegion.from_subject(a) self.assertEqual(r.in_subject(), (0, 7)) # whole sequence, HIJKLMN self.assertEqual(r.in_alignment(), (2, 11)) # HI-JK-LMN self.assertEqual(r.in_query(), (1, 5)) # BCEF in subject
def test_region_subject_to_query_with_endgaps(self): a = AlignedPair( ("a", "--ABC-EF---"), ("b", "HIJKLMNOPQR")) r = AlignedRegion.from_subject(a, 0, 3) self.assertEqual(r.in_subject(), (0, 3)) # HIJ self.assertEqual(r.in_alignment(), (0, 3)) self.assertEqual(r.in_query(), (0, 1)) # A in HIJ r = AlignedRegion.from_subject(a, 1, 6) self.assertEqual(r.in_subject(), (1, 6)) # IJKLM self.assertEqual(r.in_alignment(), (1, 6)) self.assertEqual(r.in_query(), (0, 3)) # ABC in IJKLM r = AlignedRegion.from_subject(a) self.assertEqual(r.in_subject(), (0, 11)) # whole sequence self.assertEqual(r.in_alignment(), (0, 11)) self.assertEqual(r.in_query(), (0, 5)) # ABCEF in subject
def test_from_query_region_crazy(self): a = AlignedPair( ("a", "-A-BC-EF---"), # |||| (1, 5) ("b", "--HI-JK-LMN")) r = AlignedRegion.from_query(a, 0, 3) self.assertEqual(r.in_alignment(), (1, 5)) self.assertEqual(r.in_query(), (0, 3)) self.assertEqual(r.in_subject(), (0, 2)) self.assertEqual(r.query_offset(), 0) self.assertEqual(r.subject_offset(), 0)
def test_from_query_with_endgaps(self): a = AlignedPair( ("a", "--ABC-EF---"), ("b", "HIJKLMNOPQR")) r = AlignedRegion.from_query(a, 1, 4) self.assertEqual(r.start_idx, 3) self.assertEqual(r.end_idx, 7) self.assertEqual(r.in_query(), (1, 4)) self.assertEqual(r.in_subject(), (3, 7)) self.assertEqual(r.query_offset(), 0) self.assertEqual(r.subject_offset(), 0)
def test_from_query_no_endgaps(self): a = AlignedPair( ("a", "ABCDEF"), ("b", "HIJKLM")) r = AlignedRegion.from_query(a, 2, 5) self.assertEqual(r.start_idx, 2) self.assertEqual(r.end_idx, 5) self.assertEqual(r.in_query(), (2, 5)) self.assertEqual(r.in_subject(), (2, 5)) self.assertEqual(r.query_offset(), 0) self.assertEqual(r.subject_offset(), 0)
def test_query_offset_left(self): a = AlignedPair( ("a", "ABCDEF------"), # ||| ("b", "--JKLMNOPQRS")) r = AlignedRegion(a, 7, 10) self.assertEqual(r.in_query(), (6, 6)) self.assertEqual(r.in_subject(), (5, 8)) self.assertEqual(r.query_offset(), -1) self.assertEqual(r.subject_offset(), 0)
def test_query_offset_right(self): a = AlignedPair( ("a", "------ABCDEF"), # ||| ("b", "GHIJKLMNOP--")) r = AlignedRegion(a, 1, 4) self.assertEqual(r.in_query(), (0, 0)) self.assertEqual(r.in_subject(), (1, 4)) self.assertEqual(r.query_offset(), 2) self.assertEqual(r.subject_offset(), 0)
def test_region_subject_to_query_no_endgaps(self): a = AlignedPair( ("a", "ABCDEF"), ("b", "HIJKLM")) # In an alignment with no gaps, the query sequence coordinates should # always match the subject sequence coordinates r = AlignedRegion.from_subject(a, 0, 3) self.assertEqual(r.in_alignment(), (0, 3)) rq = AlignedRegion.from_query(a, 0, 3) self.assertEqual(r.in_alignment(), (0, 3)) r = AlignedRegion.from_subject(a, 1, 5) self.assertEqual(r.in_alignment(), (1, 5)) rq = AlignedRegion.from_query(a, 1, 5) self.assertEqual(r.in_alignment(), (1, 5)) r = AlignedRegion.from_subject(a) self.assertEqual(r.in_alignment(), (0, 6)) rq = AlignedRegion.from_query(a) self.assertEqual(r.in_alignment(), (0, 6))
def unassign_threshold(self, min_id=0.975, soft_threshold=False): # Use all the beta-binomial logic from ConstantMismatchRate, # just adjust alpha and beta based on reference # sequences. Here's how. Reparameterize beta as mu and v, # following wikipedia. Hold v constant. We are going to update # mu. In the constant rate algorithm, mu2 = mu1. In the # variable rate algorithm, we determine log(mu2 / mu1) by # averaging the observed values from the reference # sequences. To stabilize things, start with a list of # [0,0,0,0,0]. Then, for each reference sequence, compute mu2 # and mu1, take the log, and append to the list. Average the # values in the list. Now use this as the new value of mu2 for # the query sequence. # Clip out the aligned region region = AlignedRegion.without_endgaps(self.alignment) region_alignment = region.trim_ends() region_positions = region_alignment.alignment_len region_matches = region_alignment.count_matches() region_mismatches = region_positions - region_matches region_subject_positions = region_alignment.subject_len # Calcuate alpha, beta, mu, and v in aligned region alpha1 = region_mismatches + 0.5 beta1 = region_matches + 0.5 v1 = alpha1 + beta1 mu1 = alpha1 / v1 # Compute number of positions outside aligned region nonregion_subject_positions = (self.alignment.subject_len - region_subject_positions) total_positions = (region_positions + nonregion_subject_positions) # Get mismatches from database typestrain_id = self.alignment.subject_id typestrain_start_idx, typestrain_end_idx = region.in_subject() reference_mismatches = self._get_mismatches(typestrain_id, typestrain_start_idx, typestrain_end_idx) # Get estimate for gamma = log(mu2 / mu1) # From reference alignments reference_logvals = [0, 0, 0, 0, 0] for region_mms, nonregion_mms in reference_mismatches: # mu = alpha / (alpha + beta) # alpha = mismatches + 0.5 # beta = matches + 0.5 # matches = len - mismatches # beta = len - mismatches + 0.5 # mu = (mismatches + 0.5) / (len + 1) ref_mu1 = (region_mms + 0.5) / (region_subject_positions + 1) ref_mu2 = (nonregion_mms + 0.5) / (nonregion_subject_positions + 1) log_mu2_mu1 = math.log(ref_mu2 / ref_mu1) reference_logvals.append(log_mu2_mu1) # TODO: add weighting gamma = numpy.mean(reference_logvals) # Calculate mu2, get alpha2 and beta2 # log(mu2 / mu1) = gamma # log(mu2) - log(mu1) = gamma # log(mu2) = log(mu1) + gamma # mu2 = exp(log(mu1) + gamma) mu2 = math.exp(math.log(mu1) + gamma) v2 = v1 # alpha2, beta2 # mu2 = alpha2 / v2 # alpha2 = mu2 * v2 alpha2 = mu2 * v2 # v2 = alpha2 + beta2 beta2 = v2 - alpha2 # Maximum number of mismatches outside observed region species_mismatch_threshold = 1 - min_id max_total_mismatches = int( math.floor(species_mismatch_threshold * total_positions)) max_nonregion_mismatches = max_total_mismatches - region_mismatches # Compute probability if soft_threshold: threshold_fcn = soft_species_probability else: threshold_fcn = hard_species_probability prob_compatible = threshold_assignment_probability( region_mismatches, region_positions, nonregion_subject_positions, alpha2, beta2, 100 * species_mismatch_threshold, threshold_fcn, ) prob_incompatible = 1 - prob_compatible return { "typestrain_id": self.alignment.subject_id, "region_mismatches": region_mismatches, "region_positions": region_positions, "probability_incompatible": prob_incompatible, "mu1": mu1, "num_references": len(reference_logvals), "mu2": mu2, "nonregion_positions_in_subject": nonregion_subject_positions, "max_nonregion_mismatches": max_nonregion_mismatches, }