def get_haplotype_copy_numbers_from_spanning_reads(self, spanning_reads): if len(spanning_reads) < 1: logging.info('There is no spanning read') return None max_length = 0 for read in spanning_reads: if len(read) - 100 > max_length: max_length = len(read) - 100 max_copies = int( round(max_length / float(len(self.reference_vntr.pattern)))) max_copies = min(max_copies, 2 * len(self.reference_vntr.get_repeat_segments())) vntr_matcher = self.build_vntr_matcher_hmm(max_copies) haplotyper = PacBioHaplotyper(spanning_reads) haplotypes = haplotyper.get_error_corrected_haplotypes() copy_numbers = [] for haplotype in haplotypes: # print('haplotype: %s' % haplotype) logp, vpath = vntr_matcher.viterbi(haplotype) rev_logp, rev_vpath = vntr_matcher.viterbi( str(Seq(haplotype).reverse_complement())) if logp < rev_logp: vpath = rev_vpath copy_numbers.append(get_number_of_repeats_in_vpath(vpath)) return copy_numbers
def find_ru_counts_with_naive_approach(self, length_dist, spanning_reads): haplotyper = PacBioHaplotyper(spanning_reads) haplotypes = haplotyper.get_error_corrected_haplotypes(1) flanking_region_lengths = [] new_spanning_reads = [] if len(haplotypes) == 0: return None self.check_if_flanking_regions_align_to_str(haplotypes[0].upper(), flanking_region_lengths, new_spanning_reads) reverse_complement_str = str(Seq(haplotypes[0]).reverse_complement()) self.check_if_flanking_regions_align_to_str( reverse_complement_str.upper(), flanking_region_lengths, new_spanning_reads) if len(flanking_region_lengths) > 0: return [ round(flanking_region_lengths[0] / len(self.reference_vntr.pattern)) ] * 2 else: return None