def confirm_clipped_reads(self, region, breakpoint, novel_seq, min_clipped=5): """Finds reads that have clipped sequences at the breakpoint and that overlap with the novel sequence Minimum of 5 clipped sequence match is considered positive Returns the number of positive reads """ num_clipped_reads = 0 for read in self.bam.fetch(region[0], region[1], region[2]): clipped_seq = self.get_clipped_seq(read, breakpoint) if clipped_seq and len(clipped_seq[1]) > min_clipped: clipped_seq_matched = False # whether the clipped sequence is at 'start' or 'end' # dictates the order of the overlap if clipped_seq[0] == 'start': olap = seq_overlap(novel_seq, read.seq) if olap > len(clipped_seq): clipped_seq_matched = True else: olap = seq_overlap(read.seq, novel_seq) if olap > len(clipped_seq): clipped_seq_matched = True if clipped_seq_matched: num_clipped_reads += 1 return num_clipped_reads
def confirm_novel(self, region, novel_seq, breakpoint, min_olap=10, max_unmapped=1000): """Given novel sequence, find read-pairs that have one unmapped mate that overlaps (with minimum overlap) with novel sequence Note of maximum number of pairs with unmapped mates examined (1000) Reports the number of pairs that have their unmapped mates entirely subsumed or overlap the novel sequence """ # capture reads with unmapped mates unmapped = {} # for not going over maximum count = 0 for read in self.bam.fetch(region[0], region[1], region[2]): if read.is_paired and read.mate_is_unmapped: count += 1 unmapped[read.qname] = read if count == max_unmapped: break found_mates = 0 if unmapped: # this assumes unmapped mates is put under same location of mapped mates # store sequences of unmapped mates mate_seq = self.get_unmapped_mate_seq(region[0], unmapped.values()) for read in unmapped.values(): # make anchored read is pointing towards breakpoint if (breakpoint == region[1] and not read.is_reverse) or \ (breakpoint == region[2] and read.is_reverse): continue if mate_seq.has_key(read.qname): if read.pos > region[1]: from_end = read.pos - region[1] + 1 else: from_end = region[1] - read.pos + 1 m = re.search(mate_seq[read.qname], novel_seq, re.IGNORECASE) if m: found_mates += 1 else: m = re.search(reverse_complement(mate_seq[read.qname]), novel_seq, re.IGNORECASE) if m: found_mates += 1 # if unmapped mate sequence (with or without reverse complement) # is not entirely embedded in novel sequence # check if it overlaps with novel sequence at the edges # if it overlaps at least 10 bases, keep it else: olap = seq_overlap(mate_seq[read.qname], novel_seq) if olap > min_olap: found_mates += 1 else: olap = seq_overlap(reverse_complement(mate_seq[read.qname]), novel_seq) if olap > min_olap: found_mates += 1 return found_mates