def find_single_unique(alns, bam, debug=False): """Extracts single unique alignment for indel detection If there is only one alignment reported by BWA-mem even when '-a' is turned on Args: alns: (list) Pysam AlignedRead objects of the same contig bam: Pysam bam handle Returns: Alignment object or None """ primary_alns = [ aln for aln in alns if not aln.is_unmapped and not aln.is_secondary ] if len(primary_alns) == 1: if primary_alns[0].mapq > 0: matched_and_insertion_len = sum( [a[1] for a in primary_alns[0].cigar if a[0] <= 1]) if float(matched_and_insertion_len) / float( primary_alns[0].rlen) < 0.95: if debug: sys.stdout.write( 'best alignment less than 0.95 mapped:%s %s\n' % (alns[0].qname, alns[0].cigarstring)) return None else: edit_distance = effective_edit_distance(alns[0]) if edit_distance is not None and float(edit_distance) / float( primary_alns[0].inferred_length) > 0.1: if debug: sys.stdout.write( 'filter out single uniq alignment %s: edit distance %s - > 0.1 of contig len %d (%.01f)\n' % (alns[0].qname, edit_distance, primary_alns[0].inferred_length, float(edit_distance) / float(primary_alns[0].inferred_length))) return None else: if debug: sys.stdout.write( 'filter out single uniq alignment %s: mapq = 0\n' % primary_alns[0].qname) return None #ambiguous_NM = 5 #for aln in alns: #if aln.is_secondary and \ #not re.search('[HS]', aln.cigarstring) and\ #re.match('\d+M', aln.cigarstring) and re.search('\d+M$', aln.cigarstring) and\ #int(aln.opt('NM')) - int(primary_alns[0].opt('NM')) <= ambiguous_NM: #if debug: #sys.stdout.write('secondary alignments too similar %s\n' % primary_alns[0].qname) #return None return Alignment.from_alignedRead(primary_alns[0], bam) else: return None
def find_chimera(alns, bam, debug=False, check_haplotype=True): """Determine if given alignments are chimeric Args: alns: (List) List of Pysam AlignedRead objects bam: (AlignmentFile) Pysam handle to BAM file - for getting reference info debug: (Boolean) debug mode - will output debugging statements check_haplotype: (Boolean) whether to screen out alignments to references containing '_' """ primary_alns = [] secondary_alns = [] for aln in alns: if re.search('[HS]', aln.cigarstring) and not aln.is_secondary: primary_alns.append(aln) else: secondary_alns.append(aln) if check_haplotype and len(primary_alns) > 1: replace_haplotype(primary_alns, secondary_alns, bam) if len(primary_alns) > 1: aligns = [Alignment.from_alignedRead(aln, bam) for aln in primary_alns] bad_aligns = [align for align in aligns if not align.is_valid()] if bad_aligns: if debug: for align in bad_aligns: sys.stdout.write('bad alignment %s %s %s %s %s %s' % (align.query, align.qstart, align.qend, align.target, align.tstart, align.tend)) else: valid_secondary_aligns = [] if secondary_alns: secondary_aligns = [ Alignment.from_alignedRead(aln, bam) for aln in secondary_alns ] valid_secondary_aligns = [ align for align in secondary_aligns if align.is_valid() ] return aligns, valid_secondary_aligns return None, None
def find_chimera(alns, bam, debug=False, check_haplotype=True): """Determine if given alignments are chimeric Args: alns: (List) List of Pysam AlignedRead objects bam: (AlignmentFile) Pysam handle to BAM file - for getting reference info debug: (Boolean) debug mode - will output debugging statements check_haplotype: (Boolean) whether to screen out alignments to references containing '_' """ primary_alns = [] secondary_alns = [] for aln in alns: if re.search('[HS]', aln.cigarstring) and not aln.is_secondary: primary_alns.append(aln) else: secondary_alns.append(aln) if check_haplotype and len(primary_alns) > 1: replace_haplotype(primary_alns, secondary_alns, bam) if len(primary_alns) > 1: aligns = [Alignment.from_alignedRead(aln, bam) for aln in primary_alns] bad_aligns = [align for align in aligns if not align.is_valid()] if bad_aligns: if debug: for align in bad_aligns: sys.stdout.write('bad alignment %s %s %s %s %s %s' % (align.query, align.qstart, align.qend, align.target, align.tstart, align.tend)) else: valid_secondary_aligns = [] if secondary_alns: secondary_aligns = [Alignment.from_alignedRead(aln, bam) for aln in secondary_alns] valid_secondary_aligns = [align for align in secondary_aligns if align.is_valid()] return aligns, valid_secondary_aligns return None, None
def map_aligns(self, bam, query_fasta, genome_fasta, accessory_known_features=None, find_events=True, max_diff=1): mappings = defaultdict(list) junc_adjs = [] events = [] for query, group in groupby(bam.fetch(until_eof=True), lambda aln: aln.query_name): print 'processing', query aligns = [] for aln in list(group): if not aln.is_unmapped: aligns.append(Alignment.from_alignedRead(aln, bam)) if not aligns: continue query_seq = query_fasta.fetch(query) for align in aligns: if not align.has_canonical_target() or align.blocks is None: continue block_matches = self.map_align(align) if block_matches: tid = self.pick_best_mapping(block_matches, align) if tid is not None: transcript = self.transcripts_dict[tid] olap = self.overlap(align, transcript) mappings[query].append( (transcript.gene, transcript.id, olap)) junc_adjs.extend( self.collect_junctions(align, transcript, block_matches[tid])) if find_events: events.extend( find_novel_junctions(block_matches[tid], align, transcript, query_seq, self.genome_fasta, accessory_known_features= accessory_known_features, max_diff=max_diff)) return mappings, junc_adjs, events
def find_single_unique(alns, bam, debug=False): """Extracts single unique alignment for indel detection If there is only one alignment reported by BWA-mem even when '-a' is turned on Args: alns: (list) Pysam AlignedRead objects of the same contig bam: Pysam bam handle Returns: Alignment object or None """ primary_alns = [aln for aln in alns if not aln.is_unmapped and not aln.is_secondary] if len(primary_alns) == 1: if primary_alns[0].mapq > 0: matched_and_insertion_len = sum([a[1] for a in primary_alns[0].cigar if a[0] <= 1]) if float(matched_and_insertion_len) / float(primary_alns[0].rlen) < 0.95: if debug: sys.stdout.write('best alignment less than 0.95 mapped:%s %s\n' % (alns[0].qname, alns[0].cigarstring)) return None else: edit_distance = effective_edit_distance(alns[0]) if edit_distance is not None and float(edit_distance)/float(primary_alns[0].inferred_length) > 0.1: if debug: sys.stdout.write('filter out single uniq alignment %s: edit distance %s - > 0.1 of contig len %d (%.01f)\n' % (alns[0].qname, edit_distance, primary_alns[0].inferred_length, float(edit_distance)/float(primary_alns[0].inferred_length) )) return None else: if debug: sys.stdout.write('filter out single uniq alignment %s: mapq = 0\n' % primary_alns[0].qname) return None #ambiguous_NM = 5 #for aln in alns: #if aln.is_secondary and \ #not re.search('[HS]', aln.cigarstring) and\ #re.match('\d+M', aln.cigarstring) and re.search('\d+M$', aln.cigarstring) and\ #int(aln.opt('NM')) - int(primary_alns[0].opt('NM')) <= ambiguous_NM: #if debug: #sys.stdout.write('secondary alignments too similar %s\n' % primary_alns[0].qname) #return None return Alignment.from_alignedRead(primary_alns[0], bam) else: return None
def map_aligns(self, bam, query_fasta, genome_fasta, accessory_known_features=None, find_events=True, max_diff=1): mappings = defaultdict(list) junc_adjs = [] events = [] for query, group in groupby(bam.fetch(until_eof=True), lambda aln: aln.query_name): print 'processing', query aligns = [] for aln in list(group): if not aln.is_unmapped: aligns.append(Alignment.from_alignedRead(aln, bam)) if not aligns: continue query_seq = query_fasta.fetch(query) for align in aligns: if not align.has_canonical_target() or align.blocks is None: continue block_matches = self.map_align(align) if block_matches: tid = self.pick_best_mapping(block_matches, align) if tid is not None: transcript = self.transcripts_dict[tid] olap = self.overlap(align, transcript) mappings[query].append((transcript.gene, transcript.id, olap)) junc_adjs.extend(self.collect_junctions(align, transcript, block_matches[tid])) if find_events: events.extend(find_novel_junctions(block_matches[tid], align, transcript, query_seq, self.genome_fasta, accessory_known_features=accessory_known_features, max_diff=max_diff) ) return mappings, junc_adjs, events