def filter_read_pair(reads, ref, hist_distance_from_consensus=None, hist_dist_along=None, binsize=None, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, VERBOSE=0): '''Filter read pair''' from hivwholeseq.utils.mapping import trim_short_cigars_pair (read1, read2) = reads # Check names to make sure we are looking at paired reads, this would # screw up the whole bamfile if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair ' + str(irp) + ': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': unmapped' return 'unmapped' # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': not properly paired' return 'unpaired' i_fwd = reads[0].is_reverse i_rev = not i_fwd readf = reads[i_fwd] readr = reads[i_rev] # Mismappings are often characterized by many mutations: # check the number of mismatches and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) if hist_distance_from_consensus is not None: hist_distance_from_consensus[dc.sum()] += 1 if hist_dist_along is not None: hbin = (readf.pos + readf.isize / 2) // binsize hist_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' return 'mutator' # Trim the bad CIGARs from the sides, if there are any good ones # FIXME: this must have a bug with leading insertions # NOTE: I rewrote the function, now simpler, it should work skip = trim_short_cigars_pair(reads, match_len_min=match_len_min, trim_pad=trim_bad_cigars, throw=False) if skip: return 'bad_cigar' # Check the reads are still long enough after trimming if (len(read1.seq) < 100): if VERBOSE >= 2: print 'Read too short:', read1.qname, len(read1.seq) return 'tiny' if (len(read2.seq) < 100): if VERBOSE >= 2: print 'Read too short:', read2.qname, len(read2.seq) return 'tiny' # NOTE: cross-overhang and similar stuff should never happen, because we # filter only insert sizes > 400 after premapping. Nonetheless... if readf.isize < 300: if VERBOSE >= 2: print 'Insert too small:', readf.isize return 'tiny' return 'good'
def fish_distant_reads(bamfilename, ref, min_mismatches=20, max_mismatches=30, VERBOSE=0, maxseqs=-1): '''Fish distant reads from the trash''' import numpy as np from hivwholeseq.utils.mapping import pair_generator, reads_to_seqrecord from hivwholeseq.sequencing.filter_mapped_reads import check_overhanging_reads, \ get_distance_from_consensus distances = [] seqs = [] edges = [] with pysam.Samfile(bamfilename, 'rb') as bamfile: for irp, reads in enumerate(pair_generator(bamfile)): if VERBOSE >= 2: if not ((irp + 1) % 10000): print irp + 1 (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: raise ValueError('Read pair '+str(irp)+': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): continue # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: continue # Fish out our reads dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) if (min_mismatches <= dc.sum() <= max_mismatches): if VERBOSE >= 3: print 'Gotcha!', reads[0].qname seqs.append(reads[0]) seqs.append(reads[1]) distances.append(dc) edge = [(read.pos, read.pos + sum(bl for bt, bl in read.cigar if bt in (0, 2))) for read in reads] edges.append(edge) if len(seqs) // 2 == maxseqs: if VERBOSE >= 2: print 'Max seqs reached:', maxseqs break seqs = list(pair_generator(reads_to_seqrecord(seqs))) distances = np.array(distances, int) return (distances, edges, seqs)
def filter_read_pair(reads, ref, hist_distance_from_consensus=None, hist_dist_along=None, binsize=None, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, VERBOSE=0): '''Filter read pair''' from hivwholeseq.utils.mapping import trim_short_cigars_pair (read1, read2) = reads # Check names to make sure we are looking at paired reads, this would # screw up the whole bamfile if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair '+str(irp)+': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair '+read1.qname+': unmapped' return 'unmapped' # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair '+read1.qname+': not properly paired' return 'unpaired' i_fwd = reads[0].is_reverse i_rev = not i_fwd readf = reads[i_fwd] readr = reads[i_rev] # Mismappings are often characterized by many mutations: # check the number of mismatches and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) if hist_distance_from_consensus is not None: hist_distance_from_consensus[dc.sum()] += 1 if hist_dist_along is not None: hbin = (readf.pos + readf.isize / 2) // binsize hist_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' return 'mutator' # Trim the bad CIGARs from the sides, if there are any good ones # FIXME: this must have a bug with leading insertions # NOTE: I rewrote the function, now simpler, it should work skip = trim_short_cigars_pair(reads, match_len_min=match_len_min, trim_pad=trim_bad_cigars, throw=False) if skip: return 'bad_cigar' # Check the reads are still long enough after trimming if (len(read1.seq) < 100): if VERBOSE >= 2: print 'Read too short:', read1.qname, len(read1.seq) return 'tiny' if (len(read2.seq) < 100): if VERBOSE >= 2: print 'Read too short:', read2.qname, len(read2.seq) return 'tiny' # NOTE: cross-overhang and similar stuff should never happen, because we # filter only insert sizes > 400 after premapping. Nonetheless... if readf.isize < 300: if VERBOSE >= 2: print 'Insert too small:', readf.isize return 'tiny' return 'good'