def get_distance_histogram(data_folder, adaID, fragment, maxreads=1000, VERBOSE=0, filtered=False): '''Get the distance of reads from their consensus''' reffilename = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=filtered) with pysam.Samfile(bamfilename, 'rb') as bamfile: n_pairs = 0 read_pairs = [] for (i, rp) in enumerate(pair_generator(bamfile)): if n_pairs >= maxreads: break r1 = rp[0] if not r1.is_proper_pair: continue read_pairs.append(rp) n_pairs += 1 ds = get_distance_from_reference(ref, read_pairs, threshold=30) h = np.bincount(ds) return h
def get_insert_size_distribution(data_folder, adaID, fragment, bins=None, maxreads=-1, VERBOSE=0, density=True): '''Get the distribution of insert sizes''' if maxreads <= 0: maxreads = 1e6 insert_sizes = np.zeros(maxreads, np.int16) # Open BAM file if fragment == 'premapped': bamfilename = get_premapped_filename(data_folder, adaID, type='bam') else: bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) # Convert from SAM if necessary if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) # Open file with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over single reads (no linkage info needed) n_written = 0 for i, reads in enumerate(pair_generator(bamfile)): if i == maxreads: if VERBOSE >= 2: print 'Max reads reached:', maxreads break # Print output if (VERBOSE >= 3) and (not ((i +1) % 10000)): print (i+1) # If unmapped or unpaired, mini, or insert size mini, discard if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \ reads[1].is_unmapped or (not reads[1].is_proper_pair): continue # Store insert size i_fwd = reads[0].is_reverse insert_sizes[i] = reads[i_fwd].isize n_written += 1 insert_sizes = insert_sizes[:n_written] insert_sizes.sort() # Bin it if bins is None: h = np.histogram(insert_sizes, density=density) else: h = np.histogram(insert_sizes, bins=bins, density=density) return insert_sizes, h
def get_coverage_tuples(data_folder, adaID, fragment, mtuples, maxreads=-1, VERBOSE=0): '''Get the joint coverage of a list of positions''' # Prepare data structures mtuples = [np.asarray(tup, int) for tup in mtuples] coverage = np.zeros(len(mtuples), int) # TODO: what to do if it is covered multiple times? or only some sites? covs_pair = [np.zeros(len(tup), bool) for tup in mtuples] # Open BAM bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over all pairs for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: if VERBOSE: print 'Max reads reached:', maxreads break if VERBOSE >= 3: if not ((irp + 1) % 10000): print irp + 1 # Reinitialize temporary structure for cov_pair in covs_pair: cov_pair[:] = False # Look in both reads for read in reads: # NOTE: deletions count as covered, because in principle # we see that part of the reference cigar = read.cigar ref_start = read.pos ref_end = ref_start + sum(bl for (bt, bl) in cigar if bt in (0, 2)) # Use numba to accelerate? better not if False: add_read(ref_start, ref_end, mtuples, covs_pair) else: for cov_pair, mtuple in izip(covs_pair, mtuples): cov_pair[(mtuple >= ref_start) & (mtuple < ref_end)] = True # Check which tuples are fully covered for i, cov_pair in enumerate(covs_pair): if cov_pair.all(): coverage[i] += 1 return coverage
def split_reads(data_folder, adaID, fragment, chunk_size=10000, maxreads=-1, VERBOSE=0): '''Split reads into chunks for mapping''' input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') with pysam.Samfile(input_filename, 'rb') as bamfile: if VERBOSE: if maxreads == -1: n_reads = get_number_reads_open(bamfile) // 2 else: n_reads = maxreads print 'Expected number of chunks:', 1 + (n_reads // chunk_size) chunk_number = 0 chunkfile = None for irp, read_pair in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp + 1) % 10000): print irp + 1 if not (irp % chunk_size): if chunkfile is not None: chunkfile.close() chunk_number += 1 chunk_filename = get_divided_filename(data_folder, adaID, fragment, type='bam', chunk=chunk_number) chunkfile = pysam.Samfile(chunk_filename, 'wb', template=bamfile) if VERBOSE >= 2: print 'Chunk n', chunk_number, 'started' chunkfile.write(read_pair[0]) chunkfile.write(read_pair[1]) if chunkfile is not None: chunkfile.close() if VERBOSE: print 'Chunking finished'
def split_reads(data_folder, adaID, fragment, chunk_size=10000, maxreads=-1, VERBOSE=0): '''Split reads into chunks for mapping''' input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') with pysam.Samfile(input_filename, 'rb') as bamfile: if VERBOSE: if maxreads == -1: n_reads = get_number_reads_open(bamfile) // 2 else: n_reads = maxreads print 'Expected number of chunks:', 1 + (n_reads // chunk_size) chunk_number = 0 chunkfile = None for irp, read_pair in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp+1) % 10000): print irp+1 if not (irp % chunk_size): if chunkfile is not None: chunkfile.close() chunk_number += 1 chunk_filename = get_divided_filename(data_folder, adaID, fragment, type='bam', chunk=chunk_number) chunkfile = pysam.Samfile(chunk_filename, 'wb', template=bamfile) if VERBOSE >= 2: print 'Chunk n', chunk_number, 'started' chunkfile.write(read_pair[0]) chunkfile.write(read_pair[1]) if chunkfile is not None: chunkfile.close() if VERBOSE: print 'Chunking finished'
seq_run = args.run adaID = args.adaID VERBOSE = args.verbose # Specify the dataset dataset = load_sequencing_run(seq_run) data_folder = dataset.folder # Get the BAM filename input_filename = get_premapped_filename(data_folder, adaID, type='bam') # Get unmapped reads and BLAST them reads_unmapped = [] n_unmapped = 0 with pysam.Samfile(input_filename, 'rb') as input_file: for reads in pair_generator(input_file): if not reads[0].is_unmapped: continue n_unmapped += 1 # Take only the first part of read1, to make sure quality is high seq = reads[reads[1].is_read1].seq[:200] seqb = Seq(seq, IUPAC.ambiguous_dna) # Save to file, to test local blast reads_unmapped.append(reads[reads[1].is_read1]) if len(reads_unmapped) >= 100: break continue
def filter_reads(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1, contaminants=None, n_cycles=600, max_mismatches=30, susp_mismatches=20, summary=True, plot=False): '''Filter the reads to good chunks''' frag_gen = fragment[:2] reffilename = get_consensus_filename(data_folder, adaID, frag_gen) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=False) if not os.path.isfile(bamfilename): samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', filtered=False) if os.path.isfile(samfilename): convert_sam_to_bam(bamfilename) else: if VERBOSE >= 1: print 'ERROR: ' + adaID + ', mapped file not found.' return outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) suspiciousfilename = get_mapped_suspicious_filename( data_folder, adaID, frag_gen) trashfilename = outfilename[:-4] + '_trashed.bam' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: # Iterate over all pairs n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_suspect = 0 n_mismapped_edge = 0 n_badcigar = 0 histogram_distance_from_consensus = np.zeros(n_cycles + 1, int) binsize = 200 histogram_dist_along = np.zeros( (len(ref) // binsize + 1, n_cycles + 1), int) for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: break # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair ' + str(irp) + ': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': unmapped' n_unmapped += 1 map(trashfile.write, reads) continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': not properly paired' n_unpaired += 1 map(trashfile.write, reads) continue # Mismappings are sometimes at fragment edges: # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: n_mismapped_edge += 1 map(trashfile.write, reads) continue # Mismappings are often characterized by many mutations: # check the number of mismatches of the whole pair and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) histogram_distance_from_consensus[dc.sum()] += 1 hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize histogram_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\ 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' n_mutator += 1 map(trashfile.write, reads) continue # Check for contamination from other PCR plates. Typically, # contamination happens for only one fragment, whereas superinfection # happens for all. At this stage, we can only give clues about # cross-contamination, the rest will be done in a script downstream # (here we could TAG suspicious reads for contamination) elif (dc.sum() > susp_mismatches): if contaminants is not None: skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE) else: skip = True if skip: n_suspect += 1 map(suspfile.write, reads) continue # Trim the bad CIGARs from the sides, if there are any good ones skip = trim_bad_cigar(reads, match_len_min=match_len_min, trim_left=trim_bad_cigars, trim_right=trim_bad_cigars) if skip: n_badcigar += 1 map(trashfile.write, reads) continue # TODO: we might want to incorporate some more stringent # criterion here, to avoid short reads, cross-overhang, etc. # Write the output n_good += 1 map(outfile.write, reads) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Mispapped at edge:', n_mismapped_edge print 'Many-mutations:', n_mutator print 'Suspect contaminations:', n_suspect print 'Bad CIGARs:', n_badcigar if summary: summary_filename = get_filter_mapped_summary_filename( data_folder, adaID, fragment) with open(summary_filename, 'a') as f: f.write('Filter results: adaID ' + adaID + fragment + '\n') f.write('Total:\t\t\t' + str(irp + 1) + '\n') f.write('Good:\t\t\t' + str(n_good) + '\n') f.write('Unmapped:\t\t' + str(n_unmapped) + '\n') f.write('Unpaired:\t\t' + str(n_unpaired) + '\n') f.write('Mismapped at edge:\t' + str(n_mismapped_edge) + '\n') f.write('Many-mutations:\t\t' + str(n_mutator) + '\n') f.write('Suspect contaminations:\t' + str(n_suspect) + '\n') f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n') if plot: plot_distance_histogram(data_folder, adaID, frag_gen, histogram_distance_from_consensus, savefig=True) plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen, len(ref), histogram_dist_along, binsize=binsize, savefig=True)
def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4] + '_trashed.bam' infilenames = [ get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq ] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print('WARNING: No mapped files found: ' + ', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair( reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname ' + pname + ', ' + samplename_pat + ', ' + fragment + '\n') f.write('Total:\t\t\t' + str(irp + 1) + '\n') f.write('Good:\t\t\t' + str(n_good) + '\n') f.write('Unmapped:\t\t' + str(n_unmapped) + '\n') f.write('Unpaired:\t\t' + str(n_unpaired) + '\n') f.write('Many-mutations:\t\t' + str(n_mutator) + '\n') f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n') f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
def filter_reads(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1, contaminants=None, n_cycles=600, max_mismatches=30, susp_mismatches=20, summary=True, plot=False): '''Filter the reads to good chunks''' frag_gen = fragment[:2] reffilename = get_consensus_filename(data_folder, adaID, frag_gen) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=False) if not os.path.isfile(bamfilename): samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', filtered=False) if os.path.isfile(samfilename): convert_sam_to_bam(bamfilename) else: if VERBOSE >= 1: print 'ERROR: '+adaID+', mapped file not found.' return outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) suspiciousfilename = get_mapped_suspicious_filename(data_folder, adaID, frag_gen) trashfilename = outfilename[:-4]+'_trashed.bam' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: # Iterate over all pairs n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_suspect = 0 n_mismapped_edge = 0 n_badcigar = 0 histogram_distance_from_consensus = np.zeros(n_cycles + 1, int) binsize = 200 histogram_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: break # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair '+str(irp)+': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair '+read1.qname+': unmapped' n_unmapped += 1 map(trashfile.write, reads) continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair '+read1.qname+': not properly paired' n_unpaired += 1 map(trashfile.write, reads) continue # Mismappings are sometimes at fragment edges: # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: n_mismapped_edge += 1 map(trashfile.write, reads) continue # Mismappings are often characterized by many mutations: # check the number of mismatches of the whole pair and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) histogram_distance_from_consensus[dc.sum()] += 1 hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize histogram_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\ 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' n_mutator += 1 map(trashfile.write, reads) continue # Check for contamination from other PCR plates. Typically, # contamination happens for only one fragment, whereas superinfection # happens for all. At this stage, we can only give clues about # cross-contamination, the rest will be done in a script downstream # (here we could TAG suspicious reads for contamination) elif (dc.sum() > susp_mismatches): if contaminants is not None: skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE) else: skip = True if skip: n_suspect += 1 map(suspfile.write, reads) continue # Trim the bad CIGARs from the sides, if there are any good ones skip = trim_bad_cigar(reads, match_len_min=match_len_min, trim_left=trim_bad_cigars, trim_right=trim_bad_cigars) if skip: n_badcigar += 1 map(trashfile.write, reads) continue # TODO: we might want to incorporate some more stringent # criterion here, to avoid short reads, cross-overhang, etc. # Write the output n_good += 1 map(outfile.write, reads) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Mispapped at edge:', n_mismapped_edge print 'Many-mutations:', n_mutator print 'Suspect contaminations:', n_suspect print 'Bad CIGARs:', n_badcigar if summary: summary_filename = get_filter_mapped_summary_filename(data_folder, adaID, fragment) with open(summary_filename, 'a') as f: f.write('Filter results: adaID '+adaID+fragment+'\n') f.write('Total:\t\t\t'+str(irp + 1)+'\n') f.write('Good:\t\t\t'+str(n_good)+'\n') f.write('Unmapped:\t\t'+str(n_unmapped)+'\n') f.write('Unpaired:\t\t'+str(n_unpaired)+'\n') f.write('Mismapped at edge:\t'+str(n_mismapped_edge)+'\n') f.write('Many-mutations:\t\t'+str(n_mutator)+'\n') f.write('Suspect contaminations:\t'+str(n_suspect)+'\n') f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n') if plot: plot_distance_histogram(data_folder, adaID, frag_gen, histogram_distance_from_consensus, savefig=True) plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen, len(ref), histogram_dist_along, binsize=binsize, savefig=True)
def get_minimal_distance_hist(bamfilename, consensi, maxreads=1000, VERBOSE=0): '''Get histogram of minimal distance of reads from consensi''' conssi = map(''.join, consensi) m = np.zeros(len(consensi), int) n_good = 0 with pysam.Samfile(bamfilename, 'rb') as bamfile: for irp, reads in enumerate(pair_generator(bamfile)): if n_good == maxreads: break if VERBOSE >= 3: print n_good + 1, 'Checking mindist for:', reads[0].qname, # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: raise ValueError('Read pair ' + str(irp) + ': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': unmapped' continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': not properly paired' continue n_good += 1 # Get all distances ds_pair = np.zeros_like(m) for ic, consensus in enumerate(consensi): conss = conssi[ic] dpair = 0 for read in reads: seq = read.seq ali = align_overlap(conss, seq) # NOTE: it is possible that we start before conss' start or end after # its end, but that IS evidence that it's not contamination from there. pos = conss.find(ali[1].replace('-', '')) alim0 = np.fromstring(ali[1], 'S1') alim1 = np.fromstring(ali[2], 'S1') # Score subst d = ((alim0 != alim1) & (alim0 != '-') & (alim1 != '-')).sum() # Score insertions gaps = alim0 == '-' if gaps.sum(): n_gaps_borders = np.diff(gaps).sum() n_gaps_borders += alim0[0] == '-' n_gaps_borders += alim0[-1] == '-' n_insertions = n_gaps_borders // 2 d += n_insertions # Score deletions gaps = alim1 == '-' if gaps.sum(): n_gaps_borders = np.diff(gaps).sum() n_gaps_borders -= alim1[0] == '-' n_gaps_borders -= alim1[-1] == '-' n_deletions = n_gaps_borders // 2 d += n_deletions dpair += d ds_pair[ic] = dpair if VERBOSE >= 3: print 'OK', m[ds_pair.argmin()] += 1 if VERBOSE >= 3: print '' return m
def filter_contamination( bamfilename, bamfilename_out, contseqs, samplename, VERBOSE=0, deltascore_max_self=60, deltascore_max_other=24, maxreads=-1, **kwargs ): """Fish contaminated reads from mapped reads The function checks for a maximal distance to the expected consensus, and only if it's more than that it checks all other samples. Args: deltascore_max_self (int): the maximal delta in alignment score to the consensus to be considered pure deltascore_max_other (int): the maximal delta in alignment score to any other sample to be considered a contamination **kwargs: passed down to the pairwise alignment function """ import pysam from collections import defaultdict from operator import itemgetter from seqanpy import align_overlap from hivwholeseq.utils.mapping import pair_generator, get_number_reads if "score_match" in kwargs: score_match = kwargs["score_match"] else: score_match = 3 bamfilename_trash = bamfilename_out[:-4] + "_trashed.bam" contseqs = contseqs.copy() consseq = contseqs.pop(samplename) if VERBOSE >= 2: print "Scanning reads (" + str(get_number_reads(bamfilename) // 2) + ")" with pysam.Samfile(bamfilename, "rb") as bamfile: with pysam.Samfile(bamfilename_out, "wb", template=bamfile) as bamfileout, pysam.Samfile( bamfilename_trash, "wb", template=bamfile ) as bamfiletrash: n_good = 0 n_cont = defaultdict(int) for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp + 1) % 100): if not ((irp + 1) == 100): sys.stdout.write("\x1b[1A") print irp + 1 for read in reads: # Look for distance to the own consensus, it that's small move on alignments_read = {} deltas_read = {} (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs) (alis1, alis2) = trim_align_overlap((alis1, alis2)) scoremax = len(alis1) * score_match delta_read = scoremax - score deltas_read[samplename] = delta_read alignments_read[samplename] = (alis1, alis2) if delta_read <= deltascore_max_self: if VERBOSE >= 4: print "Read is very close to its own consensus", scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1="ref", name2="read") continue # Otherwise, move on to all other sequences and find the neighbour for contname, contseq in contseqs.iteritems(): (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs) (ali1, ali2) = trim_align_overlap((ali1, ali2)) scoremax = len(ali1) * score_match delta_read = scoremax - score deltas_read[contname] = delta_read alignments_read[contname] = (ali1, ali2) if VERBOSE >= 5: print samplename for key, d in deltas_read.iteritems(): print key, d (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1)) # Again, the correct consensus has precedence if deltas_read[samplename] == delta_read: contname = samplename (ali1, ali2) = alignments_read[contname] # The read may be closest to its own consensus, if not very close if contname == samplename: if VERBOSE >= 4: print "Read is closest to its consensus", scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") # The read may come from another consensus (contamination) elif delta_read <= deltascore_max_other: n_cont[contname] += 1 bamfiletrash.write(reads[0]) bamfiletrash.write(reads[1]) if VERBOSE >= 2: print "Contaminated read found! Good:", n_good, "cont:", sum( n_cont.itervalues() ), "sources:", n_cont if VERBOSE >= 3: print "Read is contaminated by", contname, scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1="self", name2="read") print "" pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") if VERBOSE >= 2: print "" break # Finally, the read is not really close to anything: accept else: if VERBOSE >= 4: print "Read is close to nothing really", scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1="ref", name2="read") else: n_good += 1 bamfileout.write(reads[0]) bamfileout.write(reads[1]) n_cont = dict(n_cont) return (n_good, n_cont)
def filter_contamination(bamfilename, bamfilename_out, contseqs, samplename, VERBOSE=0, deltascore_max_self=60, deltascore_max_other=24, maxreads=-1, **kwargs): '''Fish contaminated reads from mapped reads The function checks for a maximal distance to the expected consensus, and only if it's more than that it checks all other samples. Args: deltascore_max_self (int): the maximal delta in alignment score to the consensus to be considered pure deltascore_max_other (int): the maximal delta in alignment score to any other sample to be considered a contamination **kwargs: passed down to the pairwise alignment function ''' import pysam from collections import defaultdict from operator import itemgetter from seqanpy import align_overlap from hivwholeseq.utils.mapping import pair_generator, get_number_reads if 'score_match' in kwargs: score_match = kwargs['score_match'] else: score_match = 3 bamfilename_trash = bamfilename_out[:-4] + '_trashed.bam' contseqs = contseqs.copy() consseq = contseqs.pop(samplename) if VERBOSE >= 2: print 'Scanning reads (' + str( get_number_reads(bamfilename) // 2) + ')' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(bamfilename_out, 'wb', template=bamfile) as bamfileout, \ pysam.Samfile(bamfilename_trash, 'wb', template=bamfile) as bamfiletrash: n_good = 0 n_cont = defaultdict(int) for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break if VERBOSE >= 2: if not ((irp + 1) % 100): if not ((irp + 1) == 100): sys.stdout.write('\x1b[1A') print irp + 1 for read in reads: # Look for distance to the own consensus, it that's small move on alignments_read = {} deltas_read = {} (score, alis1, alis2) = align_overlap(consseq, read.seq, **kwargs) (alis1, alis2) = trim_align_overlap((alis1, alis2)) scoremax = len(alis1) * score_match delta_read = scoremax - score deltas_read[samplename] = delta_read alignments_read[samplename] = (alis1, alis2) if delta_read <= deltascore_max_self: if VERBOSE >= 4: print 'Read is very close to its own consensus', scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1='ref', name2='read') continue # Otherwise, move on to all other sequences and find the neighbour for contname, contseq in contseqs.iteritems(): (score, ali1, ali2) = align_overlap(contseq, read.seq, **kwargs) (ali1, ali2) = trim_align_overlap((ali1, ali2)) scoremax = len(ali1) * score_match delta_read = scoremax - score deltas_read[contname] = delta_read alignments_read[contname] = (ali1, ali2) if VERBOSE >= 5: print samplename for key, d in deltas_read.iteritems(): print key, d (contname, delta_read) = min(deltas_read.iteritems(), key=itemgetter(1)) # Again, the correct consensus has precedence if deltas_read[samplename] == delta_read: contname = samplename (ali1, ali2) = alignments_read[contname] # The read may be closest to its own consensus, if not very close if contname == samplename: if VERBOSE >= 4: print 'Read is closest to its consensus', scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') # The read may come from another consensus (contamination) elif (delta_read <= deltascore_max_other): n_cont[contname] += 1 bamfiletrash.write(reads[0]) bamfiletrash.write(reads[1]) if VERBOSE >= 2: print 'Contaminated read found! Good:', n_good, 'cont:', sum( n_cont.itervalues()), 'sources:', n_cont if VERBOSE >= 3: print 'Read is contaminated by', contname, scoremax, score, delta_read pretty_print_pairwise_ali([alis1, alis2], width=90, name1='self', name2='read') print '' pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') if VERBOSE >= 2: print '' break # Finally, the read is not really close to anything: accept else: if VERBOSE >= 4: print 'Read is close to nothing really', scoremax, score, delta_read pretty_print_pairwise_ali([ali1, ali2], width=90, name1='ref', name2='read') else: n_good += 1 bamfileout.write(reads[0]) bamfileout.write(reads[1]) n_cont = dict(n_cont) return (n_good, n_cont)
# read file bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) bamfile = pysam.Samfile(bamfilename, 'rb') # Get the coverage for reads which have long insert sizes # (to be sure about their identity) cov_new = 0 cov_old = 0 for i_pairs, reads in enumerate(pair_generator(bamfile)): if i_pairs > 5000000: break if reads[0].isize < 300: continue for read in reads: if read.seq.find(primer_new) != -1: cov_new += 1 if read.seq.find(primer_old) != -1: cov_old += 1 print 'old:', cov_old, 'new:', cov_new bamfile.close()
def get_minimal_distance_hist(bamfilename, consensi, maxreads=1000, VERBOSE=0): '''Get histogram of minimal distance of reads from consensi''' conssi = map(''.join, consensi) m = np.zeros(len(consensi), int) n_good = 0 with pysam.Samfile(bamfilename, 'rb') as bamfile: for irp, reads in enumerate(pair_generator(bamfile)): if n_good == maxreads: break if VERBOSE >= 3: print n_good + 1, 'Checking mindist for:', reads[0].qname, # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: raise ValueError('Read pair '+str(irp)+': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair '+read1.qname+': unmapped' continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair '+read1.qname+': not properly paired' continue n_good += 1 # Get all distances ds_pair = np.zeros_like(m) for ic, consensus in enumerate(consensi): conss = conssi[ic] dpair = 0 for read in reads: seq = read.seq ali = align_overlap(conss, seq) # NOTE: it is possible that we start before conss' start or end after # its end, but that IS evidence that it's not contamination from there. pos = conss.find(ali[1].replace('-', '')) alim0 = np.fromstring(ali[1], 'S1') alim1 = np.fromstring(ali[2], 'S1') # Score subst d = ((alim0 != alim1) & (alim0 != '-') & (alim1 != '-')).sum() # Score insertions gaps = alim0 == '-' if gaps.sum(): n_gaps_borders = np.diff(gaps).sum() n_gaps_borders += alim0[0] == '-' n_gaps_borders += alim0[-1] == '-' n_insertions = n_gaps_borders // 2 d += n_insertions # Score deletions gaps = alim1 == '-' if gaps.sum(): n_gaps_borders = np.diff(gaps).sum() n_gaps_borders -= alim1[0] == '-' n_gaps_borders -= alim1[-1] == '-' n_deletions = n_gaps_borders // 2 d += n_deletions dpair += d ds_pair[ic] = dpair if VERBOSE >= 3: print 'OK', m[ds_pair.argmin()] += 1 if VERBOSE >= 3: print '' return m
def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4]+'_trashed.bam' infilenames = [get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair(reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n') f.write('Total:\t\t\t'+str(irp + 1)+'\n') f.write('Good:\t\t\t'+str(n_good)+'\n') f.write('Unmapped:\t\t'+str(n_unmapped)+'\n') f.write('Unpaired:\t\t'+str(n_unpaired)+'\n') f.write('Many-mutations:\t\t'+str(n_mutator)+'\n') f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n') f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')
def get_local_haplotypes(bamfilename, start, end, VERBOSE=0, maxreads=-1, label=''): '''Extract reads fully covering the region, discarding insertions''' import sys import pysam from hivwholeseq.utils.mapping import pair_generator from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open from collections import Counter haplotypes = Counter() with pysam.Samfile(bamfilename, 'rb') as bamfile: if maxreads == -1: reads_iter = pair_generator(bamfile) else: reads_iter = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE, pairs=True) for irp, reads in enumerate(reads_iter): if VERBOSE >= 2: if not ((irp + 1) % 10000): if irp + 1 != 10000: sys.stdout.write("\x1b[1A\n") if label: sys.stdout.write(label + '\t') sys.stdout.write(str(irp + 1)) sys.stdout.flush() # Sort fwd read first: this is important because with our insert # size we know the fwd read starts <= the rev read is_fwd = reads[0].is_reverse reads = [reads[is_fwd], reads[not is_fwd]] # Check for coverage of the region start_fwd = reads[0].pos end_fwd = start_fwd + sum( bl for (bt, bl) in reads[0].cigar if bt in (0, 2)) start_rev = reads[1].pos end_rev = start_rev + sum( bl for (bt, bl) in reads[1].cigar if bt in (0, 2)) overlap_len = max(0, end_fwd - start_rev) # Various scenarios possible if start_fwd > start: continue if end_rev < end: continue # No single read covers the whole region AND (the insert has a whole # OR a very short overlap) if (end_fwd < end) and (start_rev > start) and (overlap_len < 20): continue # Now the good cases if (start_fwd <= start) and (end_fwd >= end): seq = trim_read_roi(reads[0], start, end) elif (start_rev <= start) and (end_rev >= end): seq = trim_read_roi(reads[1], start, end) else: seqs = [trim_read_roi(read, start, end) for read in reads] seq = merge_read_pair(*seqs) haplotypes[seq] += 1 if VERBOSE >= 4: import ipdb ipdb.set_trace() if VERBOSE >= 2: if irp >= 10000: sys.stdout.write('\n') sys.stdout.flush() return haplotypes
def get_coallele_counts(data_folder, adaID, fragment, VERBOSE=0): '''Extract allele and insert counts from a bamfile''' # Read reference reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True) refseq = SeqIO.read(reffilename, 'fasta') # Allele counts and inserts (TODO: compress this data?) # Note: the pair is of 2 types only, while the single reads usually are of 4 counts = np.zeros((len(read_pair_types), len(alpha), len(alpha), len(refseq), len(refseq)), int) positions = np.zeros(501, int) ais = np.zeros_like(positions) # TODO: no inserts for now # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over read pairs for i, reads in enumerate(pair_generator(bamfile)): # Limit to some reads for testing if i > maxreads: if VERBOSE: print 'Max read number reached:', maxreads break # Print output if (VERBOSE >= 3) and (not ((i +1) % 10)): print (i+1) # Divide by read 1/2 and forward/reverse js = reads[0].is_reverse count = counts[js] # List of mutations positions[:] = -1 ais[:] = -1 imut = 0 # Collect from the pair of reads for read in reads: # Sequence and position # Note: stampy takes the reverse complement already seq = read.seq pos = read.pos # Iterate over CIGARs len_cig = len(read.cigar) for ic, (block_type, block_len) in enumerate(read.cigar): # Check for pos: it should never exceed the length of the fragment if (block_type in [0, 1, 2]) and (pos > len(refseq)): raise ValueError('Pos exceeded the length of the fragment') # Inline block if block_type == 0: # Get the mutations and add them indb = map(alphal.index, seq) positions[imut: imut + len(indb)] = \ pos + np.arange(len(indb)) ais[imut: imut + len(indb)] = indb imut += len(indb) # Chop off this block if ic != len_cig - 1: seq = seq[block_len:] pos += block_len # Deletion elif block_type == 2: # Chop off pos, but not sequence pos += block_len # Insertion # an insert @ pos 391 means that seq[:391] is BEFORE the insert, # THEN the insert, FINALLY comes seq[391:] elif block_type == 1: # Chop off seq, but not pos if ic != len_cig - 1: seq = seq[block_len:] # Other types of cigar? else: raise ValueError('CIGAR type '+str(block_type)+' not recognized') if VERBOSE >= 4: for pos, ai in izip(positions, ais): if pos == -1: break print pos, ai # Put the mutations into the matrix for ai1 in xrange(len(alpha)): for ai2 in xrange(len(alpha)): coun = count[ai1, ai2] pos1 = positions[ais == ai1] if ai1 == ai2: pos2 = pos1 else: pos2 = positions[ais == ai2] coords = np.meshgrid(pos1, pos2) ind = coords[0].ravel() * coun.shape[0] + coords[1].ravel() coun.ravel()[ind] += 1 return counts
for sample in patient.itersamples(): if VERBOSE >= 1: print sample.name, if sample[fragment] not in ['ok', 'low']: if VERBOSE >= 1: print 'not "ok". skipping' continue if VERBOSE >= 1: print 'ok' bamfilename = sample.get_mapped_filtered_filename(fragment, decontaminated=True) with pysam.Samfile(bamfilename, 'rb') as bamfile: if maxreads == -1: reads = pair_generator(bamfile) else: reads = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE) dists[sample.name] = get_distance_reads_sequence(refseq, reads, VERBOSE=VERBOSE, score_match=3, score_mismatch=-3) hs = {} binmax = max(map(max, dists.itervalues())) bins = np.arange(0, binmax, 6) bincenters = 0.5 * (bins[1:] + bins[:-1]) for samplename, dist in dists.iteritems():
def fish_distant_reads(bamfilename, ref, min_mismatches=20, max_mismatches=30, VERBOSE=0, maxseqs=-1): '''Fish distant reads from the trash''' import numpy as np from hivwholeseq.utils.mapping import pair_generator, reads_to_seqrecord from hivwholeseq.sequencing.filter_mapped_reads import check_overhanging_reads, \ get_distance_from_consensus distances = [] seqs = [] edges = [] with pysam.Samfile(bamfilename, 'rb') as bamfile: for irp, reads in enumerate(pair_generator(bamfile)): if VERBOSE >= 2: if not ((irp + 1) % 10000): print irp + 1 (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: raise ValueError('Read pair '+str(irp)+': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): continue # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: continue # Fish out our reads dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) if (min_mismatches <= dc.sum() <= max_mismatches): if VERBOSE >= 3: print 'Gotcha!', reads[0].qname seqs.append(reads[0]) seqs.append(reads[1]) distances.append(dc) edge = [(read.pos, read.pos + sum(bl for bt, bl in read.cigar if bt in (0, 2))) for read in reads] edges.append(edge) if len(seqs) // 2 == maxseqs: if VERBOSE >= 2: print 'Max seqs reached:', maxseqs break seqs = list(pair_generator(reads_to_seqrecord(seqs))) distances = np.array(distances, int) return (distances, edges, seqs)
def trim_and_divide_reads(data_folder, adaID, n_cycles, fragments, maxreads=-1, VERBOSE=0, minisize=100, include_tests=False, summary=True): '''Trim reads and divide them into fragments''' if VERBOSE: print 'Trim and divide into fragments: adaID '+adaID+', fragments: '+\ ' '.join(fragments) if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('Fragments used: '+' '.join(fragments)+'\n') ref_filename = get_reference_premap_filename(data_folder, adaID) refseq = SeqIO.read(ref_filename, 'fasta') smat = np.array(refseq, 'S1') len_reference = len(refseq) # Get the positions of fragment start/end, w/ and w/o primers frags_pos = get_fragment_positions(smat, fragments) store_reference_fragmented(data_folder, adaID, refseq, dict(zip(fragments, frags_pos['trim']))) if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('Primer positions (for fragments):\n') for (fragment, poss_full, poss_trim) in izip(fragments, frags_pos['full'], frags_pos['trim']): f.write(fragment+': fwd '+str(poss_full[0])+' '+str(poss_trim[0])+\ ', rev '+str(poss_trim[1])+' '+str(poss_full[1])+'\n') write_fragment_positions(data_folder, adaID, fragments, frags_pos) # Get the positions of the unwanted outer primers (in case we DO nested PCR # for that fragment) # NOTE: the LTRs make no problem, because the rev outer primer of F6 # is not in the reference anymore if F6 has undergone nested PCR # FIXME: this might not work if we have mixed fragments (e.g. F5a+b) AND nesting from re import findall primers_out = {'fwd': [], 'rev': []} for i, fr in enumerate(fragments): if (i != 0) and findall(r'F[2-6][a-z]?i', fr): primers_out['fwd'].append(fr[:-1]+'o') if (i != len(fragments) - 1) and findall(r'F[1-5][a-z]?i', fr): primers_out['rev'].append(fr[:-1]+'o') # Get all possible unambiguous primers for the unwanted outer primers from hivwholeseq.data.primers import primers_PCR from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas primers_out_seq = {'fwd': [np.array(map(list, eas(primers_PCR[fr][0])), 'S1', ndmin=2) for fr in primers_out['fwd']], 'rev': [np.array(map(list, eas(primers_PCR[fr][1])), 'S1', ndmin=2) for fr in primers_out['rev']], } primers_out_pos = {'fwd': [], 'rev': []} if primers_out['fwd']: primers_out_pos['fwd'] = map(itemgetter(0), get_primer_positions(smat, primers_out['fwd'], 'fwd')) if primers_out['rev']: primers_out_pos['rev'] = map(itemgetter(1), get_primer_positions(smat, primers_out['rev'], 'rev')) # Input and output files input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): convert_sam_to_bam(input_filename) output_filenames = get_divided_filenames(data_folder, adaID, fragments, type='bam') with pysam.Samfile(input_filename, 'rb') as bamfile: try: file_handles = [pysam.Samfile(ofn, 'wb', template=bamfile) for ofn in output_filenames[:len(fragments)]] fo_am = pysam.Samfile(output_filenames[-4], 'wb', template=bamfile) fo_cm = pysam.Samfile(output_filenames[-3], 'wb', template=bamfile) fo_um = pysam.Samfile(output_filenames[-2], 'wb', template=bamfile) fo_lq = pysam.Samfile(output_filenames[-1], 'wb', template=bamfile) # Iterate over the mapped reads and assign fragments n_mapped = [0 for fragment in fragments] n_unmapped = 0 n_crossfrag = 0 n_ambiguous = 0 n_outer = 0 n_lowq = 0 for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: if VERBOSE: print 'Maximal number of read pairs reached:', maxreads break if VERBOSE >= 2: if not ((irp+1) % 10000): print irp+1 i_fwd = reads[0].is_reverse # If unmapped or unpaired, mini, or insert size mini, or # divergent read pair (fully cross-overlapping), discard if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \ reads[1].is_unmapped or (not reads[1].is_proper_pair) or \ (reads[0].rlen < 50) or (reads[1].rlen < 50) or \ (reads[i_fwd].isize < minisize): if VERBOSE >= 3: print 'Read pair unmapped/unpaired/tiny/divergent:', reads[0].qname n_unmapped += 1 fo_um.write(reads[0]) fo_um.write(reads[1]) continue # If the insert is a misamplification from the outer primers # in fragments that underwent nested PCR, # trash it (it will have skewed amplification anyway). We cannot # find all of those, rather only the ones still carrying the # primer itself (some others have lost it while shearing). For # those, no matter what happens at the end (reading into adapters, # etc.), ONE of the reads in the pair will start exactly with one # outer primer: if the rev read with a rev primer, if the fwd # with a fwd one. Test all six. if (len(primers_out_pos['fwd']) or len(primers_out_pos['rev'])) and \ test_outer_primer(reads, primers_out_pos, primers_out_seq, len_reference): if VERBOSE >= 3: print 'Read pair from outer primer:', reads[0].qname n_outer += 1 fo_um.write(reads[0]) fo_um.write(reads[1]) continue # FIXME: the following becomes a bit harder when we mix parallel # PCRs, e.g. F5a+b, to get more product # Assign to a fragment now, so that primer trimming is faster pair_identity = assign_to_fragment(reads, frags_pos['full'], VERBOSE=VERBOSE) # 1. If no fragments are possible (e.g. one read crosses the # fragment boundary, they map to different fragments), dump it # into a special bucket if pair_identity == 'cross': n_crossfrag += 1 fo_cm.write(reads[0]) fo_cm.write(reads[1]) continue # 2. If 2+ fragments are possible (tie), put into a special bucket # (essentially excluded, because we want two independent measurements # in the overlapping region, but we might want to recover them) elif pair_identity == 'ambiguous': n_ambiguous += 1 fo_am.write(reads[0]) fo_am.write(reads[1]) continue # 3. If the intersection is a single fragment, good: trim the primers # NB: n_frag is the index IN THE POOL. If we sequence only F2-F5, F2 is n_frag = 0 n_frag = int(pair_identity) frag_pos = frags_pos['trim'][n_frag] if not np.isscalar(frag_pos[0]): frag_pos = [frag_pos[0]['inner'], frag_pos[1]['inner']] trashed_primers = trim_primers(reads, frag_pos, include_tests=include_tests) if trashed_primers or (reads[i_fwd].isize < 100): n_unmapped += 1 if VERBOSE >= 3: print 'Read pair is mismapped:', reads[0].qname fo_um.write(reads[0]) fo_um.write(reads[1]) continue # Quality trimming: if no decently long pair survives, trash #trashed_quality = main_block_low_quality(reads, phred_min=20, # include_tests=include_tests) trashed_quality = trim_low_quality(reads, phred_min=20, include_tests=include_tests) if trashed_quality or (reads[i_fwd].isize < 100): n_lowq += 1 if VERBOSE >= 3: print 'Read pair has low phred quality:', reads[0].qname fo_lq.write(reads[0]) fo_lq.write(reads[1]) continue # Check for cross-overhangs or COH (reading into the adapters) # ---------------> # <----------- # In that case, trim to perfect overlap. if test_coh(reads, VERBOSE=False): trim_coh(reads, trim=0, include_tests=include_tests) # Change coordinates into the fragmented reference (primer-trimmed) for read in reads: read.pos -= frag_pos[0] read.mpos -= frag_pos[0] # Here the tests if include_tests: lfr = frags_pos['trim'][n_frag][1] - frags_pos['trim'][n_frag][0] if test_sanity(reads, n_frag, lfr): print 'Tests failed:', reads[0].qname import ipdb; ipdb.set_trace() # There we go! n_mapped[n_frag] += 1 file_handles[n_frag].write(reads[0]) file_handles[n_frag].write(reads[1]) finally: for f in file_handles: f.close() fo_am.close() fo_cm.close() fo_um.close() fo_lq.close() if VERBOSE: print 'Trim and divide results: adaID '+adaID print 'Total:\t\t', irp print 'Mapped:\t\t', sum(n_mapped), n_mapped print 'Unmapped/unpaired/tiny:\t', n_unmapped print 'Outer primer\t', n_outer print 'Crossfrag:\t', n_crossfrag print 'Ambiguous:\t', n_ambiguous print 'Low-quality:\t', n_lowq # Write summary to file if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Trim and divide results: adaID '+adaID+'\n') f.write('Total:\t\t'+str(irp + 1)+'\n') f.write('Mapped:\t\t'+str(sum(n_mapped))+' '+str(n_mapped)+'\n') f.write('Unmapped/unpaired/tiny insert:\t'+str(n_unmapped)+'\n') f.write('Outer primer\t'+str(n_outer)+'\n') f.write('Crossfrag:\t'+str(n_crossfrag)+'\n') f.write('Ambiguous:\t'+str(n_ambiguous)+'\n') f.write('Low-quality:\t'+str(n_lowq)+'\n')
def get_local_block(bamfilename, start, end, VERBOSE=0, maxreads=-1, refroi=None): '''Extract reads fully covering the region, discarding insertions''' import sys import pysam from hivwholeseq.utils.mapping import pair_generator from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open with pysam.Samfile(bamfilename, 'rb') as bamfile: block = [] if maxreads == -1: reads_iter = pair_generator(bamfile) else: reads_iter = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE, pairs=True) for irp, reads in enumerate(reads_iter): if VERBOSE >= 2: if not ((irp + 1) % 10000): if irp + 1 != 10000: sys.stdout.write("\x1b[1A\n") sys.stdout.write(str(irp + 1)) sys.stdout.flush() # Sort fwd read first is_fwd = reads[0].is_reverse reads = [reads[is_fwd], reads[not is_fwd]] # Check for coverage of the region start_fwd = reads[0].pos end_fwd = start_fwd + sum( bl for (bt, bl) in reads[0].cigar if bt in (0, 2)) start_rev = reads[1].pos end_rev = start_rev + sum( bl for (bt, bl) in reads[1].cigar if bt in (0, 2)) if start_fwd > start: continue if end_rev < end: continue if (end_fwd < end) and (start_rev > start) and (start_rev > end_fwd): continue if VERBOSE >= 3: print ' '.join( map('{:>4d}'.format, [start_fwd, end_fwd, start_rev, end_rev])) # Gather info from both reads, merge by putting ambiguous nucleotides seqs = [] st_ens = [[start_fwd, end_fwd], [start_rev, end_rev]] for ir, read in enumerate(reads): (start_read, end_read) = st_ens[ir] if (end_read <= start) or (start_read >= end): seqs.append(None) continue seq = [] pos_ref = start_read pos_read = 0 start_block = max(start, start_read) - start end_block = min(end, end_read) - start for (bt, bl) in read.cigar: if bt == 1: pos_read += bl elif bt == 2: if pos_ref + bl > start: st = max(0, start - pos_ref) en = min(bl, end - pos_ref) seq.append('-' * (en - st)) if pos_ref + bl >= end: break pos_ref += bl elif bt == 0: if pos_ref + bl > start: st = max(0, start - pos_ref) en = min(bl, end - pos_ref) seq.append(read.seq[pos_read + st:pos_read + en]) if pos_ref + bl >= end: break pos_ref += bl pos_read += bl seq = ''.join(seq) seqs.append((start_block, end_block, seq)) # Merge sequences if both fwd and rev cover part of it if seqs[0] is None: seq = seqs[1][2] elif seqs[1] is None: seq = seqs[0][2] else: # The fwd read starts before the rev, because of our insert sizes end_block_fwd = seqs[0][1] start_block_rev = seqs[1][0] overlap = [ seqs[0][2][start_block_rev:], seqs[1][2][:end_block_fwd - start_block_rev] ] # The two reads in a pair should have the same length in the overlap if len(overlap[0]) != len(overlap[1]): if VERBOSE >= 3: print 'WARNING:', reads[ 0].qname, 'not same length in overlap!' continue ol_fwd = np.fromstring(overlap[0], 'S1') ol_rev = np.fromstring(overlap[1], 'S1') ol_fwd[ol_fwd != ol_rev] = 'N' seq = seqs[0][2][:start_block_rev] + \ ol_fwd.tostring() + \ seqs[1][2][end_block_fwd - start_block_rev:] block.append(seq) if VERBOSE >= 2: print '' return block
def get_coverage_tuples(data_folder, adaID, fragment, mtuples, maxreads=-1, VERBOSE=0): '''Get the joint coverage of a list of positions''' # Prepare data structures mtuples = [np.asarray(tup, int) for tup in mtuples] coverage = np.zeros(len(mtuples), int) # TODO: what to do if it is covered multiple times? or only some sites? covs_pair = [np.zeros(len(tup), bool) for tup in mtuples] # Open BAM bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over all pairs for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: if VERBOSE: print 'Max reads reached:', maxreads break if VERBOSE >= 3: if not ((irp + 1) % 10000): print irp + 1 # Reinitialize temporary structure for cov_pair in covs_pair: cov_pair[:] = False # Look in both reads for read in reads: # NOTE: deletions count as covered, because in principle # we see that part of the reference cigar = read.cigar ref_start = read.pos ref_end = ref_start + sum( bl for (bt, bl) in cigar if bt in (0, 2)) # Use numba to accelerate? better not if False: add_read(ref_start, ref_end, mtuples, covs_pair) else: for cov_pair, mtuple in izip(covs_pair, mtuples): cov_pair[(mtuple >= ref_start) & (mtuple < ref_end)] = True # Check which tuples are fully covered for i, cov_pair in enumerate(covs_pair): if cov_pair.all(): coverage[i] += 1 return coverage
if VERBOSE >= 1: print sample.name, if sample[fragment] not in ['ok', 'low']: if VERBOSE >= 1: print 'not "ok". skipping' continue if VERBOSE >= 1: print 'ok' bamfilename = sample.get_mapped_filtered_filename( fragment, decontaminated=True) with pysam.Samfile(bamfilename, 'rb') as bamfile: if maxreads == -1: reads = pair_generator(bamfile) else: reads = extract_mapped_reads_subsample_open( bamfile, maxreads, VERBOSE=VERBOSE) dists[sample.name] = get_distance_reads_sequence( refseq, reads, VERBOSE=VERBOSE, score_match=3, score_mismatch=-3) hs = {} binmax = max(map(max, dists.itervalues())) bins = np.arange(0, binmax, 6) bincenters = 0.5 * (bins[1:] + bins[:-1])
def get_local_haplotypes(bamfilename, start, end, VERBOSE=0, maxreads=-1, label=''): '''Extract reads fully covering the region, discarding insertions''' import sys import pysam from hivwholeseq.utils.mapping import pair_generator from hivwholeseq.utils.mapping import extract_mapped_reads_subsample_open from collections import Counter haplotypes = Counter() with pysam.Samfile(bamfilename, 'rb') as bamfile: if maxreads == -1: reads_iter = pair_generator(bamfile) else: reads_iter = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE, pairs=True) for irp, reads in enumerate(reads_iter): if VERBOSE >= 2: if not ((irp + 1) % 10000): if irp + 1 != 10000: sys.stdout.write("\x1b[1A\n") if label: sys.stdout.write(label+'\t') sys.stdout.write(str(irp + 1)) sys.stdout.flush() # Sort fwd read first: this is important because with our insert # size we know the fwd read starts <= the rev read is_fwd = reads[0].is_reverse reads = [reads[is_fwd], reads[not is_fwd]] # Check for coverage of the region start_fwd = reads[0].pos end_fwd = start_fwd + sum(bl for (bt, bl) in reads[0].cigar if bt in (0, 2)) start_rev = reads[1].pos end_rev = start_rev + sum(bl for (bt, bl) in reads[1].cigar if bt in (0, 2)) overlap_len = max(0, end_fwd - start_rev) # Various scenarios possible if start_fwd > start: continue if end_rev < end: continue # No single read covers the whole region AND (the insert has a whole # OR a very short overlap) if (end_fwd < end) and (start_rev > start) and (overlap_len < 20): continue # Now the good cases if (start_fwd <= start) and (end_fwd >= end): seq = trim_read_roi(reads[0], start, end) elif (start_rev <= start) and (end_rev >= end): seq = trim_read_roi(reads[1], start, end) else: seqs = [trim_read_roi(read, start, end) for read in reads] seq = merge_read_pair(*seqs) haplotypes[seq] += 1 if VERBOSE >= 4: import ipdb; ipdb.set_trace() if VERBOSE >= 2: if irp >= 10000: sys.stdout.write('\n') sys.stdout.flush() return haplotypes
# Read reference (fragmented) refseqs_raw = list(SeqIO.parse(get_last_reference(data_folder, adaID, ext=True), "fasta")) # Sort according to the chromosomal ordering refseqs = [] for chromosome in chromosomes: for seq in refseqs_raw: if chromosome == seq.id: refseqs.append(seq) break refs = [np.array(refseq) for refseq in refseqs] # Prepare data structures muts_all = [] # Iterate over all pairs for i_pairs, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if 2 * i_pairs >= maxreads: break # Log if VERBOSE and (not (i_pairs % 10000)): print i_pairs, # Assign names read1 = reads[0] read2 = reads[1] # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname:
def quality_score_along_reads_mapped(read_len, bamfilename, insertsize_range=[400, 1000], skipreads=0, maxreads=-1, randomreads=True, VERBOSE=0): '''Calculate the quality score along the reads''' from hivwholeseq.utils.mapping import trim_read_pair_crossoverhangs as trim_coh from hivwholeseq.utils.mapping import pair_generator quality = [[[] for j in xrange(read_len)] for i in xrange(2)] # Precompute conversion table SANGER_SCORE_OFFSET = ord("!") q_mapping = dict() for letter in range(0, 255): q_mapping[chr(letter)] = letter - SANGER_SCORE_OFFSET # Iterate over all reads (using fast iterators) with pysam.Samfile(bamfilename, 'rb') as bamfile: if not randomreads: reads_all = [] for i, read_pair in enumerate(pair_generator(bamfile)): if i < skipreads: continue if i == skipreads + maxreads: if VERBOSE: print 'Maximal number of read pairs reached:', maxreads break if VERBOSE and (not ((i + 1) % 10000)): print i + 1 reads_all.append(read_pair) else: reads_all = extract_mapped_reads_subsample_open(bamfile, maxreads, VERBOSE=VERBOSE) print len(reads_all) for reads in reads_all: # Check insert size read = reads[reads[0].is_reverse] if (read.is_unmapped or (not read.is_proper_pair) or \ (read.isize < insertsize_range[0]) or \ (read.isize >= insertsize_range[1])): continue trim_coh(reads, trim=5, include_tests=False) pos_read = 0 for read in reads: ip = read.is_read2 for (bt, bl) in read.cigar: if bt == 1: pos_read += bl elif bt == 2: pass elif bt == 0: qualb = read.qual[pos_read: pos_read + bl] poss_read = np.arange(pos_read, pos_read + bl) if read.is_reverse: poss_read = len(read.seq) - 1 - poss_read for j, qletter in izip(poss_read, qualb): quality[ip][j].append(q_mapping[qletter]) for qual in quality: for qpos in qual: qpos.sort() return quality