def make_consensus(data_folder, adaID, fragment, n_iter, qual_min=20, VERBOSE=0, coverage_min=10, summary=True): '''Make consensus sequence from the mapped reads''' if VERBOSE: print 'Build consensus: '+adaID+' '+fragment+' iteration '+str(n_iter) # Read reference reffilename = get_reference_filename(data_folder, adaID, fragment, n_iter) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) # Open BAM file bamfilename = get_mapped_filename(data_folder, adaID, fragment, n_iter) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(bamfilename,\ len(refseq), qual_min=qual_min, match_len_min=match_len_min) consensus_final = build_consensus(counts, inserts, coverage_min=coverage_min, VERBOSE=VERBOSE) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('Consensus built for iteration '+str(n_iter)) f.write('\n') return refseq, consensus_final
def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10): '''Extract allele and insert counts from a bamfile''' # Read reference reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True) refseq = SeqIO.read(reffilename, 'fasta') # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) # Call lower-level function return get_allele_counts_insertions_from_file(bamfilename, len(refseq), qual_min=qual_min, maxreads=maxreads, VERBOSE=VERBOSE)
def get_insert_size_distribution(data_folder, adaID, fragment, bins=None, maxreads=-1, VERBOSE=0, density=True): '''Get the distribution of insert sizes''' if maxreads <= 0: maxreads = 1e6 insert_sizes = np.zeros(maxreads, np.int16) # Open BAM file if fragment == 'premapped': bamfilename = get_premapped_filename(data_folder, adaID, type='bam') else: bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) # Convert from SAM if necessary if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) # Open file with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over single reads (no linkage info needed) n_written = 0 for i, reads in enumerate(pair_generator(bamfile)): if i == maxreads: if VERBOSE >= 2: print 'Max reads reached:', maxreads break # Print output if (VERBOSE >= 3) and (not ((i +1) % 10000)): print (i+1) # If unmapped or unpaired, mini, or insert size mini, discard if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \ reads[1].is_unmapped or (not reads[1].is_proper_pair): continue # Store insert size i_fwd = reads[0].is_reverse insert_sizes[i] = reads[i_fwd].isize n_written += 1 insert_sizes = insert_sizes[:n_written] insert_sizes.sort() # Bin it if bins is None: h = np.histogram(insert_sizes, density=density) else: h = np.histogram(insert_sizes, bins=bins, density=density) return insert_sizes, h
def get_allele_counts(data_folder, adaID, fragment, VERBOSE=0, maxreads=1e10): """Extract allele and insert counts from a bamfile""" # Read reference reffilename = get_consensus_filename(data_folder, adaID, fragment, trim_primers=True) refseq = SeqIO.read(reffilename, "fasta") # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point bamfilename = get_mapped_filename(data_folder, adaID, fragment, type="bam", filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) # Call lower-level function return get_allele_counts_insertions_from_file( bamfilename, len(refseq), qual_min=qual_min, maxreads=maxreads, VERBOSE=VERBOSE )
def make_consensus(data_folder, adaID, fragment, n_iter, qual_min=20, VERBOSE=0, coverage_min=10, summary=True): '''Make consensus sequence from the mapped reads''' if VERBOSE: print 'Build consensus: ' + adaID + ' ' + fragment + ' iteration ' + str( n_iter) # Read reference reffilename = get_reference_filename(data_folder, adaID, fragment, n_iter) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) # Open BAM file bamfilename = get_mapped_filename(data_folder, adaID, fragment, n_iter) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered(bamfilename,\ len(refseq), qual_min=qual_min, match_len_min=match_len_min) consensus_final = build_consensus(counts, inserts, coverage_min=coverage_min, VERBOSE=VERBOSE) if summary: with open(get_summary_fn(data_folder, adaID, fragment), 'a') as f: f.write('Consensus built for iteration ' + str(n_iter)) f.write('\n') return refseq, consensus_final
def get_read_lengths(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1): '''Get the read lengths''' # Lengths from 1 to 250 lengths = np.zeros((len(read_types), 250), int) # Open BAM file # Note: the reads should already be filtered of unmapped stuff at this point bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate over single reads (no linkage info needed) for i, read in enumerate(bamfile): # Max number of reads if i == maxreads: if VERBOSE >= 2: print 'Max reads reached:', maxreads break # Print output if (VERBOSE >= 3) and (not ((i +1) % 10000)): print (i+1) # Divide by read 1/2 and forward/reverse js = 2 * read.is_read2 + read.is_reverse # Increment counter lengths[js, read.rlen - 1] += 1 # Note: we do not delve into CIGARs because the reads are trimmed return lengths
def filter_reads(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1, contaminants=None, n_cycles=600, max_mismatches=30, susp_mismatches=20, summary=True, plot=False): '''Filter the reads to good chunks''' frag_gen = fragment[:2] reffilename = get_consensus_filename(data_folder, adaID, frag_gen) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=False) if not os.path.isfile(bamfilename): samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', filtered=False) if os.path.isfile(samfilename): convert_sam_to_bam(bamfilename) else: if VERBOSE >= 1: print 'ERROR: ' + adaID + ', mapped file not found.' return outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) suspiciousfilename = get_mapped_suspicious_filename( data_folder, adaID, frag_gen) trashfilename = outfilename[:-4] + '_trashed.bam' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: # Iterate over all pairs n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_suspect = 0 n_mismapped_edge = 0 n_badcigar = 0 histogram_distance_from_consensus = np.zeros(n_cycles + 1, int) binsize = 200 histogram_dist_along = np.zeros( (len(ref) // binsize + 1, n_cycles + 1), int) for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: break # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair ' + str(irp) + ': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': unmapped' n_unmapped += 1 map(trashfile.write, reads) continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair ' + read1.qname + ': not properly paired' n_unpaired += 1 map(trashfile.write, reads) continue # Mismappings are sometimes at fragment edges: # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: n_mismapped_edge += 1 map(trashfile.write, reads) continue # Mismappings are often characterized by many mutations: # check the number of mismatches of the whole pair and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) histogram_distance_from_consensus[dc.sum()] += 1 hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize histogram_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\ 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' n_mutator += 1 map(trashfile.write, reads) continue # Check for contamination from other PCR plates. Typically, # contamination happens for only one fragment, whereas superinfection # happens for all. At this stage, we can only give clues about # cross-contamination, the rest will be done in a script downstream # (here we could TAG suspicious reads for contamination) elif (dc.sum() > susp_mismatches): if contaminants is not None: skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE) else: skip = True if skip: n_suspect += 1 map(suspfile.write, reads) continue # Trim the bad CIGARs from the sides, if there are any good ones skip = trim_bad_cigar(reads, match_len_min=match_len_min, trim_left=trim_bad_cigars, trim_right=trim_bad_cigars) if skip: n_badcigar += 1 map(trashfile.write, reads) continue # TODO: we might want to incorporate some more stringent # criterion here, to avoid short reads, cross-overhang, etc. # Write the output n_good += 1 map(outfile.write, reads) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Mispapped at edge:', n_mismapped_edge print 'Many-mutations:', n_mutator print 'Suspect contaminations:', n_suspect print 'Bad CIGARs:', n_badcigar if summary: summary_filename = get_filter_mapped_summary_filename( data_folder, adaID, fragment) with open(summary_filename, 'a') as f: f.write('Filter results: adaID ' + adaID + fragment + '\n') f.write('Total:\t\t\t' + str(irp + 1) + '\n') f.write('Good:\t\t\t' + str(n_good) + '\n') f.write('Unmapped:\t\t' + str(n_unmapped) + '\n') f.write('Unpaired:\t\t' + str(n_unpaired) + '\n') f.write('Mismapped at edge:\t' + str(n_mismapped_edge) + '\n') f.write('Many-mutations:\t\t' + str(n_mutator) + '\n') f.write('Suspect contaminations:\t' + str(n_suspect) + '\n') f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n') if plot: plot_distance_histogram(data_folder, adaID, frag_gen, histogram_distance_from_consensus, savefig=True) plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen, len(ref), histogram_dist_along, binsize=binsize, savefig=True)
def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1, cluster_time='23:59:59', maxreads=-1, summary=True, rescue=False, dry=False): '''Map using stampy''' frag_gen = fragment[:2] if summary: summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=rescue) # Set mapping penalty scores: softer for rescues and F3 and F5 global subsrate if rescue: subsrate = '0.2' stampy_gapopen = 5 # Default: 40 stampy_gapextend = 1 # Default: 3 elif frag_gen not in ('F3', 'F5'): stampy_gapopen = 60 # Default: 40 stampy_gapextend = 5 # Default: 3 else: stampy_gapopen = 30 # Default: 40 stampy_gapextend = 2 # Default: 3 if VERBOSE: print 'Map via stampy: '+adaID+' '+frag_gen if not rescue: input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if frag_gen == 'F3': input_filename = input_filename.replace('F3a', 'F3') else: input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename+', fragment '+fragment+': input file not found.') # parallelize if requested if threads == 1: output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', rescue=rescue) # Map call_list = [stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') # Take only a (random) subsample: stampy uses the fraction of reads # intead of the number if maxreads > 0: # FIXME: figure out the -s option and the --numrecords option call_list.extend(['--numrecords', maxreads]) #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2 #frac_pairs = 1.0 * maxreads / n_pairs_tot #random_seed = np.random.randint(1e5) #call_list.extend(['-s', frac_pairs + random_seed]) call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >=2: print ' '.join(call_list) if not dry: sp.call(call_list) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', rescue=rescue) convert_sam_to_bam(output_filename) else: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (single thread).\n') if VERBOSE >= 1: print 'Dry run works (single thread)' return else: # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): # Get output filename output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=(j+1), rescue=rescue) # Map call_list = ['qsub','-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm'+adaID.replace('-', '')+frag_gen+str(j+1), '-l', 'h_rt='+cluster_time, '-l', 'h_vmem='+vmem, stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--processpart='+str(j+1)+'/'+str(threads), '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID if dry: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (multi thread).\n') if VERBOSE >= 1: print 'Dry run works (multi thread)' return # Monitor output output_file_parts = [get_mapped_filename(data_folder, adaID, frag_gen, type='bam', part=(j+1), rescue=rescue) for j in xrange(threads)] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: adaID '+\ adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped ('+str(threads)+' threads).\n') # Concatenate output files output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=True, rescue=rescue) if VERBOSE >= 1: print 'Concatenate mapped reads: adaID '+adaID+', fragment '+frag_gen pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=False, rescue=rescue) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: adaID '+adaID+', fragment '+frag_gen pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: adaID '+adaID+', fragment '+frag_gen header_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=1, rescue=rescue) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') # FIXME: check whether temp files are all deleted if VERBOSE >= 1: print 'Remove temporary files: adaID '+adaID+', fragment '+frag_gen remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
print 'fragments', fragments # Iterate over all requested samples for adaID in adaIDs: for fragment in fragments: # Read reference reffilename = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) # Open BAM bamfilename = get_mapped_filename(data_folder, adaID, fragment, filtered=False) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) with pysam.Samfile(bamfilename, 'rb') as bamfile: # Iterate through reads for i, read in enumerate(bamfile): # Limit to the first reads if i >= maxreads: break # Print output if VERBOSE and not ((i +1) % 10000): print (i+1) # Ignore unmapped reads if read.is_unmapped or not read.is_proper_pair: continue
def map_stampy(data_folder, adaID, fragment, VERBOSE=0, threads=1, cluster_time='23:59:59', maxreads=-1, summary=True, rescue=False, dry=False): '''Map using stampy''' frag_gen = fragment[:2] if summary: summary_filename = get_map_summary_filename(data_folder, adaID, frag_gen, rescue=rescue) # Set mapping penalty scores: softer for rescues and F3 and F5 global subsrate if rescue: subsrate = '0.2' stampy_gapopen = 5 # Default: 40 stampy_gapextend = 1 # Default: 3 elif frag_gen not in ('F3', 'F5'): stampy_gapopen = 60 # Default: 40 stampy_gapextend = 5 # Default: 3 else: stampy_gapopen = 30 # Default: 40 stampy_gapextend = 2 # Default: 3 if VERBOSE: print 'Map via stampy: ' + adaID + ' ' + frag_gen if not rescue: input_filename = get_divided_filename(data_folder, adaID, fragment, type='bam') # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if frag_gen == 'F3': input_filename = input_filename.replace('F3a', 'F3') else: input_filename = get_divided_filename(data_folder, adaID, 'unmapped', type='bam') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename + ', fragment ' + fragment + ': input file not found.') # parallelize if requested if threads == 1: output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', rescue=rescue) # Map call_list = [ stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend ] if stampy_sensitive: call_list.append('--sensitive') # Take only a (random) subsample: stampy uses the fraction of reads # intead of the number if maxreads > 0: # FIXME: figure out the -s option and the --numrecords option call_list.extend(['--numrecords', maxreads]) #n_pairs_tot = get_number_reads(input_filename, 'bam') / 2 #frac_pairs = 1.0 * maxreads / n_pairs_tot #random_seed = np.random.randint(1e5) #call_list.extend(['-s', frac_pairs + random_seed]) call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: sp.call(call_list) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', rescue=rescue) convert_sam_to_bam(output_filename) else: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (single thread).\n') if VERBOSE >= 1: print 'Dry run works (single thread)' return else: # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): # Get output filename output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=(j + 1), rescue=rescue) # Map call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm' + adaID.replace('-', '') + frag_gen + str(j + 1), '-l', 'h_rt=' + cluster_time, '-l', 'h_vmem=' + vmem, stampy_bin, '-g', get_index_file(data_folder, adaID, frag_gen, ext=False), '-h', get_hash_file(data_folder, adaID, frag_gen, ext=False), '-o', output_filename, '--overwrite', '--processpart=' + str(j + 1) + '/' + str(threads), '--substitutionrate=' + subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend ] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) if not dry: job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID if dry: if summary: with open(summary_filename, 'a') as f: f.write('Dry run works (multi thread).\n') if VERBOSE >= 1: print 'Dry run works (multi thread)' return # Monitor output output_file_parts = [ get_mapped_filename(data_folder, adaID, frag_gen, type='bam', part=(j + 1), rescue=rescue) for j in xrange(threads) ] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: adaID '+\ adaID+', fragment '+frag_gen+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (' + str(threads) + ' threads).\n') # Concatenate output files output_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=True, rescue=rescue) if VERBOSE >= 1: print 'Concatenate mapped reads: adaID ' + adaID + ', fragment ' + frag_gen pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', unsorted=False, rescue=rescue) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: adaID ' + adaID + ', fragment ' + frag_gen pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: adaID ' + adaID + ', fragment ' + frag_gen header_filename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', part=1, rescue=rescue) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') # FIXME: check whether temp files are all deleted if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID + ', fragment ' + frag_gen remove_mapped_tempfiles(data_folder, adaID, frag_gen, VERBOSE=VERBOSE, rescue=rescue) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
for adaID in adaIDs: # Read reference (fragmented) reffilename = get_consensus_filename(data_folder, adaID, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) # read file bamfilename = get_mapped_filename(data_folder, adaID, fragment, type='bam', filtered=True) if not os.path.isfile(bamfilename): convert_sam_to_bam(bamfilename) bamfile = pysam.Samfile(bamfilename, 'rb') # Get the coverage for reads which have long insert sizes # (to be sure about their identity) cov_new = 0 cov_old = 0 for i_pairs, reads in enumerate(pair_generator(bamfile)): if i_pairs > 5000000: break if reads[0].isize < 300: continue for read in reads: if read.seq.find(primer_new) != -1:
def premap_stampy(data_folder, adaID, VERBOSE=0, threads=1, summary=True, maxreads=-1, subsrate=0.05, gapopen=40, gapextend=3): '''Call stampy for actual mapping''' if VERBOSE: print 'Premapping: adaID ', adaID if summary: summary_filename = get_premap_summary_filename(data_folder, adaID) # Stampy can handle both gzipped and uncompressed fastq inputs input_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(input_filenames[0]): input_filenames = get_read_filenames(data_folder, adaID, gzip=False) if not all(map(os.path.isfile, input_filenames)): raise OSError('Input files for mapping not found: ' + input_filenames[0]) # parallelize if requested if threads == 1: call_list = [ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename(data_folder, adaID, ext=False), '-o', get_premapped_filename(data_folder, adaID, type='sam'), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), ] if maxreads > 0: call_list.append('--numrecords=' + str(maxreads)) call_list.extend(['-M'] + input_filenames) call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nStampy premapped (single thread).\n') # Convert to compressed BAM convert_sam_to_bam( get_premapped_filename(data_folder, adaID, type='bam')) if summary: with open(summary_filename, 'a') as f: f.write('\nSAM file converted to compressed BAM: '+\ get_premapped_filename(data_folder, adaID, type='bam')+'\n') else: # Multithreading works as follows: call qsub + stampy, monitor the process # IDs with qstat at regular intervals, and finally merge results with pysam output_file_parts = [ get_premapped_filename( data_folder, adaID, type='bam', part=(j + 1)) for j in xrange(threads) ] # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') # Submit map call import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/' JOBLOGOUT = JOBDIR + 'logout' JOBLOGERR = JOBDIR + 'logerr' cluster_time = ['23:59:59', '1:59:59'] vmem = '8G' for j in xrange(threads): call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l', 'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem, stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename( data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename( data_folder, adaID, ext=False), '-o', get_premapped_filename( data_folder, adaID, type='sam', part=(j + 1)), '--processpart=' + str(j + 1) + '/' + str(threads), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), '-M' ] + input_filenames call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if VERBOSE >= 3: print qstat_output if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert premapped reads to BAM for merging: adaID '+\ adaID+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy premapped (' + str(threads) + ' threads).\n') # Concatenate output files if VERBOSE >= 1: print 'Concatenate premapped reads: adaID ' + adaID + '...', output_filename = get_premapped_filename( data_folder, adaID, type='bam', unsorted=True) pysam.cat('-o', output_filename, *output_file_parts) if VERBOSE >= 1: print 'done.' if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort premapped reads: adaID ' + adaID output_filename_sorted = get_premapped_filename( data_folder, adaID, type='bam', unsorted=False) pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader premapped reads: adaID ' + adaID header_filename = get_premapped_filename( data_folder, adaID, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp premapping files removed.\n') f.write('\n')
def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4]+'_trashed.bam' infilenames = [get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair(reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n') f.write('Total:\t\t\t'+str(irp + 1)+'\n') f.write('Good:\t\t\t'+str(n_good)+'\n') f.write('Unmapped:\t\t'+str(n_unmapped)+'\n') f.write('Unpaired:\t\t'+str(n_unpaired)+'\n') f.write('Many-mutations:\t\t'+str(n_mutator)+'\n') f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n') f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')
def trim_and_divide_reads(data_folder, adaID, n_cycles, fragments, maxreads=-1, VERBOSE=0, minisize=100, include_tests=False, summary=True): '''Trim reads and divide them into fragments''' if VERBOSE: print 'Trim and divide into fragments: adaID '+adaID+', fragments: '+\ ' '.join(fragments) if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('Fragments used: '+' '.join(fragments)+'\n') ref_filename = get_reference_premap_filename(data_folder, adaID) refseq = SeqIO.read(ref_filename, 'fasta') smat = np.array(refseq, 'S1') len_reference = len(refseq) # Get the positions of fragment start/end, w/ and w/o primers frags_pos = get_fragment_positions(smat, fragments) store_reference_fragmented(data_folder, adaID, refseq, dict(zip(fragments, frags_pos['trim']))) if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('Primer positions (for fragments):\n') for (fragment, poss_full, poss_trim) in izip(fragments, frags_pos['full'], frags_pos['trim']): f.write(fragment+': fwd '+str(poss_full[0])+' '+str(poss_trim[0])+\ ', rev '+str(poss_trim[1])+' '+str(poss_full[1])+'\n') write_fragment_positions(data_folder, adaID, fragments, frags_pos) # Get the positions of the unwanted outer primers (in case we DO nested PCR # for that fragment) # NOTE: the LTRs make no problem, because the rev outer primer of F6 # is not in the reference anymore if F6 has undergone nested PCR # FIXME: this might not work if we have mixed fragments (e.g. F5a+b) AND nesting from re import findall primers_out = {'fwd': [], 'rev': []} for i, fr in enumerate(fragments): if (i != 0) and findall(r'F[2-6][a-z]?i', fr): primers_out['fwd'].append(fr[:-1]+'o') if (i != len(fragments) - 1) and findall(r'F[1-5][a-z]?i', fr): primers_out['rev'].append(fr[:-1]+'o') # Get all possible unambiguous primers for the unwanted outer primers from hivwholeseq.data.primers import primers_PCR from hivwholeseq.utils.sequence import expand_ambiguous_seq as eas primers_out_seq = {'fwd': [np.array(map(list, eas(primers_PCR[fr][0])), 'S1', ndmin=2) for fr in primers_out['fwd']], 'rev': [np.array(map(list, eas(primers_PCR[fr][1])), 'S1', ndmin=2) for fr in primers_out['rev']], } primers_out_pos = {'fwd': [], 'rev': []} if primers_out['fwd']: primers_out_pos['fwd'] = map(itemgetter(0), get_primer_positions(smat, primers_out['fwd'], 'fwd')) if primers_out['rev']: primers_out_pos['rev'] = map(itemgetter(1), get_primer_positions(smat, primers_out['rev'], 'rev')) # Input and output files input_filename = get_premapped_filename(data_folder, adaID, type='bam') if not os.path.isfile(input_filename): convert_sam_to_bam(input_filename) output_filenames = get_divided_filenames(data_folder, adaID, fragments, type='bam') with pysam.Samfile(input_filename, 'rb') as bamfile: try: file_handles = [pysam.Samfile(ofn, 'wb', template=bamfile) for ofn in output_filenames[:len(fragments)]] fo_am = pysam.Samfile(output_filenames[-4], 'wb', template=bamfile) fo_cm = pysam.Samfile(output_filenames[-3], 'wb', template=bamfile) fo_um = pysam.Samfile(output_filenames[-2], 'wb', template=bamfile) fo_lq = pysam.Samfile(output_filenames[-1], 'wb', template=bamfile) # Iterate over the mapped reads and assign fragments n_mapped = [0 for fragment in fragments] n_unmapped = 0 n_crossfrag = 0 n_ambiguous = 0 n_outer = 0 n_lowq = 0 for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: if VERBOSE: print 'Maximal number of read pairs reached:', maxreads break if VERBOSE >= 2: if not ((irp+1) % 10000): print irp+1 i_fwd = reads[0].is_reverse # If unmapped or unpaired, mini, or insert size mini, or # divergent read pair (fully cross-overlapping), discard if reads[0].is_unmapped or (not reads[0].is_proper_pair) or \ reads[1].is_unmapped or (not reads[1].is_proper_pair) or \ (reads[0].rlen < 50) or (reads[1].rlen < 50) or \ (reads[i_fwd].isize < minisize): if VERBOSE >= 3: print 'Read pair unmapped/unpaired/tiny/divergent:', reads[0].qname n_unmapped += 1 fo_um.write(reads[0]) fo_um.write(reads[1]) continue # If the insert is a misamplification from the outer primers # in fragments that underwent nested PCR, # trash it (it will have skewed amplification anyway). We cannot # find all of those, rather only the ones still carrying the # primer itself (some others have lost it while shearing). For # those, no matter what happens at the end (reading into adapters, # etc.), ONE of the reads in the pair will start exactly with one # outer primer: if the rev read with a rev primer, if the fwd # with a fwd one. Test all six. if (len(primers_out_pos['fwd']) or len(primers_out_pos['rev'])) and \ test_outer_primer(reads, primers_out_pos, primers_out_seq, len_reference): if VERBOSE >= 3: print 'Read pair from outer primer:', reads[0].qname n_outer += 1 fo_um.write(reads[0]) fo_um.write(reads[1]) continue # FIXME: the following becomes a bit harder when we mix parallel # PCRs, e.g. F5a+b, to get more product # Assign to a fragment now, so that primer trimming is faster pair_identity = assign_to_fragment(reads, frags_pos['full'], VERBOSE=VERBOSE) # 1. If no fragments are possible (e.g. one read crosses the # fragment boundary, they map to different fragments), dump it # into a special bucket if pair_identity == 'cross': n_crossfrag += 1 fo_cm.write(reads[0]) fo_cm.write(reads[1]) continue # 2. If 2+ fragments are possible (tie), put into a special bucket # (essentially excluded, because we want two independent measurements # in the overlapping region, but we might want to recover them) elif pair_identity == 'ambiguous': n_ambiguous += 1 fo_am.write(reads[0]) fo_am.write(reads[1]) continue # 3. If the intersection is a single fragment, good: trim the primers # NB: n_frag is the index IN THE POOL. If we sequence only F2-F5, F2 is n_frag = 0 n_frag = int(pair_identity) frag_pos = frags_pos['trim'][n_frag] if not np.isscalar(frag_pos[0]): frag_pos = [frag_pos[0]['inner'], frag_pos[1]['inner']] trashed_primers = trim_primers(reads, frag_pos, include_tests=include_tests) if trashed_primers or (reads[i_fwd].isize < 100): n_unmapped += 1 if VERBOSE >= 3: print 'Read pair is mismapped:', reads[0].qname fo_um.write(reads[0]) fo_um.write(reads[1]) continue # Quality trimming: if no decently long pair survives, trash #trashed_quality = main_block_low_quality(reads, phred_min=20, # include_tests=include_tests) trashed_quality = trim_low_quality(reads, phred_min=20, include_tests=include_tests) if trashed_quality or (reads[i_fwd].isize < 100): n_lowq += 1 if VERBOSE >= 3: print 'Read pair has low phred quality:', reads[0].qname fo_lq.write(reads[0]) fo_lq.write(reads[1]) continue # Check for cross-overhangs or COH (reading into the adapters) # ---------------> # <----------- # In that case, trim to perfect overlap. if test_coh(reads, VERBOSE=False): trim_coh(reads, trim=0, include_tests=include_tests) # Change coordinates into the fragmented reference (primer-trimmed) for read in reads: read.pos -= frag_pos[0] read.mpos -= frag_pos[0] # Here the tests if include_tests: lfr = frags_pos['trim'][n_frag][1] - frags_pos['trim'][n_frag][0] if test_sanity(reads, n_frag, lfr): print 'Tests failed:', reads[0].qname import ipdb; ipdb.set_trace() # There we go! n_mapped[n_frag] += 1 file_handles[n_frag].write(reads[0]) file_handles[n_frag].write(reads[1]) finally: for f in file_handles: f.close() fo_am.close() fo_cm.close() fo_um.close() fo_lq.close() if VERBOSE: print 'Trim and divide results: adaID '+adaID print 'Total:\t\t', irp print 'Mapped:\t\t', sum(n_mapped), n_mapped print 'Unmapped/unpaired/tiny:\t', n_unmapped print 'Outer primer\t', n_outer print 'Crossfrag:\t', n_crossfrag print 'Ambiguous:\t', n_ambiguous print 'Low-quality:\t', n_lowq # Write summary to file if summary: with open(get_divide_summary_filename(data_folder, adaID), 'a') as f: f.write('\n') f.write('Trim and divide results: adaID '+adaID+'\n') f.write('Total:\t\t'+str(irp + 1)+'\n') f.write('Mapped:\t\t'+str(sum(n_mapped))+' '+str(n_mapped)+'\n') f.write('Unmapped/unpaired/tiny insert:\t'+str(n_unmapped)+'\n') f.write('Outer primer\t'+str(n_outer)+'\n') f.write('Crossfrag:\t'+str(n_crossfrag)+'\n') f.write('Ambiguous:\t'+str(n_ambiguous)+'\n') f.write('Low-quality:\t'+str(n_lowq)+'\n')
def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4] + '_trashed.bam' infilenames = [ get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq ] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print('WARNING: No mapped files found: ' + ', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair( reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname ' + pname + ', ' + samplename_pat + ', ' + fragment + '\n') f.write('Total:\t\t\t' + str(irp + 1) + '\n') f.write('Good:\t\t\t' + str(n_good) + '\n') f.write('Unmapped:\t\t' + str(n_unmapped) + '\n') f.write('Unpaired:\t\t' + str(n_unpaired) + '\n') f.write('Many-mutations:\t\t' + str(n_mutator) + '\n') f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n') f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
def map_stampy_multithread(sample, fragment, VERBOSE=0, threads=2, summary=True, filtered=True): '''Map using stampy, multithread (via cluster requests, queueing race conditions possible)''' import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/')+'/' JOBLOGOUT = JOBDIR+'logout/' JOBLOGERR = JOBDIR+'logerr/' cluster_time = ['23:59:59', '0:59:59'] vmem = '8G' pname = patient.id sample = patient.sample_table.loc[samplename] seq_run = sample['run'] data_folder = MiSeq_runs[seq_run]['folder'] adaID = sample['adaID'] if VERBOSE: print 'Map via stampy: '+pname+' '+samplename+' '+fragment if summary: summary_filename = get_map_initial_summary_filename(pname, samplename, fragment) # Specific fragment (e.g. F5 --> F5bi) frag_spec = filter(lambda x: fragment in x, sample['fragments']) if not len(frag_spec): raise ValueError(str(patient)+', '+samplename+': fragment '+fragment+' not found.') frag_spec = frag_spec[0] input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam') # Submit map scripts in parallel to the cluster jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): output_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='sam', part=(j+1)) # Map call_list = ['qsub','-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm '+samplename+fragment+' p'+str(j+1), '-l', 'h_rt='+cluster_time[threads >= 10], '-l', 'h_vmem='+vmem, stampy_bin, '--overwrite', '-g', get_initial_index_filename(pname, fragment, ext=False), '-h', get_initial_hash_filename(pname, fragment, ext=False), '-o', output_filename, '--processpart='+str(j+1)+'/'+str(threads), '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output output_file_parts = [get_mapped_to_initial_filename(pname, samplename, fragment, type='bam', part=(j+1)) for j in xrange(threads)] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: sample '+\ samplename+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped ('+str(threads)+' threads).\n') # Concatenate output files output_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='bam', unsorted=True) if VERBOSE >= 1: print 'Concatenate premapped reads: sample '+samplename pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_to_initial_filename(pname, samplename, fragment, type='bam') # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: sample '+samplename pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: sample '+samplename header_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: sample '+samplename remove_mapped_init_tempfiles(pname, samplename, fragment, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
def map_stampy_singlethread(sample, fragment, VERBOSE=0, n_pairs=-1, summary=True, only_chunk=None, filtered=True): '''Map using stampy, single thread (no cluster queueing race conditions)''' pname = sample.patient samplename_pat = sample['patient sample'] seq_run = sample['seq run'] data_folder = sample.sequencing_run['folder'] adaID = sample['adapter'] PCR = int(sample.PCR) if VERBOSE: print 'Map via stampy (single thread): '+samplename+' '+fragment if summary: summary_filename = get_map_initial_summary_filename(pname, samplename_pat, samplename, fragment, PCR=PCR) # Specific fragment (e.g. F5 --> F5bi) frag_spec = filter(lambda x: fragment in x, sample.regions_complete) if not len(frag_spec): if summary: with open(summary_filename, 'a') as f: f.write('Failed (specific fragment for '+fragment+'not found).\n') raise ValueError(samplename+': fragment '+fragment+' not found.') else: frag_spec = frag_spec[0] input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam', only_chunk=only_chunk, filtered=filtered) # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if fragment == 'F3': input_filename = input_filename.replace('F3a', 'F3') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename+', fragment '+fragment+': input file not found.') # Extract subsample of reads if requested if n_pairs > 0: from hivwholeseq.utils.mapping import extract_mapped_reads_subsample input_filename_sub = get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, PCR=PCR, type='bam')[:-4]+\ '_unmapped.bam' n_written = extract_mapped_reads_subsample(input_filename, input_filename_sub, n_pairs, VERBOSE=VERBOSE) # Get output filename output_filename = get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, PCR=PCR, type='sam', only_chunk=only_chunk) # Map call_list = [stampy_bin, '-g', get_initial_index_filename(pname, fragment, ext=False), '-h', get_initial_hash_filename(pname, fragment, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') if n_pairs > 0: call_list = call_list + ['-M', input_filename_sub] else: call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >=2: print ' '.join(call_list) sp.call(call_list) output_filename_bam = get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR, only_chunk=only_chunk) convert_sam_to_bam(output_filename_bam) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') if only_chunk is None: if VERBOSE >= 1: print 'Remove temporary files: sample '+samplename remove_mapped_init_tempfiles(pname, samplename_pat, samplename, fragment, PCR=PCR, VERBOSE=VERBOSE, only_chunk=only_chunk) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n') if n_pairs > 0: os.remove(input_filename_sub)
def premap_stampy(data_folder, adaID, VERBOSE=0, threads=1, summary=True, maxreads=-1, subsrate=0.05, gapopen=40, gapextend=3): '''Call stampy for actual mapping''' if VERBOSE: print 'Premapping: adaID ', adaID if summary: summary_filename = get_premap_summary_filename(data_folder, adaID) # Stampy can handle both gzipped and uncompressed fastq inputs input_filenames = get_read_filenames(data_folder, adaID, gzip=True) if not os.path.isfile(input_filenames[0]): input_filenames = get_read_filenames(data_folder, adaID, gzip=False) if not all(map(os.path.isfile, input_filenames)): raise OSError('Input files for mapping not found: ' + input_filenames[0]) # parallelize if requested if threads == 1: call_list = [ stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename(data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename(data_folder, adaID, ext=False), '-o', get_premapped_filename(data_folder, adaID, type='sam'), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), ] if maxreads > 0: call_list.append('--numrecords=' + str(maxreads)) call_list.extend(['-M'] + input_filenames) call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) sp.call(call_list) if summary: with open(get_premap_summary_filename(data_folder, adaID), 'a') as f: f.write('\nStampy premapped (single thread).\n') # Convert to compressed BAM convert_sam_to_bam( get_premapped_filename(data_folder, adaID, type='bam')) if summary: with open(summary_filename, 'a') as f: f.write('\nSAM file converted to compressed BAM: '+\ get_premapped_filename(data_folder, adaID, type='bam')+'\n') else: # Multithreading works as follows: call qsub + stampy, monitor the process # IDs with qstat at regular intervals, and finally merge results with pysam output_file_parts = [ get_premapped_filename(data_folder, adaID, type='bam', part=(j + 1)) for j in xrange(threads) ] # Submit map script jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') # Submit map call import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/') + '/' JOBLOGOUT = JOBDIR + 'logout' JOBLOGERR = JOBDIR + 'logerr' cluster_time = ['23:59:59', '1:59:59'] vmem = '8G' for j in xrange(threads): call_list = [ 'qsub', '-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', adaID + ' p' + str(j + 1), '-l', 'h_rt=' + cluster_time[threads >= 30], '-l', 'h_vmem=' + vmem, stampy_bin, '--overwrite', '-g', get_reference_premap_index_filename( data_folder, adaID, ext=False), '-h', get_reference_premap_hash_filename( data_folder, adaID, ext=False), '-o', get_premapped_filename( data_folder, adaID, type='sam', part=(j + 1)), '--processpart=' + str(j + 1) + '/' + str(threads), '--insertsize=450', '--insertsd=100', '--substitutionrate=' + str(subsrate), '--gapopen=' + str(gapopen), '--gapextend=' + str(gapextend), '-M' ] + input_filenames call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split( '\n')[:-1] # The last is an empty line if VERBOSE >= 3: print qstat_output if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert premapped reads to BAM for merging: adaID '+\ adaID+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy premapped (' + str(threads) + ' threads).\n') # Concatenate output files if VERBOSE >= 1: print 'Concatenate premapped reads: adaID ' + adaID + '...', output_filename = get_premapped_filename(data_folder, adaID, type='bam', unsorted=True) pysam.cat('-o', output_filename, *output_file_parts) if VERBOSE >= 1: print 'done.' if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort premapped reads: adaID ' + adaID output_filename_sorted = get_premapped_filename(data_folder, adaID, type='bam', unsorted=False) pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader premapped reads: adaID ' + adaID header_filename = get_premapped_filename(data_folder, adaID, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: adaID ' + adaID remove_premapped_tempfiles(data_folder, adaID, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp premapping files removed.\n') f.write('\n')
def filter_reads(data_folder, adaID, fragment, VERBOSE=0, maxreads=-1, contaminants=None, n_cycles=600, max_mismatches=30, susp_mismatches=20, summary=True, plot=False): '''Filter the reads to good chunks''' frag_gen = fragment[:2] reffilename = get_consensus_filename(data_folder, adaID, frag_gen) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) bamfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=False) if not os.path.isfile(bamfilename): samfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='sam', filtered=False) if os.path.isfile(samfilename): convert_sam_to_bam(bamfilename) else: if VERBOSE >= 1: print 'ERROR: '+adaID+', mapped file not found.' return outfilename = get_mapped_filename(data_folder, adaID, frag_gen, type='bam', filtered=True) suspiciousfilename = get_mapped_suspicious_filename(data_folder, adaID, frag_gen) trashfilename = outfilename[:-4]+'_trashed.bam' with pysam.Samfile(bamfilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(suspiciousfilename, 'wb', template=bamfile) as suspfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: # Iterate over all pairs n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_suspect = 0 n_mismapped_edge = 0 n_badcigar = 0 histogram_distance_from_consensus = np.zeros(n_cycles + 1, int) binsize = 200 histogram_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) for irp, reads in enumerate(pair_generator(bamfile)): # Limit to the first reads if irp == maxreads: break # Assign names (read1, read2) = reads i_fwd = reads[0].is_reverse # Check a few things to make sure we are looking at paired reads if read1.qname != read2.qname: n_wrongname += 1 raise ValueError('Read pair '+str(irp)+': reads have different names!') # Ignore unmapped reads if read1.is_unmapped or read2.is_unmapped: if VERBOSE >= 2: print 'Read pair '+read1.qname+': unmapped' n_unmapped += 1 map(trashfile.write, reads) continue # Ignore not properly paired reads (this includes mates sitting on # different fragments) if (not read1.is_proper_pair) or (not read2.is_proper_pair): if VERBOSE >= 2: print 'Read pair '+read1.qname+': not properly paired' n_unpaired += 1 map(trashfile.write, reads) continue # Mismappings are sometimes at fragment edges: # Check for overhangs beyond the edge skip = check_overhanging_reads(reads, len(ref)) if skip: n_mismapped_edge += 1 map(trashfile.write, reads) continue # Mismappings are often characterized by many mutations: # check the number of mismatches of the whole pair and skip reads with too many dc = get_distance_from_consensus(ref, reads, VERBOSE=VERBOSE) histogram_distance_from_consensus[dc.sum()] += 1 hbin = (reads[i_fwd].pos + reads[i_fwd].isize / 2) // binsize histogram_dist_along[hbin, dc.sum()] += 1 if (dc.sum() > max_mismatches): if VERBOSE >= 2: print n_mutator+1, irp, '{:2.1f}'.format(100.0 * (n_mutator + 1) / (irp + 1))+'%',\ 'Read pair '+read1.qname+': too many mismatches '+\ '('+str(dc[0])+' + '+str(dc[1])+')' n_mutator += 1 map(trashfile.write, reads) continue # Check for contamination from other PCR plates. Typically, # contamination happens for only one fragment, whereas superinfection # happens for all. At this stage, we can only give clues about # cross-contamination, the rest will be done in a script downstream # (here we could TAG suspicious reads for contamination) elif (dc.sum() > susp_mismatches): if contaminants is not None: skip = check_suspect(reads, contaminants, VERBOSE=VERBOSE) else: skip = True if skip: n_suspect += 1 map(suspfile.write, reads) continue # Trim the bad CIGARs from the sides, if there are any good ones skip = trim_bad_cigar(reads, match_len_min=match_len_min, trim_left=trim_bad_cigars, trim_right=trim_bad_cigars) if skip: n_badcigar += 1 map(trashfile.write, reads) continue # TODO: we might want to incorporate some more stringent # criterion here, to avoid short reads, cross-overhang, etc. # Write the output n_good += 1 map(outfile.write, reads) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Mispapped at edge:', n_mismapped_edge print 'Many-mutations:', n_mutator print 'Suspect contaminations:', n_suspect print 'Bad CIGARs:', n_badcigar if summary: summary_filename = get_filter_mapped_summary_filename(data_folder, adaID, fragment) with open(summary_filename, 'a') as f: f.write('Filter results: adaID '+adaID+fragment+'\n') f.write('Total:\t\t\t'+str(irp + 1)+'\n') f.write('Good:\t\t\t'+str(n_good)+'\n') f.write('Unmapped:\t\t'+str(n_unmapped)+'\n') f.write('Unpaired:\t\t'+str(n_unpaired)+'\n') f.write('Mismapped at edge:\t'+str(n_mismapped_edge)+'\n') f.write('Many-mutations:\t\t'+str(n_mutator)+'\n') f.write('Suspect contaminations:\t'+str(n_suspect)+'\n') f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n') if plot: plot_distance_histogram(data_folder, adaID, frag_gen, histogram_distance_from_consensus, savefig=True) plot_distance_histogram_sliding_window(data_folder, adaID, frag_gen, len(ref), histogram_dist_along, binsize=binsize, savefig=True)