def get_mapped_filenames(self, fragment, PCR=1): '''Get filename(s) of mapped and filtered reads''' samples_seq = self.samples_seq fns = [ get_mapped_to_initial_filename(self.patient, self.name, samplename, PCR=PCR) for samplename, sample in samples_seq.iterrows() ] return fns
def get_allele_frequency_trajectories(pname, samples, fragment, qual_min=30, VERBOSE=0): '''Scan the reads of all samples and write to a single file''' if VERBOSE >= 1: print 'Getting allele frequency trajectories:', pname, fragment from hivwholeseq.patients.filenames import get_initial_reference_filename, \ get_mapped_to_initial_filename, get_allele_frequency_trajectories_filename, \ get_allele_count_trajectories_filename from hivwholeseq.utils.one_site_statistics import get_allele_counts_insertions_from_file, \ get_allele_counts_insertions_from_file_unfiltered, \ filter_nus refseq = SeqIO.read(get_initial_reference_filename(pname, fragment), 'fasta') # Prepare output data structures cos_traj = np.zeros((len(samples), len(alpha), len(refseq)), int) nus_traj = np.zeros((len(samples), len(alpha), len(refseq))) for it, sample in enumerate(samples): if VERBOSE >= 2: print pname, it, sample input_filename = get_mapped_to_initial_filename(pname, sample, fragment, type='bam') (counts, inserts) = get_allele_counts_insertions_from_file_unfiltered( input_filename, len(refseq), qual_min=qual_min, VERBOSE=VERBOSE) # Take the total counts, blending in the read types cou = counts.sum(axis=0) cos_traj[it] = cou # Take the filtered frequencies, blending in the read types nu = filter_nus(counts) nus_traj[it] = nu #FIXME: test, etc. return (cos_traj, nus_traj)
def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4] + '_trashed.bam' infilenames = [ get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq ] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print('WARNING: No mapped files found: ' + ', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair( reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname ' + pname + ', ' + samplename_pat + ', ' + fragment + '\n') f.write('Total:\t\t\t' + str(irp + 1) + '\n') f.write('Good:\t\t\t' + str(n_good) + '\n') f.write('Unmapped:\t\t' + str(n_unmapped) + '\n') f.write('Unpaired:\t\t' + str(n_unpaired) + '\n') f.write('Many-mutations:\t\t' + str(n_mutator) + '\n') f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n') f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
def map_stampy_multithread(sample, fragment, VERBOSE=0, threads=2, summary=True, filtered=True): '''Map using stampy, multithread (via cluster requests, queueing race conditions possible)''' import hivwholeseq JOBDIR = hivwholeseq.__path__[0].rstrip('/')+'/' JOBLOGOUT = JOBDIR+'logout/' JOBLOGERR = JOBDIR+'logerr/' cluster_time = ['23:59:59', '0:59:59'] vmem = '8G' pname = patient.id sample = patient.sample_table.loc[samplename] seq_run = sample['run'] data_folder = MiSeq_runs[seq_run]['folder'] adaID = sample['adaID'] if VERBOSE: print 'Map via stampy: '+pname+' '+samplename+' '+fragment if summary: summary_filename = get_map_initial_summary_filename(pname, samplename, fragment) # Specific fragment (e.g. F5 --> F5bi) frag_spec = filter(lambda x: fragment in x, sample['fragments']) if not len(frag_spec): raise ValueError(str(patient)+', '+samplename+': fragment '+fragment+' not found.') frag_spec = frag_spec[0] input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam') # Submit map scripts in parallel to the cluster jobs_done = np.zeros(threads, bool) job_IDs = np.zeros(threads, 'S30') for j in xrange(threads): output_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='sam', part=(j+1)) # Map call_list = ['qsub','-cwd', '-b', 'y', '-S', '/bin/bash', '-o', JOBLOGOUT, '-e', JOBLOGERR, '-N', 'm '+samplename+fragment+' p'+str(j+1), '-l', 'h_rt='+cluster_time[threads >= 10], '-l', 'h_vmem='+vmem, stampy_bin, '--overwrite', '-g', get_initial_index_filename(pname, fragment, ext=False), '-h', get_initial_hash_filename(pname, fragment, ext=False), '-o', output_filename, '--processpart='+str(j+1)+'/'+str(threads), '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >= 2: print ' '.join(call_list) job_ID = sp.check_output(call_list) job_ID = job_ID.split()[2] job_IDs[j] = job_ID # Monitor output output_file_parts = [get_mapped_to_initial_filename(pname, samplename, fragment, type='bam', part=(j+1)) for j in xrange(threads)] time_wait = 10 # secs while not jobs_done.all(): # Sleep some time time.sleep(time_wait) # Get the output of qstat to check the status of jobs qstat_output = sp.check_output(['qstat']) qstat_output = qstat_output.split('\n')[:-1] # The last is an empty line if len(qstat_output) < 3: jobs_done[:] = True break else: qstat_output = [line.split()[0] for line in qstat_output[2:]] time_wait = 10 # secs for j in xrange(threads): if jobs_done[j]: continue if job_IDs[j] not in qstat_output: # Convert to BAM for merging if VERBOSE >= 1: print 'Convert mapped reads to BAM for merging: sample '+\ samplename+', part '+str(j+1)+ ' of '+ \ str(threads) convert_sam_to_bam(output_file_parts[j]) # We do not need to wait if we did the conversion (it takes # longer than some secs) time_wait = 0 jobs_done[j] = True if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped ('+str(threads)+' threads).\n') # Concatenate output files output_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='bam', unsorted=True) if VERBOSE >= 1: print 'Concatenate premapped reads: sample '+samplename pysam.cat('-o', output_filename, *output_file_parts) if summary: with open(summary_filename, 'a') as f: f.write('BAM files concatenated (unsorted).\n') # Sort the file by read names (to ensure the pair_generator) output_filename_sorted = get_mapped_to_initial_filename(pname, samplename, fragment, type='bam') # NOTE: we exclude the extension and the option -f because of a bug in samtools if VERBOSE >= 1: print 'Sort mapped reads: sample '+samplename pysam.sort('-n', output_filename, output_filename_sorted[:-4]) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file sorted.\n') # Reheader the file without BAM -> SAM -> BAM if VERBOSE >= 1: print 'Reheader mapped reads: sample '+samplename header_filename = get_mapped_to_initial_filename(pname, samplename, fragment, type='sam', part=1) pysam.reheader(header_filename, output_filename_sorted) if summary: with open(summary_filename, 'a') as f: f.write('Joint BAM file reheaded.\n') if VERBOSE >= 1: print 'Remove temporary files: sample '+samplename remove_mapped_init_tempfiles(pname, samplename, fragment, VERBOSE=VERBOSE) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n')
def map_stampy_singlethread(sample, fragment, VERBOSE=0, n_pairs=-1, summary=True, only_chunk=None, filtered=True): '''Map using stampy, single thread (no cluster queueing race conditions)''' pname = sample.patient samplename_pat = sample['patient sample'] seq_run = sample['seq run'] data_folder = sample.sequencing_run['folder'] adaID = sample['adapter'] PCR = int(sample.PCR) if VERBOSE: print 'Map via stampy (single thread): '+samplename+' '+fragment if summary: summary_filename = get_map_initial_summary_filename(pname, samplename_pat, samplename, fragment, PCR=PCR) # Specific fragment (e.g. F5 --> F5bi) frag_spec = filter(lambda x: fragment in x, sample.regions_complete) if not len(frag_spec): if summary: with open(summary_filename, 'a') as f: f.write('Failed (specific fragment for '+fragment+'not found).\n') raise ValueError(samplename+': fragment '+fragment+' not found.') else: frag_spec = frag_spec[0] input_filename = get_input_filename(data_folder, adaID, frag_spec, type='bam', only_chunk=only_chunk, filtered=filtered) # NOTE: we introduced fragment nomenclature late, e.g. F3a. Check for that if not os.path.isfile(input_filename): if fragment == 'F3': input_filename = input_filename.replace('F3a', 'F3') # Check existance of input file, because stampy creates output anyway if not os.path.isfile(input_filename): if summary: with open(summary_filename, 'a') as f: f.write('Failed (input file for mapping not found).\n') raise ValueError(samplename+', fragment '+fragment+': input file not found.') # Extract subsample of reads if requested if n_pairs > 0: from hivwholeseq.utils.mapping import extract_mapped_reads_subsample input_filename_sub = get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, PCR=PCR, type='bam')[:-4]+\ '_unmapped.bam' n_written = extract_mapped_reads_subsample(input_filename, input_filename_sub, n_pairs, VERBOSE=VERBOSE) # Get output filename output_filename = get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, PCR=PCR, type='sam', only_chunk=only_chunk) # Map call_list = [stampy_bin, '-g', get_initial_index_filename(pname, fragment, ext=False), '-h', get_initial_hash_filename(pname, fragment, ext=False), '-o', output_filename, '--overwrite', '--substitutionrate='+subsrate, '--gapopen', stampy_gapopen, '--gapextend', stampy_gapextend] if stampy_sensitive: call_list.append('--sensitive') if n_pairs > 0: call_list = call_list + ['-M', input_filename_sub] else: call_list = call_list + ['-M', input_filename] call_list = map(str, call_list) if VERBOSE >=2: print ' '.join(call_list) sp.call(call_list) output_filename_bam = get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR, only_chunk=only_chunk) convert_sam_to_bam(output_filename_bam) if summary: with open(summary_filename, 'a') as f: f.write('Stampy mapped (single thread).\n') if only_chunk is None: if VERBOSE >= 1: print 'Remove temporary files: sample '+samplename remove_mapped_init_tempfiles(pname, samplename_pat, samplename, fragment, PCR=PCR, VERBOSE=VERBOSE, only_chunk=only_chunk) if summary: with open(summary_filename, 'a') as f: f.write('Temp mapping files removed.\n') f.write('\n') if n_pairs > 0: os.remove(input_filename_sub)
def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4]+'_trashed.bam' infilenames = [get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair(reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n') f.write('Total:\t\t\t'+str(irp + 1)+'\n') f.write('Good:\t\t\t'+str(n_good)+'\n') f.write('Unmapped:\t\t'+str(n_unmapped)+'\n') f.write('Unpaired:\t\t'+str(n_unpaired)+'\n') f.write('Many-mutations:\t\t'+str(n_mutator)+'\n') f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n') f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')