def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4] + '_trashed.bam' infilenames = [ get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq ] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print('WARNING: No mapped files found: ' + ', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair( reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname ' + pname + ', ' + samplename_pat + ', ' + fragment + '\n') f.write('Total:\t\t\t' + str(irp + 1) + '\n') f.write('Good:\t\t\t' + str(n_good) + '\n') f.write('Unmapped:\t\t' + str(n_unmapped) + '\n') f.write('Unpaired:\t\t' + str(n_unpaired) + '\n') f.write('Many-mutations:\t\t' + str(n_mutator) + '\n') f.write('Bad CIGARs:\t\t' + str(n_badcigar) + '\n') f.write('Tiny:\t\t\t' + str(n_tiny) + '\n')
def filter_mapped_reads(sample, fragment, PCR=1, maxreads=-1, VERBOSE=0, n_cycles=600, max_mismatches=100, match_len_min=30, trim_bad_cigars=3, summary=True): '''Filter the reads to good chunks''' pname = sample.patient samplename_pat = sample.name samplenames_seq = sample.samples_seq.index.tolist() if VERBOSE >= 1: print 'Filtering reads:', pname, samplename_pat, fragment, PCR reffilename = get_initial_reference_filename(pname, fragment) refseq = SeqIO.read(reffilename, 'fasta') ref = np.array(refseq) outfilename = get_mapped_filtered_filename(pname, samplename_pat, fragment, type='bam', PCR=PCR, decontaminated=False) trashfilename = outfilename[:-4]+'_trashed.bam' infilenames = [get_mapped_to_initial_filename(pname, samplename_pat, samplename, fragment, type='bam', PCR=PCR) for samplename in samplenames_seq] infilenames = filter(os.path.isfile, infilenames) if not len(infilenames): print ('WARNING: No mapped files found: '+', '.join([pname, samplename_pat, fragment, str(PCR)])) return # Take reads evenly distributed across sequencing repetitions maxreads /= len(infilenames) if VERBOSE >= 2: print 'Input mapped filenames:', if len(infilenames) >= 2: print '' print '\n'.join(infilenames) # Use first file as template for the new bamfile infilename = infilenames[0] if not os.path.isfile(infilename): convert_sam_to_bam(infilename) with pysam.Samfile(infilename, 'rb') as bamfile: with pysam.Samfile(outfilename, 'wb', template=bamfile) as outfile,\ pysam.Samfile(trashfilename, 'wb', template=bamfile) as trashfile: n_good = 0 n_wrongname = 0 n_unmapped = 0 n_unpaired = 0 n_mutator = 0 n_badcigar = 0 n_tiny = 0 binsize = 200 hist_distance_from_consensus = np.zeros(n_cycles + 1, int) hist_dist_along = np.zeros((len(ref) // binsize + 1, n_cycles + 1), int) # Iterate over input files, the first is already open for infilename in infilenames: if infilename != infilename[0]: file_open = lambda: pysam.Samfile(infilename, 'rb') file_close = lambda f: f.close() if not os.path.isfile(infilename): convert_sam_to_bam(infilename) else: file_open = lambda: bamfile file_close = lambda f: None try: bamfile = file_open() for irp, reads in enumerate(pair_generator(bamfile)): if irp == maxreads: break pair_type = filter_read_pair(reads, ref, hist_distance_from_consensus, hist_dist_along, binsize, max_mismatches=max_mismatches, match_len_min=match_len_min, trim_bad_cigars=trim_bad_cigars, VERBOSE=VERBOSE) if pair_type == 'unmapped': n_unmapped += 1 map(trashfile.write, reads) elif pair_type == 'unpaired': n_unpaired += 1 map(trashfile.write, reads) elif pair_type == 'mutator': n_mutator += 1 map(trashfile.write, reads) elif pair_type == 'bad_cigar': n_badcigar += 1 map(trashfile.write, reads) elif pair_type == 'tiny': n_tiny += 1 map(trashfile.write, reads) else: n_good += 1 map(outfile.write, reads) finally: file_close(bamfile) if VERBOSE >= 1: print 'Read pairs: ' print 'Good:', n_good print 'Unmapped:', n_unmapped print 'Unpaired:', n_unpaired print 'Many-mutations:', n_mutator print 'Bad CIGARs:', n_badcigar print 'Tiny:', n_tiny print if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'a') as f: f.write('Filter results: pname '+pname+', '+samplename_pat+', '+fragment+'\n') f.write('Total:\t\t\t'+str(irp + 1)+'\n') f.write('Good:\t\t\t'+str(n_good)+'\n') f.write('Unmapped:\t\t'+str(n_unmapped)+'\n') f.write('Unpaired:\t\t'+str(n_unpaired)+'\n') f.write('Many-mutations:\t\t'+str(n_mutator)+'\n') f.write('Bad CIGARs:\t\t'+str(n_badcigar)+'\n') f.write('Tiny:\t\t\t'+str(n_tiny)+'\n')
pname = sample_pat.patient PCR = int(PCR) for fragment in fragments: if submit: fork_self(samplename_pat, fragment, VERBOSE=VERBOSE, n_pairs=n_pairs, PCR=PCR, summary=summary) continue if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'w') as f: f.write('Call: python filter_mapped_reads.py'+\ ' --samples '+samplename_pat+\ ' --fragments '+fragment+\ ' --verbose '+str(VERBOSE)) if n_pairs != -1: f.write(' --maxreads ' + str(n_pairs)) f.write('\n') filter_mapped_reads(sample_pat, fragment, PCR=PCR, VERBOSE=VERBOSE, maxreads=n_pairs,
samples_seq_group = samples_seq.loc[samples_seq.index.isin(samplenames_seq)] sample_pat.samples_seq = samples_seq_group pname = sample_pat.patient PCR = int(PCR) for fragment in fragments: if submit: fork_self(samplename_pat, fragment, VERBOSE=VERBOSE, n_pairs=n_pairs, PCR=PCR, summary=summary) continue if summary: sfn = get_filter_mapped_init_summary_filename(pname, samplename_pat, fragment, PCR=PCR) with open(sfn, 'w') as f: f.write('Call: python filter_mapped_reads.py'+\ ' --samples '+samplename_pat+\ ' --fragments '+fragment+\ ' --verbose '+str(VERBOSE)) if n_pairs != -1: f.write(' --maxreads '+str(n_pairs)) f.write('\n') filter_mapped_reads(sample_pat, fragment, PCR=PCR, VERBOSE=VERBOSE, maxreads=n_pairs, summary=summary)