def nominate_spanning_reads(chimera_file, unmapped_bam_file, output_fastq_file): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(output_fastq_file, "w") remap_qnames = set() breaks5p = collections.defaultdict(lambda: []) breaks3p = collections.defaultdict(lambda: []) for c in Chimera.parse(open(chimera_file)): end5p = c.partner5p.end start3p = c.partner3p.start # keep track of all breakpoints breaks5p[c.partner5p.tx_name].append(end5p) breaks3p[c.partner5p.tx_name].append(start3p) for r5p, r3p in c.encomp_read_pairs: # if 5' read overlaps breakpoint then it should be remapped if r5p.clipstart < end5p < r5p.clipend: key5p = (r5p.qname, r5p.readnum) if key5p not in remap_qnames: remap_qnames.add((r5p.qname, r5p.readnum)) print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq, "I" * len(r5p.seq)) # if 3' read overlaps breakpoint then it should be remapped if r3p.clipstart < start3p < r3p.clipend: key3p = (r3p.qname, r3p.readnum) if key3p not in remap_qnames: remap_qnames.add((r3p.qname, r3p.readnum)) print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq, "I" * len(r3p.seq)) # sort breakpoint positions within each gene for tx_name in breaks5p.keys(): breaks5p[tx_name] = sorted(breaks5p[tx_name]) for tx_name in breaks3p.keys(): breaks3p[tx_name] = sorted(breaks3p[tx_name]) # check read pairs with one or both unmapped, and remap those # as well bamfh = pysam.Samfile(unmapped_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): for readnum in xrange(0, 2): print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual) # # add unmapped reads # if reads[0].is_unmapped: # readnum = 2 if reads[0].is_read2 else 1 # print >>fqfh, to_fastq(reads[0].qname, readnum, reads[0].seq, # "I" * len(reads[0].seq)) # # TODO: remove this # assert len(reads) == 1 # else: # remap = False # for r in reads: # tx_name = config.GENE_REF_PREFIX + bamfh.getrname(r.rname) # # check if this read overlaps a breakpoint # # bisect() bamfh.close() return config.JOB_SUCCESS
def transcriptome_to_genome(genome_index, transcripts, input_file, output_file, library_type, input_sam, output_sam): # setup and open files infh, outfh, transcript_tid_map = \ _setup_and_open_files(genome_index, transcripts, input_file, output_file, library_type, input_sam, output_sam) # now convert BAM reads logging.debug("Converting transcriptome to genome BAM") num_paired_frags = 0 num_unpaired_frags = 0 for pe_reads in parse_pe_reads(infh): pairs, unpaired_reads = group_read_pairs(pe_reads) if len(pairs) > 0: num_paired_frags += 1 # convert pairs for r1, r2 in convert_read_pairs(pairs, transcript_tid_map, library_type): outfh.write(r1) outfh.write(r2) else: num_unpaired_frags += 1 for r in convert_unpaired_reads(unpaired_reads, transcript_tid_map, library_type): outfh.write(r) logging.debug("Paired fragments: %d" % (num_paired_frags)) logging.debug("Unpaired fragments: %d" % (num_unpaired_frags)) outfh.close() infh.close() return config.JOB_SUCCESS
def from_bam(bamfh, min_isize, max_isize, max_samples=None): """ iterates through a BAM file looking for uniquely mapping concordant reads. keeps a histogram of all observed insert sizes in the reads. stops once 'max_samples' valid reads are encountered, or the end of the file is reached """ res = InsertSizeDistribution() res.min_isize = min_isize res.max_isize = max_isize res.arr = array.array("L", (0 for x in xrange(min_isize, max_isize + 1))) count = 0 outside_range = 0 unmapped = 0 multimapping = 0 discordant = 0 # setup debugging logging messages debug_count = 0 debug_every = 1e5 debug_next = debug_every for pe_reads in parse_pe_reads(bamfh): # progress log debug_count += 1 if debug_count == debug_next: debug_next += debug_every logging.debug("Processed reads: %d" % (debug_count)) logging.debug("Unique paired reads: %d" % (count)) logging.debug("Unmapped: %d" % (unmapped)) logging.debug("Ambiguous (multimapping): %d" % (multimapping)) logging.debug("Outside range: %d" % (outside_range)) if (max_samples is not None) and count > max_samples: break # only use uniquely mapping reads on the same chromosome num_read1_mappings = len(pe_reads[0]) num_read2_mappings = len(pe_reads[1]) if (num_read1_mappings == 0) or (num_read2_mappings == 0): unmapped += 1 if num_read1_mappings > 0: print pe_reads[0][0] if num_read2_mappings > 0: print pe_reads[1][0] continue if (num_read1_mappings > 1) or (num_read2_mappings > 1): multimapping += 1 continue # each read has exactly one alignment r1 = pe_reads[0][0] r2 = pe_reads[1][0] if r1.rname != r2.rname: discordant += 1 continue # compute insert size isize = get_insert_size(r1, r2) if res.min_isize <= isize <= res.max_isize: # store in array res.arr[isize - res.min_isize] += 1 count += 1 else: outside_range += 1 return res
def from_bam(bamfh, min_isize, max_isize, max_samples=None): """ iterates through a BAM file looking for uniquely mapping concordant reads. keeps a histogram of all observed insert sizes in the reads. stops once 'max_samples' valid reads are encountered, or the end of the file is reached """ res = FragmentSizeDistribution() res.min_isize = min_isize res.max_isize = max_isize res.arr = array.array('L', (0 for x in xrange(min_isize, max_isize + 1))) count = 0 outside_range = 0 unmapped = 0 multimapping = 0 discordant = 0 # setup debugging logging messages debug_count = 0 debug_every = 1e5 debug_next = debug_every for pe_reads in parse_pe_reads(bamfh): # progress log debug_count += 1 if debug_count == debug_next: debug_next += debug_every logging.debug("Processed reads: %d" % (debug_count)) logging.debug("Unique paired reads: %d" % (count)) logging.debug("Unmapped: %d" % (unmapped)) logging.debug("Ambiguous (multimapping): %d" % (multimapping)) logging.debug("Outside range: %d" % (outside_range)) if (max_samples is not None) and count > max_samples: break # only use uniquely mapping reads on the same chromosome num_read1_mappings = len(pe_reads[0]) num_read2_mappings = len(pe_reads[1]) if (num_read1_mappings == 0) or (num_read2_mappings == 0): unmapped += 1 continue if (num_read1_mappings > 1) or (num_read2_mappings > 1): multimapping += 1 continue # each read has exactly one alignment r1 = pe_reads[0][0] r2 = pe_reads[1][0] if r1.is_unmapped or r2.is_unmapped: unmapped += 1 continue if r1.rname != r2.rname: discordant += 1 continue # compute insert size isize = get_insert_size(r1, r2) if (res.min_isize <= isize <= res.max_isize): # store in array res.arr[isize - res.min_isize] += 1 count += 1 else: outside_range += 1 return res
def filter_multihits(transcript_file, input_bam_file, output_bam_file, max_multihits=1): logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(transcript_file))) # parse and convert sam -> bam inbamfh = pysam.Samfile(input_bam_file, "rb") outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts) num_frags = 0 logging.debug("Annotating and filtering multihits") for pe_reads in parse_pe_reads(inbamfh): mate_num_hits = [] for reads in pe_reads: num_hits = annotate_multihits(reads, tid_tx_genome_map) mate_num_hits.append(num_hits) new_pe_reads = [[], []] if mate_num_hits[0] > max_multihits: r = copy_read(pe_reads[0][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[1] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[0] = [r] else: new_pe_reads[0] = pe_reads[0] if mate_num_hits[1] > max_multihits: r = copy_read(pe_reads[1][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[0] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[1] = [r] else: new_pe_reads[1] = pe_reads[1] for reads in pe_reads: for r in reads: outbamfh.write(r) num_frags += 1 logging.debug("Found %d fragments" % (num_frags)) inbamfh.close() outbamfh.close() return config.JOB_SUCCESS
def nominate_spanning_reads(chimera_file, unmapped_bam_file, output_fastq_file): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(output_fastq_file, "w") remap_qnames = set() breaks5p = collections.defaultdict(lambda: []) breaks3p = collections.defaultdict(lambda: []) for c in Chimera.parse(open(chimera_file)): end5p = c.partner5p.end start3p = c.partner3p.start # keep track of all breakpoints breaks5p[c.partner5p.tx_name].append(end5p) breaks3p[c.partner5p.tx_name].append(start3p) for r5p, r3p in c.encomp_read_pairs: # if 5' read overlaps breakpoint then it should be remapped if r5p.clipstart < end5p < r5p.clipend: key5p = (r5p.qname, r5p.readnum) if key5p not in remap_qnames: remap_qnames.add((r5p.qname, r5p.readnum)) print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq, "I" * len(r5p.seq)) # if 3' read overlaps breakpoint then it should be remapped if r3p.clipstart < start3p < r3p.clipend: key3p = (r3p.qname, r3p.readnum) if key3p not in remap_qnames: remap_qnames.add((r3p.qname, r3p.readnum)) print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq, "I" * len(r3p.seq)) # sort breakpoint positions within each gene for tx_name in breaks5p.keys(): breaks5p[tx_name] = sorted(breaks5p[tx_name]) for tx_name in breaks3p.keys(): breaks3p[tx_name] = sorted(breaks3p[tx_name]) # check read pairs with one or both unmapped, and remap those # as well bamfh = pysam.Samfile(unmapped_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): for readnum in xrange(0, 2): print >> fqfh, to_fastq( pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual ) # # add unmapped reads # if reads[0].is_unmapped: # readnum = 2 if reads[0].is_read2 else 1 # print >>fqfh, to_fastq(reads[0].qname, readnum, reads[0].seq, # "I" * len(reads[0].seq)) # # TODO: remove this # assert len(reads) == 1 # else: # remap = False # for r in reads: # tx_name = config.GENE_REF_PREFIX + bamfh.getrname(r.rname) # # check if this read overlaps a breakpoint # # bisect() bamfh.close() return config.JOB_SUCCESS
def filter_multihits(transcript_file, input_bam_file, output_bam_file, max_multihits=1): logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(transcript_file))) # parse and convert sam -> bam inbamfh = pysam.Samfile(input_bam_file, "rb") outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts) num_frags = 0 logging.debug("Annotating and filtering multihits") for pe_reads in parse_pe_reads(inbamfh): mate_num_hits = [] for reads in pe_reads: num_hits = annotate_multihits(reads, tid_tx_genome_map) mate_num_hits.append(num_hits) new_pe_reads = [[],[]] if mate_num_hits[0] > max_multihits: r = copy_read(pe_reads[0][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[1] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[0] = [r] else: new_pe_reads[0] = pe_reads[0] if mate_num_hits[1] > max_multihits: r = copy_read(pe_reads[1][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[0] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[1] = [r] else: new_pe_reads[1] = pe_reads[1] for reads in pe_reads: for r in reads: outbamfh.write(r) num_frags += 1 logging.debug("Found %d fragments" % (num_frags)) inbamfh.close() outbamfh.close() return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) """ logging.info("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tGene paired file: %s" % (paired_bam_file)) logging.debug("\tUnmapped file: %s" % (unmapped_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) # read transcript features logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.debug("Building transcript lookup tables") # build a lookup table from bam tid index to transcript object tid_tx_map = build_tid_transcript_map(bamfh, transcripts) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts) logging.info("Parsing reads") for pe_reads in parse_pe_reads(bamfh): # add hit index and multimap information to read tags # this function also checks for unmapped reads any_unmapped = False for reads in pe_reads: any_unmapped = (any_unmapped or annotate_multihits( bamfh, reads, tid_tx_genome_map)) if any_unmapped: # write to output as discordant reads and continue to # next fragment write_pe_reads(unmappedfh, pe_reads) continue # examine all read pairing combinations and rule out invalid pairings gene_pairs, unpaired_reads = classify_read_pairs( pe_reads, max_isize, library_type, tid_tx_map) if len(gene_pairs) > 0: write_pairs(genefh, gene_pairs) # TODO: do something with unpaired discordant reads? genefh.close() unmappedfh.close() bamfh.close() logging.info("Finished pairing reads") return config.JOB_SUCCESS
def process_tophat_alignments(fastq_files, bam_file, gene_file, max_fragment_length, output_fastq_files, output_bam_file, unpaired=False, suffix="/"): # index genes exon_intervals, exon_trees = build_exon_interval_trees(gene_file) # open input files bamfh = pysam.Samfile(bam_file, "rb") if unpaired: bam_iter = parse_unpaired_pe_reads(bamfh) else: bam_iter = parse_pe_reads(bamfh) fastq_iters = [parse_fastq(open(fq)) for fq in fastq_files] # open output files outfq = [open(fq, "w") for fq in output_fastq_files] outbamfh = pysam.Samfile(output_bam_file, "wb", template=bamfh) # iterate through fastq files and bam file try: while True: bam_pe_reads = bam_iter.next() # synchronize fastq and bam and write unmapped reads to a file is_unaligned = synchronize_bam_fastq(bam_pe_reads, fastq_iters, outfq, suffix) if is_unaligned: continue # if loop reaches this point then we have a paired-end # read where both pairs align. now need to check if # the alignment is discordant tx_concordant, gene_concordant = \ is_concordant(bamfh, bam_pe_reads, exon_intervals, exon_trees, max_fragment_length) if not gene_concordant: for r in bam_pe_reads[0]: outbamfh.write(r) for r in bam_pe_reads[1]: outbamfh.write(r) except StopIteration: pass # finish remaining fastq lines try: while True: fqreads = [it.next() for it in fastq_iters] print >> outfq[0], fastq_to_string(fqreads[0]) print >> outfq[1], fastq_to_string(fqreads[1]) except StopIteration: pass return config.JOB_SUCCESS
def process_tophat_alignments(fastq_files, bam_file, gene_file, max_fragment_length, output_fastq_files, output_bam_file, unpaired=False, suffix="/"): # index genes exon_intervals, exon_trees = build_exon_interval_trees(gene_file) # open input files bamfh = pysam.Samfile(bam_file, "rb") if unpaired: bam_iter = parse_unpaired_pe_reads(bamfh) else: bam_iter = parse_pe_reads(bamfh) fastq_iters = [parse_fastq(open(fq)) for fq in fastq_files] # open output files outfq = [open(fq, "w") for fq in output_fastq_files] outbamfh = pysam.Samfile(output_bam_file, "wb", template=bamfh) # iterate through fastq files and bam file try: while True: bam_pe_reads = bam_iter.next() # synchronize fastq and bam and write unmapped reads to a file is_unaligned = synchronize_bam_fastq(bam_pe_reads, fastq_iters, outfq, suffix) if is_unaligned: continue # if loop reaches this point then we have a paired-end # read where both pairs align. now need to check if # the alignment is discordant tx_concordant, gene_concordant = \ is_concordant(bamfh, bam_pe_reads, exon_intervals, exon_trees, max_fragment_length) if not gene_concordant: for r in bam_pe_reads[0]: outbamfh.write(r) for r in bam_pe_reads[1]: outbamfh.write(r) except StopIteration: pass # finish remaining fastq lines try: while True: fqreads = [it.next() for it in fastq_iters] print >>outfq[0], fastq_to_string(fqreads[0]) print >>outfq[1], fastq_to_string(fqreads[1]) except StopIteration: pass return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) """ logging.info("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tGene paired file: %s" % (paired_bam_file)) logging.debug("\tUnmapped file: %s" % (unmapped_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) # read transcript features logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.debug("Building transcript lookup tables") # build a lookup table from bam tid index to transcript object tid_tx_map = build_tid_transcript_map(bamfh, transcripts) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts) logging.info("Parsing reads") for pe_reads in parse_pe_reads(bamfh): # add hit index and multimap information to read tags # this function also checks for unmapped reads any_unmapped = False for reads in pe_reads: any_unmapped = any_unmapped or annotate_multihits(bamfh, reads, tid_tx_genome_map) if any_unmapped: # write to output as discordant reads and continue to # next fragment write_pe_reads(unmappedfh, pe_reads) continue # examine all read pairing combinations and rule out invalid pairings gene_pairs, unpaired_reads = classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map) if len(gene_pairs) > 0: write_pairs(genefh, gene_pairs) # TODO: do something with unpaired discordant reads? genefh.close() unmappedfh.close() bamfh.close() logging.info("Finished pairing reads") return config.JOB_SUCCESS
def extract_single_mapped_reads( chimera_file, unmapped_bam_file, single_mapped_bam_file, unmapped_fastq_file, library_type, tmp_dir ): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(unmapped_fastq_file, "w") # annotate mapped reads with sequence/quality of unmapped mate bamfh = pysam.Samfile(unmapped_bam_file, "rb") unsorted_single_mapped_bam_file = os.path.join(tmp_dir, "unsorted_single_mapped_reads.bam") singlemap_bamfh = pysam.Samfile(unsorted_single_mapped_bam_file, "wb", template=bamfh) # get list of 'gene' references in bam file to compare with gene_tids = set([tid for tid, refname in enumerate(bamfh.references) if refname.startswith(config.GENE_REF_PREFIX)]) for pe_reads in parse_pe_reads(bamfh): # find which of the original reads was unmapped r1_unmapped = any(r.is_unmapped for r in pe_reads[0]) r2_unmapped = any(r.is_unmapped for r in pe_reads[1]) # if both reads unmapped, then remap to breakpoints if r1_unmapped and r2_unmapped: for readnum in (0, 1): print >> fqfh, to_fastq( pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual ) else: # annotate the mapped reads with the seq/qual of the # unmapped reads mapped_readnum = 0 if r2_unmapped else 1 unmapped_readnum = 1 if r2_unmapped else 0 unmapped_seq = pe_reads[unmapped_readnum][0].seq unmapped_qual = pe_reads[unmapped_readnum][0].qual for r in pe_reads[mapped_readnum]: # only consider gene mappings if r.rname not in gene_tids: continue orientation = get_gene_orientation(r, library_type) # TODO: may need to REVERSE read here to get original r.tags = r.tags + [("R2", unmapped_seq), ("Q2", unmapped_qual), (ORIENTATION_TAG_NAME, orientation)] singlemap_bamfh.write(r) singlemap_bamfh.close() fqfh.close() # sort/index the annotated single-mapper unmapped reads by reference/position logging.debug("Sorting single-mapped mates by reference") single_mapped_bam_prefix = os.path.splitext(single_mapped_bam_file)[0] pysam.sort("-m", str(int(1e9)), unsorted_single_mapped_bam_file, single_mapped_bam_prefix) pysam.index(single_mapped_bam_file) # remove unsorted file if os.path.exists(unsorted_single_mapped_bam_file): os.remove(unsorted_single_mapped_bam_file) return config.JOB_SUCCESS
def extract_single_mapped_reads(chimera_file, unmapped_bam_file, single_mapped_bam_file, unmapped_fastq_file, library_type, tmp_dir): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(unmapped_fastq_file, "w") # annotate mapped reads with sequence/quality of unmapped mate bamfh = pysam.Samfile(unmapped_bam_file, "rb") unsorted_single_mapped_bam_file = os.path.join( tmp_dir, "unsorted_single_mapped_reads.bam") singlemap_bamfh = pysam.Samfile(unsorted_single_mapped_bam_file, "wb", template=bamfh) for pe_reads in parse_pe_reads(bamfh): # find which of the original reads was unmapped r1_unmapped = any(r.is_unmapped for r in pe_reads[0]) r2_unmapped = any(r.is_unmapped for r in pe_reads[1]) # if both reads unmapped, then remap to breakpoints if r1_unmapped and r2_unmapped: for readnum in (0, 1): print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual) else: # annotate the mapped reads with the seq/qual of the # unmapped reads mapped_readnum = 0 if r2_unmapped else 1 unmapped_readnum = 1 if r2_unmapped else 0 unmapped_seq = pe_reads[unmapped_readnum][0].seq unmapped_qual = pe_reads[unmapped_readnum][0].qual for r in pe_reads[mapped_readnum]: orientation = get_orientation(r, library_type) # TODO: may need to REVERSE read here to get original r.tags = r.tags + [("R2", unmapped_seq), ("Q2", unmapped_qual), (ORIENTATION_TAG_NAME, orientation)] singlemap_bamfh.write(r) singlemap_bamfh.close() fqfh.close() # sort/index the annotated single-mapper unmapped reads by reference/position logging.debug("Sorting single-mapped mates by reference") single_mapped_bam_prefix = os.path.splitext(single_mapped_bam_file)[0] pysam.sort("-m", str(int(1e9)), unsorted_single_mapped_bam_file, single_mapped_bam_prefix) pysam.index(single_mapped_bam_file) # remove unsorted file if os.path.exists(unsorted_single_mapped_bam_file): os.remove(unsorted_single_mapped_bam_file) return config.JOB_SUCCESS
def nominate_unmapped_spanning_reads(unmapped_bam_file, output_fastq_file): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(output_fastq_file, "w") # check read pairs with one or both unmapped, and remap those # as well bamfh = pysam.Samfile(unmapped_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): # remap all unmapped reads for readnum,reads in enumerate(pe_reads): if any(r.is_unmapped for r in reads): print >>fqfh, to_fastq(pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual) bamfh.close() fqfh.close() return config.JOB_SUCCESS
def nominate_unmapped_spanning_reads(unmapped_bam_file, output_fastq_file): # find all reads that need to be remapped to see if they span the # breakpoint junction fqfh = open(output_fastq_file, "w") # check read pairs with one or both unmapped, and remap those # as well bamfh = pysam.Samfile(unmapped_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): # remap all unmapped reads for readnum, reads in enumerate(pe_reads): if any(r.is_unmapped for r in reads): print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum, pe_reads[readnum][0].seq, pe_reads[readnum][0].qual) bamfh.close() fqfh.close() return config.JOB_SUCCESS
def transcriptome_to_genome(genome_index, transcripts, input_file, output_file, library_type, input_sam, output_sam): # setup and open files infh, outfh, transcript_tid_map = \ _setup_and_open_files(genome_index, transcripts, input_file, output_file, library_type, input_sam, output_sam) # now convert BAM reads logging.debug("Converting transcriptome to genome BAM") num_paired_frags = 0 num_unpaired_frags = 0 for pe_reads in parse_pe_reads(infh): pairs, unpaired_reads = group_read_pairs(pe_reads) if len(pairs) > 0: num_paired_frags += 1 # convert pairs for r1,r2 in convert_read_pairs(pairs, transcript_tid_map, library_type): outfh.write(r1) outfh.write(r2) else: num_unpaired_frags += 1 for r in convert_unpaired_reads(unpaired_reads, transcript_tid_map, library_type): outfh.write(r) logging.debug("Paired fragments: %d" % (num_paired_frags)) logging.debug("Unpaired fragments: %d" % (num_unpaired_frags)) outfh.close() infh.close() return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, gene_paired_bam_file, genome_paired_bam_file, unmapped_bam_file, complex_bam_file, index_dir, max_isize, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) - discordant genome alignments (unannotated) """ logging.info("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tGene paired file: %s" % (gene_paired_bam_file)) logging.debug("\tGenome paired file: %s" % (genome_paired_bam_file)) logging.debug("\tUnmapped file: %s" % (unmapped_bam_file)) logging.debug("\tComplex file: %s" % (complex_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") genefh = pysam.Samfile(gene_paired_bam_file, "wb", template=bamfh) genomefh = pysam.Samfile(genome_paired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) complexfh = pysam.Samfile(complex_bam_file, "wb", template=bamfh) gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) # build a lookup table to get all the overlapping transcripts given a # transcript 'tid' tid_tx_cluster_map = build_tid_tx_cluster_map(bamfh, open(gene_file), rname_prefix=config.GENE_REF_PREFIX) # build a lookup table to get genome coordinates from transcript # coordinates tid_genome_map = build_tid_to_genome_map(bamfh, open(gene_file), rname_prefix=config.GENE_REF_PREFIX) for pe_reads in parse_pe_reads(bamfh): # add hit index and number of multimaps information to read tags # this function also checks for unmapped reads any_unmapped = False for reads in pe_reads: any_unmapped = (any_unmapped or annotate_multihits(bamfh, reads, tid_genome_map)) if any_unmapped: # write to output as discordant reads and continue to # next fragment write_pe_reads(unmappedfh, pe_reads) continue # examine all read pairing combinations and rule out invalid # pairings. this returns gene pairs and genome pairs gene_pairs, genome_pairs, unpaired_reads = \ classify_read_pairs(pe_reads, max_isize, library_type, tid_genome_map, tid_tx_cluster_map) if len(gene_pairs) > 0 or len(genome_pairs) > 0: write_pairs(genefh, gene_pairs) write_pairs(genomefh, genome_pairs) else: write_pe_reads(complexfh, unpaired_reads) genefh.close() genomefh.close() unmappedfh.close() complexfh.close() bamfh.close() logging.info("Finished pairing reads")
def find_discordant_fragments(transcripts, input_bam_file, paired_bam_file, discordant_bam_file, unpaired_bam_file, unmapped_bam_file, multimap_bam_file, unresolved_bam_file, max_isize, max_multihits, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) """ logging.debug("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tPaired BAM file: %s" % (paired_bam_file)) logging.debug("\tUnpaired BAM file: %s" % (unpaired_bam_file)) logging.debug("\tUnmapped BAM file: %s" % (unmapped_bam_file)) logging.debug("\tMultimap BAM file: %s" % (multimap_bam_file)) logging.debug("\tUnresolved BAM file: %s" % (unresolved_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") pairedfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) discordantfh = pysam.Samfile(discordant_bam_file, "wb", template=bamfh) unpairedfh = pysam.Samfile(unpaired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) multimapfh = pysam.Samfile(multimap_bam_file, "wb", template=bamfh) unresolvedfh = pysam.Samfile(unresolved_bam_file, "wb", template=bamfh) # build a lookup table from bam tid index to transcript object logging.debug("Building transcript lookup tables") tid_tx_map = build_tid_transcript_map(bamfh, transcripts) tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts) # build a transcript to genome coordinate map logging.debug("Parsing and classifying reads") num_unmapped = 0 num_unpaired = 0 num_multimap = 0 num_paired = 0 num_discordant = 0 num_unresolved = 0 for pe_reads in parse_pe_reads(bamfh): # count multimapping mate_num_hits = [0, 0] for rnum, reads in enumerate(pe_reads): num_hits = count_transcriptome_multimaps(bamfh, reads, tid_tx_genome_map) mate_num_hits[rnum] = num_hits if max(mate_num_hits) > max_multihits: # if either mate has many genome mappings then write # the reads to the multimapping bam file write_pe_reads(multimapfh, pe_reads) num_multimap += 1 elif max(mate_num_hits) == 0: # if both mates unmapped write to unmapped bam file write_pe_reads(unmappedfh, pe_reads) num_unmapped += 1 elif min(mate_num_hits) == 0: # if one or other mate unmapped then write to the unpaired bam file write_unpaired_reads(pe_reads, mate_num_hits, library_type, unpairedfh) num_unpaired += 1 else: # examine all read pairing combinations and rule out invalid pairings concordant_pairs, discordant_pairs, unpaired_reads = \ classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map) if len(concordant_pairs) > 0: write_pairs(concordant_pairs, pairedfh) num_paired += 1 elif len(discordant_pairs) > 0: write_pairs(discordant_pairs, discordantfh) num_discordant += 1 else: # both reads in the pair mapped, but no pairings could # be resolved write_pe_reads(unpaired_reads, unresolvedfh) num_unresolved += 1 pairedfh.close() discordantfh.close() unpairedfh.close() unmappedfh.close() multimapfh.close() unresolvedfh.close() bamfh.close() logging.debug("Finished pairing reads") logging.debug("\tUnmapped fragments: %d" % (num_unmapped)) logging.debug("\tMultimapping fragments: %d" % (num_multimap)) logging.debug("\tUnpaired fragments: %d" % (num_unpaired)) logging.debug("\tUnresolvable mapped fragments: %d" % (num_unresolved)) logging.debug("\tDiscordant fragments: %d" % (num_discordant)) logging.debug("\tPaired fragments: %d" % (num_paired)) return config.JOB_SUCCESS
def pair_discordant_clusters(discordant_bam_file, cluster_pair_file, tmp_dir): # # sort the BAM file that has cluster annotations by read name # logging.debug("Sorting newly annotated discordant BAM file by read name") qname_sorted_bam_prefix = os.path.join(tmp_dir, os.path.splitext(discordant_bam_file)[0] + ".byname") qname_sorted_bam_file = qname_sorted_bam_prefix + ".bam" pysam.sort("-n", "-m", str(int(1e9)), discordant_bam_file, qname_sorted_bam_prefix) # # iterate through named-sorted bam file write cluster pairs # logging.debug("Enumerating cluster pairs") tmp_cluster_file = os.path.join(tmp_dir, "tmp_clusters.txt") tmp_cluster_fh = open(tmp_cluster_file, 'w') bamfh = pysam.Samfile(qname_sorted_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): # group into 5' and 3' reads reads5p = [] reads3p = [] for reads in pe_reads: for r in reads: orientation = r.opt(ORIENTATION_TAG) if orientation == ORIENTATION_5P: reads5p.append(r) else: reads3p.append(r) # iterate through possible pairs for r5p in reads5p: for r3p in reads3p: id5p = r5p.opt(DISCORDANT_CLUSTER_TAG) id3p = r3p.opt(DISCORDANT_CLUSTER_TAG) print >>tmp_cluster_fh, '\t'.join(map(str, (id5p, id3p, r5p.qname))) bamfh.close() tmp_cluster_fh.close() # # sort cluster pairs # logging.debug("Sorting cluster pairs") tmp_sorted_cluster_file = os.path.join(tmp_dir, "tmp_clusters.srt.txt") def sortfunc(line): fields = line.strip().split('\t') return (fields[0], fields[1]) batch_sort(input=tmp_cluster_file, output=tmp_sorted_cluster_file, key=sortfunc, buffer_size=32000, tempdirs=[tmp_dir]) # # write cluster pairs # logging.debug("Grouping cluster pairs") pair_id = 0 outfh = open(cluster_pair_file, "w") for id5p, id3p, qnames in parse_and_group_cluster_pairs(open(tmp_sorted_cluster_file)): print >>outfh, '\t'.join(map(str, [pair_id, id5p, id3p, ','.join(qnames)])) pair_id += 1 outfh.close() # remove temporary files if os.path.exists(qname_sorted_bam_file): os.remove(qname_sorted_bam_file) if os.path.exists(tmp_cluster_file): os.remove(tmp_cluster_file) if os.path.exists(tmp_sorted_cluster_file): os.remove(tmp_sorted_cluster_file) return config.JOB_SUCCESS
def find_discordant_fragments(transcripts, input_bam_file, paired_bam_file, discordant_bam_file, unpaired_bam_file, unmapped_bam_file, multimap_bam_file, unresolved_bam_file, max_isize, max_multihits, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) """ logging.debug("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tPaired BAM file: %s" % (paired_bam_file)) logging.debug("\tUnpaired BAM file: %s" % (unpaired_bam_file)) logging.debug("\tUnmapped BAM file: %s" % (unmapped_bam_file)) logging.debug("\tMultimap BAM file: %s" % (multimap_bam_file)) logging.debug("\tUnresolved BAM file: %s" % (unresolved_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") pairedfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) discordantfh = pysam.Samfile(discordant_bam_file, "wb", template=bamfh) unpairedfh = pysam.Samfile(unpaired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) multimapfh = pysam.Samfile(multimap_bam_file, "wb", template=bamfh) unresolvedfh = pysam.Samfile(unresolved_bam_file, "wb", template=bamfh) # build a lookup table from bam tid index to transcript object logging.debug("Building transcript lookup tables") tid_tx_map = build_tid_transcript_map(bamfh, transcripts) tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts) # build a transcript to genome coordinate map logging.debug("Parsing and classifying reads") num_unmapped = 0 num_unpaired = 0 num_multimap = 0 num_paired = 0 num_discordant = 0 num_unresolved = 0 for pe_reads in parse_pe_reads(bamfh): # count multimapping mate_num_hits = [0, 0] for rnum,reads in enumerate(pe_reads): num_hits = count_transcriptome_multimaps(bamfh, reads, tid_tx_genome_map) mate_num_hits[rnum] = num_hits if max(mate_num_hits) > max_multihits: # if either mate has many genome mappings then write # the reads to the multimapping bam file write_pe_reads(multimapfh, pe_reads) num_multimap += 1 elif max(mate_num_hits) == 0: # if both mates unmapped write to unmapped bam file write_pe_reads(unmappedfh, pe_reads) num_unmapped += 1 elif min(mate_num_hits) == 0: # if one or other mate unmapped then write to the unpaired bam file write_unpaired_reads(pe_reads, mate_num_hits, library_type, unpairedfh) num_unpaired += 1 else: # examine all read pairing combinations and rule out invalid pairings concordant_pairs, discordant_pairs, unpaired_reads = \ classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map) if len(concordant_pairs) > 0: write_pairs(concordant_pairs, pairedfh) num_paired += 1 elif len(discordant_pairs) > 0: write_pairs(discordant_pairs, discordantfh) num_discordant += 1 else: # both reads in the pair mapped, but no pairings could # be resolved write_pe_reads(unpaired_reads, unresolvedfh) num_unresolved += 1 pairedfh.close() discordantfh.close() unpairedfh.close() unmappedfh.close() multimapfh.close() unresolvedfh.close() bamfh.close() logging.debug("Finished pairing reads") logging.debug("\tUnmapped fragments: %d" % (num_unmapped)) logging.debug("\tMultimapping fragments: %d" % (num_multimap)) logging.debug("\tUnpaired fragments: %d" % (num_unpaired)) logging.debug("\tUnresolvable mapped fragments: %d" % (num_unresolved)) logging.debug("\tDiscordant fragments: %d" % (num_discordant)) logging.debug("\tPaired fragments: %d" % (num_paired)) return config.JOB_SUCCESS
def pair_discordant_clusters(discordant_bam_file, cluster_pair_file, tmp_dir): # # sort the BAM file that has cluster annotations by read name # logging.debug("Sorting newly annotated discordant BAM file by read name") qname_sorted_bam_prefix = os.path.join( tmp_dir, os.path.splitext(discordant_bam_file)[0] + ".byname") qname_sorted_bam_file = qname_sorted_bam_prefix + ".bam" pysam.sort("-n", "-m", str(int(1e9)), discordant_bam_file, qname_sorted_bam_prefix) # # iterate through named-sorted bam file write cluster pairs # logging.debug("Enumerating cluster pairs") tmp_cluster_file = os.path.join(tmp_dir, "tmp_clusters.txt") tmp_cluster_fh = open(tmp_cluster_file, 'w') bamfh = pysam.Samfile(qname_sorted_bam_file, "rb") for pe_reads in parse_pe_reads(bamfh): # group into 5' and 3' reads reads5p = [] reads3p = [] for reads in pe_reads: for r in reads: orientation = r.opt(ORIENTATION_TAG) if orientation == ORIENTATION_5P: reads5p.append(r) else: reads3p.append(r) # iterate through possible pairs for r5p in reads5p: for r3p in reads3p: id5p = r5p.opt(DISCORDANT_CLUSTER_TAG) id3p = r3p.opt(DISCORDANT_CLUSTER_TAG) print >> tmp_cluster_fh, '\t'.join( map(str, (id5p, id3p, r5p.qname))) bamfh.close() tmp_cluster_fh.close() # # sort cluster pairs # logging.debug("Sorting cluster pairs") tmp_sorted_cluster_file = os.path.join(tmp_dir, "tmp_clusters.srt.txt") def sortfunc(line): fields = line.strip().split('\t') return (fields[0], fields[1]) batch_sort(input=tmp_cluster_file, output=tmp_sorted_cluster_file, key=sortfunc, buffer_size=32000, tempdirs=[tmp_dir]) # # write cluster pairs # logging.debug("Grouping cluster pairs") pair_id = 0 outfh = open(cluster_pair_file, "w") for id5p, id3p, qnames in parse_and_group_cluster_pairs( open(tmp_sorted_cluster_file)): print >> outfh, '\t'.join( map(str, [pair_id, id5p, id3p, ','.join(qnames)])) pair_id += 1 outfh.close() # remove temporary files if os.path.exists(qname_sorted_bam_file): os.remove(qname_sorted_bam_file) if os.path.exists(tmp_cluster_file): os.remove(tmp_cluster_file) if os.path.exists(tmp_sorted_cluster_file): os.remove(tmp_sorted_cluster_file) return config.JOB_SUCCESS