def discordant_reads_to_bedpe(index_dir, input_bam_file, output_file): # open BAM alignment file bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) tid_tx_map = build_tid_transcript_map(bamfh, transcripts) outfh = open(output_file, "w") logging.debug("Converting BAM to BEDPE format") for r5p,r3p in parse_gene_discordant_reads(bamfh): # store pertinent read information in lightweight structure called # DiscordantRead object. this departs from SAM format into a # custom read format dr5p = DiscordantRead.from_read(r5p) dr3p = DiscordantRead.from_read(r3p) # get gene information tx5p = tid_tx_map[r5p.rname] tx3p = tid_tx_map[r3p.rname] # write bedpe format fields = [tx5p.tx_id, r5p.pos, r5p.aend, tx3p.tx_id, r3p.pos, r3p.aend, r5p.qname, # read name 0, # score tx5p.strand, tx3p.strand, # strand 1, strand 2 ] fields.append('|'.join(map(str, dr5p.to_list()))) fields.append('|'.join(map(str, dr3p.to_list()))) print >>outfh, '\t'.join(map(str, fields)) outfh.close() bamfh.close()
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) """ logging.info("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tGene paired file: %s" % (paired_bam_file)) logging.debug("\tUnmapped file: %s" % (unmapped_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) # read transcript features logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.debug("Building transcript lookup tables") # build a lookup table from bam tid index to transcript object tid_tx_map = build_tid_transcript_map(bamfh, transcripts) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts) logging.info("Parsing reads") for pe_reads in parse_pe_reads(bamfh): # add hit index and multimap information to read tags # this function also checks for unmapped reads any_unmapped = False for reads in pe_reads: any_unmapped = (any_unmapped or annotate_multihits( bamfh, reads, tid_tx_genome_map)) if any_unmapped: # write to output as discordant reads and continue to # next fragment write_pe_reads(unmappedfh, pe_reads) continue # examine all read pairing combinations and rule out invalid pairings gene_pairs, unpaired_reads = classify_read_pairs( pe_reads, max_isize, library_type, tid_tx_map) if len(gene_pairs) > 0: write_pairs(genefh, gene_pairs) # TODO: do something with unpaired discordant reads? genefh.close() unmappedfh.close() bamfh.close() logging.info("Finished pairing reads") return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) """ logging.info("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tGene paired file: %s" % (paired_bam_file)) logging.debug("\tUnmapped file: %s" % (unmapped_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) # read transcript features logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.debug("Building transcript lookup tables") # build a lookup table from bam tid index to transcript object tid_tx_map = build_tid_transcript_map(bamfh, transcripts) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts) logging.info("Parsing reads") for pe_reads in parse_pe_reads(bamfh): # add hit index and multimap information to read tags # this function also checks for unmapped reads any_unmapped = False for reads in pe_reads: any_unmapped = any_unmapped or annotate_multihits(bamfh, reads, tid_tx_genome_map) if any_unmapped: # write to output as discordant reads and continue to # next fragment write_pe_reads(unmappedfh, pe_reads) continue # examine all read pairing combinations and rule out invalid pairings gene_pairs, unpaired_reads = classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map) if len(gene_pairs) > 0: write_pairs(genefh, gene_pairs) # TODO: do something with unpaired discordant reads? genefh.close() unmappedfh.close() bamfh.close() logging.info("Finished pairing reads") return config.JOB_SUCCESS