def filter_multihits(transcript_file, input_bam_file, output_bam_file, max_multihits=1): logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(transcript_file))) # parse and convert sam -> bam inbamfh = pysam.Samfile(input_bam_file, "rb") outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts) num_frags = 0 logging.debug("Annotating and filtering multihits") for pe_reads in parse_pe_reads(inbamfh): mate_num_hits = [] for reads in pe_reads: num_hits = annotate_multihits(reads, tid_tx_genome_map) mate_num_hits.append(num_hits) new_pe_reads = [[], []] if mate_num_hits[0] > max_multihits: r = copy_read(pe_reads[0][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[1] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[0] = [r] else: new_pe_reads[0] = pe_reads[0] if mate_num_hits[1] > max_multihits: r = copy_read(pe_reads[1][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[0] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[1] = [r] else: new_pe_reads[1] = pe_reads[1] for reads in pe_reads: for r in reads: outbamfh.write(r) num_frags += 1 logging.debug("Found %d fragments" % (num_frags)) inbamfh.close() outbamfh.close() return config.JOB_SUCCESS
def filter_multihits(transcript_file, input_bam_file, output_bam_file, max_multihits=1): logging.debug("Reading transcript features") transcripts = list(TranscriptFeature.parse(open(transcript_file))) # parse and convert sam -> bam inbamfh = pysam.Samfile(input_bam_file, "rb") outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts) num_frags = 0 logging.debug("Annotating and filtering multihits") for pe_reads in parse_pe_reads(inbamfh): mate_num_hits = [] for reads in pe_reads: num_hits = annotate_multihits(reads, tid_tx_genome_map) mate_num_hits.append(num_hits) new_pe_reads = [[],[]] if mate_num_hits[0] > max_multihits: r = copy_read(pe_reads[0][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[1] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[0] = [r] else: new_pe_reads[0] = pe_reads[0] if mate_num_hits[1] > max_multihits: r = copy_read(pe_reads[1][0]) r.is_unmapped = True r.is_proper_pair = False r.is_secondary = False r.rname = -1 r.pos = 0 if mate_num_hits[0] > max_multihits: r.mate_is_unmapped = True r.mrnm = -1 r.mpos = 0 new_pe_reads[1] = [r] else: new_pe_reads[1] = pe_reads[1] for reads in pe_reads: for r in reads: outbamfh.write(r) num_frags += 1 logging.debug("Found %d fragments" % (num_frags)) inbamfh.close() outbamfh.close() return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) """ logging.info("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tGene paired file: %s" % (paired_bam_file)) logging.debug("\tUnmapped file: %s" % (unmapped_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) # read transcript features logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.debug("Building transcript lookup tables") # build a lookup table from bam tid index to transcript object tid_tx_map = build_tid_transcript_map(bamfh, transcripts) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts) logging.info("Parsing reads") for pe_reads in parse_pe_reads(bamfh): # add hit index and multimap information to read tags # this function also checks for unmapped reads any_unmapped = False for reads in pe_reads: any_unmapped = (any_unmapped or annotate_multihits( bamfh, reads, tid_tx_genome_map)) if any_unmapped: # write to output as discordant reads and continue to # next fragment write_pe_reads(unmappedfh, pe_reads) continue # examine all read pairing combinations and rule out invalid pairings gene_pairs, unpaired_reads = classify_read_pairs( pe_reads, max_isize, library_type, tid_tx_map) if len(gene_pairs) > 0: write_pairs(genefh, gene_pairs) # TODO: do something with unpaired discordant reads? genefh.close() unmappedfh.close() bamfh.close() logging.info("Finished pairing reads") return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type): """ parses BAM file and categorizes reads into several groups: - concordant - discordant within gene (splicing isoforms) - discordant between different genes (chimeras) """ logging.info("Finding discordant read pair combinations") logging.debug("\tInput file: %s" % (input_bam_file)) logging.debug("\tMax insert size: '%d'" % (max_isize)) logging.debug("\tLibrary type: '%s'" % (library_type)) logging.debug("\tGene paired file: %s" % (paired_bam_file)) logging.debug("\tUnmapped file: %s" % (unmapped_bam_file)) # setup input and output files bamfh = pysam.Samfile(input_bam_file, "rb") genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh) unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh) # read transcript features logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) logging.debug("Building transcript lookup tables") # build a lookup table from bam tid index to transcript object tid_tx_map = build_tid_transcript_map(bamfh, transcripts) # build a transcript to genome coordinate map tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts) logging.info("Parsing reads") for pe_reads in parse_pe_reads(bamfh): # add hit index and multimap information to read tags # this function also checks for unmapped reads any_unmapped = False for reads in pe_reads: any_unmapped = any_unmapped or annotate_multihits(bamfh, reads, tid_tx_genome_map) if any_unmapped: # write to output as discordant reads and continue to # next fragment write_pe_reads(unmappedfh, pe_reads) continue # examine all read pairing combinations and rule out invalid pairings gene_pairs, unpaired_reads = classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map) if len(gene_pairs) > 0: write_pairs(genefh, gene_pairs) # TODO: do something with unpaired discordant reads? genefh.close() unmappedfh.close() bamfh.close() logging.info("Finished pairing reads") return config.JOB_SUCCESS