def filter_multihits(transcript_file,
                     input_bam_file,
                     output_bam_file,
                     max_multihits=1):
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # parse and convert sam -> bam
    inbamfh = pysam.Samfile(input_bam_file, "rb")
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts)
    num_frags = 0
    logging.debug("Annotating and filtering multihits")
    for pe_reads in parse_pe_reads(inbamfh):
        mate_num_hits = []
        for reads in pe_reads:
            num_hits = annotate_multihits(reads, tid_tx_genome_map)
            mate_num_hits.append(num_hits)
        new_pe_reads = [[], []]
        if mate_num_hits[0] > max_multihits:
            r = copy_read(pe_reads[0][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[1] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[0] = [r]
        else:
            new_pe_reads[0] = pe_reads[0]
        if mate_num_hits[1] > max_multihits:
            r = copy_read(pe_reads[1][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[0] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[1] = [r]
        else:
            new_pe_reads[1] = pe_reads[1]
        for reads in pe_reads:
            for r in reads:
                outbamfh.write(r)
        num_frags += 1
    logging.debug("Found %d fragments" % (num_frags))
    inbamfh.close()
    outbamfh.close()
    return config.JOB_SUCCESS
def filter_multihits(transcript_file, input_bam_file, output_bam_file,
                     max_multihits=1):
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # parse and convert sam -> bam
    inbamfh = pysam.Samfile(input_bam_file, "rb")
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh)
    # build a transcript to genome coordinate map   
    tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts)
    num_frags = 0
    logging.debug("Annotating and filtering multihits")
    for pe_reads in parse_pe_reads(inbamfh):        
        mate_num_hits = []
        for reads in pe_reads:
            num_hits = annotate_multihits(reads, tid_tx_genome_map)
            mate_num_hits.append(num_hits)
        new_pe_reads = [[],[]]
        if mate_num_hits[0] > max_multihits:
            r = copy_read(pe_reads[0][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[1] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[0] = [r]
        else:
            new_pe_reads[0] = pe_reads[0]
        if mate_num_hits[1] > max_multihits:
            r = copy_read(pe_reads[1][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[0] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[1] = [r]
        else:
            new_pe_reads[1] = pe_reads[1]
        for reads in pe_reads:
            for r in reads:
                outbamfh.write(r)
        num_frags += 1
    logging.debug("Found %d fragments" % (num_frags))
    inbamfh.close()
    outbamfh.close()
    return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file,
                              unmapped_bam_file, index_dir, max_isize,
                              library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    """
    logging.info("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tGene paired file: %s" % (paired_bam_file))
    logging.debug("\tUnmapped file: %s" % (unmapped_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    # read transcript features
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.debug("Building transcript lookup tables")
    # build a lookup table from bam tid index to transcript object
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts)
    logging.info("Parsing reads")
    for pe_reads in parse_pe_reads(bamfh):
        # add hit index and multimap information to read tags
        # this function also checks for unmapped reads
        any_unmapped = False
        for reads in pe_reads:
            any_unmapped = (any_unmapped or annotate_multihits(
                bamfh, reads, tid_tx_genome_map))
        if any_unmapped:
            # write to output as discordant reads and continue to
            # next fragment
            write_pe_reads(unmappedfh, pe_reads)
            continue
        # examine all read pairing combinations and rule out invalid pairings
        gene_pairs, unpaired_reads = classify_read_pairs(
            pe_reads, max_isize, library_type, tid_tx_map)
        if len(gene_pairs) > 0:
            write_pairs(genefh, gene_pairs)
        # TODO: do something with unpaired discordant reads?
    genefh.close()
    unmappedfh.close()
    bamfh.close()
    logging.info("Finished pairing reads")
    return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    """
    logging.info("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tGene paired file: %s" % (paired_bam_file))
    logging.debug("\tUnmapped file: %s" % (unmapped_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    # read transcript features
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.debug("Building transcript lookup tables")
    # build a lookup table from bam tid index to transcript object
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts)
    logging.info("Parsing reads")
    for pe_reads in parse_pe_reads(bamfh):
        # add hit index and multimap information to read tags
        # this function also checks for unmapped reads
        any_unmapped = False
        for reads in pe_reads:
            any_unmapped = any_unmapped or annotate_multihits(bamfh, reads, tid_tx_genome_map)
        if any_unmapped:
            # write to output as discordant reads and continue to
            # next fragment
            write_pe_reads(unmappedfh, pe_reads)
            continue
        # examine all read pairing combinations and rule out invalid pairings
        gene_pairs, unpaired_reads = classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map)
        if len(gene_pairs) > 0:
            write_pairs(genefh, gene_pairs)
        # TODO: do something with unpaired discordant reads?
    genefh.close()
    unmappedfh.close()
    bamfh.close()
    logging.info("Finished pairing reads")
    return config.JOB_SUCCESS