def main():
    from optparse import OptionParser
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = OptionParser("usage: %prog [options] <in.bam> <out.bam>")
    parser.add_option('--min-fragment-length',
                      dest="min_fragment_length",
                      type="int",
                      default=50)
    parser.add_option('--max-fragment-length',
                      dest="max_fragment_length",
                      type="int",
                      default=1000)
    parser.add_option('--library', dest="library_type", default="fr")
    #parser.add_option('--unpaired-bam', dest="unpaired_bam_file", default=None)
    options, args = parser.parse_args()
    input_bam_file = args[0]
    output_bam_file = args[1]
    logging.info("Merging read pairs")
    logging.debug("Input file: %s" % (input_bam_file))
    logging.debug("Output file: %s" % (output_bam_file))
    logging.debug("Library type: '%s'" % (options.library_type))
    library_type = parse_library_type(options.library_type)
    bamfh = pysam.Samfile(input_bam_file, "rb")
    outfh = pysam.Samfile(output_bam_file, "wb", template=bamfh)
    #outfh = pysam.Samfile("-", "w", template=bamfh)
    merge_read_pairs(bamfh, outfh, options.min_fragment_length,
                     options.max_fragment_length, library_type)
    logging.info("Paired-end merging completed")
Exemple #2
0
def sam_to_bam(input_fastq_file,
               input_sam_file,
               output_bam_file,
               multihits,
               mode,
               keep_unmapped=True):
    samfh = pysam.Samfile(input_sam_file, "r")
    if mode == "pe":
        fix_iter = fix_pe_alignment_ordering(samfh,
                                             open(input_fastq_file),
                                             is_paired=True)
    elif mode == "pesr":
        fix_iter = fix_pe_sr_alignment_ordering(samfh, open(input_fastq_file))
    elif mode == "sr":
        fix_iter = fix_pe_alignment_ordering(samfh,
                                             open(input_fastq_file),
                                             is_paired=False)
    num_unmapped = 0
    num_multihits = 0
    num_frags = 0
    bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh)
    for frags in fix_iter:
        num_frags += 1
        for reads in frags:
            un, mh = write_reads_to_bam(reads, bamfh, multihits, keep_unmapped)
            num_unmapped += un
            num_multihits += mh
    bamfh.close()
    samfh.close()
    logging.debug("Found %d fragments" % (num_frags))
    logging.debug("\t%d unmapped reads" % (num_unmapped))
    logging.debug("\t%d multimapping (>%dX) reads" %
                  (num_multihits, multihits))
Exemple #3
0
def join_segmented_alignments(input_sam_file, input_fastq_file,
                              output_bam_file, is_paired):
    # setup debugging logging messages
    debug_count = 0
    debug_every = 1e6
    debug_next = debug_every
    # open sam file
    infh = pysam.Samfile(input_sam_file, "r")
    #header = infh.header
    outfh = pysam.Samfile(output_bam_file, "wb", template=infh)
    tid_type_map = get_tid_ref_types(outfh)
    #outfh = pysam.Samfile("-", "w", template=infh)
    # iterate through paired-end alignments
    logging.info("Processing paired alignments")
    align_iter = fix_segmented_alignment_ordering(infh, open(input_fastq_file),
                                                  is_paired)
    for segmented_pe_reads in align_iter:
        debug_count += 1
        if debug_count == debug_next:
            debug_next += debug_every
            logging.debug("Processed %d reads" % debug_count)
        # get alignments
        for mate, mate_segs in enumerate(segmented_pe_reads):
            # search for segment matches
            joined_hits = find_valid_segment_alignments(mate_segs)
            num_hits = len(joined_hits)
            #print 'HITS', num_hits
            for hit_index, split_hits in enumerate(joined_hits):
                # total number of splits
                num_splits = len(split_hits)
                #print 'HIT', hit_index, 'SPLITS', len(split_hits)
                for split_index, seg_hits in enumerate(split_hits):
                    num_seg_hits = len(seg_hits)
                    split_reads = []
                    multimaps = 0
                    #print 'SPLIT', split_index, 'HITS', num_seg_hits
                    for seg_index, seg_reads in enumerate(seg_hits):
                        # make SAM record for each segment
                        tags = [(SamTags.RTAG_NUM_PARTITIONS, num_hits),
                                (SamTags.RTAG_PARTITION_IND, hit_index),
                                (SamTags.RTAG_NUM_SPLITS, num_splits),
                                (SamTags.RTAG_SPLIT_IND, split_index),
                                (SamTags.RTAG_NUM_MAPPINGS, num_seg_hits),
                                (SamTags.RTAG_MAPPING_IND, seg_index)]
                        r = make_joined_read(mate, seg_reads, tags=tags)
                        split_reads.append(r)
                        # TODO: keep track of multimaps using the number of
                        # genome hits as a proxy (this is not perfect, since
                        # splice junction reads could be multimapping
                        if tid_type_map[r.rname] == REF_GENOME:
                            multimaps += 1
                    # output reads now that multimappings have been computed
                    for r in split_reads:
                        if not r.is_unmapped:
                            r.tags = r.tags + [("NH", multimaps)]
                        outfh.write(r)
def transcriptome_to_genome(input_sam_file, output_sam_file,
                            gene_to_genome_map):
    insamfh = pysam.Samfile(input_sam_file, "r")
    new_header, gene_table = build_translation_table(insamfh,
                                                     gene_to_genome_map)
    outsamfh = pysam.Samfile(output_sam_file, "wh", header=new_header)
    for read in translate_multihit_reads(insamfh, gene_table):
        outsamfh.write(read)
    outsamfh.close()
    insamfh.close()
def filter_multihits(transcript_file,
                     input_bam_file,
                     output_bam_file,
                     max_multihits=1):
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # parse and convert sam -> bam
    inbamfh = pysam.Samfile(input_bam_file, "rb")
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts)
    num_frags = 0
    logging.debug("Annotating and filtering multihits")
    for pe_reads in parse_pe_reads(inbamfh):
        mate_num_hits = []
        for reads in pe_reads:
            num_hits = annotate_multihits(reads, tid_tx_genome_map)
            mate_num_hits.append(num_hits)
        new_pe_reads = [[], []]
        if mate_num_hits[0] > max_multihits:
            r = copy_read(pe_reads[0][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[1] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[0] = [r]
        else:
            new_pe_reads[0] = pe_reads[0]
        if mate_num_hits[1] > max_multihits:
            r = copy_read(pe_reads[1][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[0] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[1] = [r]
        else:
            new_pe_reads[1] = pe_reads[1]
        for reads in pe_reads:
            for r in reads:
                outbamfh.write(r)
        num_frags += 1
    logging.debug("Found %d fragments" % (num_frags))
    inbamfh.close()
    outbamfh.close()
    return config.JOB_SUCCESS
Exemple #6
0
def sam_to_bam(input_fastq_files,
               input_sam_file,
               output_bam_file,
               quals,
               multihits,
               pe_sr_mode=False,
               softclip=True,
               keep_unmapped=True):
    samfh = pysam.Samfile(input_sam_file, "r")
    num_unmapped = 0
    num_multihits = 0
    num_frags = 0
    bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh)
    # setup fastq parsing
    if softclip and (quals != SANGER_FORMAT):
        kwargs = {"convert_quals": True, "qual_format": quals}
    else:
        kwargs = {"convert_quals": False}
    fqiters = [
        parse_fastq_record(open(fq), **kwargs) for fq in input_fastq_files
    ]

    # handle single-read and paired-end
    if len(fqiters) == 1:
        reorder_func = fix_sr_alignment_ordering(samfh, fqiters[0])
    else:
        reorder_func = fix_alignment_ordering(samfh, fqiters, pe_sr_mode)
    # iterate through buffer
    for bufitems in reorder_func:
        num_frags += 1
        for bufitem in bufitems:
            for r in bufitem.reads:
                # softclip uses the fastq record to replace the sequence
                # and quality scores of the read
                if softclip:
                    soft_pad_read(bufitem.fqrec, r)
                # keep statistics of unmapped/multimapped reads and
                # suppress output if 'keep_unmapped' is False
                if r.is_unmapped:
                    xm_tag = r.opt('XM')
                    if xm_tag < multihits:
                        num_unmapped += 1
                        if not keep_unmapped:
                            continue
                    else:
                        num_multihits += 1
                bamfh.write(r)
    for fqfh in fqiters:
        fqfh.close()
    bamfh.close()
    samfh.close()
    logging.debug("Found %d fragments" % (num_frags))
    logging.debug("\t%d unmapped reads" % (num_unmapped))
    logging.debug("\t%d multimapping (>%dX) reads" %
                  (num_multihits, multihits))
Exemple #7
0
def extend_and_pad_sam(input_fastq_files, input_sam_file, output_sam_file):
    infh = pysam.Samfile(input_sam_file, "r")
    outfh = pysam.Samfile(output_sam_file, "w", template=infh)

    tagdict = dict(r.tags)
    # TODO: bug in pysam handling CP tag, fix by forcing to integer
    if "CP" in tagdict:
        tagdict["CP"] = int(tagdict["CP"])
    # add additional tags
    tagdict.update(tags)
    r.tags = tagdict.items()
def find_discordant_fragments(input_bam_file, paired_bam_file,
                              unmapped_bam_file, index_dir, max_isize,
                              library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    """
    logging.info("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tGene paired file: %s" % (paired_bam_file))
    logging.debug("\tUnmapped file: %s" % (unmapped_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    # read transcript features
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.debug("Building transcript lookup tables")
    # build a lookup table from bam tid index to transcript object
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts)
    logging.info("Parsing reads")
    for pe_reads in parse_pe_reads(bamfh):
        # add hit index and multimap information to read tags
        # this function also checks for unmapped reads
        any_unmapped = False
        for reads in pe_reads:
            any_unmapped = (any_unmapped or annotate_multihits(
                bamfh, reads, tid_tx_genome_map))
        if any_unmapped:
            # write to output as discordant reads and continue to
            # next fragment
            write_pe_reads(unmappedfh, pe_reads)
            continue
        # examine all read pairing combinations and rule out invalid pairings
        gene_pairs, unpaired_reads = classify_read_pairs(
            pe_reads, max_isize, library_type, tid_tx_map)
        if len(gene_pairs) > 0:
            write_pairs(genefh, gene_pairs)
        # TODO: do something with unpaired discordant reads?
    genefh.close()
    unmappedfh.close()
    bamfh.close()
    logging.info("Finished pairing reads")
    return config.JOB_SUCCESS
Exemple #9
0
def process_tophat_alignments(fastq_files,
                              bam_file,
                              gene_file,
                              max_fragment_length,
                              output_fastq_files,
                              output_bam_file,
                              unpaired=False,
                              suffix="/"):
    # index genes
    exon_intervals, exon_trees = build_exon_interval_trees(gene_file)
    # open input files
    bamfh = pysam.Samfile(bam_file, "rb")
    if unpaired:
        bam_iter = parse_unpaired_pe_reads(bamfh)
    else:
        bam_iter = parse_pe_reads(bamfh)
    fastq_iters = [parse_fastq(open(fq)) for fq in fastq_files]
    # open output files
    outfq = [open(fq, "w") for fq in output_fastq_files]
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=bamfh)
    # iterate through fastq files and bam file
    try:
        while True:
            bam_pe_reads = bam_iter.next()
            # synchronize fastq and bam and write unmapped reads to a file
            is_unaligned = synchronize_bam_fastq(bam_pe_reads, fastq_iters,
                                                 outfq, suffix)
            if is_unaligned:
                continue
            # if loop reaches this point then we have a paired-end
            # read where both pairs align.  now need to check if
            # the alignment is discordant
            tx_concordant, gene_concordant = \
                is_concordant(bamfh, bam_pe_reads, exon_intervals,
                              exon_trees, max_fragment_length)
            if not gene_concordant:
                for r in bam_pe_reads[0]:
                    outbamfh.write(r)
                for r in bam_pe_reads[1]:
                    outbamfh.write(r)
    except StopIteration:
        pass
    # finish remaining fastq lines
    try:
        while True:
            fqreads = [it.next() for it in fastq_iters]
            print >> outfq[0], fastq_to_string(fqreads[0])
            print >> outfq[1], fastq_to_string(fqreads[1])
    except StopIteration:
        pass
    return config.JOB_SUCCESS
def realign_genome_reads(input_bam_file, output_bam_file, gene_file):
    # build a map of gene name to genome coords
    logging.info("Reading gene index")
    infh = pysam.Samfile(input_bam_file, "rb")
    gene_tid_list = get_gene_tids(infh)
    exon_trees = build_exon_trees(infh, gene_file)
    outfh = pysam.Samfile("-", "w", template=infh)
    #outfh = pysam.Samfile(output_bam_file, "wb", template=infh)
    for pe_reads in parse_pe_sam_file(infh):
        for mate_partitions in pe_reads:
            for splits in mate_partitions:
                for reads in splits:
                    for r in realign_split_reads(reads, gene_tid_list,
                                                 exon_trees):
                        outfh.write(r)
def discordant_reads_to_bedpe(index_dir, input_bam_file, output_file):
    # open BAM alignment file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    outfh = open(output_file, "w")    
    logging.debug("Converting BAM to BEDPE format")
    for r5p,r3p in parse_gene_discordant_reads(bamfh):
        # store pertinent read information in lightweight structure called
        # DiscordantRead object. this departs from SAM format into a 
        # custom read format
        dr5p = DiscordantRead.from_read(r5p)
        dr3p = DiscordantRead.from_read(r3p)
        # get gene information
        tx5p = tid_tx_map[r5p.rname]
        tx3p = tid_tx_map[r3p.rname]
        # write bedpe format
        fields = [tx5p.tx_id, r5p.pos, r5p.aend,
                  tx3p.tx_id, r3p.pos, r3p.aend,
                  r5p.qname,  # read name
                  0, # score
                  tx5p.strand, tx3p.strand, # strand 1, strand 2
                  ]
        fields.append('|'.join(map(str, dr5p.to_list())))
        fields.append('|'.join(map(str, dr3p.to_list())))  
        print >>outfh, '\t'.join(map(str, fields)) 
    outfh.close()
    bamfh.close()
Exemple #12
0
def filter_chimeras(input_file, output_file, index_dir, bam_file,
                    weighted_unique_frags, median_isize, max_isize,
                    isoform_fraction, false_pos_file):
    logging.debug("Filtering Parameters")
    logging.debug("\tweighted unique fragments: %f" % (weighted_unique_frags))
    logging.debug("\tmedian insert size: %d" % (median_isize))
    logging.debug("\tmax insert size allowed: %d" % (max_isize))
    logging.debug("\tfraction of wild-type isoform: %f" % (isoform_fraction))
    logging.debug("\tfalse positive chimeras file: %s" % (false_pos_file))
    # get false positive chimera list
    if (false_pos_file is not None) and (false_pos_file is not ""):
        logging.debug("Parsing false positive chimeras")
        false_pos_pairs = read_false_pos_file(false_pos_file)
    else:
        false_pos_pairs = set()
    # open BAM file for checking wild-type isoform
    bamfh = pysam.Samfile(bam_file, "rb")
    # filter chimeras
    logging.debug("Checking chimeras")
    num_chimeras = 0
    num_filtered_chimeras = 0
    tmp_file = make_temp(os.path.dirname(output_file), suffix=".txt")
    f = open(tmp_file, "w")
    for c in Chimera.parse(open(input_file)):
        num_chimeras += 1
        good = filter_weighted_frags(c, weighted_unique_frags)
        if not good:
            continue
        good = good and filter_inner_dist(c, max_isize)
        if not good:
            continue
        false_pos_key = (c.partner5p.tx_name, c.partner5p.end,
                         c.partner3p.tx_name, c.partner3p.start)
        good = good and (false_pos_key not in false_pos_pairs)
        if not good:
            continue
        good = good and filter_chimeric_isoform_fraction(
            c, isoform_fraction, median_isize, bamfh)
        if good:
            print >> f, '\t'.join(map(str, c.to_list()))
            num_filtered_chimeras += 1
    f.close()
    logging.debug("Total chimeras: %d" % num_chimeras)
    logging.debug("Filtered chimeras: %d" % num_filtered_chimeras)
    # cleanup memory for false positive chimeras
    del false_pos_pairs
    bamfh.close()
    # find highest coverage chimeras among isoforms
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    kept_chimeras = get_highest_coverage_isoforms(tmp_file, gene_file)
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(tmp_file)):
        if c.name in kept_chimeras:
            num_filtered_chimeras += 1
            print >> f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras)
    os.remove(tmp_file)
    return config.JOB_SUCCESS
def nominate_spanning_reads(chimera_file, unmapped_bam_file,
                            output_fastq_file):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(output_fastq_file, "w")
    remap_qnames = set()
    breaks5p = collections.defaultdict(lambda: [])
    breaks3p = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(chimera_file)):
        end5p = c.partner5p.end
        start3p = c.partner3p.start
        # keep track of all breakpoints
        breaks5p[c.partner5p.tx_name].append(end5p)
        breaks3p[c.partner5p.tx_name].append(start3p)
        for r5p, r3p in c.encomp_read_pairs:
            # if 5' read overlaps breakpoint then it should be remapped
            if r5p.clipstart < end5p < r5p.clipend:
                key5p = (r5p.qname, r5p.readnum)
                if key5p not in remap_qnames:
                    remap_qnames.add((r5p.qname, r5p.readnum))
                    print >> fqfh, to_fastq(r5p.qname, r5p.readnum, r5p.seq,
                                            "I" * len(r5p.seq))
            # if 3' read overlaps breakpoint then it should be remapped
            if r3p.clipstart < start3p < r3p.clipend:
                key3p = (r3p.qname, r3p.readnum)
                if key3p not in remap_qnames:
                    remap_qnames.add((r3p.qname, r3p.readnum))
                    print >> fqfh, to_fastq(r3p.qname, r3p.readnum, r3p.seq,
                                            "I" * len(r3p.seq))
    # sort breakpoint positions within each gene
    for tx_name in breaks5p.keys():
        breaks5p[tx_name] = sorted(breaks5p[tx_name])
    for tx_name in breaks3p.keys():
        breaks3p[tx_name] = sorted(breaks3p[tx_name])
    # check read pairs with one or both unmapped, and remap those
    # as well
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    for pe_reads in parse_pe_reads(bamfh):
        for readnum in xrange(0, 2):
            print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum,
                                    pe_reads[readnum][0].seq,
                                    pe_reads[readnum][0].qual)


#            # add unmapped reads
#            if reads[0].is_unmapped:
#                readnum = 2 if reads[0].is_read2 else 1
#                print >>fqfh, to_fastq(reads[0].qname, readnum, reads[0].seq,
#                                       "I" * len(reads[0].seq))
#                # TODO: remove this
#                assert len(reads) == 1
#            else:
#                remap = False
#                for r in reads:
#                    tx_name = config.GENE_REF_PREFIX + bamfh.getrname(r.rname)
#                    # check if this read overlaps a breakpoint
#
#                    bisect()
    bamfh.close()
    return config.JOB_SUCCESS
Exemple #14
0
def bam_to_fastq(bam_file, fastq_files):
    fqfhs = [open(f, "w") for f in fastq_files]
    bamfh = pysam.Samfile(bam_file, "rb")
    for r in bamfh:
        if r.is_read1:
            i = 0
        elif r.is_read2:
            i = 1
        record = "@%s\n%s\n+\n%s" % (r.qname, r.seq, r.qual)
        print >> fqfhs[i], record
def extract_tophat_encompassing_reads(index_dir, tophat_bam_file,
                                      encompassing_bam_file, max_isize,
                                      library_type):
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    bamfh = pysam.Samfile(tophat_bam_file, "rb")
    for r in bamfh:
        if (r.is_unmapped) or (r.mate_is_unmapped):
            continue
        if r.rname != r.mrnm:
            print r.qname, r.rname, r.pos, r.is_reverse, r.mrnm, r.mpos, r.mate_is_reverse
    bamfh.close()
Exemple #16
0
def sam_stdin_to_bam(output_bam_file,
                     input_fastq_file,
                     multihits,
                     is_paired=True,
                     keep_unmapped=True):
    samfh = pysam.Samfile("-", "r")
    bamfh = pysam.Samfile(output_bam_file, "wb", template=samfh)
    num_unmapped = 0
    num_multihits = 0
    if is_paired:
        for pe_reads in fix_pe_alignment_ordering(samfh,
                                                  open(input_fastq_file),
                                                  is_paired=is_paired):
            for reads in pe_reads:
                for r in reads:
                    if r.is_unmapped:
                        xm_tag = r.opt('XM')
                        if xm_tag < multihits:
                            num_unmapped += 1
                            if not keep_unmapped:
                                continue
                        num_multihits += 1
                    bamfh.write(r)
    else:
        for reads in fix_sr_alignment_ordering(samfh, open(input_fastq_file)):
            for r in reads:
                if r.is_unmapped:
                    xm_tag = r.opt('XM')
                    if xm_tag < multihits:
                        num_unmapped += 1
                        if not keep_unmapped:
                            continue
                    num_multihits += 1
                bamfh.write(r)
    bamfh.close()
    samfh.close()
    logging.debug("[SAMTOBAM] Filtered %d unmapped reads" % (num_unmapped))
    logging.debug("[SAMTOBAM] Found %d multimapping (>%d) reads" %
                  (num_multihits, multihits))
    logging.info("[SAMTOBAM] Finished converting SAM -> BAM")
def nominate_unmapped_spanning_reads(unmapped_bam_file, output_fastq_file):
    # find all reads that need to be remapped to see if they span the
    # breakpoint junction
    fqfh = open(output_fastq_file, "w")
    # check read pairs with one or both unmapped, and remap those
    # as well
    bamfh = pysam.Samfile(unmapped_bam_file, "rb")
    for pe_reads in parse_pe_reads(bamfh):
        # remap all unmapped reads
        for readnum, reads in enumerate(pe_reads):
            if any(r.is_unmapped for r in reads):
                print >> fqfh, to_fastq(pe_reads[readnum][0].qname, readnum,
                                        pe_reads[readnum][0].seq,
                                        pe_reads[readnum][0].qual)

    bamfh.close()
    fqfh.close()
    return config.JOB_SUCCESS
def calc_chimera_pvalues(input_file,
                         bam_file, 
                         num_mapped_reads, 
                         num_discordant_reads_within_isize_range):
    # calc discordant reads per million
    percent_discordant = num_discordant_reads_within_isize_range / float(num_mapped_reads)
    # open BAM file for checking wild-type isoforms
    bamfh = pysam.Samfile(bam_file, "rb")
    for c in Chimera.parse(open(input_file)):        
        # count 5' and 3' reads
        rname5p = config.GENE_REF_PREFIX + c.tx_name_5p
        rname3p = config.GENE_REF_PREFIX + c.tx_name_3p        
        num_reads_5p = len(set(r.qname for r in bamfh.fetch(rname5p, c.tx_start_5p, c.tx_end_5p)))
        num_reads_3p = len(set(r.qname for r in bamfh.fetch(rname3p, c.tx_start_3p, c.tx_end_3p)))
        # expected number of discordant reads
        exp_discordant_5p = num_reads_5p * percent_discordant
        exp_discordant_3p = num_reads_3p * percent_discordant
        print c.gene_name_5p, c.gene_name_3p, num_reads_5p, num_reads_3p, exp_discordant_5p, exp_discordant_3p
    bamfh.close()    
Exemple #19
0
def nominate_chimeras(index_dir, input_bam_file, output_file, trim_bp):
    logging.debug("Reading gene information")
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    tid_tx_map, genome_tx_trees = build_tid_tx_maps(
        bamfh, gene_file, rname_prefix=config.GENE_REF_PREFIX)
    # group discordant read pairs by gene
    chimera_num = 0
    outfh = open(output_file, "w")
    logging.debug("Parsing discordant reads")
    for tid5p, tid3p, readpairs in parse_gene_chimeric_reads(bamfh):
        c = read_pairs_to_chimera("C%07d" % (chimera_num), tid5p, tid3p,
                                  readpairs, tid_tx_map, genome_tx_trees,
                                  trim_bp)
        fields = c.to_list()
        chimera_num += 1
        print >> outfh, '\t'.join(map(str, fields))
    outfh.close()
    bamfh.close()
def main():
    from optparse import OptionParser
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = OptionParser("usage: %prog [options] <bam> <out.bedpe>")
    parser.add_option('-i',
                      '--min-fragment-length',
                      dest="min_fragment_length",
                      type="int",
                      default=0)
    parser.add_option('-I',
                      '--max-fragment-length',
                      dest="max_fragment_length",
                      type="int",
                      default=1000)
    parser.add_option('-n',
                      '--max-samples',
                      dest="max_samples",
                      type="int",
                      default=None)
    parser.add_option('-o', dest="output_file", default=None)
    options, args = parser.parse_args()
    input_bam_file = args[0]
    bamfh = pysam.Samfile(input_bam_file, "rb")
    isizedist = InsertSizeDistribution.from_bam(bamfh,
                                                options.min_fragment_length,
                                                options.max_fragment_length,
                                                options.max_samples)
    bamfh.close()
    if options.output_file is not None:
        f = open(options.output_file, "w")
    else:
        f = sys.stdout
    isizedist.to_file(f)
    if options.output_file is not None:
        f.close()
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" %
                 (isizedist.n, isizedist.mean(), isizedist.std(),
                  isizedist.percentile(50.0), isizedist.mode()))
def main():
    from optparse import OptionParser
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = OptionParser("usage: %prog [options] <bam> <out.bedpe>")
    parser.add_option("--index",
                      dest="index_dir",
                      help="Path to chimerascan index directory")
    parser.add_option('--max-fragment-length',
                      dest="max_fragment_length",
                      type="int",
                      default=1000)
    parser.add_option('--max-indel-size',
                      dest="max_indel_size",
                      type="int",
                      default=100)
    parser.add_option('--library-type', dest="library_type", default="fr")
    parser.add_option('--multihits', type="int", default=1)
    parser.add_option('--padding', type="int", default=0)
    options, args = parser.parse_args()
    input_bam_file = args[0]
    gene_output_file = args[1]
    genome_output_file = args[2]
    gene_feature_file = os.path.join(options.index_dir,
                                     config.GENE_FEATURE_FILE)
    library_type = parse_library_type(options.library_type)
    # open bam file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    find_discordant_reads(bamfh,
                          gene_output_file,
                          genome_output_file,
                          gene_feature_file,
                          max_indel_size=options.max_indel_size,
                          max_isize=options.max_fragment_length,
                          max_multihits=options.multihits,
                          library_type=library_type,
                          padding=options.padding)
    bamfh.close()
Exemple #22
0
def fastq_to_bam(fastq_files, qual_format, bam_file):
    fqfhs = [parse_fastq(open(f)) for f in fastq_files]
    qual_func = get_qual_conversion_func(qual_format)
    header = {'HD': {'VN': '1.0', 'SO': 'unknown'}}
    #              'SQ': [{'LN': 1, 'SN': 'dummy'}]}
    bamfh = pysam.Samfile(bam_file, "wb", header=header)
    try:
        while True:
            for i, fqiter in enumerate(fqfhs):
                id, seq, qual = fqiter.next()
                a = pysam.AlignedRead()
                a.rname = -1
                a.mrnm = -1
                #a.pos = 0
                #a.mpos = 0
                a.qname = id
                a.seq = seq
                a.qual = qual_func(qual)
                a.is_read1 = (i == 0)
                a.is_read2 = (i == 1)
                bamfh.write(a)
    except StopIteration:
        pass
    bamfh.close()
Exemple #23
0
def run_chimerascan(runconfig):
    # normal run
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        sys.exit(JOB_ERROR)
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir,
                                      config.RUNCONFIG_XML_FILE)
    fh = open(runconfig_xml_file, "w")
    print >> fh, xmlstring
    fh.close()
    # gather and parse run parameters
    library_type = parse_library_type(runconfig.library_type)
    gene_feature_file = os.path.join(runconfig.index_dir,
                                     config.GENE_FEATURE_FILE)
    bowtie_mode = "-v" if runconfig.bowtie_mode_v else "-n"
    bowtie_index = os.path.join(runconfig.index_dir, config.ALIGN_INDEX)
    original_read_length = get_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = original_read_length - runconfig.trim5 - runconfig.trim3
    min_fragment_length = max(runconfig.min_fragment_length,
                              trimmed_read_length)
    #
    # Initial Bowtie alignment step
    #
    # align in paired-end mode, trying to resolve as many reads as possible
    # this effectively rules out the vast majority of reads as candidate
    # fusions
    unaligned_fastq_param = os.path.join(tmp_dir, config.UNALIGNED_FASTQ_PARAM)
    maxmultimap_fastq_param = os.path.join(tmp_dir,
                                           config.MAXMULTIMAP_FASTQ_PARAM)
    aligned_bam_file = os.path.join(runconfig.output_dir,
                                    config.ALIGNED_READS_BAM_FILE)
    aligned_log_file = os.path.join(log_dir, "bowtie_alignment.log")
    if all(up_to_date(aligned_bam_file, fq) for fq in runconfig.fastq_files):
        logging.info("[SKIPPED] Alignment results exist")
    else:
        logging.info("Aligning full-length reads in paired-end mode")
        retcode = align_pe_full(
            runconfig.fastq_files,
            bowtie_index,
            aligned_bam_file,
            unaligned_fastq_param,
            maxmultimap_fastq_param,
            min_fragment_length=min_fragment_length,
            max_fragment_length=runconfig.max_fragment_length,
            trim5=runconfig.trim5,
            trim3=runconfig.trim3,
            library_type=runconfig.library_type,
            num_processors=runconfig.num_processors,
            fastq_format=runconfig.fastq_format,
            multihits=runconfig.multihits,
            mismatches=runconfig.mismatches,
            bowtie_bin=runconfig.bowtie_bin,
            bowtie_mode=bowtie_mode,
            log_file=aligned_log_file)
        if retcode != 0:
            logging.error("Bowtie failed with error code %d" % (retcode))
            sys.exit(retcode)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir,
                                   config.ISIZE_DIST_FILE)
    isize_dist = InsertSizeDistribution()
    if up_to_date(isize_dist_file, aligned_bam_file):
        logging.info("[SKIPPED] Profiling insert size distribution")
        isize_dist.from_file(open(isize_dist_file, "r"))
    else:
        logging.info("Profiling insert size distribution")
        max_isize_samples = config.ISIZE_MAX_SAMPLES
        bamfh = pysam.Samfile(aligned_bam_file, "rb")
        isize_dist.from_bam(bamfh,
                            min_isize=min_fragment_length,
                            max_isize=runconfig.max_fragment_length,
                            max_samples=max_isize_samples)
        isize_dist.to_file(open(isize_dist_file, "w"))
        bamfh.close()
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" %
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(),
                  isize_dist.percentile(50.0), isize_dist.mode()))
    #
    # Discordant reads alignment step
    #
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    discordant_log_file = os.path.join(log_dir,
                                       "bowtie_segmented_alignment.log")
    unaligned_fastq_files = [
        os.path.join(tmp_dir, fq) for fq in config.UNALIGNED_FASTQ_FILES
    ]
    # get the segments used in discordant alignment to know the effective
    # read length used to align.  we used this to set the 'padding' during
    # spanning read discovery
    segments = determine_read_segments(original_read_length,
                                       segment_length=runconfig.segment_length,
                                       segment_trim=True,
                                       trim5=runconfig.trim5,
                                       trim3=runconfig.trim3)
    segmented_read_length = segments[-1][1]
    logging.debug("Segmented alignment will use effective read length of %d" %
                  (segmented_read_length))
    if all(
            up_to_date(discordant_bam_file, fq)
            for fq in runconfig.fastq_files):
        logging.info("[SKIPPED] Discordant alignment results exist")
    else:
        logging.info("Aligning initially unmapped reads in single read mode")
        align(unaligned_fastq_files,
              runconfig.fastq_format,
              bowtie_index,
              discordant_bam_file,
              bowtie_bin=runconfig.bowtie_bin,
              num_processors=runconfig.num_processors,
              segment_length=runconfig.segment_length,
              segment_trim=True,
              trim5=runconfig.trim5,
              trim3=runconfig.trim3,
              multihits=runconfig.multihits,
              mismatches=runconfig.mismatches,
              bowtie_mode=bowtie_mode,
              best_strata=runconfig.best_strata,
              log_file=discordant_log_file)
    #
    # Merge paired-end reads step
    #
    paired_bam_file = os.path.join(tmp_dir, config.DISCORDANT_PAIRED_BAM_FILE)
    if up_to_date(paired_bam_file, discordant_bam_file):
        logging.info("[SKIPPED] Read pairing results exist")
    else:
        logging.info("Pairing aligned reads")
        bamfh = pysam.Samfile(discordant_bam_file, "rb")
        paired_bamfh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
        merge_read_pairs(bamfh, paired_bamfh, runconfig.min_fragment_length,
                         runconfig.max_fragment_length, library_type)
        paired_bamfh.close()
        bamfh.close()
    #
    # Find discordant reads step
    #
    discordant_gene_bedpe_file = \
        os.path.join(tmp_dir, config.DISCORDANT_GENE_BEDPE_FILE)
    discordant_genome_bedpe_file = \
        os.path.join(tmp_dir, config.DISCORDANT_GENOME_BEDPE_FILE)
    padding = original_read_length - segmented_read_length
    if (up_to_date(discordant_gene_bedpe_file, paired_bam_file)
            and up_to_date(discordant_genome_bedpe_file, paired_bam_file)):
        logging.info("[SKIPPED] Finding discordant reads")
    else:
        logging.info("Finding discordant reads")
        bamfh = pysam.Samfile(paired_bam_file, "rb")
        find_discordant_reads(bamfh,
                              discordant_gene_bedpe_file,
                              discordant_genome_bedpe_file,
                              gene_feature_file,
                              max_indel_size=runconfig.max_indel_size,
                              max_isize=runconfig.max_fragment_length,
                              max_multihits=runconfig.multihits,
                              library_type=library_type,
                              padding=padding)
        bamfh.close()
    #
    # Extract full sequences of the discordant reads
    #
    extended_discordant_gene_bedpe_file = \
        os.path.join(tmp_dir,
                     config.EXTENDED_DISCORDANT_GENE_BEDPE_FILE)
    if up_to_date(extended_discordant_gene_bedpe_file,
                  discordant_gene_bedpe_file):
        logging.info(
            "[SKIPPED] Retrieving full length sequences for realignment")
    else:
        logging.info("Retrieving full length sequences for realignment")
        extend_sequences(unaligned_fastq_files, discordant_gene_bedpe_file,
                         extended_discordant_gene_bedpe_file)
    #
    # Sort discordant reads
    #
    sorted_discordant_gene_bedpe_file = os.path.join(
        tmp_dir, config.SORTED_DISCORDANT_GENE_BEDPE_FILE)
    if (up_to_date(sorted_discordant_gene_bedpe_file,
                   extended_discordant_gene_bedpe_file)):
        logging.info("[SKIPPED] Sorting discordant BEDPE file")
    else:
        logging.info("Sorting discordant BEDPE file")
        sort_discordant_reads(extended_discordant_gene_bedpe_file,
                              sorted_discordant_gene_bedpe_file)
    #
    # Nominate chimeras step
    #
    encompassing_bedpe_file = os.path.join(
        tmp_dir, config.ENCOMPASSING_CHIMERA_BEDPE_FILE)
    if (up_to_date(encompassing_bedpe_file,
                   sorted_discordant_gene_bedpe_file)):
        logging.info("[SKIPPED] Nominating chimeras from discordant reads")
    else:
        logging.info("Nominating chimeras from discordant reads")
        nominate_chimeras(open(sorted_discordant_gene_bedpe_file, "r"),
                          open(encompassing_bedpe_file, "w"),
                          gene_feature_file,
                          trim=config.EXON_JUNCTION_TRIM_BP)
    #
    # Filter encompassing chimeras step
    #
    filtered_encomp_bedpe_file = \
        os.path.join(tmp_dir,
                     config.FILTERED_ENCOMPASSING_CHIMERA_BEDPE_FILE)
    if (up_to_date(filtered_encomp_bedpe_file, encompassing_bedpe_file)):
        logging.info("[SKIPPED] Filtering encompassing chimeras")
    else:
        logging.info("Filtering encompassing chimeras")
        # max_isize = isize_mean + runconfig.filter_isize_stdevs*isize_std
        filter_encompassing_chimeras(
            encompassing_bedpe_file,
            filtered_encomp_bedpe_file,
            gene_feature_file,
            max_multimap=runconfig.filter_max_multimaps,
            multimap_cov_ratio=runconfig.filter_multimap_ratio,
            max_isize=-1,
            strand_pval=runconfig.filter_strand_pval)
    #
    # Nominate spanning reads step
    #
    spanning_fastq_file = os.path.join(runconfig.output_dir,
                                       config.SPANNING_FASTQ_FILE)
    if all(up_to_date(spanning_fastq_file, f) for f in unaligned_fastq_files):
        logging.info("[SKIPPED] Preparing junction spanning reads")
    else:
        logging.info("Preparing junction spanning reads")
        outfh = open(spanning_fastq_file, "w")
        for f in unaligned_fastq_files:
            shutil.copyfileobj(open(f), outfh)
        outfh.close()
    # TODO: skip this step for now, and simply realign all the reads


#    spanning_fastq_file = os.path.join(runconfig.output_dir, config.SPANNING_FASTQ_FILE)
#    if (up_to_date(spanning_fastq_file, extended_discordant_bedpe_file) and
#        up_to_date(spanning_fastq_file, filtered_encomp_bedpe_file)):
#        logging.info("[SKIPPED] Nominating junction spanning reads")
#    else:
#        logging.info("Nominating junction spanning reads")
#        nominate_spanning_reads(open(extended_discordant_bedpe_file, 'r'),
#                                open(filtered_encomp_bedpe_file, 'r'),
#                                open(spanning_fastq_file, 'w'))
#
# Extract junction sequences from chimeras file
#
    ref_fasta_file = os.path.join(runconfig.index_dir,
                                  config.ALIGN_INDEX + ".fa")
    junc_fasta_file = os.path.join(tmp_dir, config.JUNC_REF_FASTA_FILE)
    junc_map_file = os.path.join(tmp_dir, config.JUNC_REF_MAP_FILE)
    spanning_read_length = get_read_length(spanning_fastq_file)
    if (up_to_date(junc_fasta_file, filtered_encomp_bedpe_file)
            and up_to_date(junc_map_file, filtered_encomp_bedpe_file)):
        logging.info("[SKIPPED] Extracting junction read sequences")
    else:
        logging.info("Extracting junction read sequences")
        bedpe_to_junction_fasta(filtered_encomp_bedpe_file,
                                ref_fasta_file, spanning_read_length,
                                open(junc_fasta_file, "w"),
                                open(junc_map_file, "w"))
    #
    # Build a bowtie index to align and detect spanning reads
    #
    bowtie_spanning_index = os.path.join(tmp_dir, config.JUNC_BOWTIE_INDEX)
    bowtie_spanning_index_file = os.path.join(tmp_dir,
                                              config.JUNC_BOWTIE_INDEX_FILE)
    if (up_to_date(bowtie_spanning_index_file, junc_fasta_file)):
        logging.info(
            "[SKIPPED] Building bowtie index for junction-spanning reads")
    else:
        logging.info("Building bowtie index for junction-spanning reads")
        args = [
            runconfig.bowtie_build_bin, junc_fasta_file, bowtie_spanning_index
        ]
        f = open(os.path.join(log_dir, "bowtie_build.log"), "w")
        subprocess.call(args, stdout=f, stderr=f)
        f.close()
    #
    # Align unmapped reads across putative junctions
    #
    junc_bam_file = os.path.join(tmp_dir, config.JUNC_READS_BAM_FILE)
    junc_log_file = os.path.join(log_dir, "bowtie_spanning_alignment.log")
    if (up_to_date(junc_bam_file, bowtie_spanning_index_file)
            and up_to_date(junc_bam_file, spanning_fastq_file)):
        logging.info("[SKIPPED] Aligning junction spanning reads")
    else:
        logging.info("Aligning junction spanning reads")
        retcode = align_sr_full(spanning_fastq_file,
                                bowtie_spanning_index,
                                junc_bam_file,
                                trim5=runconfig.trim5,
                                trim3=runconfig.trim3,
                                num_processors=runconfig.num_processors,
                                fastq_format=runconfig.fastq_format,
                                multihits=runconfig.multihits,
                                mismatches=runconfig.mismatches,
                                bowtie_bin=runconfig.bowtie_bin,
                                bowtie_mode=bowtie_mode,
                                log_file=junc_log_file)
        if retcode != 0:
            logging.error("Bowtie failed with error code %d" % (retcode))
            sys.exit(retcode)
    #
    # Merge spanning and encompassing read information
    #
    raw_chimera_bedpe_file = os.path.join(tmp_dir,
                                          config.RAW_CHIMERA_BEDPE_FILE)
    if (up_to_date(raw_chimera_bedpe_file, junc_bam_file)
            and up_to_date(raw_chimera_bedpe_file, junc_map_file)):
        logging.info(
            "[SKIPPED] Merging spanning and encompassing read alignments")
    else:
        logging.info("Merging spanning and encompassing read alignments")
        merge_spanning_alignments(junc_bam_file,
                                  junc_map_file,
                                  raw_chimera_bedpe_file,
                                  anchor_min=0,
                                  anchor_max=0,
                                  anchor_mismatches=0)
    #
    # Choose best isoform for each junction
    #
    chimera_bedpe_file = os.path.join(tmp_dir, config.CHIMERA_BEDPE_FILE)
    if (up_to_date(chimera_bedpe_file, raw_chimera_bedpe_file)):
        logging.info("[SKIPPED] Filtering chimeras")
    else:
        logging.info("Filtering chimeras")
        # get insert size at prob
        max_isize = isize_dist.percentile(runconfig.filter_isize_percentile)
        filter_spanning_chimeras(raw_chimera_bedpe_file,
                                 chimera_bedpe_file,
                                 gene_feature_file,
                                 mate_pval=runconfig.filter_strand_pval,
                                 max_isize=max_isize)
    #
    # Rank chimeras
    #
    ranked_chimera_bedpe_file = os.path.join(runconfig.output_dir,
                                             config.RANKED_CHIMERA_BEDPE_FILE)
    if (up_to_date(ranked_chimera_bedpe_file, chimera_bedpe_file)):
        logging.info("[SKIPPED] Ranking chimeras")
    else:
        logging.info("Ranking chimeras")
        rank_chimeras(chimera_bedpe_file,
                      ranked_chimera_bedpe_file,
                      empirical_prob=runconfig.empirical_prob)
    #
    # Cleanup
    #
    #shutil.rmtree(tmp_dir)
    #
    # Done
    #
    logging.info("Finished run. Chimeras written to file %s" %
                 (ranked_chimera_bedpe_file))
    return JOB_SUCCESS
def find_discordant_fragments(input_bam_file, gene_paired_bam_file,
                              genome_paired_bam_file, unmapped_bam_file, 
                              complex_bam_file, index_dir, max_isize, 
                              library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    - discordant genome alignments (unannotated)
    """
    logging.info("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tGene paired file: %s" % (gene_paired_bam_file))
    logging.debug("\tGenome paired file: %s" % (genome_paired_bam_file))
    logging.debug("\tUnmapped file: %s" % (unmapped_bam_file))
    logging.debug("\tComplex file: %s" % (complex_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    genefh = pysam.Samfile(gene_paired_bam_file, "wb", template=bamfh)
    genomefh = pysam.Samfile(genome_paired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    complexfh = pysam.Samfile(complex_bam_file, "wb", template=bamfh)
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    # build a lookup table to get all the overlapping transcripts given a
    # transcript 'tid'
    tid_tx_cluster_map = build_tid_tx_cluster_map(bamfh, 
                                                  open(gene_file), 
                                                  rname_prefix=config.GENE_REF_PREFIX)
    # build a lookup table to get genome coordinates from transcript 
    # coordinates
    tid_genome_map = build_tid_to_genome_map(bamfh, 
                                             open(gene_file), 
                                             rname_prefix=config.GENE_REF_PREFIX)
    for pe_reads in parse_pe_reads(bamfh):
        # add hit index and number of multimaps information to read tags
        # this function also checks for unmapped reads
        any_unmapped = False
        for reads in pe_reads:
            any_unmapped = (any_unmapped or 
                            annotate_multihits(bamfh, reads, tid_genome_map))
        if any_unmapped:
            # write to output as discordant reads and continue to 
            # next fragment
            write_pe_reads(unmappedfh, pe_reads)
            continue
        # examine all read pairing combinations and rule out invalid 
        # pairings.  this returns gene pairs and genome pairs
        gene_pairs, genome_pairs, unpaired_reads = \
            classify_read_pairs(pe_reads, max_isize,
                                library_type, tid_genome_map,
                                tid_tx_cluster_map)
        if len(gene_pairs) > 0 or len(genome_pairs) > 0:
            write_pairs(genefh, gene_pairs)
            write_pairs(genomefh, genome_pairs)
        else:
            write_pe_reads(complexfh, unpaired_reads)
    genefh.close()
    genomefh.close()
    unmappedfh.close()
    complexfh.close()
    bamfh.close()  
    logging.info("Finished pairing reads")
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, input_bam_file,
                                    output_file, trim_bp, max_read_length,
                                    homology_mismatches):
    """
    homology_mismatches: number of mismatches to tolerate while computing
    homology between chimeric breakpoint sequence and "wildtype" sequence
    
    trim_bp: when selecting the best matching exon for each read, we
    account for spurious overlap into adjacent exons by trimming the
    read by 'trim_bp'
    """
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    # open BAM alignment file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading gene information")
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    tid_tx_map = build_tid_tx_map(bamfh,
                                  gene_file,
                                  rname_prefix=config.GENE_REF_PREFIX)
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # iterate through read pairs
    outfh = open(output_file, "w")
    logging.debug("Parsing discordant reads")
    for r5p, r3p in parse_gene_discordant_reads(bamfh):
        # store pertinent read information in lightweight structure called
        # DiscordantRead object. this departs from SAM format into a
        # custom read format
        dr5p = DiscordantRead.from_read(r5p)
        dr3p = DiscordantRead.from_read(r3p)
        # get gene information
        tx5p = tid_tx_map[r5p.rname]
        tx3p = tid_tx_map[r3p.rname]
        # given the insert size find the highest probability
        # exon junction breakpoint between the two transcripts
        isize_prob, breakpoints = \
            choose_best_breakpoints(r5p, r3p, tx5p, tx3p,
                                    trim_bp, isize_dist)
        # extract the sequence of the breakpoint along with the
        # number of homologous bases at the breakpoint between
        # chimera and wildtype genes
        for breakpoint in breakpoints:
            exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint
            breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \
                extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p,
                                            config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p,
                                            ref_fa, max_read_length,
                                            homology_mismatches)
            # write breakpoint information for each read to a file
            fields = [
                tx5p.tx_name,
                0,
                tx_end_5p,
                tx3p.tx_name,
                tx_start_3p,
                tx3p.tx_end,
                r5p.rname,  # name
                isize_prob,  # score
                tx5p.strand,
                tx3p.strand,  # strand 1, strand 2
                # user defined fields
                exon_num_5p,
                exon_num_3p,
                breakpoint_seq_5p,
                breakpoint_seq_3p,
                homology_left,
                homology_right
            ]
            fields.append('|'.join(map(str, dr5p.to_list())))
            fields.append('|'.join(map(str, dr3p.to_list())))
            print >> outfh, '\t'.join(map(str, fields))
    # cleanup
    ref_fa.close()
    outfh.close()
    bamfh.close()
    return config.JOB_SUCCESS