コード例 #1
0
def discordant_reads_to_bedpe(index_dir, input_bam_file, output_file):
    # open BAM alignment file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    outfh = open(output_file, "w")    
    logging.debug("Converting BAM to BEDPE format")
    for r5p,r3p in parse_gene_discordant_reads(bamfh):
        # store pertinent read information in lightweight structure called
        # DiscordantRead object. this departs from SAM format into a 
        # custom read format
        dr5p = DiscordantRead.from_read(r5p)
        dr3p = DiscordantRead.from_read(r3p)
        # get gene information
        tx5p = tid_tx_map[r5p.rname]
        tx3p = tid_tx_map[r3p.rname]
        # write bedpe format
        fields = [tx5p.tx_id, r5p.pos, r5p.aend,
                  tx3p.tx_id, r3p.pos, r3p.aend,
                  r5p.qname,  # read name
                  0, # score
                  tx5p.strand, tx3p.strand, # strand 1, strand 2
                  ]
        fields.append('|'.join(map(str, dr5p.to_list())))
        fields.append('|'.join(map(str, dr3p.to_list())))  
        print >>outfh, '\t'.join(map(str, fields)) 
    outfh.close()
    bamfh.close()
コード例 #2
0
def find_discordant_fragments(input_bam_file, paired_bam_file,
                              unmapped_bam_file, index_dir, max_isize,
                              library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    """
    logging.info("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tGene paired file: %s" % (paired_bam_file))
    logging.debug("\tUnmapped file: %s" % (unmapped_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    # read transcript features
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.debug("Building transcript lookup tables")
    # build a lookup table from bam tid index to transcript object
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts)
    logging.info("Parsing reads")
    for pe_reads in parse_pe_reads(bamfh):
        # add hit index and multimap information to read tags
        # this function also checks for unmapped reads
        any_unmapped = False
        for reads in pe_reads:
            any_unmapped = (any_unmapped or annotate_multihits(
                bamfh, reads, tid_tx_genome_map))
        if any_unmapped:
            # write to output as discordant reads and continue to
            # next fragment
            write_pe_reads(unmappedfh, pe_reads)
            continue
        # examine all read pairing combinations and rule out invalid pairings
        gene_pairs, unpaired_reads = classify_read_pairs(
            pe_reads, max_isize, library_type, tid_tx_map)
        if len(gene_pairs) > 0:
            write_pairs(genefh, gene_pairs)
        # TODO: do something with unpaired discordant reads?
    genefh.close()
    unmappedfh.close()
    bamfh.close()
    logging.info("Finished pairing reads")
    return config.JOB_SUCCESS
コード例 #3
0
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    """
    logging.info("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tGene paired file: %s" % (paired_bam_file))
    logging.debug("\tUnmapped file: %s" % (unmapped_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    # read transcript features
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.debug("Building transcript lookup tables")
    # build a lookup table from bam tid index to transcript object
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts)
    logging.info("Parsing reads")
    for pe_reads in parse_pe_reads(bamfh):
        # add hit index and multimap information to read tags
        # this function also checks for unmapped reads
        any_unmapped = False
        for reads in pe_reads:
            any_unmapped = any_unmapped or annotate_multihits(bamfh, reads, tid_tx_genome_map)
        if any_unmapped:
            # write to output as discordant reads and continue to
            # next fragment
            write_pe_reads(unmappedfh, pe_reads)
            continue
        # examine all read pairing combinations and rule out invalid pairings
        gene_pairs, unpaired_reads = classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map)
        if len(gene_pairs) > 0:
            write_pairs(genefh, gene_pairs)
        # TODO: do something with unpaired discordant reads?
    genefh.close()
    unmappedfh.close()
    bamfh.close()
    logging.info("Finished pairing reads")
    return config.JOB_SUCCESS