Beispiel #1
0
def parse_gene_chimeric_reads(bamfh):
    # create a dictionary structure to hold read pairs
    chimera_dict = collections.defaultdict(lambda: [])
    for r1, r2 in parse_pairs(bamfh):
        #
        # TODO:
        # for now we are only going to deal with gene-gene
        # chimeras and leave other chimeras for study at a
        # later time
        #
        dr1 = r1.opt(DISCORDANT_TAG_NAME)
        dr2 = r2.opt(DISCORDANT_TAG_NAME)
        if (dr1 != DiscordantTags.DISCORDANT_GENE
                or dr2 != DiscordantTags.DISCORDANT_GENE):
            continue
        # organize key in 5' to 3' order
        or1 = r1.opt(ORIENTATION_TAG_NAME)
        or2 = r2.opt(ORIENTATION_TAG_NAME)
        assert or1 != or2
        if or1 == OrientationTags.FIVEPRIME:
            pair = (r1, r2)
        else:
            pair = (r2, r1)
        # store pertinent information in lightweight structure
        # convert to DiscordantRead objects
        r5p = DiscordantRead.from_read(pair[0])
        r3p = DiscordantRead.from_read(pair[1])
        # keep list of discordant chimeric reads
        chimera_dict[(r5p.tid, r3p.tid)].append((r5p, r3p))
    for key, pairs in chimera_dict.iteritems():
        rname1, rname2 = key
        yield rname1, rname2, pairs
def parse_gene_chimeric_reads(bamfh):
    # create a dictionary structure to hold read pairs
    chimera_dict = collections.defaultdict(lambda: [])   
    for r1,r2 in parse_pairs(bamfh):
        #
        # TODO:
        # for now we are only going to deal with gene-gene
        # chimeras and leave other chimeras for study at a 
        # later time
        #
        dr1 = r1.opt(DISCORDANT_TAG_NAME)
        dr2 = r2.opt(DISCORDANT_TAG_NAME)
        if (dr1 != DiscordantTags.DISCORDANT_GENE or
            dr2 != DiscordantTags.DISCORDANT_GENE):            
            continue
        # organize key in 5' to 3' order
        or1 = r1.opt(ORIENTATION_TAG_NAME)
        or2 = r2.opt(ORIENTATION_TAG_NAME)
        assert or1 != or2
        if or1 == OrientationTags.FIVEPRIME:
            pair = (r1,r2)
        else:
            pair = (r2,r1)
        # store pertinent information in lightweight structure
        # convert to DiscordantRead objects
        r5p = DiscordantRead.from_read(pair[0])
        r3p = DiscordantRead.from_read(pair[1])
        # keep list of discordant chimeric reads
        chimera_dict[(r5p.tid, r3p.tid)].append((r5p,r3p))
    for key,pairs in chimera_dict.iteritems():
        rname1,rname2 = key
        yield rname1, rname2, pairs
def discordant_reads_to_bedpe(index_dir, input_bam_file, output_file):
    # open BAM alignment file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading gene information")
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    tid_gene_map = build_tid_gene_map(bamfh, gene_file,
                                      rname_prefix=config.GENE_REF_PREFIX)
    outfh = open(output_file, "w")    
    logging.debug("Converting BAM to BEDPE format")
    for r5p,r3p in parse_gene_discordant_reads(bamfh):
        # store pertinent read information in lightweight structure called
        # DiscordantRead object. this departs from SAM format into a 
        # custom read format
        dr5p = DiscordantRead.from_read(r5p)
        dr3p = DiscordantRead.from_read(r3p)
        # get gene information
        tx5p = tid_gene_map[r5p.rname]
        tx3p = tid_gene_map[r3p.rname]
        # write bedpe format
        fields = [tx5p.tx_name, r5p.pos, r5p.aend,
                  tx3p.tx_name, r3p.pos, r3p.aend,
                  r5p.qname,  # read name
                  0, # score
                  tx5p.strand, tx3p.strand, # strand 1, strand 2
                  ]
        fields.append('|'.join(map(str, dr5p.to_list())))
        fields.append('|'.join(map(str, dr3p.to_list())))  
        print >>outfh, '\t'.join(map(str, fields)) 
    outfh.close()
def discordant_reads_to_bedpe(index_dir, input_bam_file, output_file):
    # open BAM alignment file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    outfh = open(output_file, "w")    
    logging.debug("Converting BAM to BEDPE format")
    for r5p,r3p in parse_gene_discordant_reads(bamfh):
        # store pertinent read information in lightweight structure called
        # DiscordantRead object. this departs from SAM format into a 
        # custom read format
        dr5p = DiscordantRead.from_read(r5p)
        dr3p = DiscordantRead.from_read(r3p)
        # get gene information
        tx5p = tid_tx_map[r5p.rname]
        tx3p = tid_tx_map[r3p.rname]
        # write bedpe format
        fields = [tx5p.tx_id, r5p.pos, r5p.aend,
                  tx3p.tx_id, r3p.pos, r3p.aend,
                  r5p.qname,  # read name
                  0, # score
                  tx5p.strand, tx3p.strand, # strand 1, strand 2
                  ]
        fields.append('|'.join(map(str, dr5p.to_list())))
        fields.append('|'.join(map(str, dr3p.to_list())))  
        print >>outfh, '\t'.join(map(str, fields)) 
    outfh.close()
    bamfh.close()
def filter_spanning_reads(chimeras, reads, 
                          anchor_min, 
                          anchor_length, 
                          anchor_mismatches,
                          library_type):
    for i,r in enumerate(reads):
        if r.is_unmapped:
            continue
        # make a discordant read object
        # TODO: need to annotate reads elsewhere since they have already been sorted here
        r.tags = r.tags + [("HI", 0),
                           ("IH", 1),
                           ("NH", 1),
                           (DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE),
                           (ORIENTATION_TAG_NAME, get_gene_orientation(r, library_type))]
        dr = DiscordantRead.from_read(r)
        dr.is_spanning = True
        # check read alignment against chimeras
        for c in chimeras:
            if check_breakpoint_alignment(c, r, 
                                          anchor_min, 
                                          anchor_length, 
                                          anchor_mismatches):
                # valid spanning read
                yield c,dr
def parse_discordant_bedpe_by_transcript_pair(fh):
    prev_tx5p, prev_tx3p = None,None
    frags = []
    for line in fh:
        fields = line.strip().split('\t')        
        tx5p = fields[0]
        tx3p = fields[3]
        dr5p = DiscordantRead.from_list(fields[10].split("|"))
        dr3p = DiscordantRead.from_list(fields[11].split("|"))
        if (tx5p, tx3p) != (prev_tx5p, prev_tx3p):
            if len(frags) > 0:
                yield prev_tx5p, prev_tx3p, frags
                frags = []
            prev_tx5p, prev_tx3p = tx5p, tx3p
        frags.append((dr5p, dr3p))
    if len(frags) > 0:
        yield tx5p, tx3p, frags        
Beispiel #7
0
def parse_discordant_bedpe_by_transcript_pair(fh):
    prev_tx5p, prev_tx3p = None, None
    frags = []
    for line in fh:
        fields = line.strip().split('\t')
        tx5p = fields[0]
        tx3p = fields[3]
        dr5p = DiscordantRead.from_list(fields[10].split("|"))
        dr3p = DiscordantRead.from_list(fields[11].split("|"))
        if (tx5p, tx3p) != (prev_tx5p, prev_tx3p):
            if len(frags) > 0:
                yield prev_tx5p, prev_tx3p, frags
                frags = []
            prev_tx5p, prev_tx3p = tx5p, tx3p
        frags.append((dr5p, dr3p))
    if len(frags) > 0:
        yield tx5p, tx3p, frags
def filter_spanning_reads(chimeras, reads, anchor_min, anchor_length,
                          anchor_mismatches, library_type):
    for i, r in enumerate(reads):
        if r.is_unmapped:
            continue
        # make a discordant read object
        # TODO: need to annotate reads elsewhere since they have already been sorted here
        r.tags = r.tags + [
            ("HI", 0), ("IH", 1), ("NH", 1),
            (DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE),
            (ORIENTATION_TAG_NAME, get_orientation(r, library_type))
        ]
        dr = DiscordantRead.from_read(r)
        dr.is_spanning = True
        # check read alignment against chimeras
        for c in chimeras:
            if check_breakpoint_alignment(c, r, anchor_min, anchor_length,
                                          anchor_mismatches):
                # valid spanning read
                yield c, dr
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, input_bam_file,
                                    output_file, trim_bp, max_read_length,
                                    homology_mismatches):
    """
    homology_mismatches: number of mismatches to tolerate while computing
    homology between chimeric breakpoint sequence and "wildtype" sequence
    
    trim_bp: when selecting the best matching exon for each read, we
    account for spurious overlap into adjacent exons by trimming the
    read by 'trim_bp'
    """
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    # open BAM alignment file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading gene information")
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    tid_tx_map = build_tid_tx_map(bamfh,
                                  gene_file,
                                  rname_prefix=config.GENE_REF_PREFIX)
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # iterate through read pairs
    outfh = open(output_file, "w")
    logging.debug("Parsing discordant reads")
    for r5p, r3p in parse_gene_discordant_reads(bamfh):
        # store pertinent read information in lightweight structure called
        # DiscordantRead object. this departs from SAM format into a
        # custom read format
        dr5p = DiscordantRead.from_read(r5p)
        dr3p = DiscordantRead.from_read(r3p)
        # get gene information
        tx5p = tid_tx_map[r5p.rname]
        tx3p = tid_tx_map[r3p.rname]
        # given the insert size find the highest probability
        # exon junction breakpoint between the two transcripts
        isize_prob, breakpoints = \
            choose_best_breakpoints(r5p, r3p, tx5p, tx3p,
                                    trim_bp, isize_dist)
        # extract the sequence of the breakpoint along with the
        # number of homologous bases at the breakpoint between
        # chimera and wildtype genes
        for breakpoint in breakpoints:
            exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint
            breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \
                extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p,
                                            config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p,
                                            ref_fa, max_read_length,
                                            homology_mismatches)
            # write breakpoint information for each read to a file
            fields = [
                tx5p.tx_name,
                0,
                tx_end_5p,
                tx3p.tx_name,
                tx_start_3p,
                tx3p.tx_end,
                r5p.rname,  # name
                isize_prob,  # score
                tx5p.strand,
                tx3p.strand,  # strand 1, strand 2
                # user defined fields
                exon_num_5p,
                exon_num_3p,
                breakpoint_seq_5p,
                breakpoint_seq_3p,
                homology_left,
                homology_right
            ]
            fields.append('|'.join(map(str, dr5p.to_list())))
            fields.append('|'.join(map(str, dr3p.to_list())))
            print >> outfh, '\t'.join(map(str, fields))
    # cleanup
    ref_fa.close()
    outfh.close()
    bamfh.close()
    return config.JOB_SUCCESS
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, 
                                    input_bam_file, output_file, 
                                    trim_bp, max_read_length,
                                    homology_mismatches):                      
    """
    homology_mismatches: number of mismatches to tolerate while computing
    homology between chimeric breakpoint sequence and "wildtype" sequence
    
    trim_bp: when selecting the best matching exon for each read, we
    account for spurious overlap into adjacent exons by trimming the
    read by 'trim_bp'
    """   
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    # open BAM alignment file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading gene information")
    gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE)
    tid_tx_map = build_tid_tx_map(bamfh, gene_file,
                                  rname_prefix=config.GENE_REF_PREFIX)
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa")
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # iterate through read pairs
    outfh = open(output_file, "w")
    logging.debug("Parsing discordant reads")
    for r5p,r3p in parse_gene_discordant_reads(bamfh):
        # store pertinent read information in lightweight structure called
        # DiscordantRead object. this departs from SAM format into a 
        # custom read format
        dr5p = DiscordantRead.from_read(r5p)
        dr3p = DiscordantRead.from_read(r3p)
        # get gene information
        tx5p = tid_tx_map[r5p.rname]
        tx3p = tid_tx_map[r3p.rname]
        # given the insert size find the highest probability 
        # exon junction breakpoint between the two transcripts
        isize_prob, breakpoints = \
            choose_best_breakpoints(r5p, r3p, tx5p, tx3p, 
                                    trim_bp, isize_dist)
        # extract the sequence of the breakpoint along with the
        # number of homologous bases at the breakpoint between 
        # chimera and wildtype genes
        for breakpoint in breakpoints:
            exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint
            breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \
                extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p,
                                            config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p,
                                            ref_fa, max_read_length,
                                            homology_mismatches)
            # write breakpoint information for each read to a file
            fields = [tx5p.tx_name, 0, tx_end_5p,
                      tx3p.tx_name, tx_start_3p, tx3p.tx_end,
                      r5p.rname,  # name
                      isize_prob, # score
                      tx5p.strand, tx3p.strand, # strand 1, strand 2
                      # user defined fields
                      exon_num_5p, exon_num_3p,
                      breakpoint_seq_5p, breakpoint_seq_3p, 
                      homology_left, homology_right] 
            fields.append('|'.join(map(str, dr5p.to_list())))
            fields.append('|'.join(map(str, dr3p.to_list())))  
            print >>outfh, '\t'.join(map(str, fields))        
    # cleanup
    ref_fa.close()
    outfh.close()
    bamfh.close()
    return config.JOB_SUCCESS