def parse_gene_chimeric_reads(bamfh): # create a dictionary structure to hold read pairs chimera_dict = collections.defaultdict(lambda: []) for r1, r2 in parse_pairs(bamfh): # # TODO: # for now we are only going to deal with gene-gene # chimeras and leave other chimeras for study at a # later time # dr1 = r1.opt(DISCORDANT_TAG_NAME) dr2 = r2.opt(DISCORDANT_TAG_NAME) if (dr1 != DiscordantTags.DISCORDANT_GENE or dr2 != DiscordantTags.DISCORDANT_GENE): continue # organize key in 5' to 3' order or1 = r1.opt(ORIENTATION_TAG_NAME) or2 = r2.opt(ORIENTATION_TAG_NAME) assert or1 != or2 if or1 == OrientationTags.FIVEPRIME: pair = (r1, r2) else: pair = (r2, r1) # store pertinent information in lightweight structure # convert to DiscordantRead objects r5p = DiscordantRead.from_read(pair[0]) r3p = DiscordantRead.from_read(pair[1]) # keep list of discordant chimeric reads chimera_dict[(r5p.tid, r3p.tid)].append((r5p, r3p)) for key, pairs in chimera_dict.iteritems(): rname1, rname2 = key yield rname1, rname2, pairs
def parse_gene_chimeric_reads(bamfh): # create a dictionary structure to hold read pairs chimera_dict = collections.defaultdict(lambda: []) for r1,r2 in parse_pairs(bamfh): # # TODO: # for now we are only going to deal with gene-gene # chimeras and leave other chimeras for study at a # later time # dr1 = r1.opt(DISCORDANT_TAG_NAME) dr2 = r2.opt(DISCORDANT_TAG_NAME) if (dr1 != DiscordantTags.DISCORDANT_GENE or dr2 != DiscordantTags.DISCORDANT_GENE): continue # organize key in 5' to 3' order or1 = r1.opt(ORIENTATION_TAG_NAME) or2 = r2.opt(ORIENTATION_TAG_NAME) assert or1 != or2 if or1 == OrientationTags.FIVEPRIME: pair = (r1,r2) else: pair = (r2,r1) # store pertinent information in lightweight structure # convert to DiscordantRead objects r5p = DiscordantRead.from_read(pair[0]) r3p = DiscordantRead.from_read(pair[1]) # keep list of discordant chimeric reads chimera_dict[(r5p.tid, r3p.tid)].append((r5p,r3p)) for key,pairs in chimera_dict.iteritems(): rname1,rname2 = key yield rname1, rname2, pairs
def discordant_reads_to_bedpe(index_dir, input_bam_file, output_file): # open BAM alignment file bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts logging.debug("Reading gene information") gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) tid_gene_map = build_tid_gene_map(bamfh, gene_file, rname_prefix=config.GENE_REF_PREFIX) outfh = open(output_file, "w") logging.debug("Converting BAM to BEDPE format") for r5p,r3p in parse_gene_discordant_reads(bamfh): # store pertinent read information in lightweight structure called # DiscordantRead object. this departs from SAM format into a # custom read format dr5p = DiscordantRead.from_read(r5p) dr3p = DiscordantRead.from_read(r3p) # get gene information tx5p = tid_gene_map[r5p.rname] tx3p = tid_gene_map[r3p.rname] # write bedpe format fields = [tx5p.tx_name, r5p.pos, r5p.aend, tx3p.tx_name, r3p.pos, r3p.aend, r5p.qname, # read name 0, # score tx5p.strand, tx3p.strand, # strand 1, strand 2 ] fields.append('|'.join(map(str, dr5p.to_list()))) fields.append('|'.join(map(str, dr3p.to_list()))) print >>outfh, '\t'.join(map(str, fields)) outfh.close()
def discordant_reads_to_bedpe(index_dir, input_bam_file, output_file): # open BAM alignment file bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts logging.debug("Reading transcript features") transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE) transcripts = list(TranscriptFeature.parse(open(transcript_file))) tid_tx_map = build_tid_transcript_map(bamfh, transcripts) outfh = open(output_file, "w") logging.debug("Converting BAM to BEDPE format") for r5p,r3p in parse_gene_discordant_reads(bamfh): # store pertinent read information in lightweight structure called # DiscordantRead object. this departs from SAM format into a # custom read format dr5p = DiscordantRead.from_read(r5p) dr3p = DiscordantRead.from_read(r3p) # get gene information tx5p = tid_tx_map[r5p.rname] tx3p = tid_tx_map[r3p.rname] # write bedpe format fields = [tx5p.tx_id, r5p.pos, r5p.aend, tx3p.tx_id, r3p.pos, r3p.aend, r5p.qname, # read name 0, # score tx5p.strand, tx3p.strand, # strand 1, strand 2 ] fields.append('|'.join(map(str, dr5p.to_list()))) fields.append('|'.join(map(str, dr3p.to_list()))) print >>outfh, '\t'.join(map(str, fields)) outfh.close() bamfh.close()
def filter_spanning_reads(chimeras, reads, anchor_min, anchor_length, anchor_mismatches, library_type): for i,r in enumerate(reads): if r.is_unmapped: continue # make a discordant read object # TODO: need to annotate reads elsewhere since they have already been sorted here r.tags = r.tags + [("HI", 0), ("IH", 1), ("NH", 1), (DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE), (ORIENTATION_TAG_NAME, get_gene_orientation(r, library_type))] dr = DiscordantRead.from_read(r) dr.is_spanning = True # check read alignment against chimeras for c in chimeras: if check_breakpoint_alignment(c, r, anchor_min, anchor_length, anchor_mismatches): # valid spanning read yield c,dr
def parse_discordant_bedpe_by_transcript_pair(fh): prev_tx5p, prev_tx3p = None,None frags = [] for line in fh: fields = line.strip().split('\t') tx5p = fields[0] tx3p = fields[3] dr5p = DiscordantRead.from_list(fields[10].split("|")) dr3p = DiscordantRead.from_list(fields[11].split("|")) if (tx5p, tx3p) != (prev_tx5p, prev_tx3p): if len(frags) > 0: yield prev_tx5p, prev_tx3p, frags frags = [] prev_tx5p, prev_tx3p = tx5p, tx3p frags.append((dr5p, dr3p)) if len(frags) > 0: yield tx5p, tx3p, frags
def parse_discordant_bedpe_by_transcript_pair(fh): prev_tx5p, prev_tx3p = None, None frags = [] for line in fh: fields = line.strip().split('\t') tx5p = fields[0] tx3p = fields[3] dr5p = DiscordantRead.from_list(fields[10].split("|")) dr3p = DiscordantRead.from_list(fields[11].split("|")) if (tx5p, tx3p) != (prev_tx5p, prev_tx3p): if len(frags) > 0: yield prev_tx5p, prev_tx3p, frags frags = [] prev_tx5p, prev_tx3p = tx5p, tx3p frags.append((dr5p, dr3p)) if len(frags) > 0: yield tx5p, tx3p, frags
def filter_spanning_reads(chimeras, reads, anchor_min, anchor_length, anchor_mismatches, library_type): for i, r in enumerate(reads): if r.is_unmapped: continue # make a discordant read object # TODO: need to annotate reads elsewhere since they have already been sorted here r.tags = r.tags + [ ("HI", 0), ("IH", 1), ("NH", 1), (DISCORDANT_TAG_NAME, DiscordantTags.DISCORDANT_GENE), (ORIENTATION_TAG_NAME, get_orientation(r, library_type)) ] dr = DiscordantRead.from_read(r) dr.is_spanning = True # check read alignment against chimeras for c in chimeras: if check_breakpoint_alignment(c, r, anchor_min, anchor_length, anchor_mismatches): # valid spanning read yield c, dr
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, input_bam_file, output_file, trim_bp, max_read_length, homology_mismatches): """ homology_mismatches: number of mismatches to tolerate while computing homology between chimeric breakpoint sequence and "wildtype" sequence trim_bp: when selecting the best matching exon for each read, we account for spurious overlap into adjacent exons by trimming the read by 'trim_bp' """ # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # open BAM alignment file bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts logging.debug("Reading gene information") gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) tid_tx_map = build_tid_tx_map(bamfh, gene_file, rname_prefix=config.GENE_REF_PREFIX) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) # iterate through read pairs outfh = open(output_file, "w") logging.debug("Parsing discordant reads") for r5p, r3p in parse_gene_discordant_reads(bamfh): # store pertinent read information in lightweight structure called # DiscordantRead object. this departs from SAM format into a # custom read format dr5p = DiscordantRead.from_read(r5p) dr3p = DiscordantRead.from_read(r3p) # get gene information tx5p = tid_tx_map[r5p.rname] tx3p = tid_tx_map[r3p.rname] # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(r5p, r3p, tx5p, tx3p, trim_bp, isize_dist) # extract the sequence of the breakpoint along with the # number of homologous bases at the breakpoint between # chimera and wildtype genes for breakpoint in breakpoints: exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p, config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p, ref_fa, max_read_length, homology_mismatches) # write breakpoint information for each read to a file fields = [ tx5p.tx_name, 0, tx_end_5p, tx3p.tx_name, tx_start_3p, tx3p.tx_end, r5p.rname, # name isize_prob, # score tx5p.strand, tx3p.strand, # strand 1, strand 2 # user defined fields exon_num_5p, exon_num_3p, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right ] fields.append('|'.join(map(str, dr5p.to_list()))) fields.append('|'.join(map(str, dr3p.to_list()))) print >> outfh, '\t'.join(map(str, fields)) # cleanup ref_fa.close() outfh.close() bamfh.close() return config.JOB_SUCCESS
def discordant_reads_to_breakpoints(index_dir, isize_dist_file, input_bam_file, output_file, trim_bp, max_read_length, homology_mismatches): """ homology_mismatches: number of mismatches to tolerate while computing homology between chimeric breakpoint sequence and "wildtype" sequence trim_bp: when selecting the best matching exon for each read, we account for spurious overlap into adjacent exons by trimming the read by 'trim_bp' """ # read insert size distribution isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file)) # open BAM alignment file bamfh = pysam.Samfile(input_bam_file, "rb") # build a lookup table to get genomic intervals from transcripts logging.debug("Reading gene information") gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) tid_tx_map = build_tid_tx_map(bamfh, gene_file, rname_prefix=config.GENE_REF_PREFIX) # open the reference sequence fasta file ref_fasta_file = os.path.join(index_dir, config.ALIGN_INDEX + ".fa") ref_fa = pysam.Fastafile(ref_fasta_file) # iterate through read pairs outfh = open(output_file, "w") logging.debug("Parsing discordant reads") for r5p,r3p in parse_gene_discordant_reads(bamfh): # store pertinent read information in lightweight structure called # DiscordantRead object. this departs from SAM format into a # custom read format dr5p = DiscordantRead.from_read(r5p) dr3p = DiscordantRead.from_read(r3p) # get gene information tx5p = tid_tx_map[r5p.rname] tx3p = tid_tx_map[r3p.rname] # given the insert size find the highest probability # exon junction breakpoint between the two transcripts isize_prob, breakpoints = \ choose_best_breakpoints(r5p, r3p, tx5p, tx3p, trim_bp, isize_dist) # extract the sequence of the breakpoint along with the # number of homologous bases at the breakpoint between # chimera and wildtype genes for breakpoint in breakpoints: exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \ extract_breakpoint_sequence(config.GENE_REF_PREFIX + tx5p.tx_name, tx_end_5p, config.GENE_REF_PREFIX + tx3p.tx_name, tx_start_3p, ref_fa, max_read_length, homology_mismatches) # write breakpoint information for each read to a file fields = [tx5p.tx_name, 0, tx_end_5p, tx3p.tx_name, tx_start_3p, tx3p.tx_end, r5p.rname, # name isize_prob, # score tx5p.strand, tx3p.strand, # strand 1, strand 2 # user defined fields exon_num_5p, exon_num_3p, breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right] fields.append('|'.join(map(str, dr5p.to_list()))) fields.append('|'.join(map(str, dr3p.to_list()))) print >>outfh, '\t'.join(map(str, fields)) # cleanup ref_fa.close() outfh.close() bamfh.close() return config.JOB_SUCCESS