def get_highest_coverage_isoforms(input_file, gene_file): # place overlapping chimeras into clusters logging.debug("Building isoform cluster lookup table") transcript_cluster_map = build_transcript_cluster_map(open(gene_file)) # build a lookup table to get genome coordinates from transcript # coordinates transcript_genome_map = build_transcript_genome_map(open(gene_file)) cluster_chimera_dict = collections.defaultdict(lambda: []) for c in Chimera.parse(open(input_file)): # TODO: adjust this to score chimeras differently! key = (c.name, c.get_num_frags()) # get cluster of overlapping genes cluster5p = transcript_cluster_map[c.tx_name_5p] cluster3p = transcript_cluster_map[c.tx_name_3p] # get genomic positions of breakpoints coord5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map) coord3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map) # add to dictionary cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(key) # choose highest coverage chimeras within each pair of clusters logging.debug("Finding highest coverage isoforms") kept_chimeras = set() for stats_list in cluster_chimera_dict.itervalues(): stats_dict = collections.defaultdict(lambda: set()) for stats_info in stats_list: # index chimera names stats_dict[stats_info[1:]].add(stats_info[0]) # find highest scoring key sorted_keys = sorted(stats_dict.keys(), reverse=True) kept_chimeras.update(stats_dict[sorted_keys[0]]) return kept_chimeras
def annotate_multihits(bamfh, reads, transcript_tid_genome_map): hits = set() any_unmapped = False for r in reads: if r.is_unmapped: any_unmapped = True continue if r.rname not in transcript_tid_genome_map: tid = r.rname pos = r.pos else: # use the position that is most 5' relative to genome left_tid, left_strand, left_pos = transcript_to_genome_pos(r.rname, r.pos, transcript_tid_genome_map) right_tid, right_strand, right_pos = transcript_to_genome_pos(r.rname, r.aend-1, transcript_tid_genome_map) tid = left_tid pos = imin2(left_pos, right_pos) hits.add((tid, pos)) #print r.qname, bamfh.getrname(r.rname), r.pos, bamfh.getrname(tid), pos for i,r in enumerate(reads): # annotate reads with 'HI', and 'IH' tags r.tags = r.tags + [("HI",i), ("IH",len(reads)), ("NH", len(hits))] return any_unmapped
def annotate_multihits(bamfh, reads, transcript_tid_genome_map): hits = set() any_unmapped = False for r in reads: if r.is_unmapped: any_unmapped = True continue if r.rname not in transcript_tid_genome_map: tid = r.rname pos = r.pos else: # use the position that is most 5' relative to genome left_tid, left_strand, left_pos = transcript_to_genome_pos( r.rname, r.pos, transcript_tid_genome_map) right_tid, right_strand, right_pos = transcript_to_genome_pos( r.rname, r.aend - 1, transcript_tid_genome_map) tid = left_tid pos = imin2(left_pos, right_pos) hits.add((tid, pos)) #print r.qname, bamfh.getrname(r.rname), r.pos, bamfh.getrname(tid), pos for i, r in enumerate(reads): # annotate reads with 'HI', and 'IH' tags r.tags = r.tags + [("HI", i), ("IH", len(reads)), ("NH", len(hits))] return any_unmapped
def write_output(input_file, bam_file, output_file, index_dir): gene_file = os.path.join(index_dir, config.GENE_FEATURE_FILE) # build a lookup table to get genome coordinates from transcript # coordinates transcript_genome_map = build_transcript_genome_map(open(gene_file)) tx_name_gene_map = build_tx_name_gene_map(gene_file) genome_tx_trees = build_genome_tx_trees(gene_file) # open BAM file for checking wild-type isoform bamfh = pysam.Samfile(bam_file, "rb") # group chimera isoforms together lines = [] chimera_clusters = 0 for key,chimeras in get_chimera_groups(input_file, gene_file): txs5p = set() txs3p = set() genes5p = set() genes3p = set() names = set() for c in chimeras: txs5p.add("%s:%d-%d" % (c.tx_name_5p, c.tx_start_5p, c.tx_end_5p-1)) txs3p.add("%s:%d-%d" % (c.tx_name_3p, c.tx_start_3p, c.tx_end_3p-1)) genes5p.add(c.gene_name_5p) genes3p.add(c.gene_name_3p) names.add(c.name) c = get_best_coverage_chimera(chimeras) # get chimera type and distance between genes chimera_type, distance = get_chimera_type(tx_name_gene_map[c.tx_name_5p], tx_name_gene_map[c.tx_name_3p], genome_tx_trees) # get genomic positions of chimera chrom5p,strand5p,start5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_start_5p, transcript_genome_map) chrom5p,strand5p,end5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map) if strand5p == 1: start5p,end5p = end5p,start5p chrom3p,strand3p,start3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map) chrom3p,strand3p,end3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_end_3p-1, transcript_genome_map) if strand3p == 1: start3p,end3p = end3p,start3p # get breakpoint spanning sequences spanning_seqs = set() spanning_fasta_lines = [] for dr in c.get_spanning_reads(): if dr.seq in spanning_seqs: continue spanning_seqs.add(dr.seq) spanning_fasta_lines.extend([">%s/%d;pos=%d;strand=%s" % (dr.qname, dr.readnum+1, dr.pos, "-" if dr.is_reverse else "+"), dr.seq]) # get isoform fraction num_wt_frags_5p, num_wt_frags_3p = get_wildtype_frags(c, bamfh) num_chimeric_frags = c.get_num_frags() frac5p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_5p) frac3p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_3p) # setup fields of BEDPE file fields = [chrom5p, start5p, end5p, chrom3p, start3p, end3p, "CLUSTER%d" % (chimera_clusters), c.get_num_frags(), "+" if (strand5p == 0) else "-", "+" if (strand3p == 0) else "-", ','.join(txs5p), ','.join(txs3p), ','.join(genes5p), ','.join(genes3p), chimera_type, distance, c.get_num_frags(), c.get_num_spanning_frags(), c.get_num_unique_positions(), frac5p, frac3p, ','.join(spanning_fasta_lines), ','.join(names)] lines.append(fields) chimera_clusters += 1 bamfh.close() logging.debug("Clustered chimeras: %d" % (chimera_clusters)) # sort lines = sorted(lines, key=operator.itemgetter(18, 17, 16), reverse=True) f = open(output_file, "w") print >>f, '\t'.join(['#chrom5p', 'start5p', 'end5p', 'chrom3p', 'start3p', 'end3p', 'chimera_cluster_id', 'score', 'strand5p', 'strand3p', 'transcript_ids_5p', 'transcript_ids_3p', 'genes5p', 'genes3p', 'type', 'distance', 'total_frags', 'spanning_frags', 'unique_alignment_positions', 'isoform_fraction_5p', 'isoform_fraction_3p', 'breakpoint_spanning_reads', 'chimera_ids']) for fields in lines: print >>f, '\t'.join(map(str, fields)) f.close() return config.JOB_SUCCESS