def get_highest_coverage_isoforms(input_file, gene_file): # place overlapping chimeras into clusters logging.debug("Building isoform cluster lookup table") tx_cluster_map = build_tx_cluster_map(open(gene_file)) # build a lookup table to get genome coordinates from transcript # coordinates tx_genome_map = build_gene_to_genome_map(open(gene_file)) cluster_chimera_dict = collections.defaultdict(lambda: []) for c in Chimera.parse(open(input_file)): key = (c.name, c.get_num_unique_spanning_positions(), c.get_weighted_cov(), c.get_num_frags()) # get cluster of overlapping genes cluster5p = tx_cluster_map[c.partner5p.tx_name] cluster3p = tx_cluster_map[c.partner3p.tx_name] # get genomic positions of breakpoints coord5p = gene_to_genome_pos(c.partner5p.tx_name, c.partner5p.end-1, tx_genome_map) coord3p = gene_to_genome_pos(c.partner3p.tx_name, c.partner3p.start, tx_genome_map) # add to dictionary cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(key) # choose highest coverage chimeras within each pair of clusters logging.debug("Finding highest coverage isoforms") kept_chimeras = set() for stats_list in cluster_chimera_dict.itervalues(): stats_dict = collections.defaultdict(lambda: set()) for stats_info in stats_list: # index chimera names stats_dict[stats_info[1:]].add(stats_info[0]) # find highest scoring key sorted_keys = sorted(stats_dict.keys(), reverse=True) kept_chimeras.update(stats_dict[sorted_keys[0]]) return kept_chimeras
def get_highest_coverage_isoforms(input_file, gene_file): # place overlapping chimeras into clusters logging.debug("Building isoform cluster lookup table") tx_cluster_map = build_tx_cluster_map(open(gene_file)) # build a lookup table to get genome coordinates from transcript # coordinates tx_genome_map = build_gene_to_genome_map(open(gene_file)) cluster_chimera_dict = collections.defaultdict(lambda: []) for c in Chimera.parse(open(input_file)): key = (c.name, c.get_num_unique_spanning_positions(), c.get_weighted_cov(), c.get_num_frags()) # get cluster of overlapping genes cluster5p = tx_cluster_map[c.partner5p.tx_name] cluster3p = tx_cluster_map[c.partner3p.tx_name] # get genomic positions of breakpoints coord5p = gene_to_genome_pos(c.partner5p.tx_name, c.partner5p.end - 1, tx_genome_map) coord3p = gene_to_genome_pos(c.partner3p.tx_name, c.partner3p.start, tx_genome_map) # add to dictionary cluster_chimera_dict[(cluster5p, cluster3p, coord5p, coord3p)].append(key) # choose highest coverage chimeras within each pair of clusters logging.debug("Finding highest coverage isoforms") kept_chimeras = set() for stats_list in cluster_chimera_dict.itervalues(): stats_dict = collections.defaultdict(lambda: set()) for stats_info in stats_list: # index chimera names stats_dict[stats_info[1:]].add(stats_info[0]) # find highest scoring key sorted_keys = sorted(stats_dict.keys(), reverse=True) kept_chimeras.update(stats_dict[sorted_keys[0]]) return kept_chimeras
def annotate_multihits(bamfh, reads, tid_genome_map): hits = set() any_unmapped = False for r in reads: if r.is_unmapped: any_unmapped = True continue if r.rname not in tid_genome_map: tid = r.rname pos = r.pos else: # use the position that is most 5' relative to genome left_tid, left_strand, left_pos = gene_to_genome_pos(r.rname, r.pos, tid_genome_map) right_tid, right_strand, right_pos = gene_to_genome_pos(r.rname, r.aend-1, tid_genome_map) tid = left_tid pos = imin2(left_pos, right_pos) hits.add((tid, pos)) #print r.qname, bamfh.getrname(r.rname), r.pos, bamfh.getrname(tid), pos for i,r in enumerate(reads): # annotate reads with 'HI', and 'IH' tags r.tags = r.tags + [("HI",i), ("IH",len(reads)), ("NH", len(hits))] return any_unmapped