def get_highest_coverage_isoforms(input_file, gene_file): # place overlapping chimeras into clusters logging.debug("Building isoform cluster lookup table") transcript_cluster_map = build_transcript_cluster_map(open(gene_file)) # build a lookup table to get genome coordinates from transcript # coordinates transcript_genome_map = build_transcript_genome_map(open(gene_file)) cluster_chimera_dict = collections.defaultdict(lambda: []) for c in Chimera.parse(open(input_file)): # TODO: adjust this to score chimeras differently! key = (c.name, c.get_num_frags()) # get cluster of overlapping genes cluster5p = transcript_cluster_map[c.tx_name_5p] cluster3p = transcript_cluster_map[c.tx_name_3p] # get genomic positions of breakpoints coord5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map) coord3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map) # add to dictionary cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(key) # choose highest coverage chimeras within each pair of clusters logging.debug("Finding highest coverage isoforms") kept_chimeras = set() for stats_list in cluster_chimera_dict.itervalues(): stats_dict = collections.defaultdict(lambda: set()) for stats_info in stats_list: # index chimera names stats_dict[stats_info[1:]].add(stats_info[0]) # find highest scoring key sorted_keys = sorted(stats_dict.keys(), reverse=True) kept_chimeras.update(stats_dict[sorted_keys[0]]) return kept_chimeras
def get_chimera_groups(input_file, gene_file): # build a lookup table to get gene clusters from transcript name transcript_cluster_map = build_transcript_cluster_map(open(gene_file)) # build a lookup table to get genome coordinates from transcript # coordinates # TODO: can either group by exact breakpoint, or just by # gene cluster # transcript_genome_map = build_transcript_genome_map(open(gene_file)) # group chimeras in the same genomic cluster with the same # breakpoint cluster_chimera_dict = collections.defaultdict(lambda: []) for c in Chimera.parse(open(input_file)): # get cluster of overlapping genes cluster5p = transcript_cluster_map[c.tx_name_5p] cluster3p = transcript_cluster_map[c.tx_name_3p] # get genomic positions of breakpoints #coord5p = transcript_to_genome_pos(c.partner5p.tx_name, c.partner5p.end-1, transcript_genome_map) #coord3p = transcript_to_genome_pos(c.partner3p.tx_name, c.partner3p.start, transcript_genome_map) # add to dictionary cluster_chimera_dict[(cluster5p,cluster3p)].append(c) # TODO: use this grouping instead? #cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(c) for key,chimeras in cluster_chimera_dict.iteritems(): yield key,chimeras