def get_highest_coverage_isoforms(input_file, gene_file):
    # place overlapping chimeras into clusters
    logging.debug("Building isoform cluster lookup table")
    transcript_cluster_map = build_transcript_cluster_map(open(gene_file))
    # build a lookup table to get genome coordinates from transcript 
    # coordinates
    transcript_genome_map = build_transcript_genome_map(open(gene_file))
    cluster_chimera_dict = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(input_file)):
        # TODO: adjust this to score chimeras differently!
        key = (c.name, c.get_num_frags())
        # get cluster of overlapping genes
        cluster5p = transcript_cluster_map[c.tx_name_5p]
        cluster3p = transcript_cluster_map[c.tx_name_3p]
        # get genomic positions of breakpoints
        coord5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map)
        coord3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map)
        # add to dictionary
        cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(key)    
    # choose highest coverage chimeras within each pair of clusters
    logging.debug("Finding highest coverage isoforms")
    kept_chimeras = set()
    for stats_list in cluster_chimera_dict.itervalues():
        stats_dict = collections.defaultdict(lambda: set())
        for stats_info in stats_list:
            # index chimera names
            stats_dict[stats_info[1:]].add(stats_info[0])
        # find highest scoring key
        sorted_keys = sorted(stats_dict.keys(), reverse=True)
        kept_chimeras.update(stats_dict[sorted_keys[0]])
    return kept_chimeras
def get_chimera_groups(input_file, gene_file):
    # build a lookup table to get gene clusters from transcript name    
    transcript_cluster_map = build_transcript_cluster_map(open(gene_file))
    # build a lookup table to get genome coordinates from transcript 
    # coordinates
    # TODO: can either group by exact breakpoint, or just by
    # gene cluster
    # transcript_genome_map = build_transcript_genome_map(open(gene_file))
    # group chimeras in the same genomic cluster with the same
    # breakpoint
    cluster_chimera_dict = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(input_file)):
        # get cluster of overlapping genes
        cluster5p = transcript_cluster_map[c.tx_name_5p]
        cluster3p = transcript_cluster_map[c.tx_name_3p]
        # get genomic positions of breakpoints
        #coord5p = transcript_to_genome_pos(c.partner5p.tx_name, c.partner5p.end-1, transcript_genome_map)
        #coord3p = transcript_to_genome_pos(c.partner3p.tx_name, c.partner3p.start, transcript_genome_map)
        # add to dictionary
        cluster_chimera_dict[(cluster5p,cluster3p)].append(c)
        # TODO: use this grouping instead?
        #cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(c)
    for key,chimeras in cluster_chimera_dict.iteritems():
        yield key,chimeras