コード例 #1
0
def get_highest_coverage_isoforms(input_file, gene_file):
    # place overlapping chimeras into clusters
    logging.debug("Building isoform cluster lookup table")
    tx_cluster_map = build_tx_cluster_map(open(gene_file))
    # build a lookup table to get genome coordinates from transcript 
    # coordinates
    tx_genome_map = build_gene_to_genome_map(open(gene_file))
    cluster_chimera_dict = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(input_file)):
        key = (c.name,
               c.get_num_unique_spanning_positions(),
               c.get_weighted_cov(),
               c.get_num_frags())
        # get cluster of overlapping genes
        cluster5p = tx_cluster_map[c.partner5p.tx_name]
        cluster3p = tx_cluster_map[c.partner3p.tx_name]
        # get genomic positions of breakpoints
        coord5p = gene_to_genome_pos(c.partner5p.tx_name, c.partner5p.end-1, tx_genome_map)
        coord3p = gene_to_genome_pos(c.partner3p.tx_name, c.partner3p.start, tx_genome_map)
        # add to dictionary
        cluster_chimera_dict[(cluster5p,cluster3p,coord5p,coord3p)].append(key)    
    # choose highest coverage chimeras within each pair of clusters
    logging.debug("Finding highest coverage isoforms")
    kept_chimeras = set()
    for stats_list in cluster_chimera_dict.itervalues():
        stats_dict = collections.defaultdict(lambda: set())
        for stats_info in stats_list:
            # index chimera names
            stats_dict[stats_info[1:]].add(stats_info[0])
        # find highest scoring key
        sorted_keys = sorted(stats_dict.keys(), reverse=True)
        kept_chimeras.update(stats_dict[sorted_keys[0]])
    return kept_chimeras
コード例 #2
0
def get_highest_coverage_isoforms(input_file, gene_file):
    # place overlapping chimeras into clusters
    logging.debug("Building isoform cluster lookup table")
    tx_cluster_map = build_tx_cluster_map(open(gene_file))
    # build a lookup table to get genome coordinates from transcript
    # coordinates
    tx_genome_map = build_gene_to_genome_map(open(gene_file))
    cluster_chimera_dict = collections.defaultdict(lambda: [])
    for c in Chimera.parse(open(input_file)):
        key = (c.name, c.get_num_unique_spanning_positions(),
               c.get_weighted_cov(), c.get_num_frags())
        # get cluster of overlapping genes
        cluster5p = tx_cluster_map[c.partner5p.tx_name]
        cluster3p = tx_cluster_map[c.partner3p.tx_name]
        # get genomic positions of breakpoints
        coord5p = gene_to_genome_pos(c.partner5p.tx_name, c.partner5p.end - 1,
                                     tx_genome_map)
        coord3p = gene_to_genome_pos(c.partner3p.tx_name, c.partner3p.start,
                                     tx_genome_map)
        # add to dictionary
        cluster_chimera_dict[(cluster5p, cluster3p, coord5p,
                              coord3p)].append(key)
    # choose highest coverage chimeras within each pair of clusters
    logging.debug("Finding highest coverage isoforms")
    kept_chimeras = set()
    for stats_list in cluster_chimera_dict.itervalues():
        stats_dict = collections.defaultdict(lambda: set())
        for stats_info in stats_list:
            # index chimera names
            stats_dict[stats_info[1:]].add(stats_info[0])
        # find highest scoring key
        sorted_keys = sorted(stats_dict.keys(), reverse=True)
        kept_chimeras.update(stats_dict[sorted_keys[0]])
    return kept_chimeras
コード例 #3
0
def annotate_multihits(bamfh, reads, tid_genome_map):
    hits = set()
    any_unmapped = False
    for r in reads:
        if r.is_unmapped:
            any_unmapped = True
            continue
        if r.rname not in tid_genome_map:
            tid = r.rname
            pos = r.pos
        else:
            # use the position that is most 5' relative to genome
            left_tid, left_strand, left_pos = gene_to_genome_pos(r.rname, r.pos, tid_genome_map)
            right_tid, right_strand, right_pos = gene_to_genome_pos(r.rname, r.aend-1, tid_genome_map)
            tid = left_tid
            pos = imin2(left_pos, right_pos)
        hits.add((tid, pos))
        #print r.qname, bamfh.getrname(r.rname), r.pos, bamfh.getrname(tid), pos  
    for i,r in enumerate(reads):
        # annotate reads with 'HI', and 'IH' tags
        r.tags = r.tags + [("HI",i), ("IH",len(reads)), ("NH", len(hits))]
    return any_unmapped