def build_tx_name_gene_map(genefile, rname_prefix=None):
    rname_prefix = '' if rname_prefix is None else rname_prefix
    tx_map = {}
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        tx_map[rname_prefix + g.tx_name] = g
    return tx_map
Example #2
0
def build_tid_tx_cluster_map(bamfh, line_iter, rname_prefix=None):
    rname_tid_map = get_rname_tid_map(bamfh)
    rname_prefix = '' if rname_prefix is None else rname_prefix
    cluster_trees = collections.defaultdict(lambda: ClusterTree(0, 1))
    genes = []
    for g in GeneFeature.parse(line_iter):
        # only use genes that are references in the sam file
        rname = rname_prefix + g.tx_name
        if rname not in rname_tid_map:
            continue
        genome_tid = rname_tid_map[g.chrom]
        # insert into cluster tree
        cluster_trees[genome_tid].insert(g.tx_start, g.tx_end, len(genes))
        genes.append(g)
    # extract gene clusters
    tid_tx_cluster_map = {}
    current_cluster_id = 0
    for genome_tid, tree in cluster_trees.iteritems():
        for start, end, indexes in tree.getregions():
            # group overlapping transcripts on same strand together
            strand_tx_dict = collections.defaultdict(lambda: set())
            for index in indexes:
                g = genes[index]
                rname = rname_prefix + g.tx_name
                tid = rname_tid_map[rname]
                strand_tx_dict[g.strand].add(tid)
            # build a map between transcript tids and all the overlapping
            # transcripts on the same strand
            for strand, tids in strand_tx_dict.iteritems():
                for tid in tids:
                    tid_tx_cluster_map[tid] = current_cluster_id
                current_cluster_id += 1
                #print strand, [bamfh.getrname(tid) for tid in tids]
    return tid_tx_cluster_map
Example #3
0
def build_tx_name_gene_map(genefile, rname_prefix=None):
    rname_prefix = '' if rname_prefix is None else rname_prefix
    tx_map = {}
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        tx_map[rname_prefix + g.tx_name] = g
    return tx_map
def build_gene_maps(samfh, genefile):
    rname_tid_map = dict(
        (rname, i) for i, rname in enumerate(samfh.references))
    gene_genome_map = [None] * len(samfh.references)
    gene_trees = collections.defaultdict(lambda: IntervalTree())
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        name = config.GENE_REF_PREFIX + g.tx_name
        if name not in rname_tid_map:
            continue
        if g.chrom not in rname_tid_map:
            continue
        gene_tid = rname_tid_map[name]
        # get reference index in sam file
        chrom_tid = rname_tid_map[g.chrom]
        # store gene by reference id in sam file
        gene_genome_map[gene_tid] = g
        # add gene to interval tree
        gene_interval = Interval(g.tx_start,
                                 g.tx_end,
                                 chrom=g.chrom,
                                 strand=g.strand,
                                 value=g.tx_name)
        gene_trees[chrom_tid].insert_interval(gene_interval)
    return gene_genome_map, gene_trees
Example #5
0
def build_tid_tx_cluster_map(bamfh, line_iter, rname_prefix=None):
    rname_tid_map = get_rname_tid_map(bamfh)
    rname_prefix = '' if rname_prefix is None else rname_prefix
    cluster_trees = collections.defaultdict(lambda: ClusterTree(0,1))
    genes = []    
    for g in GeneFeature.parse(line_iter):
        # only use genes that are references in the sam file
        rname = rname_prefix + g.tx_name
        if rname not in rname_tid_map:
            continue
        genome_tid = rname_tid_map[g.chrom]
        # insert into cluster tree        
        cluster_trees[genome_tid].insert(g.tx_start, g.tx_end, len(genes)) 
        genes.append(g)
    # extract gene clusters
    tid_tx_cluster_map = {}
    current_cluster_id = 0
    for genome_tid, tree in cluster_trees.iteritems():
        for start, end, indexes in tree.getregions():
            # group overlapping transcripts on same strand together            
            strand_tx_dict = collections.defaultdict(lambda: set())
            for index in indexes:
                g = genes[index]
                rname = rname_prefix + g.tx_name                
                tid = rname_tid_map[rname]
                strand_tx_dict[g.strand].add(tid)
            # build a map between transcript tids and all the overlapping
            # transcripts on the same strand
            for strand, tids in strand_tx_dict.iteritems():
                for tid in tids:                    
                    tid_tx_cluster_map[tid] = current_cluster_id
                current_cluster_id += 1
                #print strand, [bamfh.getrname(tid) for tid in tids]
    return tid_tx_cluster_map
def build_genome_tx_trees(genefile):
    genome_tx_trees = collections.defaultdict(lambda: IntervalTree())    
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        # add gene to interval tree
        interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=g)
        genome_tx_trees[g.chrom].insert_interval(interval)
    return genome_tx_trees
Example #7
0
def build_genome_tx_trees(genefile):
    genome_tx_trees = collections.defaultdict(lambda: IntervalTree())
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        # add gene to interval tree
        interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=g)
        genome_tx_trees[g.chrom].insert_interval(interval)
    return genome_tx_trees
def build_tid_gene_map(bamfh, genefile, rname_prefix=None):
    rname_tid_map = dict((rname,tid) for tid,rname in enumerate(bamfh.references))
    rname_prefix = '' if rname_prefix is None else rname_prefix
    tid_tx_map = {}
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        # only use genes that are references in the sam file
        rname = rname_prefix + g.tx_name
        if rname not in rname_tid_map:
            continue
        tid = rname_tid_map[rname]
        tid_tx_map[tid] = g
    return tid_tx_map
Example #9
0
def build_tid_gene_map(bamfh, genefile, rname_prefix=None):
    rname_tid_map = get_rname_tid_map(bamfh)
    rname_prefix = '' if rname_prefix is None else rname_prefix
    tid_tx_map = {}
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        # only use genes that are references in the sam file
        rname = rname_prefix + g.tx_name
        if rname not in rname_tid_map:
            continue
        tid = rname_tid_map[rname]
        tid_tx_map[tid] = g
    return tid_tx_map
def build_gene_to_genome_map(line_iter, rname_prefix=None):
    # create arrays to map genes in bed file to genome 
    rname_prefix = '' if rname_prefix is None else rname_prefix
    gene_genome_map = {}    
    for g in GeneFeature.parse(line_iter):
        rname = rname_prefix + g.tx_name
        strand = 1 if g.strand == '-' else 0 
        exon_vectors = [(start, end) for start, end in g.exons]
        if strand:
            exon_vectors.reverse()
        if rname in gene_genome_map:
            logging.error("Duplicate references %s found in bed file" % (rname))
        gene_genome_map[rname] = (g.chrom, strand, exon_vectors)
    return gene_genome_map
Example #11
0
def build_gene_to_genome_map(line_iter, rname_prefix=None):
    # create arrays to map genes in bed file to genome
    rname_prefix = '' if rname_prefix is None else rname_prefix
    gene_genome_map = {}
    for g in GeneFeature.parse(line_iter):
        rname = rname_prefix + g.tx_name
        strand = 1 if g.strand == '-' else 0
        exon_vectors = [(start, end) for start, end in g.exons]
        if strand:
            exon_vectors.reverse()
        if rname in gene_genome_map:
            logging.error("Duplicate references %s found in bed file" %
                          (rname))
        gene_genome_map[rname] = (g.chrom, strand, exon_vectors)
    return gene_genome_map
def build_gene_interval_trees(genefile):
    trees = collections.defaultdict(lambda: IntervalTree())
    intervals = {}
    # build gene interval trees for fast lookup by genomic position
    for g in GeneFeature.parse(open(genefile)):
        k = (g.chrom, g.tx_start, g.tx_end)
        if k not in intervals:
            # add gene to tree
            txlist = []
            intervals[k] = txlist
            interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=txlist)
            trees[g.chrom].insert_interval(interval)
        else:
            txlist = intervals[k]
        # add isoform to value (list of isoforms that share start/end)
        txlist.append(g)
    return trees
Example #13
0
def build_gene_interval_trees(genefile):
    trees = collections.defaultdict(lambda: IntervalTree())
    intervals = {}
    # build gene interval trees for fast lookup by genomic position
    for g in GeneFeature.parse(open(genefile)):
        k = (g.chrom, g.tx_start, g.tx_end)
        if k not in intervals:
            # add gene to tree
            txlist = []
            intervals[k] = txlist
            interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=txlist)
            trees[g.chrom].insert_interval(interval)
        else:
            txlist = intervals[k]
        # add isoform to value (list of isoforms that share start/end)
        txlist.append(g)
    return trees
def build_exon_interval_trees(genefile):
    exon_trees = collections.defaultdict(lambda: IntervalTree())
    exon_intervals = {}
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        for i, e in enumerate(g.exons):
            k = (g.chrom, e[0], e[1])
            if k not in exon_intervals:
                # add exon to tree
                txlist = []
                exon_intervals[k] = txlist
                interval = Interval(e[0], e[1], strand=g.strand, value=txlist)
                exon_trees[g.chrom].insert_interval(interval)
            else:
                txlist = exon_intervals[k]
            # add transcript isoform
            txlist.append((g, i))
    return exon_trees
Example #15
0
def build_tid_to_genome_map(bamfh, line_iter, rname_prefix=None):
    rname_tid_map = get_rname_tid_map(bamfh)
    rname_prefix = '' if rname_prefix is None else rname_prefix
    tid_genome_map = {}    
    for g in GeneFeature.parse(line_iter):        
        rname = rname_prefix + g.tx_name
        if rname not in rname_tid_map:
            continue
        tid = rname_tid_map[rname]
        genome_tid = rname_tid_map[g.chrom]
        strand = 1 if g.strand == '-' else 0 
        exon_vectors = [(start, end) for start, end in g.exons]
        if strand:
            exon_vectors.reverse()
        if tid in tid_genome_map:
            logging.error("Duplicate references %s found in file" % (tid))
        tid_genome_map[tid] = (genome_tid, strand, exon_vectors)
    return tid_genome_map
Example #16
0
def build_exon_interval_trees(genefile):
    exon_trees = collections.defaultdict(lambda: IntervalTree())
    exon_intervals = {}
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        for i,e in enumerate(g.exons):
            k = (g.chrom, e[0], e[1])
            if k not in exon_intervals:
                # add exon to tree
                txlist = []
                exon_intervals[k] = txlist
                interval = Interval(e[0], e[1], strand=g.strand, value=txlist)
                exon_trees[g.chrom].insert_interval(interval)
            else:
                txlist = exon_intervals[k]
            # add transcript isoform
            txlist.append((g,i))
    return exon_trees
Example #17
0
def build_tid_to_genome_map(bamfh, line_iter, rname_prefix=None):
    rname_tid_map = get_rname_tid_map(bamfh)
    rname_prefix = '' if rname_prefix is None else rname_prefix
    tid_genome_map = {}
    for g in GeneFeature.parse(line_iter):
        rname = rname_prefix + g.tx_name
        if rname not in rname_tid_map:
            continue
        tid = rname_tid_map[rname]
        genome_tid = rname_tid_map[g.chrom]
        strand = 1 if g.strand == '-' else 0
        exon_vectors = [(start, end) for start, end in g.exons]
        if strand:
            exon_vectors.reverse()
        if tid in tid_genome_map:
            logging.error("Duplicate references %s found in file" % (tid))
        tid_genome_map[tid] = (genome_tid, strand, exon_vectors)
    return tid_genome_map
Example #18
0
def build_gene_maps(samfh, genefile):
    rname_tid_map = dict((rname,i) for i,rname in enumerate(samfh.references))
    gene_genome_map = [None] * len(samfh.references)
    gene_trees = collections.defaultdict(lambda: IntervalTree())    
    # build gene and genome data structures for fast lookup
    for g in GeneFeature.parse(open(genefile)):
        name = config.GENE_REF_PREFIX + g.tx_name
        if name not in rname_tid_map:
            continue
        if g.chrom not in rname_tid_map:
            continue
        gene_tid = rname_tid_map[name]
        # get reference index in sam file
        chrom_tid = rname_tid_map[g.chrom]        
        # store gene by reference id in sam file
        gene_genome_map[gene_tid] = g
        # add gene to interval tree
        gene_interval = Interval(g.tx_start, g.tx_end, chrom=g.chrom, strand=g.strand, value=g.tx_name)
        gene_trees[chrom_tid].insert_interval(gene_interval)
    return gene_genome_map, gene_trees
def build_transcript_cluster_map(line_iter, rname_prefix=None):
    # setup cluster trees
    chrom_strand_cluster_trees = \
        collections.defaultdict(lambda: {"+": ClusterTree(0,1),
                                         "-": ClusterTree(0,1)})
    transcripts = []
    index_cluster_map = {}
    for transcript in GeneFeature.parse(line_iter):
        # insert exons into cluster tree
        cluster_tree = chrom_strand_cluster_trees[transcript.chrom][transcript.strand]
        i = len(transcripts)
        for start,end in transcript.exons:
            cluster_tree.insert(start, end, i)
        # each transcript is initially in a cluster by itself
        index_cluster_map[i] = set([i])
        transcripts.append(transcript)
    # extract gene clusters
    for strand_cluster_trees in chrom_strand_cluster_trees.itervalues():
        for cluster_tree in strand_cluster_trees.itervalues():
            for start, end, indexes in cluster_tree.getregions():
                # make new cluster by aggregating all existing
                # clusters with new indexes
                newclust = set(indexes)
                for i in indexes:
                    newclust.update(index_cluster_map[i])
                # map every transcript to the new cluster
                for i in newclust:
                    index_cluster_map[i] = newclust
    # enumerate all clusters
    rname_prefix = '' if rname_prefix is None else rname_prefix
    transcript_cluster_map = {}
    for cluster_id, clust in enumerate(index_cluster_map.values()):
        for i in clust:
            transcript = transcripts[i]
            transcript_cluster_map[rname_prefix + transcript.tx_name] = cluster_id
    return transcript_cluster_map