def build_tx_name_gene_map(genefile, rname_prefix=None): rname_prefix = '' if rname_prefix is None else rname_prefix tx_map = {} # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): tx_map[rname_prefix + g.tx_name] = g return tx_map
def build_tid_tx_cluster_map(bamfh, line_iter, rname_prefix=None): rname_tid_map = get_rname_tid_map(bamfh) rname_prefix = '' if rname_prefix is None else rname_prefix cluster_trees = collections.defaultdict(lambda: ClusterTree(0, 1)) genes = [] for g in GeneFeature.parse(line_iter): # only use genes that are references in the sam file rname = rname_prefix + g.tx_name if rname not in rname_tid_map: continue genome_tid = rname_tid_map[g.chrom] # insert into cluster tree cluster_trees[genome_tid].insert(g.tx_start, g.tx_end, len(genes)) genes.append(g) # extract gene clusters tid_tx_cluster_map = {} current_cluster_id = 0 for genome_tid, tree in cluster_trees.iteritems(): for start, end, indexes in tree.getregions(): # group overlapping transcripts on same strand together strand_tx_dict = collections.defaultdict(lambda: set()) for index in indexes: g = genes[index] rname = rname_prefix + g.tx_name tid = rname_tid_map[rname] strand_tx_dict[g.strand].add(tid) # build a map between transcript tids and all the overlapping # transcripts on the same strand for strand, tids in strand_tx_dict.iteritems(): for tid in tids: tid_tx_cluster_map[tid] = current_cluster_id current_cluster_id += 1 #print strand, [bamfh.getrname(tid) for tid in tids] return tid_tx_cluster_map
def build_gene_maps(samfh, genefile): rname_tid_map = dict( (rname, i) for i, rname in enumerate(samfh.references)) gene_genome_map = [None] * len(samfh.references) gene_trees = collections.defaultdict(lambda: IntervalTree()) # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): name = config.GENE_REF_PREFIX + g.tx_name if name not in rname_tid_map: continue if g.chrom not in rname_tid_map: continue gene_tid = rname_tid_map[name] # get reference index in sam file chrom_tid = rname_tid_map[g.chrom] # store gene by reference id in sam file gene_genome_map[gene_tid] = g # add gene to interval tree gene_interval = Interval(g.tx_start, g.tx_end, chrom=g.chrom, strand=g.strand, value=g.tx_name) gene_trees[chrom_tid].insert_interval(gene_interval) return gene_genome_map, gene_trees
def build_tid_tx_cluster_map(bamfh, line_iter, rname_prefix=None): rname_tid_map = get_rname_tid_map(bamfh) rname_prefix = '' if rname_prefix is None else rname_prefix cluster_trees = collections.defaultdict(lambda: ClusterTree(0,1)) genes = [] for g in GeneFeature.parse(line_iter): # only use genes that are references in the sam file rname = rname_prefix + g.tx_name if rname not in rname_tid_map: continue genome_tid = rname_tid_map[g.chrom] # insert into cluster tree cluster_trees[genome_tid].insert(g.tx_start, g.tx_end, len(genes)) genes.append(g) # extract gene clusters tid_tx_cluster_map = {} current_cluster_id = 0 for genome_tid, tree in cluster_trees.iteritems(): for start, end, indexes in tree.getregions(): # group overlapping transcripts on same strand together strand_tx_dict = collections.defaultdict(lambda: set()) for index in indexes: g = genes[index] rname = rname_prefix + g.tx_name tid = rname_tid_map[rname] strand_tx_dict[g.strand].add(tid) # build a map between transcript tids and all the overlapping # transcripts on the same strand for strand, tids in strand_tx_dict.iteritems(): for tid in tids: tid_tx_cluster_map[tid] = current_cluster_id current_cluster_id += 1 #print strand, [bamfh.getrname(tid) for tid in tids] return tid_tx_cluster_map
def build_genome_tx_trees(genefile): genome_tx_trees = collections.defaultdict(lambda: IntervalTree()) # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): # add gene to interval tree interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=g) genome_tx_trees[g.chrom].insert_interval(interval) return genome_tx_trees
def build_tid_gene_map(bamfh, genefile, rname_prefix=None): rname_tid_map = dict((rname,tid) for tid,rname in enumerate(bamfh.references)) rname_prefix = '' if rname_prefix is None else rname_prefix tid_tx_map = {} # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): # only use genes that are references in the sam file rname = rname_prefix + g.tx_name if rname not in rname_tid_map: continue tid = rname_tid_map[rname] tid_tx_map[tid] = g return tid_tx_map
def build_tid_gene_map(bamfh, genefile, rname_prefix=None): rname_tid_map = get_rname_tid_map(bamfh) rname_prefix = '' if rname_prefix is None else rname_prefix tid_tx_map = {} # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): # only use genes that are references in the sam file rname = rname_prefix + g.tx_name if rname not in rname_tid_map: continue tid = rname_tid_map[rname] tid_tx_map[tid] = g return tid_tx_map
def build_gene_to_genome_map(line_iter, rname_prefix=None): # create arrays to map genes in bed file to genome rname_prefix = '' if rname_prefix is None else rname_prefix gene_genome_map = {} for g in GeneFeature.parse(line_iter): rname = rname_prefix + g.tx_name strand = 1 if g.strand == '-' else 0 exon_vectors = [(start, end) for start, end in g.exons] if strand: exon_vectors.reverse() if rname in gene_genome_map: logging.error("Duplicate references %s found in bed file" % (rname)) gene_genome_map[rname] = (g.chrom, strand, exon_vectors) return gene_genome_map
def build_gene_interval_trees(genefile): trees = collections.defaultdict(lambda: IntervalTree()) intervals = {} # build gene interval trees for fast lookup by genomic position for g in GeneFeature.parse(open(genefile)): k = (g.chrom, g.tx_start, g.tx_end) if k not in intervals: # add gene to tree txlist = [] intervals[k] = txlist interval = Interval(g.tx_start, g.tx_end, strand=g.strand, value=txlist) trees[g.chrom].insert_interval(interval) else: txlist = intervals[k] # add isoform to value (list of isoforms that share start/end) txlist.append(g) return trees
def build_exon_interval_trees(genefile): exon_trees = collections.defaultdict(lambda: IntervalTree()) exon_intervals = {} # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): for i, e in enumerate(g.exons): k = (g.chrom, e[0], e[1]) if k not in exon_intervals: # add exon to tree txlist = [] exon_intervals[k] = txlist interval = Interval(e[0], e[1], strand=g.strand, value=txlist) exon_trees[g.chrom].insert_interval(interval) else: txlist = exon_intervals[k] # add transcript isoform txlist.append((g, i)) return exon_trees
def build_tid_to_genome_map(bamfh, line_iter, rname_prefix=None): rname_tid_map = get_rname_tid_map(bamfh) rname_prefix = '' if rname_prefix is None else rname_prefix tid_genome_map = {} for g in GeneFeature.parse(line_iter): rname = rname_prefix + g.tx_name if rname not in rname_tid_map: continue tid = rname_tid_map[rname] genome_tid = rname_tid_map[g.chrom] strand = 1 if g.strand == '-' else 0 exon_vectors = [(start, end) for start, end in g.exons] if strand: exon_vectors.reverse() if tid in tid_genome_map: logging.error("Duplicate references %s found in file" % (tid)) tid_genome_map[tid] = (genome_tid, strand, exon_vectors) return tid_genome_map
def build_exon_interval_trees(genefile): exon_trees = collections.defaultdict(lambda: IntervalTree()) exon_intervals = {} # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): for i,e in enumerate(g.exons): k = (g.chrom, e[0], e[1]) if k not in exon_intervals: # add exon to tree txlist = [] exon_intervals[k] = txlist interval = Interval(e[0], e[1], strand=g.strand, value=txlist) exon_trees[g.chrom].insert_interval(interval) else: txlist = exon_intervals[k] # add transcript isoform txlist.append((g,i)) return exon_trees
def build_gene_maps(samfh, genefile): rname_tid_map = dict((rname,i) for i,rname in enumerate(samfh.references)) gene_genome_map = [None] * len(samfh.references) gene_trees = collections.defaultdict(lambda: IntervalTree()) # build gene and genome data structures for fast lookup for g in GeneFeature.parse(open(genefile)): name = config.GENE_REF_PREFIX + g.tx_name if name not in rname_tid_map: continue if g.chrom not in rname_tid_map: continue gene_tid = rname_tid_map[name] # get reference index in sam file chrom_tid = rname_tid_map[g.chrom] # store gene by reference id in sam file gene_genome_map[gene_tid] = g # add gene to interval tree gene_interval = Interval(g.tx_start, g.tx_end, chrom=g.chrom, strand=g.strand, value=g.tx_name) gene_trees[chrom_tid].insert_interval(gene_interval) return gene_genome_map, gene_trees
def build_transcript_cluster_map(line_iter, rname_prefix=None): # setup cluster trees chrom_strand_cluster_trees = \ collections.defaultdict(lambda: {"+": ClusterTree(0,1), "-": ClusterTree(0,1)}) transcripts = [] index_cluster_map = {} for transcript in GeneFeature.parse(line_iter): # insert exons into cluster tree cluster_tree = chrom_strand_cluster_trees[transcript.chrom][transcript.strand] i = len(transcripts) for start,end in transcript.exons: cluster_tree.insert(start, end, i) # each transcript is initially in a cluster by itself index_cluster_map[i] = set([i]) transcripts.append(transcript) # extract gene clusters for strand_cluster_trees in chrom_strand_cluster_trees.itervalues(): for cluster_tree in strand_cluster_trees.itervalues(): for start, end, indexes in cluster_tree.getregions(): # make new cluster by aggregating all existing # clusters with new indexes newclust = set(indexes) for i in indexes: newclust.update(index_cluster_map[i]) # map every transcript to the new cluster for i in newclust: index_cluster_map[i] = newclust # enumerate all clusters rname_prefix = '' if rname_prefix is None else rname_prefix transcript_cluster_map = {} for cluster_id, clust in enumerate(index_cluster_map.values()): for i in clust: transcript = transcripts[i] transcript_cluster_map[rname_prefix + transcript.tx_name] = cluster_id return transcript_cluster_map