def __init__(self, chromosome, start, end=None, name=None, build=genomeUtils.hg19, strand=None, attributes={}): ''' Create a genome feature that spans [start,stop) of chromosome in build (using 0-based BED coordinates) ''' temp = genomeUtils.standardizeChromosome(chromosome,build) if temp == None: raise Exception('Invalid Chromosome: %s' % chromosome) chromosome = temp start,self.genomeStart = genomeUtils.standardizePosition(chromosome, start, build) if end == None: # assume that the size is only 1 bp end = start+1 self.genomeEnd = self.genomeStart+1 else: end,self.genomeEnd = genomeUtils.standardizePosition(chromosome, end, build) self.hashCode = "%s_%i_%i" % (chromosome,start,end) if name==None: self.name = self.hashCode else: self.name = name self.build = build self.strand = strand self.attributes = attributes Interval.__init__(self, start=start, end=end, value=self.attributes, chrom=chromosome, strand=strand) # I use these objects for nested structures such as exons inside a gene, etc self.queryObjects = set() self.children = set()
def get_reads_and_ranges(bam_region, cid, chrom, region_start, region_end, strand, options): pos_range = defaultdict(lambda: [0,0]) filtered_reads = Intersecter() read_iterator = filter_reads( bam_region, options.chrom_prefix + chrom, region_start, region_end, options ) for read in read_iterator: if is_valid_paired(read, region_start, options): rstart = min(read.pos, read.pnext) + 1 rend = rstart + abs(read.isize) - 1 filtered_reads.add_interval(Interval(rstart, rend)) inc_pr(pos_range, rstart, rend, region_start, region_end) inc_pr_at(pos_range, rstart, region_start, region_end) inc_pr_at(pos_range, rend, region_start, region_end) elif is_valid_single(read, options): rstart = read.pos + 1 rend = rstart + aln_length(read.cigar) - 1 filtered_reads.add_interval(Interval(rstart, rend)) inc_pr(pos_range, rstart, rend, region_start, region_end) if as_merged(read, options) or as_trimmed(read, options): inc_pr_at(pos_range, rstart, region_start, region_end) inc_pr_at(pos_range, rend, region_start, region_end) elif read.is_reverse: inc_pr_at(pos_range, rend, region_start, region_end) else: inc_pr_at(pos_range, rstart, region_start, region_end) return filtered_reads, pos_range
def setUp(self): iv = IntervalNode(50, 59, Interval(50, 59)) for i in range(0, 110, 10): if i == 50: continue f = Interval(i, i + 9) iv = iv.insert(f.start, f.end, f) self.intervals = iv
def setUp(self): iv = IntervalTree() iv.add_interval(Interval(50, 59)) for i in range(0, 110, 10): if i == 50: continue f = Interval(i, i + 9) iv.add_interval(f) self.intervals = iv
def regionTree2(tmp,resFrag,strand,info,geneid): if strand not in resFrag: resFrag[strand] = {} if tmp[0] in resFrag[strand]: resFrag[strand][tmp[0]].add_interval(Interval(int(tmp[1]),int(tmp[2]),value={"exon":info,'geneid':geneid})) else: resFrag[strand][tmp[0]] = Intersecter() resFrag[strand][tmp[0]].add_interval(Interval(int(tmp[1]),int(tmp[2]),value={"exon":info,'geneid':geneid}))
def test_left(self): iv = self.intervals self.assertEqual(str(iv.left(60, n=2)), str([Interval(50, 59), Interval(40, 49)])) for i in range(10, 100, 10): r = iv.left(i, max_dist=10, n=1) self.assertEqual(r[0].end, i - 1)
def setUp(self): iv = IntervalNode(1, 2, Interval(1, 2)) self.max = 1000000 for i in range(0, self.max, 10): f = Interval(i, i) iv = iv.insert(f.start, f.end, f) for i in range(600): iv = iv.insert(0, 1, Interval(0, 1)) self.intervals = iv
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False): ''' This function cites the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara ''' ''' Function load_restriction_fragment cite the same function from Hi-C Pro (https://github.com/nservant/HiC-Pro/blob/master/scripts/mapped_2hic_fragments.py) by Nicolas Servant, Eric Viara ''' """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ resFrag = {} if verbose: print("## Loading Restriction File Intervals '" + in_file + "'...") bed_handle = open(in_file) nline = 0 for line in bed_handle: nline +=1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print("Warning : wrong input format in line" + nline + ". Not a BED file !?") continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) midPoint = (start + end)/2 fragl = abs(end - start) name = name.strip() ## Discard fragments outside the size range if minfragsize != None and int(fragl) < int(minfragsize): print("Warning : fragment "+ name + " [" + fragl + "] outside of range. Discarded") continue if maxfragsize != None and int(fragl) > int(maxfragsize): print("Warning : fragment " + name + " [" + fragl + "] outside of range. Discarded") continue if chromosome in resFrag: tree = resFrag[chromosome] tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name, 'midPoint': midPoint})) resFrag[chromosome] = tree bed_handle.close() return resFrag
def load_restriction_fragment(in_file, minfragsize=None, maxfragsize=None, verbose=False): """ Read a BED file and store the intervals in a tree Intervals are zero-based objects. The output object is a hash table with one search tree per chromosome in_file = input file [character] verbose = verbose mode [logical] """ resFrag = {} if verbose: print "## Loading Restriction File Intervals '", in_file, "'..." bed_handle = open(in_file) nline = 0 nfilt = 0 for line in bed_handle: nline +=1 bedtab = line.split("\t") try: chromosome, start, end, name = bedtab[:4] except ValueError: print "Warning : wrong input format in line", nline,". Not a BED file !?" continue # BED files are zero-based as Intervals objects start = int(start) # + 1 end = int(end) fragl = abs(end - start) name = name.strip() ## Discard fragments outside the size range filt=False if minfragsize != None and int(fragl) < int(minfragsize): nfilt+=1 filt=True elif maxfragsize != None and int(fragl) > int(maxfragsize): nfilt+=1 filt=True if chromosome in resFrag: tree = resFrag[chromosome] tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) else: tree = Intersecter() tree.add_interval(Interval(start, end, value={'name': name, 'filter': filt})) resFrag[chromosome] = tree if nfilt > 0: print "Warning : ", nfilt ,"fragment(s) outside of range and discarded. ", nline - nfilt, " remaining." bed_handle.close() return resFrag
def test_downstream(self): iv = self.intervals downstreams = iv.downstream_of_interval(Interval(59, 60), num_intervals=200) for d in downstreams: self.assertTrue(d.start > 60) downstreams = iv.downstream_of_interval(Interval(59, 60, strand=-1), num_intervals=200) for d in downstreams: self.assertTrue(d.start < 59)
def add_pvalues_to_peaks_frame_macs_bf(peaks_frame,experiment_peaks_frame,TTAA_frame,lam_win_size,pseudocounts = 0.2,macs_pvalue=True): print "lab specific hohoho" experiment_gnashy_dict = {} experiment_dict_of_trees = {} TTAA_frame_gbChr_dict = {} TTAA_dict_of_trees = {} list_of_l_names = [lam_win_size] print "Making interval tree for experiment hops..." for name,group in experiment_peaks_frame.groupby('Chr'): experiment_gnashy_dict[name] = group experiment_gnashy_dict[name].index = experiment_gnashy_dict[name]["Start"] #initialize tree experiment_dict_of_trees[name] = Intersecter() #populate tree with position as interval for idx, row in experiment_gnashy_dict[name].iterrows(): experiment_dict_of_trees[name].add_interval(Interval(int(idx),int(idx)+3)) print "Making interval tree for TTAAs..." #make interval tree for TTAAs for name,group in TTAA_frame.groupby('Chr'): TTAA_frame_gbChr_dict[name] = group TTAA_frame_gbChr_dict[name].index = TTAA_frame_gbChr_dict[name]["Start"] #initialize tree TTAA_dict_of_trees[name] = Intersecter() #populate tree with position as interval for idx, row in TTAA_frame_gbChr_dict[name].iterrows(): TTAA_dict_of_trees[name].add_interval(Interval(int(idx),int(idx+3))) #go through cluster frame and compute pvalues lambda_type_list =[] lambda_list = [] pvalue_list = [] for idx,row in peaks_frame.iterrows(): #add number of background hops in cluster to frame cluster_center = row["Center"] #find lambda and compute significance of cluster num_TTAAs = len(TTAA_dict_of_trees[row["Chr"]].find(row["Start"],row["End"])) #compute lambda for window size num_exp_hops_lam_win_size = len(experiment_dict_of_trees[row["Chr"]].find(cluster_center-(lam_win_size/2 - 1),cluster_center+lam_win_size/2)) num_TTAAs_lam_win_size = len(TTAA_dict_of_trees[row["Chr"]].find(cluster_center-(lam_win_size/2 - 1),cluster_center+lam_win_size/2)) lambda_win_size = float(num_exp_hops_lam_win_size)/(max(num_TTAAs_lam_win_size,1)) lambda_f = lambda_win_size lambda_type_list.append(lam_win_size) lambda_list.append(lambda_f) #compute pvalue and record it pvalue = 1-scistat.poisson.cdf((row["Experiment Hops"]+pseudocounts),lambda_f*max(num_TTAAs,1)+pseudocounts) pvalue_list.append(pvalue) #make frame from all of the lists peaks_frame["Lambda Type"] = lambda_type_list peaks_frame["Lambda"] = lambda_list peaks_frame["Poisson pvalue"] = pvalue_list return peaks_frame
def load_exons_and_genes(genesF): gtrees={} etrees={} genesIN = open(genesF) #map transcript (isoform) name to cluster_id ("gene") t_to_gene_map = {} #load individual transcripts (isoforms) for line in genesIN: #skip header if line[0] == 'c': continue line = line.rstrip() fields = line.split('\t') (cluster_id,tname,refid,strand,tstart,tend) = fields[:6] refid = refid.replace("chr","") eStarts = fields[9].split(',') eEnds = fields[10].split(',') alignID = fields[12] #now save the exons as intervals if refid not in etrees: etrees[refid] = IntervalTree() #use 1-based closed interval tstart=int(tstart)+1 for (eStart,eEnd) in zip(eStarts,eEnds): if len(eStart) == 0: continue #use 1-based closed interval eStart=int(eStart)+1 #sys.stderr.write("%s %s %s\n"%(eStart,eEnd,cluster_id)) #must adjust for the open intervals (both) of the interval tree itv = Interval(eStart-1,int(eEnd)+1, value=[cluster_id,alignID,strand]) etrees[refid].insert_interval(itv) #now map to the cluster_id and figure whether we can increase #the longest transcript coordinate span with these coordinates tend = int(tend) if cluster_id not in t_to_gene_map: t_to_gene_map[cluster_id]=[tstart,tend,refid] if tstart < t_to_gene_map[cluster_id][0]: t_to_gene_map[cluster_id][0] = tstart if tend > t_to_gene_map[cluster_id][1]: t_to_gene_map[cluster_id][1] = tend genesIN.close() #now convert the cluster (gene) coordinate extents to intervals for (cluster_id,span) in t_to_gene_map.iteritems(): (st,en,refid) = span if refid not in gtrees: gtrees[refid] = IntervalTree() #sys.stderr.write("%d %d %s\n"%(st,en,cluster_id)) #must adjust for the open intervals (both) of the interval tree itv = Interval(int(st)-1,int(en)+1,value=cluster_id) gtrees[refid].insert_interval(itv) return (etrees,gtrees)
def add_exon(self, rStart0, rEnd1, sStart0, sEnd1, rstrand, score): assert rStart0 < rEnd1 and sStart0 < sEnd1 if rstrand == '-': assert len(self.ref_exons) == 0 or self.ref_exons[0].start >= rEnd1 self.scores.insert(0, score) self.ref_exons.insert(0, Interval(rStart0, rEnd1)) else: assert len(self.ref_exons) == 0 or self.ref_exons[-1].end <= rStart0 self.scores.append(score) self.ref_exons.append(Interval(rStart0, rEnd1)) if rstrand == '-': self.seq_exons.insert(0, Interval(sStart0, sEnd1)) else: self.seq_exons.append(Interval(sStart0, sEnd1))
def BED_to_interval_tree(BED_file): """ Creates an index of intervals, using an interval tree, for each BED entry :param BED_file: file handler of a BED file :return interval tree """ from bx.intervals.intersection import IntervalTree, Interval bed_interval_tree = {} for line in BED_file: if line[0] == "#": continue fields = line.strip().split() chrom, start_bed, end_bed, = fields[0], int(fields[1]), int(fields[2]) if chrom not in bed_interval_tree: bed_interval_tree[chrom] = IntervalTree() # skip if a region overlaps with a region already seen """ if len(bed_interval_tree[chrom].find(start_bed, start_bed + 1)) > 0: continue """ bed_interval_tree[chrom].add_interval(Interval(start_bed, end_bed)) return bed_interval_tree
def convert_BLAST9rec_to_gmapRecord(rec_list): """ Adds .chr, .seqid, and .ref_exons so we can use it to write in UCSC format """ if not len(rec_list) > 0: raise RuntimeError("Cannot convert an empty record list!") seqname = rec_list[0].sID seqid = rec_list[0].qID strand = rec_list[0].strand if not all(x.sID == chr for x in rec_list): raise RuntimeError( "The record list has differing `sID` valuess - they must all be the same!!" ) if not all(x.qID == seqid for x in rec_list): raise RuntimeError( "The record list has differing `qID` valuess - they must all be the same!" ) if not all(x.strand == strand for x in rec_list): raise RuntimeError( "The record list has differing `strand` values - they must all be the same!!" ) r = gmapRecord(seqname, coverage=0, identity=0, strand=strand, seqid=seqid) r.ref_exons = [Interval(x.sStart, x.sEnd) for x in rec_list] return r
def load_bed(read_length, path, flanking): """ parse bed file and ignore overlapping regions :param path: BED file path :param flanking: Integer to add to each target's start and end :return: dictionary of targets where keys are chrom,start,end """ targets = {} unique_targets = [] bed = BedTool(path) for record in bed: chrom, start, end = record[0], int(record[1]), int(record[2]) start -= flanking end += flanking if int(end) < int( start): # if this target on minus strand flip the start/end start, end = end, start if chrom not in targets: targets[chrom] = Intersecter() if targets[chrom].find(start, end) == [] and abs(start - end) >= ( read_length * 2): # no overlaps yet targets[chrom].add_interval(Interval(start, end)) unique_targets.append([chrom, start, end]) return unique_targets
def parse_models(bedfile): '''Converts gene models in BED format to a list of exon intervals.''' reader = csv.reader(open(bedfile), dialect='excel-tab') for row in reader: exons = [] chrom = row[0] chrom_start = int(row[1]) geneid = row[3] exon_sizes = [int(s) for s in row[10].split(',')] exon_starts = [chrom_start + int(s) for s in row[11].split(',')] for i in range(len(exon_starts)): if i == 0: terminal = True elif i == len(exon_starts) - 1: terminal = True else: terminal = False exon_start = exon_starts[i] exon_end = exon_start + exon_sizes[i] exon = Interval(exon_start, exon_end, value={'geneid':geneid, 'terminal':terminal, 'chrom':chrom}) exons.append(exon) yield exons
def load_repeats(repeatsF): rtrees={} gtype = "" tname = "" seen = set() repeatsIN = open(repeatsF) FIRST = True for line in repeatsIN: #skip header if FIRST: FIRST = False continue line = line.rstrip() fields = line.split('\t') (refid,st,en) = fields[5:8] refid = refid.replace("chr","") strand = fields[9] tname = fields[10] gtype = fields[11] #use 1-based closed interval st=int(st)+1 #present already in interval tree if "%d_%s" % (st,en) in seen: continue seen.add("%d_%s" % (st,en)) if refid not in rtrees: rtrees[refid] = IntervalTree() #must adjust for the open intervals (both) of the interval tree itv = Interval(st-1,int(en)+1, value=[tname,gtype,strand]) rtrees[refid].insert_interval(itv) repeatsIN.close() return rtrees
def load_repeats(repeatsF): rtrees = {} gtype = "" tname = "" seen = set() repeatsIN = open(repeatsF) FIRST = True for line in repeatsIN: #skip header if FIRST: FIRST = False continue line = line.rstrip() fields = line.split('\t') (refid, st, en) = fields[5:8] orient = fields[9] tname = fields[10] gtype = fields[11] #present already in interval tree if "%s_%s" % (st, en) in seen: continue seen.add("%s_%s" % (st, en)) if refid not in rtrees: rtrees[refid] = IntervalTree() itv = Interval(int(st), int(en), value=[tname, gtype]) rtrees[refid].insert_interval(itv) repeatsIN.close() return rtrees
def find_overlap_dataframes(query, hits): ''' find overlap between sorted query and hits regions :param query: [pd.DataFrame] query peaks :param hits: [pd.DataFrame] hits regions :return: flags [array] ''' query_idx, hits_labs = [], [] tree_hash = {chrom: Intersecter() for chrom in hits.chrom.unique()} null_res = [ tree_hash[chrom].add_interval(Interval(start, end)) for chrom, start, end in hits.values ] for idx, line in enumerate(query.values): chrom, start, end = line[0:3] overlaps = tree_hash[chrom].find(start, end) if not overlaps: continue for ovp in overlaps: hits_labs.append(chrom + ':' + str(ovp.start) + '-' + str(ovp.end)) query_idx.append(idx) tmp_labels = hits.chrom + ':' + hits.start.astype( str) + '-' + hits.end.astype(str) hit_indexs = tmp_labels[tmp_labels.isin(hits_labs)].index.values return np.array(query_idx), hit_indexs
def count_reads(transcripts, bam_iter, number_of_counts=1): """ Count the reads in a given transcript :TODO rename :TODO change to cython Arguments --------- transcripts : list list of exons bam_iter : pysam.BamFileIterator gotton after pysam.BamFile.fetch() call """ # Convert this to Cython out_counts = zeros(len(transcripts)) intron_lengths = [] read_vector = [] tree = Intersecter() # Assume exons are position sorted for ti, transcript in enumerate(transcripts): ex_list = [] for j, i in enumerate(transcript): tree.add_interval( Interval(int(i[0]), int(i[1]), value={'anno': ti})) if j != 0: ex_list.append(transcript[j-1][1]\ - transcript[j][0]) intron_lengths.append(ex_list) for read in bam_iter: block_counter = zeros((len(transcripts), )) intron_match = zeros((len(transcripts), )) blocks = read.get_blocks() junction_lengths = [] for i, j in enumerate(blocks): if i != 0: junction_lengths.append(blocks[i - 1][1] - j[0]) else: pass junction_lengths = set(junction_lengths) for i, k in enumerate(blocks): overlap = tree.find(k[0], k[1]) if len(overlap) == 0: break else: for s in overlap: if (k[0] >= s.start) and\ (k[1] <= s.end): block_counter[s.value['anno']] += 1 for ij, il in enumerate(intron_lengths): if set(junction_lengths).issubset(set(il)): intron_match[ij] = 1 else: pass smatch = nrepeat(len(blocks), len(transcripts)) gg = logical_and(block_counter == smatch, intron_match) read_vector.append(gg) out_counts += gg read_matrix = array(read_vector) uniq_r = sum_(read_matrix, axis=1) == 1 #normalization_constant = [for i in transcripts] return (out_counts)
def make_uuid_sets(tables): ''' UUID interval tree --> UUID intersection graph --> connected components --> UUID sets''' forest = dd(Intersecter) for table in tables: for rec in getrec(table): chrom = rec['Chromosome'] start = int(rec['5_Prime_End']) - 10 end = int(rec['3_Prime_End']) + 10 if start > end: start, end = end, start forest[chrom].add_interval(Interval(start, end, value=rec['UUID'])) G = nx.Graph() for table in tables: for rec in getrec(table): chrom = rec['Chromosome'] start = int(rec['5_Prime_End']) - 10 end = int(rec['3_Prime_End']) + 10 if start > end: start, end = end, start for i in forest[chrom].find(start, end): G.add_edge(rec['UUID'], i.value) return list(nx.connected_components(G))
def test_upstream(self): iv = self.intervals upstreams = iv.upstream_of_interval(Interval(59, 60), num_intervals=200) for u in upstreams: self.assertTrue(u.end < 59) upstreams = iv.upstream_of_interval(Interval(60, 70, strand=-1), num_intervals=200) for u in upstreams: self.assertTrue(u.start > 70) upstreams = iv.upstream_of_interval(Interval(58, 58, strand=-1), num_intervals=200) for u in upstreams: self.assertTrue(u.start > 59)
def after(self, contig, start, end, num_intervals=1, max_dist=2500): '''get closest interval after *end*.''' if contig not in self.mIndex: raise KeyError("contig %s not in index" % contig) return [(x.start, x.end, x.value) for x in self.mIndex[contig].after_interval( Interval(start, end), num_intervals=1, max_dist=max_dist)]
def regionTree(tmp, resFrag): """ tmp: The length of the list is at least 3(chrom,start,end) resFrag: a dictionary to store the Interval tree """ if tmp[0] not in resFrag: resFrag[tmp[0]] = Intersecter() resFrag[tmp[0]].add_interval(Interval(int(tmp[1]), int(tmp[2]), tmp[3:]))
def __init__(self,coordinates): self.interval_tree=dict() for c in coordinates: if c.chr_id not in self.interval_tree: self.interval_tree[c.chr_id]=Intersecter() self.interval_tree[c.chr_id].add_interval(Interval(c.bpstart,c.bpend,c))
def parse_gene_coordinate(infile): for line in open(infile): cols = line.strip().split(',') geneid = cols[0] chr, start, end = cols[2:] start = int(start) end = int(end) yield chr, Interval(start, end, value={'geneid': geneid})
def init_intersecter(hits): ''' ''' intersecter = Intersecter() for h in hits: intersecter.add_interval(Interval(h[0], h[1])) return intersecter
def setUp(self): iv = IntervalTree() n = 0 for i in range(1, 1000, 80): iv.insert(i, i + 10, dict(value=i * i)) # add is synonym for insert. iv.add(i + 20, i + 30, dict(astr=str(i * i))) # or insert/add an interval object with start, end attrs. iv.insert_interval( Interval(i + 40, i + 50, value=dict(astr=str(i * i)))) iv.add_interval( Interval(i + 60, i + 70, value=dict(astr=str(i * i)))) n += 4 self.intervals = self.iv = iv self.nintervals = n
def test_right(self): iv = self.intervals self.assertEqual(str(iv.left(60, n=2)), str([Interval(50, 59), Interval(40, 49)])) def get_right_start(b10): r = iv.right(b10 + 1, n=1) assert len(r) == 1 return r[0].start for i in range(10, 100, 10): self.assertEqual(get_right_start(i), i + 10) for i in range(0, 100, 10): r = iv.right(i - 1, max_dist=10, n=1) print r self.assertEqual(r[0].start, i)
def test_n(self): iv = self.intervals for i in range(0, 90, 10): r = iv.after(i, max_dist=20, num_intervals=2) self.assertEqual(r[0].start, i + 10) self.assertEqual(r[1].start, i + 20) r = iv.after_interval(Interval(i, i), max_dist=20, num_intervals=2) self.assertEqual(r[0].start, i + 10) self.assertEqual(r[1].start, i + 20)