def intersect(a, b): rec_a = list(GFF.parse(a)) rec_b = list(GFF.parse(b)) if len(rec_a) > 1 or len(rec_b) > 1: raise Exception("Cannot handle multiple GFF3 records in a file, yet") rec_a = rec_a[0] rec_b = rec_b[0] tree_a = IntervalTree(list(treeFeatures(rec_a.features)), 1, len(rec_a)) tree_b = IntervalTree(list(treeFeatures(rec_b.features)), 1, len(rec_b)) rec_a_map = {f.id: f for f in rec_a.features} rec_b_map = {f.id: f for f in rec_b.features} rec_a_hits_in_b = [] rec_b_hits_in_a = [] for feature in rec_a.features: hits = tree_b.find_range( (int(feature.location.start), int(feature.location.end))) for hit in hits: rec_a_hits_in_b.append(rec_b_map[hit]) for feature in rec_b.features: hits = tree_a.find_range( (int(feature.location.start), int(feature.location.end))) for hit in hits: rec_b_hits_in_a.append(rec_a_map[hit]) rec_a.features = set(rec_a_hits_in_b) rec_b.features = set(rec_b_hits_in_a) return rec_a, rec_b
def test_simple_float(): """docstring for test_empty""" empty_interval = [0.0,1.0,1] empty_tree = IntervalTree([empty_interval],0,1) assert empty_tree.find_range([0.5,0.6]) == [1]
def test_empty(): """docstring for test_empty""" empty_interval = [0,0,None] empty_tree = IntervalTree([empty_interval],0,0) assert empty_tree.find_range([1,1]) == []
def test_simple(): """docstring for test_empty""" empty_interval = [1,1,3] empty_tree = IntervalTree([empty_interval],0,1) assert empty_tree.find_range([1,1]) == [3]
def calc_overlap_between_segments(ordered_segments1, ordered_segments2): ''' Calculates the total overlap size between a pair of ordered and disjoint groups of segments. Each group of segment is given by: [(start1, end1), (start2, end2), ...]. ''' from interval_tree import IntervalTree if len(ordered_segments1) == 0 or len(ordered_segments2) == 0: return 0 if len(ordered_segments1) > len(ordered_segments2): ordered_segments1, ordered_segments2 = ordered_segments2, ordered_segments1 min_value = min(ordered_segments1[0][0], ordered_segments2[0][0]) max_value = max(ordered_segments1[-1][1], ordered_segments2[-1][1]) interval_tree1 = IntervalTree( [segment + (segment, ) for segment in ordered_segments1], min_value, max_value) total_overlap = 0 for segment in ordered_segments2: for overlapping_segment in interval_tree1.find_range(segment): overlapping_start = max(segment[0], overlapping_segment[0]) overlapping_end = min(segment[1], overlapping_segment[1]) assert overlapping_start <= overlapping_end, 'Reported overlap between %d..%d to %d..%d.' % (segment + \ overlapping_segment) total_overlap += (overlapping_end - overlapping_start + 1) return total_overlap
def test_case_1(): tree = IntervalTree(1, 10) tree.insert_line(2, 6) print(tree.search_line(2, 7)) print(tree.search_line(3, 5)) print("over")
def main(): # build intervals list from file intervals = str_to_interval(readIntervalsFile()) # build an interval tree tree = IntervalTree(intervals) for line in sys.stdin: # search input number in tree processLine(tree, line)
def __init__(self, peak_calling_file, inference_file, output): """ :param peak_calling_file: peak_calling的文件 :param inference_file: CNN预测输出结果 """ self.peak_file = peak_calling_file self.infer_file = inference_file self.peak = None self.infer = None self.interval_tree = IntervalTree() self.output = output
def test_float_tree(): """docstring for test_empty""" first_interval = [0.0,0.01,1] second_interval = [0.01,0.02,2] third_interval = [0.02,1,3] empty_tree = IntervalTree([ first_interval, second_interval, third_interval ],0,2) assert empty_tree.find_range([0.001,0.001]) == [1,2,3]
def _build_gene_interval_trees(genes): if len(genes) == 0: return None segments = [] max_coordinate = 1 for gene in genes: start, end = _get_gene_locus(gene) segments += [(start, end, gene)] max_coordinate = max(max_coordinate, end) return IntervalTree(segments, 1, max_coordinate)
def db_hit(database, query_mass): ''' Returns database hits for the query_mass. Each DatabaseEntry in database now knows its own range of begin and end masses The tolerance ppm for that range is specified when the DatabaseEntry is created. See test_discretisation.py and identification.py for example usage. Args: - database: a list of DatabaseEntry objects - query_mass: the mass to query Returns: a list of DatabaseEntry objects {e} where e.get_begin() < query_mass < e.get_end() ''' T = IntervalTree(database) hits = T.search(query_mass) return hits
def build_gene_trees(infile, gene_db): """Build gene trees from the gene file""" gene_trees = {} chromosome_stops = {} with gzip.open(infile) as f: for line in f: if not line.startswith('#'): line = line.rstrip().split('\t') # print(line) # print(len(line)) if len(line) >= 6: chrom = line[0] start = int(line[1]) stop = int(line[2]) hgnc_symbol = line[5] if hgnc_symbol: if chrom in gene_trees: if not hgnc_symbol in gene_trees[chrom]: gene_trees[chrom][hgnc_symbol] = [start, stop] else: gene_trees[chrom] = {} gene_trees[chrom][hgnc_symbol] = [start, stop] if stop > chromosome_stops.get(chrom, 0): chromosome_stops[chrom] = stop + 1 #Prepare for interval tree interval_trees = {} for chromosome in gene_trees: for gene_symbol in gene_trees[chromosome]: start = gene_trees[chromosome][gene_symbol][0] stop = gene_trees[chromosome][gene_symbol][1] interval = [start, stop, gene_symbol] if chromosome in interval_trees: interval_trees[chromosome].append(interval) else: interval_trees[chromosome] = [interval] for chrom in gene_trees: interval_trees[chrom] = IntervalTree(interval_trees[chrom], 1, chromosome_stops[chrom]) with open(gene_db, 'wb') as f: logger.info("Dumping gene database to {0}.".format(gene_db)) pickle.dump(interval_trees, f) logger.debug("Dumping successful.")
def __init__(self, conf, pathList, transform): self.conf = conf self.dataList = [] self.offsets = [] self.len = 0 features = [] for idx, dataFolder in enumerate(pathList): feature = [] # # This can be done better by using pytorch to concat datasets. self.dataList.append(DataFolder(conf, dataFolder, transform)) feature.append(self.len) self.offsets.append(self.len) self.len += len(self.dataList[-1]) feature.append(self.len - 1) feature.append(idx) features.append(feature) self.binSelector = IntervalTree(features, 0, self.len + 1)
def neighbours_in_record(rec_a, rec_b, within=1000, mode='unordered', **kwargs): feat_f = list(treeFeatures(rec_a.features, strand=1)) feat_r = list(treeFeatures(rec_a.features, strand=-1)) if len(feat_f) > 0: tree_f = IntervalTree(feat_f, 1, len(rec_a)) else: tree_f = None if len(feat_r) > 0: tree_r = IntervalTree(feat_r, 1, len(rec_a)) else: tree_r = None rec_a_map = {f.id: f for f in rec_a.features} # rec_b_map = {f.id: f for f in rec_b.features} rec_a_hits_in_b = [] rec_b_hits_in_a = [] for feature in rec_b.features: start = feature.location.start end = feature.location.end if feature.location.strand > 0: start -= within if mode != 'ordered': end += within if tree_f is None: continue hits = tree_f.find_range((start, end)) if len(hits) == 0: continue print start, end, feature.location.strand, feature.id, hits rec_b_hits_in_a.append(feature) for hit in hits: feat_hit = rec_a_map[hit] if feat_hit not in rec_a_hits_in_b: rec_a_hits_in_b.append(feat_hit) else: end += within if mode != 'ordered': start -= within if tree_r is None: continue hits = tree_r.find_range((start, end)) if len(hits) == 0: continue print start, end, feature.location.strand, feature.id, hits rec_b_hits_in_a.append(feature) for hit in hits: feat_hit = rec_a_map[hit] if feat_hit not in rec_a_hits_in_b: rec_a_hits_in_b.append(feat_hit) rec_a.features = rec_a_hits_in_b rec_b.features = rec_b_hits_in_a return rec_a, rec_b
#lens.append(len(res)) else: for i in range(tries): res = tree.find(start, end) res.sort(key=operator.attrgetter('start')) lens.append("%i:%s" % (len(res), [x.start for x in res[-1:]])) #lens.append(len(res)) t1 = time.time() return res, t1 - t0, lens start_max = STOP * 3 while True: intervals = rands(N, start_max=start_max) t0 = time.time() tree = IntervalTree(intervals) t1 = time.time() print "time to build IntervalTree with %i intervals: %.3f" % (N, t1 - t0) t0 = time.time() ints = Intersecter(intervals) t1 = time.time() print "time to build Intersector with %i intervals: %.3f" % (N, t1 - t0) found, t, tree_lens = search(tree, START, STOP, TRIES) print "time to search tree %i times: %.3f. found %i intervals" % ( TRIES, t, len(found)) found, t, brute_lens = search(intervals, START, STOP, TRIES) print "time to search brute %i times: %.3f. found %i intervals" % ( TRIES, t, len(found))
def sort_contours_by_level(contours): """Sort contours into parts. Returns a sorted list of lists, where inner lists represent contours at the same depth, and the outer list organizes inner lists by decreasing depth. """ # TODO: handle pre-closed contours. (Circles, ellipses, etc.) parts = [] height_interval_to_contours = { } # items are contour lists, since multiple contours can have the same height interval. contour_tree = IntervalTree() heights = set() contours_by_name = {} nested_contour_tree_items = {} # dict of contour nodes # Find min/max heights of all contours. layout_y_min = math.inf layout_y_max = -math.inf # Also find the left/right extremes to find global corners. layout_x_min = math.inf layout_x_max = -math.inf for contour in contours: # Store contours by name. contours_by_name[contour.name()] = contour # Store contour in a dict by height interval. Some contours can have the same height, so use lists. # This data structure is the input to build the interval tree. if (contour.y_min, contour.y_max) in height_interval_to_contours: height_interval_to_contours[(contour.y_min, contour.y_max)].append(contour) else: height_interval_to_contours[(contour.y_min, contour.y_max)] = [contour] # Update the extremes of the layout. if contour.y_min < layout_y_min: layout_y_min = contour.y_min if contour.y_max > layout_y_max: layout_y_max = contour.y_max if contour.x_min < layout_x_min: layout_x_min = contour.x_min if contour.x_max > layout_x_max: layout_x_max = contour.x_max # Add the contour's midpoint to the height intervals. heights.add((contour.y_max - contour.y_min) / 2 + contour.y_min) # Create interval tree. print("Packing Contours into Interval Tree for sorting speedup.") contour_tree.build(layout_y_min, layout_y_max, height_interval_to_contours) # Construct all contour in-out relationships. print("Constructing in-out contour relationships.") for height in heights: # Extract all the contours that exist at this height. contour_subset_lists = contour_tree.query(height) contour_subset_lists = [item[1] for item in contour_subset_lists ] # remove the keys. contour_subset_lists = [ item for sublist in contour_subset_lists for item in sublist ] # flatten remaining lists. # Build the In-Out relationship tree. for a_index, contour_a in enumerate(contour_subset_lists): contour_a_node = nested_contour_tree_items.get( contour_a.name(), Node(contour_a.name())) for b_index, contour_b in enumerate(contour_subset_lists[a_index + 1:]): point_a = (contour_a.start_x, contour_a.start_y) point_b = (contour_b.start_x, contour_b.start_y) # Check if a is in b. If so, insert pair relationship into tree. if point_in_contour(point_a, contour_b): # contour_b is contour_a's parent. Add back to the dict contour_b_node = nested_contour_tree_items.get( contour_b.name(), Node(contour_b.name())) contour_a_node.parent = contour_b_node nested_contour_tree_items[ contour_b.name()] = contour_b_node # Check if b is in a. If so, insert pair relationship into tree. elif point_in_contour(point_b, contour_a): # contour_a is contour_b's parent. Add back to the dict contour_b_node = nested_contour_tree_items.get( contour_b.name(), Node(contour_b.name())) contour_b_node.parent = contour_a_node nested_contour_tree_items[ contour_b.name()] = contour_b_node nested_contour_tree_items[contour_a.name()] = contour_a_node print("Organizing contours by depth") # A dict, keyed by level (int) of contours that live at that level. depth_lists = OrderedDict() # Contours may be sorted in multiple separate trees. # Pull contours out of the dict representation and put into lists sorted by depths while len(nested_contour_tree_items): # Find the root(s) and print out the tree from there. node = None # Pull an arbitrary item out from the nesting. node_key = list(nested_contour_tree_items.keys())[0] # Get the root of this tree. node = nested_contour_tree_items[node_key] while node.parent is not None: node = node.parent # https://anytree.readthedocs.io/en/latest/api/anytree.iterators.html#anytree.iterators.levelordergroupiter.LevelOrderGroupIter list_o_lists = [[node.name for node in children] for children in LevelOrderGroupIter(node)] for index, depth_list in enumerate(list_o_lists): old_depth_list = depth_lists.get(index, []) for contour_name in depth_list: old_depth_list.append(contours_by_name[contour_name]) del nested_contour_tree_items[contour_name] depth_lists[index] = old_depth_list # Return serialized tree and a starting point. return [v for k, v in depth_lists.items()], (layout_x_max, layout_y_max)