def intersect(a, b): rec_a = list(GFF.parse(a)) rec_b = list(GFF.parse(b)) if len(rec_a) > 1 or len(rec_b) > 1: raise Exception("Cannot handle multiple GFF3 records in a file, yet") rec_a = rec_a[0] rec_b = rec_b[0] tree_a = IntervalTree(list(treeFeatures(rec_a.features)), 1, len(rec_a)) tree_b = IntervalTree(list(treeFeatures(rec_b.features)), 1, len(rec_b)) rec_a_map = {f.id: f for f in rec_a.features} rec_b_map = {f.id: f for f in rec_b.features} rec_a_hits_in_b = [] rec_b_hits_in_a = [] for feature in rec_a.features: hits = tree_b.find_range( (int(feature.location.start), int(feature.location.end))) for hit in hits: rec_a_hits_in_b.append(rec_b_map[hit]) for feature in rec_b.features: hits = tree_a.find_range( (int(feature.location.start), int(feature.location.end))) for hit in hits: rec_b_hits_in_a.append(rec_a_map[hit]) rec_a.features = set(rec_a_hits_in_b) rec_b.features = set(rec_b_hits_in_a) return rec_a, rec_b
def test_simple_float(): """docstring for test_empty""" empty_interval = [0.0,1.0,1] empty_tree = IntervalTree([empty_interval],0,1) assert empty_tree.find_range([0.5,0.6]) == [1]
def test_empty(): """docstring for test_empty""" empty_interval = [0,0,None] empty_tree = IntervalTree([empty_interval],0,0) assert empty_tree.find_range([1,1]) == []
def test_simple(): """docstring for test_empty""" empty_interval = [1,1,3] empty_tree = IntervalTree([empty_interval],0,1) assert empty_tree.find_range([1,1]) == [3]
class FlotDataset(torch.utils.data.Dataset): '''Read from a list all of the data files.''' def __init__(self, conf, pathList, transform): self.conf = conf self.dataList = [] self.offsets = [] self.len = 0 features = [] for idx, dataFolder in enumerate(pathList): feature = [] # # This can be done better by using pytorch to concat datasets. self.dataList.append(DataFolder(conf, dataFolder, transform)) feature.append(self.len) self.offsets.append(self.len) self.len += len(self.dataList[-1]) feature.append(self.len - 1) feature.append(idx) features.append(feature) self.binSelector = IntervalTree(features, 0, self.len + 1) def __len__(self): return self.len def __getitem__(self, idx): binIdx = self.binSelector.find_range([idx, idx])[0] if binIdx == None: printError('selected impossible index %s', idx) return None return self.dataList[binIdx].__getitem__(idx - self.offsets[binIdx])
def calc_overlap_between_segments(ordered_segments1, ordered_segments2): ''' Calculates the total overlap size between a pair of ordered and disjoint groups of segments. Each group of segment is given by: [(start1, end1), (start2, end2), ...]. ''' from interval_tree import IntervalTree if len(ordered_segments1) == 0 or len(ordered_segments2) == 0: return 0 if len(ordered_segments1) > len(ordered_segments2): ordered_segments1, ordered_segments2 = ordered_segments2, ordered_segments1 min_value = min(ordered_segments1[0][0], ordered_segments2[0][0]) max_value = max(ordered_segments1[-1][1], ordered_segments2[-1][1]) interval_tree1 = IntervalTree( [segment + (segment, ) for segment in ordered_segments1], min_value, max_value) total_overlap = 0 for segment in ordered_segments2: for overlapping_segment in interval_tree1.find_range(segment): overlapping_start = max(segment[0], overlapping_segment[0]) overlapping_end = min(segment[1], overlapping_segment[1]) assert overlapping_start <= overlapping_end, 'Reported overlap between %d..%d to %d..%d.' % (segment + \ overlapping_segment) total_overlap += (overlapping_end - overlapping_start + 1) return total_overlap
def test_float_tree(): """docstring for test_empty""" first_interval = [0.0,0.01,1] second_interval = [0.01,0.02,2] third_interval = [0.02,1,3] empty_tree = IntervalTree([ first_interval, second_interval, third_interval ],0,2) assert empty_tree.find_range([0.001,0.001]) == [1,2,3]
def neighbours_in_record(rec_a, rec_b, within=1000, mode='unordered', **kwargs): feat_f = list(treeFeatures(rec_a.features, strand=1)) feat_r = list(treeFeatures(rec_a.features, strand=-1)) if len(feat_f) > 0: tree_f = IntervalTree(feat_f, 1, len(rec_a)) else: tree_f = None if len(feat_r) > 0: tree_r = IntervalTree(feat_r, 1, len(rec_a)) else: tree_r = None rec_a_map = {f.id: f for f in rec_a.features} # rec_b_map = {f.id: f for f in rec_b.features} rec_a_hits_in_b = [] rec_b_hits_in_a = [] for feature in rec_b.features: start = feature.location.start end = feature.location.end if feature.location.strand > 0: start -= within if mode != 'ordered': end += within if tree_f is None: continue hits = tree_f.find_range((start, end)) if len(hits) == 0: continue print start, end, feature.location.strand, feature.id, hits rec_b_hits_in_a.append(feature) for hit in hits: feat_hit = rec_a_map[hit] if feat_hit not in rec_a_hits_in_b: rec_a_hits_in_b.append(feat_hit) else: end += within if mode != 'ordered': start -= within if tree_r is None: continue hits = tree_r.find_range((start, end)) if len(hits) == 0: continue print start, end, feature.location.strand, feature.id, hits rec_b_hits_in_a.append(feature) for hit in hits: feat_hit = rec_a_map[hit] if feat_hit not in rec_a_hits_in_b: rec_a_hits_in_b.append(feat_hit) rec_a.features = rec_a_hits_in_b rec_b.features = rec_b_hits_in_a return rec_a, rec_b