Esempio n. 1
0
def intersect(a, b):
    rec_a = list(GFF.parse(a))
    rec_b = list(GFF.parse(b))
    if len(rec_a) > 1 or len(rec_b) > 1:
        raise Exception("Cannot handle multiple GFF3 records in a file, yet")

    rec_a = rec_a[0]
    rec_b = rec_b[0]

    tree_a = IntervalTree(list(treeFeatures(rec_a.features)), 1, len(rec_a))
    tree_b = IntervalTree(list(treeFeatures(rec_b.features)), 1, len(rec_b))

    rec_a_map = {f.id: f for f in rec_a.features}
    rec_b_map = {f.id: f for f in rec_b.features}

    rec_a_hits_in_b = []
    rec_b_hits_in_a = []

    for feature in rec_a.features:
        hits = tree_b.find_range(
            (int(feature.location.start), int(feature.location.end)))
        for hit in hits:
            rec_a_hits_in_b.append(rec_b_map[hit])

    for feature in rec_b.features:
        hits = tree_a.find_range(
            (int(feature.location.start), int(feature.location.end)))
        for hit in hits:
            rec_b_hits_in_a.append(rec_a_map[hit])

    rec_a.features = set(rec_a_hits_in_b)
    rec_b.features = set(rec_b_hits_in_a)
    return rec_a, rec_b
Esempio n. 2
0
def test_simple_float():
    """docstring for test_empty"""
    
    empty_interval = [0.0,1.0,1]
    empty_tree = IntervalTree([empty_interval],0,1)
    
    assert empty_tree.find_range([0.5,0.6]) == [1]
Esempio n. 3
0
def test_empty():
    """docstring for test_empty"""
    
    empty_interval = [0,0,None]
    empty_tree = IntervalTree([empty_interval],0,0)
    
    assert empty_tree.find_range([1,1]) == []
Esempio n. 4
0
def test_simple():
    """docstring for test_empty"""
    
    empty_interval = [1,1,3]
    empty_tree = IntervalTree([empty_interval],0,1)
    
    assert empty_tree.find_range([1,1]) == [3]
Esempio n. 5
0
class FlotDataset(torch.utils.data.Dataset):
    '''Read from a list all of the data files.'''
    def __init__(self, conf, pathList, transform):
        self.conf = conf
        self.dataList = []
        self.offsets = []
        self.len = 0
        features = []
        for idx, dataFolder in enumerate(pathList):
            feature = []
            #
            # This can be done better by using pytorch to concat datasets.
            self.dataList.append(DataFolder(conf, dataFolder, transform))
            feature.append(self.len)
            self.offsets.append(self.len)
            self.len += len(self.dataList[-1])
            feature.append(self.len - 1)
            feature.append(idx)
            features.append(feature)
        self.binSelector = IntervalTree(features, 0, self.len + 1)

    def __len__(self):
        return self.len

    def __getitem__(self, idx):
        binIdx = self.binSelector.find_range([idx, idx])[0]
        if binIdx == None:
            printError('selected impossible index %s', idx)
            return None
        return self.dataList[binIdx].__getitem__(idx - self.offsets[binIdx])
Esempio n. 6
0
def calc_overlap_between_segments(ordered_segments1, ordered_segments2):
    '''
    Calculates the total overlap size between a pair of ordered and disjoint groups of segments.
    Each group of segment is given by: [(start1, end1), (start2, end2), ...]. 
    '''

    from interval_tree import IntervalTree

    if len(ordered_segments1) == 0 or len(ordered_segments2) == 0:
        return 0

    if len(ordered_segments1) > len(ordered_segments2):
        ordered_segments1, ordered_segments2 = ordered_segments2, ordered_segments1

    min_value = min(ordered_segments1[0][0], ordered_segments2[0][0])
    max_value = max(ordered_segments1[-1][1], ordered_segments2[-1][1])
    interval_tree1 = IntervalTree(
        [segment + (segment, ) for segment in ordered_segments1], min_value,
        max_value)
    total_overlap = 0

    for segment in ordered_segments2:
        for overlapping_segment in interval_tree1.find_range(segment):
            overlapping_start = max(segment[0], overlapping_segment[0])
            overlapping_end = min(segment[1], overlapping_segment[1])
            assert overlapping_start <= overlapping_end, 'Reported overlap between %d..%d to %d..%d.' % (segment + \
                    overlapping_segment)
            total_overlap += (overlapping_end - overlapping_start + 1)

    return total_overlap
Esempio n. 7
0
def test_float_tree():
    """docstring for test_empty"""
    
    first_interval = [0.0,0.01,1]
    second_interval = [0.01,0.02,2]
    third_interval = [0.02,1,3]
    
    empty_tree = IntervalTree([
        first_interval,
        second_interval,
        third_interval
    ],0,2)
    
    assert empty_tree.find_range([0.001,0.001]) == [1,2,3]
Esempio n. 8
0
def neighbours_in_record(rec_a,
                         rec_b,
                         within=1000,
                         mode='unordered',
                         **kwargs):
    feat_f = list(treeFeatures(rec_a.features, strand=1))
    feat_r = list(treeFeatures(rec_a.features, strand=-1))

    if len(feat_f) > 0:
        tree_f = IntervalTree(feat_f, 1, len(rec_a))
    else:
        tree_f = None

    if len(feat_r) > 0:
        tree_r = IntervalTree(feat_r, 1, len(rec_a))
    else:
        tree_r = None

    rec_a_map = {f.id: f for f in rec_a.features}
    # rec_b_map = {f.id: f for f in rec_b.features}

    rec_a_hits_in_b = []
    rec_b_hits_in_a = []

    for feature in rec_b.features:
        start = feature.location.start
        end = feature.location.end
        if feature.location.strand > 0:
            start -= within
            if mode != 'ordered':
                end += within

            if tree_f is None:
                continue

            hits = tree_f.find_range((start, end))
            if len(hits) == 0:
                continue
            print start, end, feature.location.strand, feature.id, hits

            rec_b_hits_in_a.append(feature)
            for hit in hits:
                feat_hit = rec_a_map[hit]
                if feat_hit not in rec_a_hits_in_b:
                    rec_a_hits_in_b.append(feat_hit)

        else:
            end += within
            if mode != 'ordered':
                start -= within

            if tree_r is None:
                continue

            hits = tree_r.find_range((start, end))
            if len(hits) == 0:
                continue

            print start, end, feature.location.strand, feature.id, hits
            rec_b_hits_in_a.append(feature)
            for hit in hits:
                feat_hit = rec_a_map[hit]
                if feat_hit not in rec_a_hits_in_b:
                    rec_a_hits_in_b.append(feat_hit)

    rec_a.features = rec_a_hits_in_b
    rec_b.features = rec_b_hits_in_a
    return rec_a, rec_b