Ejemplo n.º 1
0
def test_union():
    t = IntervalTree()
    interval = Interval(0, 1)
    s = set([interval])

    r = t.union(s)
    assert len(r) == 1
    assert set(r).pop() == interval

    t.extend(s)
    t.extend(s)
    assert len(t) == 1
    assert set(t).pop() == interval

    interval = Interval(2, 3)
    t.update([interval])
    assert len(t) == 2
    assert sorted(t)[1] == interval

    t = IntervalTree(s)
    t.extend([interval])
    assert len(t) == 2
    assert sorted(t)[1] == interval
Ejemplo n.º 2
0
    def __load(self):
        with pathlib.Path.open(self.path, 'r') as json_data:
            data = json.load(json_data)
            if data:
                self.data = data

            for chunk in self.data['response']['chunks']:

                if chunk['channelTag'] == '1':
                    for r in chunk['alternatives'][0]['words']:
                        i = Interval(float(r["startTime"][:-1]),
                                     float(r["endTime"][:-1]), r)
                        r['hit'] = False
                        self.tree.add(i)
Ejemplo n.º 3
0
def get_interval(start, stop, value):
    """Create an interval instance
    
        Args:
            start(int)
            stop(int)
            value
        
        Returns:
            interval(intervaltree.Interval)
    
    """
    interval = Interval(start, stop, value)
    return interval
Ejemplo n.º 4
0
 def __init__(self, seq, fraglen, ignore_n=True):
     self.seq = seq
     self.chromlen = self.len = len(seq)
     self.peak_regions = IntervalTree()
     self.blacklist = IntervalTree()
     self.fraglen = fraglen
     if ignore_n:
         pos = 0
         for k, g in groupby(self.seq.translate(self.COUNT_N)):
             l = sum(1 for _ in g)
             if k == 'N':
                 self.blacklist.add(Interval(pos, pos + l))
                 self.len -= l
             pos += l
Ejemplo n.º 5
0
def test_minimal_sequence():
    t = IntervalTree()
    t.addi(-0.62, 4.38)  # becomes root
    t.addi(9.24, 14.24)  # right child

    ## Check that the tree structure is like this:
    # t.print_structure()
    # Node<-0.62, depth=2, balance=1>
    #  Interval(-0.62, 4.38)
    # >:  Node<9.24, depth=1, balance=0>
    #      Interval(9.24, 14.24)
    root = t.top_node
    assert root.s_center == set([Interval(-0.62, 4.38)])
    assert root.right_node.s_center == set([Interval(9.24, 14.24)])
    assert not root.left_node

    t.verify()

    # This line left an empty node when drotate() failed to promote
    # Intervals properly:
    t.addi(4.0, 9.0)
    t.print_structure()
    t.verify()
Ejemplo n.º 6
0
def make_intervaltree(df: pd.DataFrame) -> IntervalTree:
    intervals = []
    if df.empty:
        raise Exception(
            "Error! Try to make intervaltree from empty dataframe.")
    for idx, entry in df.iterrows():
        #if entry.response_size == entry.offset: # first operation
        start = entry['offset'] - entry['response_size']
        if start == entry['offset']:
            print("Emtpy interval! .. skip!")
            continue
        intervals.append(Interval(start, entry['offset'],
                                  (entry['kind'], idx)))
    return IntervalTree(intervals)
Ejemplo n.º 7
0
    def unconsumed_ranges(self):
        """Return an IntervalTree of unconsumed ranges, of the format
        (start, end] with the end value not being included
        """
        res = IntervalTree()

        prev = None

        # normal iteration is not in a predictable order
        ranges = sorted([x for x in self.range_set], key=lambda x: x.begin)

        for rng in ranges:
            if prev is None:
                prev = rng
                continue
            res.add(Interval(prev.end, rng.begin))
            prev = rng

        # means we've seeked past the end
        if len(self.range_set[self.tell()]) != 1:
            res.add(Interval(prev.end, self.tell()))

        return res
Ejemplo n.º 8
0
 def allocate(self, start: float, end: float, num_bytes: int, job: Job):
     assert self._scheduler.time <= start <= end
     assert 0 < num_bytes <= self.available_space(start, end)
     # There should be only one interval per job.
     assert job.id not in self._job_allocations
     interval = Interval(start, end, num_bytes)
     self._job_allocations[job.id] = interval
     self._interval_tree.add(interval)
     assert bool(not self._job_allocations) == bool(
         self._interval_tree.is_empty())
     assert len(self._job_allocations) == len(
         self._interval_tree.all_intervals)
     if __debug__:
         self._interval_tree.verify()
Ejemplo n.º 9
0
def get_unique_loci(intervals):

    grouped_intervals = defaultdict(list)
    for genome, seqid, start, end in intervals:
        grouped_intervals[(genome, seqid)].append(Interval(start, end))

    unique_loci = list()
    for (genome, seqid), intvls in grouped_intervals.items():
        itree = IntervalTree(intvls)
        itree.merge_overlaps()
        for intvl in itree:
            unique_loci.append((genome, seqid, intvl.begin, intvl.end))

    return unique_loci
Ejemplo n.º 10
0
def find_diff(list_a, list_b):
    interval_tree = IntervalTree()

    for interval in list_a:
        interval_tree.add(Interval(interval[0], interval[1]))

    for interval in list_b:
        interval_tree.chop(interval[0], interval[1])

    result = []
    for item in interval_tree.items():
        result.append((item.begin, item.end))

    return result
Ejemplo n.º 11
0
def _fix(interval):
    '''
    Helper function for ``GenomeIntervalTree.from_bed and ``.from_table``.

    Data tables may contain intervals with begin >= end. Such intervals lead to infinite recursions and
    other unpleasant behaviour, so something has to be done about them. We 'fix' them by simply setting end = begin+1.
    '''
    if interval.begin >= interval.end:
        warnings.warn(
            "Interval with reversed coordinates (begin >= end) detected when reading data. Interval was automatically fixed to point interval [begin, begin+1)."
        )
        return Interval(interval.begin, interval.begin + 1, interval.data)
    else:
        return interval
Ejemplo n.º 12
0
    def get_intervals(self, projection):
        """Compute epsilon separated interval tree from projection

        Parameters
        ----------
        projection : 1-d array
            Projection array of points on a vector

        Returns
        -------
        IntervalTree
            epsilon separated interval tree
        """
        start = projection[0]
        end = projection[0]
        epsilon = (np.max(projection) - np.min(projection)) * self.epsilon
        tree = IntervalTree()
        for point in projection[1:]:
            if point < end + epsilon:
                end = point
            else:
                try:
                    end += 2 * np.finfo(self.precision).eps
                    tree.add(Interval(start, end))
                except ValueError:
                    # NULL interval
                    pass
                start = point
                end = point
        else:
            try:
                end += 2 * np.finfo(self.precision).eps
                tree.add(Interval(start, end))
            except ValueError:
                # NULL interval
                pass
        return tree
Ejemplo n.º 13
0
def test_merge_equals_reducer_with_initializer():
    def reducer(old, new):
        return old + [new]

    # empty tree
    e = IntervalTree()
    e.merge_equals(data_reducer=reducer, data_initializer=[])
    e.verify()
    assert not e

    # one Interval in tree, no change
    o = IntervalTree.from_tuples([(1, 2, 'hello')])
    o.merge_equals(data_reducer=reducer, data_initializer=[])
    o.verify()
    assert len(o) == 1
    assert sorted(o) == [Interval(1, 2, ['hello'])]

    # many Intervals in tree, no change
    t = IntervalTree.from_tuples(data.ivs1.data)
    orig = IntervalTree.from_tuples(data.ivs1.data)
    t.merge_equals(data_reducer=reducer, data_initializer=[])
    t.verify()
    assert len(t) == len(orig)
    assert t != orig
    assert sorted(t) == [Interval(b, e, [d]) for b, e, d in sorted(orig)]

    # many Intervals in tree, with change
    t = IntervalTree.from_tuples(data.ivs1.data)
    orig = IntervalTree.from_tuples(data.ivs1.data)
    t.addi(4, 7, 'foo')
    t.merge_equals(data_reducer=reducer, data_initializer=[])
    t.verify()
    assert len(t) == len(orig)
    assert t != orig
    assert not t.containsi(4, 7, 'foo')
    assert not t.containsi(4, 7, '[4,7)')
    assert t.containsi(4, 7, ['[4,7)', 'foo'])
def merge_overlaps_min_frac(self,
                            data_reducer=None,
                            data_initializer=None,
                            min_frac=0.05):
    """
    Same as IntervalTree.merge_overlaps(), except intervals are only
    merged if the fraction of overlap in both intervals exceeds the
    cutoff specified by min_frac.
    """
    if not self:
        return

    sorted_intervals = sorted(self.all_intervals)  # get sorted intervals
    merged = []
    # use mutable object to allow new_series() to modify it
    current_reduced = [None]
    higher = None  # iterating variable, which new_series() needs access to

    def new_series():
        if data_initializer is None:
            current_reduced[0] = higher.data
            merged.append(higher)
            return
        else:  # data_initializer is not None
            current_reduced[0] = copy(data_initializer)
            current_reduced[0] = data_reducer(current_reduced[0], higher.data)
            merged.append(
                Interval(higher.begin, higher.end, current_reduced[0]))

    for higher in sorted_intervals:
        if merged:  # series already begun
            lower = merged[-1]
            if intervals_frac_overlap(
                    lower, higher) >= min_frac and intervals_frac_overlap(
                        higher, lower) >= min_frac:
                upper_bound = max(lower.end, higher.end)
                if data_reducer is not None:
                    current_reduced[0] = data_reducer(current_reduced[0],
                                                      higher.data)
                else:  # annihilate the data, since we don't know how to merge it
                    current_reduced[0] = None
                merged[-1] = Interval(lower.begin, upper_bound,
                                      current_reduced[0])
            else:
                new_series()
        else:  # not merged; is first of Intervals to merge
            new_series()

    self.__init__(merged)
Ejemplo n.º 15
0
def load_coverage_df(exon_padding, tx_accession, samples):
    transcript = genes.load_transcripts()[tx_accession]
    tree = IntervalTree(
        [Interval(exon.begin, exon.end) for exon in transcript.exons])
    ds = [
        load_coverage(sample, transcript.chrom, tree, transcript)
        for sample in samples
    ]
    df_coverage = pd.concat(
        [ds[0]["chrom"], ds[0]["pos"], ds[0]["exon_no"]] +
        [d.iloc[:, 3] for d in ds],
        axis="columns",
    )
    df_coverage.sort_values("pos", inplace=True)
    return df_coverage
Ejemplo n.º 16
0
    def __load(self):
        with pathlib.Path.open(self.path, 'r') as json_data:
            data = json.load(json_data)
            if data:
                self.data = data

            try:
                for obj in self.data:
                    if obj.get("result"):
                        for r in obj["result"]:
                            i = Interval(r["start"], r["end"], r)
                            r['hit'] = False
                            self.tree.add(i)
            except TypeError as err:
                raise TypeError('Incorrect engine type!')
Ejemplo n.º 17
0
 def nonOverlapGraph(self, DAG=None, leftNode=None, rightNode=None, currVar=None, weights=None):
     """
     build non overlapping graph locally
     :param DAG: directed acyclic graph (networkx graph object)
     :param leftNode: node on the left, Interval object
     :param rightNode: node on the right, Interval object
     :param currVar: current variant, Interval object
     :param weights: dictionary of weights for each variant, dict
     :return: DAG, node1, node3
     """
     svId, svType = currVar.data
     [b2l, b2r, b2a, l2r, l2a, r2a, uniqBA, uniqLR] = weights[svId]
     node1 = leftNode if leftNode else Interval(0, currVar.begin - 1, (svId, "REF"))
     node2 = currVar
     node3 = rightNode if rightNode else Interval(currVar.end, currVar.end + 1, (svId, "REF"))
     DAG.add_node(node1)
     DAG.add_node(node2)
     DAG.add_node(node3)
     DAG.add_edge(node1, node2, weight=b2l)
     if svType == "DEL":
         DAG.add_edge(node1, node3, weight=b2a)
         DAG.add_edge(node2, node3, weight=r2a)
     elif svType == "DUP":
         nodeCopy = Interval(currVar.begin, currVar.end, (svId, "DupCopy"))
         DAG.add_edge(node2, node3, weight=uniqLR)
         DAG.add_edge(node2, nodeCopy, weight=l2r)
         DAG.add_edge(nodeCopy, node3, weight=r2a) # undecided
     elif svType == "INV":
         nodeInv = Interval(currVar.end, currVar.begin, (svId, "InvFlip"))
         DAG.add_edge(node2, node3, weight=r2a)
         DAG.add_edge(node1, nodeInv, weight=b2r)
         DAG.add_edge(nodeInv, node3, weight=l2a)
     elif svType == "BND":
         DAG.add_edge(node2, node3, weight=r2a)
         DAG.add_edge(node1, node3, weight=uniqBA)
     return DAG, node1, node3
Ejemplo n.º 18
0
def exclude_by_qcow2(curr_tree, begin, end, qcow_off):
    q_bgn = qcow_off
    q_end = qcow_off + (end - begin)
    is_lower = 0

    data = [q_bgn, q_end, begin, end, is_lower]

    iv = Interval(q_bgn, q_end, data)
    retval = remove_exist_iv(curr_tree, iv)
    if retval:
        log_dbg_msg("[exclude_by_qcow2] remove bgn={} qcow={}".format(
            hex(begin), hex(qcow_off)))
        return

    curr_tree.chop(q_bgn, q_end, chop_func_cb)
Ejemplo n.º 19
0
 def get_tx_tips(self, timestamp: Optional[float] = None) -> Set[Interval]:
     self._check_connection()
     if isinstance(timestamp, float) and timestamp != inf:
         self.log.warn(
             'timestamp given in float will be truncated, use int instead')
         timestamp = int(timestamp)
     request = protos.ListTipsRequest(tx_type=protos.TRANSACTION_TYPE,
                                      timestamp=timestamp)
     result = self._stub.ListTips(request)
     tips = set()
     for interval_proto in result:
         tips.add(
             Interval(interval_proto.begin, interval_proto.end,
                      interval_proto.data))
     return tips
Ejemplo n.º 20
0
def get_domains(fname):
    domains = np.genfromtxt(fname,
                            dtype=np.dtype([('chrm', 'S10'),
                                            ('start', np.uint32),
                                            ('end', np.uint32)]),
                            usecols=(0, 1, 2))
    domains_dict = {}
    print "Generating intervals object"
    count = 0
    for domain in domains:
        count += 1
        if not domain["chrm"] in domains_dict.keys():
            domains_dict[domain["chrm"]] = IntervalTree()
            prev_domain = None
        if prev_domain == None:
            domains_dict[domain["chrm"]].add(
                Interval(domain["start"], domain["end"], count))
        else:
            domains_dict[domain["chrm"]].add(
                Interval(prev_domain["start"], domain["end"], count))

        prev_domain = domain

    return domains_dict
Ejemplo n.º 21
0
def get_max_coverage(df):
    coverage_list = []
    interval_list = []
    total_coverage = 0

    interval_list = df.values.tolist()
    interval_list = [tuple(l) for l in interval_list]
    t = IntervalTree(Interval(*iv) for iv in interval_list)

    total_coverage = get_coverage(t)
    length = len(interval_list)
    for idx, inter in enumerate(interval_list):
        #        if idx % 10 == 0:
        print "#########"
        print(idx * 100.0 / length), "%"
        print "#########"
        i = Interval(*inter)
        #        ovlp_interval_set = t.search(i)
        ovlp_interval_tree = get_ovlp_interval_tree(df, inter)
        if ovlp_interval_tree == None:
            return total_coverage
        cvg_affect = get_cvg_affect(ovlp_interval_tree, i)
        coverage_list.append(total_coverage - cvg_affect)
    return max(coverage_list)
Ejemplo n.º 22
0
def contain(p, ostore, tree, dereference=True):
    "Return all intervals that contain the point p."
    ivs = set()
    for iv in tree.at(p):
        ref = iv.data
        if ref.otype == "tree":
            t = ostore.get_object(ref.key)
            ivs.update(contain(p, ostore, t))
        else:
            if dereference:
                data = ostore.get_object(ref.key)
            else:
                data = ref
            ivs.add(Interval(iv.begin, iv.end, data))
    return ivs
Ejemplo n.º 23
0
def find_remaining(
    itrees: Mapping[str, IntervalTree],
    nstretches: Mapping[str, IntervalTree],
    scaffolds: Mapping[str, SeqRecord],
) -> None:
    for scaffold, seq in scaffolds.items():
        contigs = itrees[scaffold]
        nstretch = nstretches[scaffold]

        # This is just to remove the data from the intervals.
        # Having data prevents them from being removed with difference.
        intervals = [Interval(i.begin, i.end) for i in contigs]
        intervals.extend(Interval(i.begin, i.end) for i in nstretch)
        covered = IntervalTree(intervals)
        # Strict=false means that adjacent but non-overlapping
        # will also be merged.
        covered.merge_overlaps(strict=False)

        remaining = IntervalTree([Interval(0, len(seq))]) | covered
        remaining.split_overlaps()
        remaining.difference_update(covered)

        itrees[scaffold].update(remaining)
    return
Ejemplo n.º 24
0
 def find_incbins(self, path: str) -> List[Interval]:
     incbins = []
     with open(path, 'r') as file:
         for line in file:
             line = line.strip()
             if line.startswith('.incbin "baserom.gba"'):
                 arr = line.split(',')
                 if len(arr) == 3:
                     addr = int(arr[1], 16)
                     length = int(arr[2], 16)
                     incbin = Interval(addr, addr + length, path)
                     incbins.append(incbin)
                 else:
                     print(f'Invalid incbin: {line}')
     return incbins
Ejemplo n.º 25
0
 def __init__(self, interval_tuples:Iterator[Tuple[Chrom,int,int,GeneName]]):
     '''interval_tuples is like [('22', 12321, 12345, 'APOL1'), ...]'''
     self._its: Dict[Chrom,IntervalTree] = {}
     gene_start_tuples_by_chrom: Dict[Chrom,List[Tuple[int,GeneName]]] = {}
     gene_end_tuples_by_chrom: Dict[Chrom,List[Tuple[int,GeneName]]] = {}
     for (chrom, pos_start, pos_end, gene_name) in interval_tuples:
         if chrom not in self._its:
             self._its[chrom] = IntervalTree()
             gene_start_tuples_by_chrom[chrom] = []
             gene_end_tuples_by_chrom[chrom] = []
         self._its[chrom].add(Interval(pos_start, pos_end, gene_name))
         gene_start_tuples_by_chrom[chrom].append((pos_start, gene_name))
         gene_end_tuples_by_chrom[chrom].append((pos_end, gene_name))
     self._gene_starts = {chrom:BisectFinder(tuples) for chrom,tuples in gene_start_tuples_by_chrom.items()}
     self._gene_ends = {chrom:BisectFinder(tuples) for chrom,tuples in gene_end_tuples_by_chrom.items()}
Ejemplo n.º 26
0
    def get_interval_cu(self, cu_id):
        # MEM LD
        mem_ld_cycle, mem_ld_interval = self.get_interval_cu_cond(
            cu_id, 'LIKE "%MEM LD%"')

        mem_ld_interval_tree = IntervalTree(
            Interval(*iv) for iv in mem_ld_interval)

        # MEM ST
        mem_st_cycle, mem_st_interval = self.get_interval_cu_cond(
            cu_id, 'LIKE "%MEM ST%"')

        mem_st_interval_tree = IntervalTree(
            Interval(*iv) for iv in mem_st_interval)

        # OTHER
        other_cycle, other_interval = self.get_interval_cu_cond(
            cu_id, 'NOT LIKE "%MEM LD%"')

        other_interval_tree = IntervalTree(
            Interval(*iv) for iv in other_interval)

        cycle = self.get_max('inst', 'start + length',
                             ' WHERE cu=' + str(cu_id))
        # print cycle, mem_cycle, other_cycle

        info = {}
        info['mem_ld'] = mem_ld_interval_tree
        info['mem_st'] = mem_st_interval_tree
        info['other'] = other_interval_tree
        info['cycle_all'] = cycle
        info['cycle_mem_ld'] = mem_ld_cycle
        info['cycle_mem_st'] = mem_st_cycle
        info['cycle_other'] = other_cycle

        return info
Ejemplo n.º 27
0
def get_interval_tree(df: pd.DataFrame,
                      diff=True,
                      preserve_struct=False,
                      type_equality='default') -> IntervalTree:

    struct_last = df.iloc[0].values[0]
    index_struct_last = df.index[0]
    intervals_structs = []

    for row in df.iloc[1:, :].itertuples(index=True, name=True):
        index = row[0]
        struct_current = row[1]

        if not diff:
            intervals_structs.append(
                Interval(
                    index_struct_last, index,
                    get_struct(struct_last)
                    if not preserve_struct else struct_last))
        else:
            structs_equal = utils.b_absolutely_equal(
                struct_current, struct_last
            ) if type_equality == 'absolute' else struct_current == struct_last

            if not structs_equal:
                intervals_structs.append(
                    Interval(
                        index_struct_last, index,
                        get_struct(struct_last)
                        if not preserve_struct else struct_last))

        struct_last = struct_current
        index_struct_last = index

    return IntervalTree(
        Interval(begin, end, data) for begin, end, data in intervals_structs)
def detect_overlapping_genes(gff_db, min_overlap=0.1):
    """
    Scans the genome annotation for regions
    containing overlapping genes. If the
    fraction of overlap in both genes is
    lower than min_overlap, the genes are
    not considered overlapping.
    Returns a set of overlapping gene IDs.
    """
    ol_set = set()
    for gene in gff_db.features_of_type('gene', order_by='start'):
        region = (gene.seqid, gene.start, gene.end)
        overlapping_genes = list(gff_db.region(region, featuretype=['gene']))
        if len(overlapping_genes) > 1:
            for gene_pair in combinations(overlapping_genes, 2):
                gene1 = Interval(gene_pair[0].start, gene_pair[0].end)
                gene2 = Interval(gene_pair[1].start, gene_pair[1].end)
                if intervals_frac_overlap(
                        gene1,
                        gene2) >= min_overlap and intervals_frac_overlap(
                            gene2, gene1) >= min_overlap:
                    ol_set = ol_set.union(
                        set([ol_gene['ID'][0] for ol_gene in gene_pair]))
    return ol_set
Ejemplo n.º 29
0
def overlap(begin, end, ostore, tree, dereference=True):
    "Return all intervals that overlap the interval (begin, end)."
    ivs = set()
    for iv in tree.overlap(begin, end):
        ref = iv.data
        if ref.otype == "tree":
            t = ostore.get_object(ref.key)
            ivs.update(overlap(begin, end, ostore, t))
        else:
            if dereference:
                data = ostore.get_object(ref.key)
            else:
                data = ref
            ivs.add(Interval(iv.begin, iv.end, data))
    return ivs
Ejemplo n.º 30
0
def scan_tree(intervals):
    """construct an interval tree using supplied genomic intervals, check all elements on the tree against iself and return any that hit 2 or more intervals (i.e. itself + 1 other)"""

    retlist = set()
    t = IntervalTree(Interval(*iv) for iv in intervals)

    for g in intervals:

        if len(t.overlap(g[0], g[1])) > 1:
            #            print( t.overlap( g[0], g[1]) )
            o = t.overlap(g[0], g[1])
            for x in o:
                retlist.add(x.data)

    return retlist