def interval_tree(start_data, stop_data, buffer_len): starts = [] stops = [] t = IntervalTree() ## Shrink each interval by the buffer size for key, value in start_data.iteritems(): for i in range(0, len(value)): shrunk_start = value[i] + buffer_len / 2.0 shrunk_stop = stop_data[key][i] + 1 - buffer_len / 2.0 if shrunk_start < shrunk_stop: t[shrunk_start:shrunk_stop] = (shrunk_start, shrunk_stop) ## Add chromosome endpoints without buffer chrom_start, chrom_stop = get_extremes(start_data, stop_data) if chrom_start < t.begin() + 1: t[chrom_start:t.begin() + 1] = (chrom_start, t.begin() + 1) if t.end() - 1 < chrom_stop: t[t.end() - 1:chrom_stop] = (t.end() - 1, chrom_stop) ## Merge intervals that overlap in tree to get consensus t.merge_overlaps() ## Check that original intervals only overlap with one consensus interval for key, value in start_data.iteritems(): for i in range(0, len(value)): start = value[i] stop = stop_data[key][i] + 1 if len(t[start:stop]) > 1: ## If they overlap with more than one ## Remove part of consensus interval ## This will never be more than the buffer size/2 assert (len(t[start:stop]) == 2) remove_start = 0 remove_stop = 0 min_length = float('inf') for interval in t[start:stop]: overlap_start, overlap_stop = get_overlap( (start, stop), (interval[0], interval[1])) if (overlap_stop - overlap_start) < min_length: min_length = overlap_stop - overlap_start remove_start = overlap_start remove_stop = overlap_stop print(min_length) t.chop(remove_start, remove_stop) assert (min_length <= buffer_len / 2.0) assert (len(t[start:stop]) < 2) ## Get consensus start and stop points chrom_len = chrom_stop - chrom_start covered = 0.0 for interval in sorted(t): starts.append(interval[0]) stops.append(interval[1]) covered = covered + (interval[1] - interval[0]) print("The percentage of the chromosome covered is: %s" % '{0:.2f}'.format( (covered / chrom_len) * 100.0)) return (starts, stops)
def test_empty_queries(): t = IntervalTree() e = set() assert len(t) == 0 assert t.is_empty() assert t[3] == e assert t[4:6] == e assert t.begin() == 0 assert t.end() == 0 assert t[t.begin():t.end()] == e assert t.items() == e assert set(t) == e assert set(t.copy()) == e assert t.find_nested() == {} t.verify()
def test_list_init(): tree = IntervalTree([Interval(-10, 10), Interval(-20.0, -10.0)]) tree.verify() assert tree assert len(tree) == 2 assert tree.items() == set([Interval(-10, 10), Interval(-20.0, -10.0)]) assert tree.begin() == -20 assert tree.end() == 10
def test_empty_queries(): t = IntervalTree() e = set() assert len(t) == 0 assert t.is_empty() assert t[3] == e assert t[4:6] == e assert t.begin() == 0 assert t.end() == 0 assert t[t.begin():t.end()] == e assert t.items() == e assert set(t) == e assert set(t.copy()) == e assert t.find_nested() == {} assert t.range().is_null() assert t.range().length() == 0 t.verify()
def test_empty_queries(): t = IntervalTree() e = set() assert len(t) == 0 assert t.is_empty() assert t[3] == e assert t[4:6] == e assert t.begin() == 0 assert t.end() == 0 assert t[t.begin():t.end()] == e assert t.overlap(t.begin(), t.end()) == e assert t.envelop(t.begin(), t.end()) == e assert t.items() == e assert set(t) == e assert set(t.copy()) == e assert t.find_nested() == {} assert t.range().is_null() assert t.range().length() == 0 t.verify()
class BratEntity(BratAnnotation): """ Each entity annotation has a unique ID and is defined by type (e.g. Person or Organization) and the span of characters containing the entity mention (represented as a "start end" offset pair). For example, :: T1 Organization 0 4 Sony T3 Organization 33 41 Ericsson T3 Country 75 81 Sweden Each line contains one text-bound annotation identifying the entity mention in text Represented in standoff as "`ID [tab] TYPE START END [tab] TEXT`" where START and END are positive integer offsets identifying the span of the annotation in text and `TEXT` is the corresponding text. Discontinuous annotations can be represented as "`ID [tab] TYPE START END[;START END]* [tab] TEXT`" with multiple START END pairs separated by semicolons. """ def __init__(self): super(BratEntity, self).__init__() self.text = None # type: Optional[str] self.locations = IntervalTree() # type: IntervalTree def shift(self, offset: int): ent = BratEntity() ent.id = self.id ent.type = self.type ent.text = self.text for interval in self.locations: ent.locations[interval.begin + offset: interval.end + offset] = interval.data return ent def add_span(self, start: int, end: int, data = None): self.locations[start: end] = data @property def total_span(self) -> Tuple[int, int]: return self.locations.begin(), self.locations.end() def __eq__(self, other): if not isinstance(other, BratEntity): return False else: return self.id == other.id \ and self.type == other.type \ and self.text == other.text \ and self.locations == other.locations def __str__(self): return 'BratEntity[id=%s,type=%s,text=%s,loc=%s]' % ( self.id, self.type, self.text, self.locations)
def test_generator_init(): tree = IntervalTree( Interval(begin, end) for begin, end in [(-10, 10), (-20, -10), (10, 20)]) tree.verify() assert tree assert len(tree) == 3 assert tree.items() == set([ Interval(-20, -10), Interval(-10, 10), Interval(10, 20), ]) assert tree.begin() == -20 assert tree.end() == 20
def test_generator_init(): tree = IntervalTree( Interval(begin, end) for begin, end in [(-10, 10), (-20, -10), (10, 20)] ) tree.verify() assert tree assert len(tree) == 3 assert tree.items() == set([ Interval(-20, -10), Interval(-10, 10), Interval(10, 20), ]) assert tree.begin() == -20 assert tree.end() == 20
def gen_interval_tree(interval): # Returns an interval tree queryable by points and ranges # [0, n] -> (node, offset) # Node Node Node # |---------|--------------|----------| offset = 0 tree = IntervalTree() for pair in interval.split('|', 1)[-1].split('_'): (node, start, end) = map(int, re.split('[+,]', pair)) tree[start + offset:end + offset] = (node, offset) offset += end return (tree, tree.begin())
def smooth_nucleotide(regions, concat_regions_d, mutations, tukey_filter, simulation_window): """Generate a smoothing curve for a list of element's mutations in the nucleotide sequence Args: regions (IntervalTree): IntervalTree with genomic positions of an element concat_regions_d (dict): keys are start genomic regions, values are positions (index) relative to the start mutations (list): list of mutations formatted as namedtuple tukey_filter (numpy.ndarray): kde array, length equals smoothing window. simulation_window (int): simulation window Returns: final_smooth_tree (IntervalTree): interval are genomic regions or indexes (concatenate mode), data np.array of smoothing score by position mutations_in (list): list of mutations in regions """ first_smooth_tree = IntervalTree() final_smooth_tree = IntervalTree() mutations_in = [] # Generate smoothing arrays for regions for interval in regions: # Add extra bases for smoothing of simulated mutations that fall outside regions and tukey_filter first_smooth_tree.addi( interval.begin, interval.end, np.zeros((interval.end - interval.begin + len(tukey_filter) + simulation_window - 2))) if not concat_regions_d: # Smooth for mutation in mutations: for interval in first_smooth_tree[mutation.region[0]]: # Get index of mutation in region new_begin = interval.begin - (simulation_window + len(tukey_filter) - 2) // 2 # always integer index = mutation.position - new_begin tukey_begin = index - (len(tukey_filter) - 1) // 2 # Smooth mutations interval.data[tukey_begin:tukey_begin + len(tukey_filter)] += tukey_filter # Get mutations inside regions if regions[mutation.position]: mutations_in.append(mutation) # Remove extra bp for interval in first_smooth_tree: begin = interval.begin end = interval.end slicer = (simulation_window + len(tukey_filter) - 2) // 2 final_smooth_tree.addi(begin, end, interval.data[slicer:-slicer]) else: # Smooth simulated mutations outside regions for mutation in mutations: if not first_smooth_tree[mutation.position]: for interval in first_smooth_tree[mutation.region[0]]: new_begin = interval.begin - (simulation_window + len(tukey_filter) - 2) // 2 # always integer index = mutation.position - new_begin tukey_begin = index - (len(tukey_filter) - 1) // 2 # Smooth mutations interval.data[tukey_begin:tukey_begin + len(tukey_filter)] += tukey_filter # Remove extra bp for interval in first_smooth_tree: begin = interval.begin end = interval.end slicer = (simulation_window + len(tukey_filter) - 2) // 2 final_smooth_tree.addi(begin, end, interval.data[slicer:-slicer]) # Merge sorted regions (one interval == concatenated sequence) and add tukey//2 to both ends concat_tree = IntervalTree() concat_array = np.zeros((len(tukey_filter) - 1) // 2) for interval in sorted(final_smooth_tree): concat_array = np.append(concat_array, interval.data) concat_array = np.append(concat_array, np.zeros((len(tukey_filter) - 1) // 2)) concat_tree.addi(final_smooth_tree.begin(), final_smooth_tree.end(), concat_array) final_smooth_tree = IntervalTree() # Smooth mutations inside regions for mutation in mutations: if first_smooth_tree[mutation.position]: for interval in concat_tree[mutation.position]: # Get index of mutation in concatenated sequence index = (mutation.position - mutation.region[0] ) + concat_regions_d[mutation.region[0]].start # Smooth mutations interval.data[index:(index + len(tukey_filter))] += tukey_filter mutations_in.append(mutation) # Remove extra bp for interval in concat_tree: begin = interval.begin end = interval.end slicer = (len(tukey_filter) - 1) // 2 final_smooth_tree.addi(begin, end, interval.data[slicer:-slicer]) return final_smooth_tree, mutations_in
class CoordinateTranslator(object): class Leaf(object): def __init__(self, feature, coding_start, coding_stop): self.feature = feature self.start = feature.start self.stop = feature.stop self.coding_start = coding_start self.coding_stop = coding_stop def __str__(self): return 'genomic: [%s, %s], coding: [%s, %s]' % ( self.start, self.stop, self.coding_start, self.coding_stop) def __init__(self, exons, introns, strand, coding_offset, coding_length): self.strand = strand self.coding_offset = coding_offset self.coding_length = coding_length self._exon_tree = IntervalTree() self._intron_tree = IntervalTree() self._genomic_tree = IntervalTree() _coding_start = -self.coding_offset for exon in (exons if self.strand == '+' else exons[::-1]): leaf = Transcript.CoordinateTranslator.Leaf( exon, _coding_start, _coding_start + exon.length - 1) self._genomic_tree.addi(leaf.start, leaf.stop + 1, leaf) self._exon_tree.addi(leaf.coding_start, leaf.coding_stop + 1, leaf) # increment _coding_start = leaf.coding_stop + 1 for intron in introns: # introns don't have coding coordinates, so use those of # adjacent exons leaf_genomic_upstream = \ list(self._genomic_tree[intron.start - 1])[0].data leaf_genomic_downstream = \ list(self._genomic_tree[intron.stop + 1])[0].data # NOTE: always assemble intronic offsets w.r.t. to the # 'coding stop' position of the upstream CDS if self.strand == '+': leaf = \ Transcript.CoordinateTranslator.Leaf( intron, leaf_genomic_upstream.coding_stop, leaf_genomic_downstream.coding_start ) else: leaf = \ Transcript.CoordinateTranslator.Leaf( intron, leaf_genomic_downstream.coding_stop, leaf_genomic_upstream.coding_start ) self._intron_tree.addi(leaf.start, leaf.stop + 1, leaf) # add introns that are upstream and downstream to the exon # sequence # TODO: we may not need this, depending on how we choose to handle # [start, stop] ranges that occur outside exon ranges if self.strand == '+': # straw upstream (genomic) intron straw0 = \ Feature('.', 0, self._genomic_tree.begin() - 1, self.strand, None) # noqa leaf0 = \ Transcript.CoordinateTranslator.Leaf(straw0, -1, 0) self._intron_tree.addi(straw0.start, straw0.stop, leaf0) # straw downstream (genomic) intron straw1 = \ Feature('.', self._genomic_tree.end() + 1, sys.maxint, self.strand, None) # noqa leaf1 = \ Transcript.CoordinateTranslator.Leaf( straw1, self.coding_length - 1, self.coding_length) # noqa self._intron_tree.addi(straw1.start, straw1.stop, leaf1) else: # straw upstream (genomic) intron straw0 = \ Feature('.', 0, self._genomic_tree.begin() - 1, self.strand, None) # noqa leaf0 = \ Transcript.CoordinateTranslator.Leaf(straw0, self.coding_length - 1, self.coding_length) # noqa self._intron_tree.addi(straw0.start, straw0.stop, leaf0) # straw downstream (genomic) intron straw1 = \ Feature('.', self._genomic_tree.end() + 1, sys.maxint, self.strand, None) # noqa leaf1 = \ Transcript.CoordinateTranslator.Leaf(straw1, -1, 0) # noqa self._intron_tree.addi(straw1.start, straw1.stop, leaf1) def to_coding_range(self, start, stop, hgvs_format=False): # from above, introns have a coding_length == 1 # TODO: set 'intron' attribute on leaves in '_intron_tree' # above def _is_intron(leaf): return leaf.coding_stop - leaf.coding_start == 1 # coding start range_coding_start = (list(self._genomic_tree[start] | self._intron_tree[start]) or [None])[0] coding_start = None intron_coding_offset_start = 0 leaf = range_coding_start.data if _is_intron(leaf): if self.strand == '+': delta0 = start - leaf.start + 1 delta1 = leaf.stop + 1 - start if hgvs_format and delta0 > delta1: coding_start = leaf.coding_stop intron_coding_offset_start = -delta1 else: coding_start = leaf.coding_start intron_coding_offset_start = delta0 else: delta0 = leaf.stop + 1 - stop delta1 = stop - leaf.start + 1 if hgvs_format and delta0 > delta1: coding_start = leaf.coding_stop intron_coding_offset_start = -delta1 else: coding_start = leaf.coding_start intron_coding_offset_start = delta0 else: if self.strand == '+': coding_start = \ leaf.coding_start + (start - leaf.start) else: coding_start = \ leaf.coding_start + (leaf.stop - stop) # coding stop range_coding_stop = (list(self._genomic_tree[stop] | self._intron_tree[stop]) or [None])[0] coding_stop = None intron_coding_offset_stop = 0 leaf = range_coding_stop.data if _is_intron(leaf): if self.strand == '+': delta0 = stop - leaf.start + 1 delta1 = leaf.stop + 1 - stop if hgvs_format and delta0 > delta1: coding_stop = leaf.coding_stop intron_coding_offset_stop = -delta1 else: coding_stop = leaf.coding_start intron_coding_offset_stop = delta0 else: delta0 = leaf.stop + 1 - start delta1 = start - leaf.start + 1 if hgvs_format and delta0 > delta1: coding_stop = leaf.coding_stop intron_coding_offset_stop = -delta1 else: coding_stop = leaf.coding_start intron_coding_offset_stop = delta0 else: if self.strand == '+': coding_stop = \ leaf.coding_stop - (leaf.stop - stop) else: coding_stop = \ leaf.coding_stop - (start - leaf.start) return \ Transcript.CodingRange( coding_start, coding_stop, intron_coding_offset_start, intron_coding_offset_stop ) def to_genomic_ranges(self, coding_start, coding_stop): genomic_ranges = [] list_ranges = sorted(self._exon_tree[coding_start:coding_stop + 1], reverse=self.strand == '-') for leaf in [r.data for r in list_ranges]: if self.strand == '+': genomic_ranges.append( Transcript.GenomicRange( leaf.start + max(coding_start - leaf.coding_start, 0), # noqa leaf.stop - max(leaf.coding_stop - coding_stop, 0) # noqa )) else: genomic_ranges.append( Transcript.GenomicRange( leaf.start + max(leaf.coding_stop - coding_stop, 0), # noqa leaf.stop - max(coding_start - leaf.coding_start, 0) # noqa )) return genomic_ranges def __str__(self): return 'coding sequences: %s' % map(str, self._tree)
class ExonCoords: def __init__(self, chromosome, strand, breakpoint, gene_name, exons: IntervalTree): self.chromosome = chromosome self.strand = strand self.breakpoint = breakpoint self.gene_name = gene_name self.exons = IntervalTree(exons) @classmethod def fromTuple(cls, a_tuple): return cls(a_tuple[0], a_tuple[1], a_tuple[2], a_tuple[3], a_tuple[4]) @classmethod def copy_without_exons(cls, exc): return cls(exc.chromosome, exc.strand, exc.breakpoint, exc.gene_name, IntervalTree()) @classmethod def empty(cls): return cls("", 0, -1, "", IntervalTree()) def print_properties(self): print("#########################################") print( "coordinates :", self.chromosome + ":" + str(self.exons.begin()) + "-" + str(self.exons.end())) print("gene :", self.gene_name) print("strand :", self._strand) print("breakpoint :", self._breakpoint) print("exons :", self._exons) print("#########################################") def print_as_bed(self): chromosome = self.chromosome for ex in sorted(self.exons): print(chromosome + "\t" + str(ex.begin) + "\t" + str(ex.end)) @property def gene_name(self): return self._gene_name @gene_name.setter def gene_name(self, value): self._gene_name = value @property def chromosome(self): return self._chromosome @chromosome.setter def chromosome(self, value): self._chromosome = value @property def strand(self): return self._strand @strand.setter def strand(self, value): self._strand = value @property def breakpoint(self): # int return self._breakpoint @breakpoint.setter def breakpoint(self, value): self._breakpoint = value @property def exons(self): # IntervalTree() return self._exons @exons.setter def exons(self, exons): self._exons = exons def begin(self): return self.exons.begin()
class IntervalGraph(object): """Base class for undirected interval graphs. The IntervalGraph class allows any hashable object as a node and can associate key/value attribute pairs with each undirected edge. Each edge must have two integers, begin and end for its interval. Self-loops are allowed but multiple edges (two or more edges with the same nodes, begin and end interval) are not. Two nodes can have more than one edge with different overlapping or non-overlapping intervals. Parameters ---------- attr : keyword arguments, optional (default= no attributes) Attributes to add to graph as key=value pairs. Examples -------- Create an empty graph structure (a "null interval graph") with no nodes and no edges. >>> G = dnx.IntervalGraph() G can be grown in several ways. **Nodes:** Add one node at a time: >>> G.add_node(1) Add the nodes from any container (a list, dict, set or even the lines from a file or the nodes from another graph). Add the nodes from any container (a list, dict, set) >>> G.add_nodes_from([2, 3]) >>> G.add_nodes_from(range(100, 110)) **Edges:** G can also be grown by adding edges. This can be considered the primary way to grow G, since nodes with no edge will not appear in G in most cases. See ``G.to_snapshot()``. Add one edge, which starts at 0 and ends at 10. Keep in mind that the interval is [0, 10). Thus, it does not include the end. >>> G.add_edge(1, 2, 0, 10) a list of edges, >>> G.add_edges_from([(1, 2, 0, 10), (1, 3, 3, 11)]) If some edges connect nodes not yet in the graph, the nodes are added automatically. There are no errors when adding nodes or edges that already exist. **Attributes:** Each interval graph, node, and edge can hold key/value attribute pairs in an associated attribute dictionary (the keys must be hashable). By default these are empty, but can be added or changed using add_edge, add_node. Keep in mind that the edge interval is not an attribute of the edge. >>> G = dnx.IntervalGraph(day="Friday") >>> G.graph {'day': 'Friday'} Add node attributes using add_node(), add_nodes_from() >>> G.add_node(1, time='5pm') >>> G.add_nodes_from([3], time='2pm') Add edge attributes using add_edge(), add_edges_from(). >>> G.add_edge(1, 2, 0, 10, weight=4.7 ) >>> G.add_edges_from([(3, 4, 3, 11), (4, 5, 0, 33)], color='red') **Shortcuts:** Here are a couple examples of available shortcuts: >>> 1 in G # check if node in interval graph during any interval True >>> len(G) # number of nodes in the entire interval graph 5 **Subclasses (Advanced):** Edges in interval graphs are represented by Interval Objects and are kept in an IntervalTree. Both are based on intervaltree available in pypi (https://pypi.org/project/intervaltree). IntervalTree allows for fast interval based search through edges, which makes interval graph analyes possible. The Graph class uses a dict-of-dict-of-dict data structure. The outer dict (node_dict) holds adjacency information keyed by node. The next dict (adjlist_dict) represents the adjacency information and holds edge data keyed by interval object. The inner dict (edge_attr_dict) represents the edge data and holds edge attribute values keyed by attribute names. """ def __init__(self, **attr): """Initialize an interval graph with edges, name, or graph attributes. Parameters ---------- attr : keyword arguments, optional (default= no attributes) Attributes to add to graph as key=value pairs. Examples -------- >>> G = dnx.IntervalGraph() >>> G = dnx.IntervalGraph(name='my graph') >>> G.graph {'name': 'my graph'} """ self.tree = IntervalTree() self.graph = {} # dictionary for graph attributes self._adj = {} self._node = {} self.graph.update(attr) @property def name(self): """String identifier of the interval graph. This interval graph attribute appears in the attribute dict IG.graph keyed by the string `"name"`. as well as an attribute (technically a property) `IG.name`. This is entirely user controlled. """ return self.graph.get('name', '') @name.setter def name(self, s): self.graph['name'] = s def __str__(self): """Return the interval graph name. Returns ------- name : string The name of the interval graph. Examples -------- >>> G = dnx.IntervalGraph(name='foo') >>> str(G) 'foo' """ return self.name def __len__(self): """Return the number of nodes. Use: 'len(G)'. Returns ------- nnodes : int The number of nodes in the graph. Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_nodes_from([2, 4, 5]) >>> len(G) 3 """ return len(self._node) def __contains__(self, n): """Return True if n is a node, False otherwise. Use: 'n in G'. Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_node(2) >>> 2 in G True """ try: return n in self._node except TypeError: return False def interval(self): """Return a 2-tuple as (begin, end) interval of the entire interval graph. Note that end is non-inclusive. Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_edges_from([(1, 2, 0, 10), (3, 7, 9, 16)]) >>> G.interval() (0, 16) """ return self.tree.begin(), self.tree.end() def add_node(self, node_for_adding, **attr): """Add a single node `node_for_adding` and update node attributes. Parameters ---------- node_for_adding : node A node can be any hashable Python object except None. attr : keyword arguments, optional Set or change node attributes using key=value. See Also -------- add_nodes_from Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_node(1) >>> G.add_node('Hello') >>> G.number_of_nodes() 2 Use keywords set/change node attributes: >>> G.add_node(1, size=10) >>> G.add_node(3, weight=0.4, UTM=('13S', 382871, 3972649)) Notes ----- A hashable object is one that can be used as a key in a Python dictionary. This includes strings, numbers, tuples of strings and numbers, etc. On many platforms hashable items also include mutables such as NetworkX Graphs, though one should be careful that the hash doesn't change on mutables. """ if node_for_adding not in self._node: self._adj[node_for_adding] = {} self._node[node_for_adding] = attr else: # update attr even if node already exists self._node[node_for_adding].update(attr) def add_nodes_from(self, nodes_for_adding, **attr): """Add multiple nodes. Parameters ---------- nodes_for_adding : iterable container A container of nodes (list, dict, set, etc.). OR A container of (node, attribute dict) tuples. Node attributes are updated using the attribute dict. attr : keyword arguments, optional (default= no attributes) Update attributes for all nodes in nodes. Node attributes specified in nodes as a tuple take precedence over attributes specified via keyword arguments. See Also -------- add_node Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_nodes_from('Hello') >>> G.has_node('e') True Use keywords to update specific node attributes for every node. >>> G.add_nodes_from([1, 2], size=10) >>> G.add_nodes_from([3, 4], weight=0.4) Use (node, attrdict) tuples to update attributes for specific nodes. >>> G.add_nodes_from([(1, dict(size=11)), (2, {'color':'blue'})]) """ for n in nodes_for_adding: # keep all this inside try/except because # CPython throws TypeError on n not in self._node, # while pre-2.7.5 ironpython throws on self._adj[n] try: if n not in self._node: self._adj[n] = {} self._node[n] = attr.copy() else: self._node[n].update(attr) except TypeError: nn, ndict = n if nn not in self._node: self._adj[nn] = {} self._node[nn] = attr.copy() self._node[nn].update(ndict) else: self._node[nn].update(attr) self._node[nn].update(ndict) def number_of_nodes(self, begin=None, end=None): """Return the number of nodes in the interval graph between the given interval. Parameters ---------- begin: integer, optional (default= beginning of the entire interval graph) Inclusive beginning time of the node appearing in the interval graph. end: integer, optional (default= end of the entire interval graph + 1) Non-inclusive ending time of the node appearing in the interval graph. Must be bigger than begin. Note that the default value is shifted up by 1 to make it an inclusive end. Returns ------- nnodes : int The number of nodes in the interval graph. See Also -------- __len__ Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_edges_from([(1, 2, 0, 5), (3, 4, 8, 11)]) >>> len(G) 4 >>> G.number_of_nodes() 4 >>> G.number_of_nodes(begin=6) 2 >>> G.number_of_nodes(begin=5, end=8) # end in non-inclusive 2 >>> G.number_of_nodes(end=8) 4 """ if begin is None and end is None: return len(self._node) if begin is None: begin = self.tree.begin() if end is None: end = self.tree.end() + 1 iedges = self.tree[begin:end] inodes = set() for iv in iedges: inodes.add(iv.data[0]) inodes.add(iv.data[1]) return len(inodes) def has_node(self, n, begin=None, end=None): """Return True if the interval graph contains the node n, during the given interval. Identical to `n in G` when 'begin' and 'end' are not defined. Parameters ---------- n : node begin: integer, optional (default= beginning of the entire interval graph) Inclusive beginning time of the node appearing in the interval graph. end: integer, optional (default= end of the entire interval graph + 1) Non-inclusive ending time of the node appearing in the interval graph. Must be bigger than begin. Note that the default value is shifted up by 1 to make it an inclusive end. Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_ndoe(1) >>> G.has_node(1) True It is more readable and simpler to use >>> 0 in G True With interval query: >>> G.add_edge(3, 4, 2, 5) >>> G.has_node(3) True >>> G.has_node(3, begin=2) True >>> G.has_node(3, end=2) # end is non-inclusive False """ try: exists_node = n in self._node except TypeError: exists_node = False if (begin is None and end is None) or not exists_node: return exists_node if begin is None: begin = self.tree.begin() if end is None: end = self.tree.end() + 1 iedges = self._adj[n].keys() for iv in iedges: if iv.overlaps(begin=begin, end=end): return True return False def nodes(self, begin=None, end=None, data=False, default=None): """A NodeDataView of the IntervalGraph nodes. A nodes is considered to be present during an interval, if it has an edge with overlapping interval. Parameters ---------- begin: integer, optional (default= beginning of the entire interval graph) Inclusive beginning time of the node appearing in the interval graph. end: integer, optional (default= end of the entire interval graph + 1) Non-inclusive ending time of the node appearing in the interval graph. Must be bigger than begin. Note that the default value is shifted up by 1 to make it an inclusive end. data : string or bool, optional (default=False) The node attribute returned in 2-tuple (n, dict[data]). If False, return just the nodes n. default : value, optional (default=None) Value used for nodes that don't have the requested attribute. Only relevant if data is not True or False. Returns ------- NodeDataView A NodeDataView iterates over `(n, data)` and has no set operations. When called, if data is False, an iterator over nodes. Otherwise an iterator of 2-tuples (node, attribute value) where data is True. Examples -------- There are two simple ways of getting a list of all nodes in the graph: >>> G = dnx.IntervalGraph() >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 12, 19), (2, 4, 8, 15)]) [1, 2, 4, 6] To get the node data along with the nodes: >>> G.add_nodes_from([(1, {'time': '1pm'}), (2, {'time': '2pm'}), (4, {'time': '4pm'}), (6, {'day': 'Friday'})]) [(1, {'time': '1pm'}), (2, {'time': '2pm'}), (4, {'time': '4pm'}), (6, {'day': 'Friday'})] >>> G.nodes(data="time") [(1, '1pm'), (2, '2pm'), (4, '4pm'), (6, None)] >>> G.nodes(data="time", default="5pm") [(1, '1pm'), (2, '2pm'), (4, '4pm'), (6, '5pm')] To get nodes which appear in a specific interval. nodes without an edge are not considered present. >>> G.nodes(begin=11, data=True) [(2, {'time': '2pm'}), (4, {'time': '4pm'}), (6, {'day': 'Friday'})] >>> G.nodes(begin=4, end=12) # non-inclusive end [1, 2, 4] """ if begin is None and end is None: return NodeDataView(self._node, data=data, default=default) if begin is None: begin = self.tree.begin() if end is None: end = self.tree.end() + 1 iedges = self.tree[begin:end] inodes = set() for iv in iedges: inodes.add(iv.data[0]) inodes.add(iv.data[1]) node_dict = {n: self._node[n] for n in inodes} return NodeDataView(node_dict, data=data, default=default) def remove_node(self, n, begin=None, end=None): """Remove the presence of a node n within the given interval. Removes the presence node n and all adjacent edges within the given interval. If interval is specified, all the edges of n will be removed within that interval. Quiet if n is not in the interval graph. Parameters ---------- n : node A node in the graph begin: integer, optional (default= beginning of the entire interval graph) Inclusive beginning time of the node appearing in the interval graph. end: integer, optional (default= end of the entire interval graph + 1) Non-inclusive ending time of the node appearing in the interval graph. Must be bigger than begin. Note that the default value is shifted up by 1 to make it an inclusive end. Examples -------- >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 12, 19), (2, 4, 8, 15)]) >>> G.add_nodes_from([(1, {'time': '1pm'}), (2, {'time': '2pm'}), (4, {'time': '4pm'})]) >>> G.nodes(begin=4, end=6) [1, 2, 4, 6] >>> G.remove_node(2, begin=4, end=6) >>> G.nodes(begin=4, end=6) [4, 6] >>> G.nodes(data=True) [(1, {'time': '1pm'}), (2, {'time': '2pm'}), (4, {'time': '4pm'}), (6, {})] >>> G.remove_node(2) >>> G.nodes(data=True) [(1, {'time': '1pm'}), (4, {'time': '4pm'}), (6, {})] """ if n not in self._node: return if begin is None and end is None: for iedge in list(self._adj[n].keys()): self.__remove_iedge(iedge) else: if begin is None: begin = self.tree.begin() if end is None: end = self.tree.end() + 1 for iedge in self.tree[begin:end]: if iedge.data[0] == n or iedge.data[1] == n: self.__remove_iedge(iedge) # delete the node and its attributes if no edge left if len(self._adj[n]) == 0: self._adj.pop(n, None) self._node.pop(n, None) def add_edge(self, u, v, begin, end, **attr): """Add an edge between u and v, during interval [begin, end). The nodes u and v will be automatically added if they are not already in the interval graph. Edge attributes can be specified with keywords or by directly accessing the edge's attribute dictionary. See examples below. Parameters ---------- u, v : nodes Nodes can be, for example, strings or numbers. Nodes must be hashable (and not None) Python objects. begin: orderable type Inclusive beginning time of the edge appearing in the interval graph. end: orderable type Non-inclusive ending time of the edge appearing in the interval graph. Must be bigger than begin. attr : keyword arguments, optional Edge data (or labels or objects) can be assigned using keyword arguments. See Also -------- add_edges_from : add a collection of edges Notes ----- Adding an edge that already exists updates the edge data. Both begin and end must be the same type across all edges in the interval graph. Also, to create snapshots, both must be integers. Many NetworkX algorithms designed for weighted graphs use an edge attribute (by default `weight`) to hold a numerical value. Examples -------- The following all add the edge e=(1, 2, 3, 10) to graph G: >>> G = dnx.IntervalGraph() >>> e = (1, 2, 3, 10) >>> G.add_edge(1, 2, 3, 10) # explicit two-node form with interval >>> G.add_edge(*e) # single edge as tuple of two nodes and interval >>> G.add_edges_from([(1, 2, 3, 10)]) # add edges from iterable container Associate data to edges using keywords: >>> G.add_edge(1, 2, 3, 10 weight=3) >>> G.add_edge(1, 3, 4, 9, weight=7, capacity=15, length=342.7) """ iedge = self.__get_iedge_in_tree(begin, end, u, v) # if edge exists, just update attr if iedge is not None: # since both point to the same attr, updating one is enough self._adj[u][iedge].update(attr) return iedge = Interval(begin, end, (u, v)) # add nodes if u not in self._node: self._adj[u] = {} self._node[u] = {} if v not in self._node: self._adj[v] = {} self._node[v] = {} # add edge try: self.tree.add(iedge) except ValueError: raise NetworkXError( "IntervalGraph: edge duration must be strictly bigger than zero {0}." .format(iedge)) self._adj[u][iedge] = self._adj[v][iedge] = attr def add_edges_from(self, ebunch_to_add, **attr): """Add all the edges in ebunch_to_add. Parameters ---------- ebunch_to_add : container of edges Each edge given in the container will be added to the interval graph. The edges must be given as as 4-tuples (u, v, being, end). Both begin and end must be orderable and the same type across all edges. attr : keyword arguments, optional Edge data (or labels or objects) can be assigned using keyword arguments. See Also -------- add_edge : add a single edge Notes ----- Adding the same edge (with the same interval) twice has no effect but any edge data will be updated when each duplicate edge is added. Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11)]) # using a list of edge tuples Associate data to edges >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11)], weight=3) >>> G.add_edges_from([(3, 4, 2, 19), (1, 4, 1, 3)], label='WN2898') """ for e in ebunch_to_add: if len(e) != 4: raise NetworkXError( "Edge tuple {0} must be a 4-tuple.".format(e)) self.add_edge(e[0], e[1], e[2], e[3], **attr) def has_edge(self, u, v, begin=None, end=None, overlapping=True): """Return True if there exists an edge between u and v in the interval graph, during the given interval. Parameters ---------- u, v : nodes Nodes can be, for example, strings or numbers. Nodes must be hashable (and not None) Python objects. begin : integer, optional (default= beginning of the entire interval graph) Inclusive beginning time of the node appearing in the interval graph. end : integer, optional (default= end of the entire interval graph + 1) Non-inclusive ending time of the node appearing in the interval graph. Must be bigger than begin. Note that the default value is shifted up by 1 to make it an inclusive end. overlapping : bool, optional (default= True) if True, it returns True if there exists an edge between u and v with overlapping interval with `begin` and `end`. if False, it returns true only if there exists an edge between u and v with the exact interval. Note: if False, both `begin` and `end` must be defined, otherwise an exception is raised. Raises ------ NetworkXError If `begin` and `end` are not defined and `overlapping= False` Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11)]) >>> G.has_edge(1, 2) True With specific overlapping interval: >>> G.has_edge(1, 2, begin=2) True >>> G.has_edge(2, 4, begin=12) False Exact interval match: >>> G.has_edge(2, 4, begin=1, end=11) True >>> G.has_edge(2, 4, begin=2, end=11) False """ if begin is None and end is None: for iv in self._adj[u].keys(): if iv.data[0] == v or iv.data[1] == v: return True return False if not overlapping: if begin is None or end is None: raise NetworkXError( "For exact interval match (overlapping=False), both begin and end must be defined." ) return self.__get_iedge_in_tree(u, v, begin, end) is not None if begin is None: begin = self.tree.begin() if end is None: end = self.tree.end() + 1 for iv in self._adj[u].keys(): if (iv.data[0] == v or iv.data[1] == v) and iv.overlaps( begin=begin, end=end): return True return False def edges(self, u=None, v=None, begin=None, end=None, data=False, default=None): """A list of Interval objects of the IntervalGraph edges. All edges which are present within the given interval. All parameters are optional. `u` and `v` can be thought of as constraints. If no node is defined, all edges within the interval are returned. If one node is defined, all edges which have that node as one end, will be returned, and finally if both nodes are defined then all edges between the two nodes are returned. Parameters ---------- u, v : nodes, optional (default=None) Nodes can be, for example, strings or numbers. Nodes must be hashable (and not None) Python objects. If the node does not exist in the graph, a key error is raised. begin: integer, optional (default= beginning of the entire interval graph) Inclusive beginning time of the edge appearing in the interval graph. end: integer, optional (default= end of the entire interval graph + 1) Non-inclusive ending time of the edge appearing in the interval graph. Must be bigger than begin. Note that the default value is shifted up by 1 to make it an inclusive end. data : string or bool, optional (default=False) If True, return 2-tuple (Interval object, dict of attributes). If False, return just the Interval objects. If string (name of the attribute), return 2-tuple (Interval object, attribute value). default : value, optional (default=None) Default Value to be used for edges that don't have the requested attribute. Only relevant if `data` is a string (name of an attribute). Returns ------- List of Interval objects An interval object has the following format: (begin, end, (u, v)) When called, if `data` is False, a list of interval objects. If `data` is True, a list of 2-tuples: (Interval, dict of attribute(s) with values), If `data` is a string, a list of 2-tuples (Interval, attribute value). Examples -------- To get a list of all edges: >>> G = dnx.IntervalGraph() >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 12, 19), (2, 4, 8, 15)]) >>> G.edges() [Interval(8, 15, (2, 4)), Interval(3, 10, (1, 2)), Interval(1, 11, (2, 4)), Interval(12, 19, (6, 4))] To get edges which appear in a specific interval: >>> G.edges(begin=10) [Interval(12, 19, (6, 4)), Interval(1, 11, (2, 4)), Interval(8, 15, (2, 4))] >>> G.edges(end=5) [Interval(3, 10, (1, 2)), Interval(1, 11, (2, 4))] >>> G.edges(begin=2, end=4) [Interval(3, 10, (1, 2)), Interval(1, 11, (2, 4))] To get edges with either of the two nodes being defined: >>> G.edges(u=2) [Interval(3, 10, (1, 2)), Interval(1, 11, (2, 4)), Interval(8, 15, (2, 4))] >>> G.edges(u=2, begin=11) [Interval(1, 11, (2, 4)), Interval(8, 15, (2, 4))] >>> G.edges(u=2, v=4, end=8) [Interval(1, 11, (2, 4))] >>> G.edges(u=1, v=6) [] To get a list of edges with data: >>> G = dnx.IntervalGraph() >>> G.add_edge(1, 3, 1, 4, weight=8, height=18) >>> G.add_edge(1, 2, 3, 10, weight=10) >>> G.add_edge(2, 6, 2, 10) >>> G.edges(data="weight") [(Interval(2, 8, (2, 3)), None), (Interval(3, 10, (1, 2)), 10), (Interval(1, 4, (1, 3)), 8)] >>> G.edges(data="weight", default=5) [(Interval(2, 8, (2, 3)), 5), (Interval(3, 10, (1, 2)), 10), (Interval(1, 4, (1, 3)), 8)] >>> G.edges(data=True) [(Interval(2, 8, (2, 3)), {}), (Interval(3, 10, (1, 2)), {'weight': 10}), (Interval(1, 4, (1, 3)), {'height': 18, 'weight': 8})] >>> G.edges(u=1, begin=5, end=9, data="weight") [(Interval(3, 10, (1, 2)), 10)] """ # If non of the nodes are defined the interval tree is queried for the list of edges, # otherwise the edges are returned based on the nodes in the self._adj.o if u is None and v is None: if begin is None and end is None: iedges = self.tree.all_intervals # interval filtering else: if begin is None: begin = self.tree.begin() if end is None: end = self.tree.end() + 1 iedges = self.tree[begin:end] else: # Node filtering if u is not None and v is not None: iedges = [ iv for iv in self._adj[u].keys() if iv.data[0] == v or iv.data[1] == v ] elif u is not None: iedges = self._adj[u].keys() else: iedges = self._adj[v].keys() # Interval filtering if begin is not None and end is not None: iedges = [ iv for iv in iedges if iv.end >= begin and iv.begin < end ] elif begin is not None: iedges = [iv for iv in iedges if iv.end >= begin] elif end is not None: iedges = [iv for iv in iedges if iv.begin < end] # Appending attribute data if needed if data is False: return iedges if isinstance(iedges, list) else list(iedges) if data is True: return [(iv, self._adj[iv.data[0]][iv]) for iv in iedges] return [(iv, self._adj[iv.data[0]][iv][data]) if data in self._adj[iv.data[0]][iv].keys() else (iv, default) for iv in iedges] def remove_edge(self, u, v, begin=None, end=None, overlapping=True): """Remove the edge between u and v in the interval graph, during the given interval. Quiet if the specified edge is not present. Parameters ---------- u, v : nodes Nodes can be, for example, strings or numbers. Nodes must be hashable (and not None) Python objects. begin : integer, optional (default= beginning of the entire interval graph) Inclusive beginning time of the edge appearing in the interval graph. end : integer, optional (default= end of the entire interval graph + 1) Non-inclusive ending time of the edge appearing in the interval graph. Must be bigger than begin. Note that the default value is shifted up by 1 to make it an inclusive end. overlapping : bool, optional (default= True) if True, remove the edge between u and v with overlapping interval with `begin` and `end`. if False, remove the edge between u and v with the exact interval. Note: if False, both `begin` and `end` must be defined, otherwise an exception is raised. Raises ------ NetworkXError If `begin` and `end` are not defined and `overlapping= False` Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 5, 9), (1, 2, 8, 15)]) >>> G.remove_edge(1, 2) >>> G.has_edge(1, 2) False With specific overlapping interval >>> G = dnx.IntervalGraph() >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 5, 9), (1, 2, 8, 15)]) >>> G.remove_edge(1, 2, begin=2, end=4) >>> G.has_edge(1, 2, begin=2, end=4) False >>> G.has_edge(1, 2) True Exact interval match >>> G.remove_edge(2, 4, begin=1, end=11, overlapping=False) >>> G.has_edge(2, 4, begin=1, end=11) False """ # remove edge between u and v with the exact given interval if not overlapping: if begin is None or end is None: raise NetworkXError( "For exact interval match (overlapping=False), both begin and end must be defined." ) iedge = self.__get_iedge_in_tree(u, v, begin, end) if iedge is None: return self.__remove_iedge(iedge) return iedges_to_remove = [] # remove every edge between u and v if begin is None and end is None: for iv in self._adj[u].keys(): if iv.data[0] == v or iv.data[1] == v: iedges_to_remove.append(iv) # remove edge between u and v with overlapping interval with the given interval if begin is None: begin = self.tree.begin() if end is None: end = self.tree.end() + 1 for iv in self._adj[u].keys(): if (iv.data[0] == v or iv.data[1] == v) and iv.overlaps( begin=begin, end=end): iedges_to_remove.append(iv) # removing found iedges for iv in iedges_to_remove: self.__remove_iedge(iv) def __remove_iedge(self, iedge): """Remove the interval edge from the interval graph. Quiet if the specified edge is not present. Parameters ---------- iedge : Interval object Interval edge to be removed. Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_edge(1, 2, 3, 10) >>> iedge = Interval(3, 10, (1, 2)) # Interval(begin, end, (u, v)) >>> G.__remove_iedge(iedge) """ self.tree.discard(iedge) self._adj[iedge.data[0]].pop(iedge, None) self._adj[iedge.data[1]].pop(iedge, None) def __get_iedge_in_tree(self, u, v, begin, end): """Return interval edge if found in the interval graph with the exact interval, otherwise return None. Parameters ---------- u, v : nodes Nodes can be, for example, strings or numbers. Nodes must be hashable (and not None) Python objects. begin : integer Inclusive beginning time of the edge appearing in the interval graph. end : integer Non-inclusive ending time of the edge appearing in the interval graph. Must be bigger than begin. Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_edge(1, 2, 3, 10) >>> G.__get_iedge_in_tree(2, 1, 3, 10) Interval(3, 10, (1, 2)) >>> G.__get_iedge_in_tree(2, 1, 4, 10) None """ temp_iedge = Interval(begin, end, (u, v)) if temp_iedge in self.tree: return temp_iedge temp_iedge = Interval(begin, end, (v, u)) if temp_iedge in self.tree: return temp_iedge return None def to_subgraph(self, begin, end, multigraph=False, edge_data=False, edge_interval_data=False, node_data=False): """Return a networkx Graph or MultiGraph which includes all the nodes and edges which have overlapping intervals with the given interval. Parameters ---------- begin: integer Inclusive beginning time of the edge appearing in the interval graph. Must be bigger than begin. end: integer Non-inclusive ending time of the edge appearing in the interval graph. multigraph: bool, optional (default= False) If True, a networkx MultiGraph will be returned. If False, networkx Graph. edge_data: bool, optional (default= False) If True, edges will keep their attributes. edge_interval_data: bool, optional (default= False) If True, each edge's attribute will also include its begin and end interval data. If `edge_data= True` and there already exist edge attributes with names begin and end, they will be overwritten. node_data : bool, optional (default= False) if True, each node's attributes will be included. See Also -------- to_snapshots : divide the interval graph to snapshots Notes ----- If multigraph= False, and edge_data=True or edge_interval_data=True, in case there are multiple edges, only one will show with one of the edge's attributes. Note: nodes with no edges will not appear in any subgraph. Examples -------- >>> G = dnx.IntervalGraph() >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 12, 19), (2, 4, 8, 15)]) >>> H = G.to_subgraph(4, 12) >>> type(H) <class 'networkx.classes.graph.Graph'> >>> list(H.edges(data=True)) [(1, 2, {}), (2, 4, {})] >>> H = G.to_subgraph(4, 12, edge_interval_data=True) >>> type(H) <class 'networkx.classes.graph.Graph'> >>> list(H.edges(data=True)) [(1, 2, {'end': 10, 'begin': 3}), (2, 4, {'end': 15, 'begin': 8})] >>> M = G.to_subgraph(4, 12, multigraph=True, edge_interval_data=True) >>> type(M) <class 'networkx.classes.multigraph.MultiGraph'> >>> list(M.edges(data=True)) [(1, 2, {'end': 10, 'begin': 3}), (2, 4, {'end': 11, 'begin': 1}), (2, 4, {'end': 15, 'begin': 8})] """ if end <= begin: raise NetworkXError( "IntervalGraph: subgraph duration must be strictly bigger than zero: " "begin: {}, end: {}.".format(begin, end)) iedges = self.tree[begin:end] if multigraph: G = MultiGraph() else: G = Graph() if edge_data and edge_interval_data: G.add_edges_from((iedge.data[0], iedge.data[1], dict(self._adj[iedge.data[0]][iedge], begin=iedge.begin, end=iedge.end)) for iedge in iedges) elif edge_data: G.add_edges_from((iedge.data[0], iedge.data[1], self._adj[iedge.data[0]][iedge].copy()) for iedge in iedges) elif edge_interval_data: G.add_edges_from((iedge.data[0], iedge.data[1], { 'begin': iedge.begin, 'end': iedge.end }) for iedge in iedges) else: G.add_edges_from( (iedge.data[0], iedge.data[1]) for iedge in iedges) # include node attributes if node_data: G.add_nodes_from((n, self._node[n].copy()) for n in G.nodes) return G def to_snapshots(self, number_of_snapshots, multigraph=False, edge_data=False, edge_interval_data=False, node_data=False, return_length=False): """Return a list of networkx Graph or MultiGraph objects as snapshots of the interval graph in consecutive order. Parameters ---------- number_of_snapshots : integer Number of snapshots to divide the interval graph into. Must be bigger than 1. multigraph : bool, optional (default= False) If True, a networkx MultiGraph will be returned. If False, networkx Graph. edge_data: bool, optional (default= False) If True, edges will keep their attributes. edge_interval_data : bool, optional (default= False) If True, each edge's attribute will also include its begin and end interval data. If `edge_data= True` and there already exist edge attributes with names begin and end, they will be overwritten. node_data : bool, optional (default= False) if True, each node's attributes will be included. return_length : bool, optional (default= False) If true, the length of snapshots will be returned as the second argument. See Also -------- to_subgraph : subgraph based on an interval Notes ----- In order to create snapshots, begin and end interval objects of the interval graph must be numbers. If multigraph= False, and edge_data=True or edge_interval_data=True, in case there are multiple edges, only one will show with one of the edge's attributes. Examples -------- Snapshots of NetworkX Graph >>> G = dnx.IntervalGraph() >>> G.add_edges_from([(1, 2, 3, 10), (2, 4, 1, 11), (6, 4, 12, 19), (2, 4, 8, 15)]) >>> S, l = G.to_snapshots(2, edge_interval_data=True, return_length=True) >>> S [<networkx.classes.graph.Graph object at 0x100000>, <networkx.classes.graph.Graph object at 0x150d00>] >>> l 9.0 >>> for g in S: >>> ... g.edges(data=True)) [(1, 2, {'begin': 3, 'end': 10}), (2, 4, {'begin': 8, 'end': 15})] [(2, 4, {'begin': 8, 'end': 15}), (4, 6, {'begin': 12, 'end': 19})] Snapshots of NetworkX MultiGraph >>> S, l = G.to_snapshots(3, multigraph=True, edge_interval_data=True, return_length=True) >>> S [<networkx.classes.multigraph.MultiGraph object at 0x1060d40b8>, <networkx.classes.multigraph.MultiGraph object at 0x151020c9e8>, <networkx.classes.multigraph.MultiGraph object at 0x151021d390>] >>> l 6.0 >>> for g in S: >>> ... g.edges(data=True)) [(1, 2, {'end': 10, 'begin': 3}), (2, 4, {'end': 11, 'begin': 1})] [(1, 2, {'end': 10, 'begin': 3}), (2, 4, {'end': 11, 'begin': 1}), (2, 4, {'end': 15, 'begin': 8}), (4, 6, {'end': 19, 'begin': 12})] [(2, 4, {'end': 15, 'begin': 8}), (4, 6, {'end': 19, 'begin': 12})] """ if number_of_snapshots < 2 or type(number_of_snapshots) is not int: raise NetworkXError( "IntervalGraph: number of snapshots must be an integer and 2 or bigger. " "{0} was passed.".format(number_of_snapshots)) begin, end = self.interval() snapshot_len = (end - begin) / number_of_snapshots snapshots = [] end_inclusive_addition = 0 for i in range(number_of_snapshots): # since to_subgraph is end non-inclusive, shift the end up by 1 to include end in the last snapshot. if i == number_of_snapshots - 1: end_inclusive_addition = 1 snapshots.append( self.to_subgraph(begin + snapshot_len * i, begin + snapshot_len * (i + 1) + end_inclusive_addition, multigraph=multigraph, edge_data=edge_data, edge_interval_data=edge_interval_data, node_data=node_data)) if return_length: return snapshots, snapshot_len return snapshots @staticmethod def load_from_txt(path, delimiter=" ", nodetype=None, comments="#"): """Read interval graph in from path. Every line in the file must be an edge in the following format: "node node begin end". Both interval times must be integers. Nodes can be any hashable objects. Parameters ---------- path : string or file Filename to read. nodetype : Python type, optional Convert nodes to this type. comments : string, optional Marker for comment lines delimiter : string, optional Separator for node labels. The default is whitespace. Returns ------- G: IntervalGraph The graph corresponding to the lines in edge list. Examples -------- >>> G=dnx.IntervalGraph.load_from_txt("my_dygraph.txt") The optional nodetype is a function to convert node strings to nodetype. For example >>> G=dnx.IntervalGraph.load_from_txt("my_dygraph.txt", nodetype=int) will attempt to convert all nodes to integer type. Since nodes must be hashable, the function nodetype must return hashable types (e.g. int, float, str, frozenset - or tuples of those, etc.) """ ig = IntervalGraph() with open(path, 'r') as file: for line in file: p = line.find(comments) if p >= 0: line = line[:p] if not len(line): continue line = line.rstrip().split(delimiter) u, v, begin, end = line if nodetype is not None: try: u = nodetype(u) v = nodetype(v) except: raise TypeError( "Failed to convert node to type {0}".format( nodetype)) try: begin = int(begin) end = nodetype(end) except: raise TypeError("Failed to convert time to type int") ig.add_edge(u, v, begin, end) return ig
class TaskSet(object): """ Holds a set of tasks in a priority queue. """ def __init__(self): self._tasksQueue = TaskUnitPriorityQueue() # keep r1 < r2 < r3 order. self._intervalTree = IntervalTree() @property def tasks(self): return self._tasksQueue.items() def add(self, task): if not self._tasksQueue.contains(task.taskID): self._addTaskToTree(task) self._tasksQueue.push(task) else: raise DuplicateTaskException def _addTaskToTree(self, task): """ Adds task to interval tree. """ self._intervalTree.addi(begin=task.release, end=task.deadline, data=task.taskID) def remove(self, task): self._intervalTree.discardi(task.release, task.deadline, task.taskID) self._tasksQueue.remove(task.taskID) def _findLatestInterval(self, intervals): """ Find the latest interval. """ latest = intervals[0] for interval in intervals: if interval.begin > latest.begin: latest = interval return latest def _orIntervals(self, intervalListA, intervalListB): return list(set(intervalListA) | set(intervalListB)) def _conflictPath(self, interval, intervalTree): """ @param interval The interval to find conflicts with. @param intervalTree The intervalTree that contains all intervals Finds the longest number of intervals that are all overlapping (conflicting). For example: if A and B conflict and B and C conflict and A is the interval we're looking for conflicts with, the returned intervals will be A, B, C. Another example: if D and E conflict and F and G conflict, and we're looking for all conflicts with D, only D and E will be returned as F and G are not overlapping with either D and E. """ intervals = list(intervalTree.search(interval)) # if only one interval, check if its the one we're # trying to find conflicts with. if len(intervals) == 1 and intervals[0] == interval: return [] # now find the latest of all the intervals and get all conflicts # with and keep going until there are no more conflicts. latestInterval = self._findLatestInterval(intervals) # remove all the conflicts, we dont need to check them again. intervalTree.remove_overlap(interval) # put the latest conflict back into the tree and find its conflicts intervalTree.add(latestInterval) # now go find all conflicts with the latest interval until there are none. return self._orIntervals( intervals, self._conflictPath(latestInterval, intervalTree)) def _intervalConflictAlreadyDetected(self, interval, conflicts): """ Checks to see if interval was already detected to conflict. """ for conflict in conflicts: for ival in conflict: if ival == interval: return True return False def findConflicts(self): """ Finds all conflicts within the task set. """ begin = self._intervalTree.begin() end = self._intervalTree.end() conflicts = [] conflictObjs = [] nonConflictsObjs = [] intervals = sorted(self._intervalTree[begin:end]) for interval in intervals: # check if this interval was already detected to conflict if self._intervalConflictAlreadyDetected(interval, conflicts): continue conflictIntervals = self._conflictPath(interval, self._intervalTree.copy()) if len(conflictIntervals) > 0: # there was a conflict conflicts.append(conflictIntervals) conflictObjs.append(Conflict(conflictIntervals)) else: nonConflictsObjs.append(Conflict(interval)) return ConflictSet(conflictObjs), ConflictSet(nonConflictsObjs) def __iter__(self): return self._tasksQueue
def precise_extension(dict_transcript, dict_exon_signal, gene_col, coverage_stringtie): precisely_extended_dict = {} overlapped_transcripts = [] coverage = coverage_stringtie * 200 # Average length of an exon = 200pb. # Boolean if the introns of a gene car be the exon of an other one. intron_exon = False for chromosome in dict_transcript: # Create a new dictionnary with the same model than dict_transcript. precisely_extended_dict[str(chromosome)] = IntervalTree() for transcript in sorted(dict_transcript[chromosome]): overlap_start = 0 # Introduce the boolean extension with false as default for each transcript. extension = False # Case where the transcript is from the positive strand. if transcript.data[0][6] == "+": # Check if there is others transcripts in the area to extend. if len(dict_transcript[chromosome][transcript.end + 1:transcript.end + 5001]) != 0: exons_it = IntervalTree() introns_it = IntervalTree() max_extension = 0 for transcript_in_iv in sorted( dict_transcript[chromosome][transcript.end + 1:transcript.end + 5001]): # If others transcripts are from the same strand but not the same gene, store in an IV the exons and the overlapping start. if transcript_in_iv.data[0][ gene_col] != transcript.data[0][ gene_col] and transcript_in_iv.data[0][ 6] == "+": if overlap_start == 0: if transcript_in_iv.begin > transcript.end: overlap_start = transcript_in_iv.begin # If transcripts are already overlapping before extension, error in the original GTF. else: overlap_start = transcript.end + 1 overlapped_transcripts.append(transcript) for exon_in_transcript in transcript_in_iv.data: if int(exon_in_transcript[3]) > transcript.end: exons_it[int(exon_in_transcript[3]) + 1:int(exon_in_transcript[4] )] = "exon" else: continue # Comeback to the case where there is an overlapping issue. if len(exons_it) > 1: # If there is a signal in the area where overlapping start in the stringtie output. if chromosome in dict_exon_signal: if len(dict_exon_signal[chromosome] [overlap_start:transcript.end + 5001]) != 0 and intron_exon == True: exons_it.merge_overlaps() # Convert the exon intervaltree in a intron one. for exon_number, exons in enumerate( sorted(exons_it)): if exon_number == 0: previous_end = exons.end else: introns_it[previous_end + 1:exons.begin] = "intron" previous_end = exons.end # Check if signal overlap introns and assign max extension in consequence. for signal in sorted( dict_exon_signal[chromosome] [overlap_start:introns_it.end()], reverse=True): if signal.data[0] == "+": for intron in sorted(introns_it, reverse=True): if signal.end > intron.begin and signal.begin < intron.end and signal.end <= transcript.end + 5001: if signal.end < intron.end: max_extension = signal.end else: max_extension = intron.end extension = True break if max_extension != 0: new_transcript_end = max_extension break else: continue # Case where no signal overlap introns. if max_extension == 0: if len(dict_exon_signal[chromosome] [transcript.end + 1:overlap_start]) != 0: for signal in sorted( dict_exon_signal[chromosome] [transcript.end + 1:overlap_start], reverse=True): if signal.data[ 0] == "+" and signal.end <= transcript.end + 5001: new_transcript_end = signal.end extension = True break else: extension = False # Case where no signal overlap transcripts. else: if len( dict_exon_signal[chromosome] [transcript.end + 1:overlap_start]) != 0: for signal in sorted( dict_exon_signal[chromosome] [transcript.end + 1:overlap_start], reverse=True): if signal.data[ 0] == "+" and signal.end <= transcript.end + 5001 and signal.end < overlap_start: if signal.data[1] * ( signal.end - signal.begin) > coverage: new_transcript_end = signal.end extension = True break else: extension = False elif len(exons_it) == 1: if chromosome in dict_exon_signal: if len(dict_exon_signal[chromosome] [transcript.end + 1:exons_it.begin()]) != 0: for signal in sorted( dict_exon_signal[chromosome] [transcript.end:exons_it.begin()], reverse=True): if signal.data[ 0] == "+" and signal.end <= exons_it.begin( ) - 1: if signal.data[1] * ( signal.end - signal.begin) > coverage: new_transcript_end = signal.end extension = True break else: extension = False else: extension = False else: # If there is a signal present from the stringtie output overlapping from the end of the transcript to an inputted value, save the signal's end. if chromosome in dict_exon_signal: if len(dict_exon_signal[chromosome] [transcript.end + 1:transcript.end + 5001]) != 0: for signal in sorted( dict_exon_signal[chromosome] [transcript.end:transcript.end + 5001], reverse=True): if signal.data[ 0] == "+" and signal.end <= transcript.end + 5001: if signal.data[1] * ( signal.end - signal.begin) > coverage: new_transcript_end = signal.end extension = True break else: extension = False else: extension = False # When extension is true, end of the transcript is changed with the signal's end and added to the new dict. if extension is True: modified_transcript = copy.deepcopy(transcript) modified_transcript.data[-1][4] = str(new_transcript_end) modified_transcript.data[-1][1] = "BestScriptEver" modified_transcript.data[-1].append( "extension +" + str(new_transcript_end - transcript.end)) precisely_extended_dict[chromosome][ int(transcript.begin):int(new_transcript_end )] = modified_transcript.data # Otherwise, unmodified transcript is added. else: precisely_extended_dict[chromosome][ int(transcript.begin):int(transcript.end )] = transcript.data # Case where the transcript is from the negative strand. if transcript.data[0][6] == "-": # Check if there is others transcripts in the area to extend. if len(dict_transcript[chromosome] [transcript.begin - 5000:transcript.begin]) != 0: exons_it = IntervalTree() introns_it = IntervalTree() max_extension = 0 for transcript_in_iv in sorted( dict_transcript[chromosome][transcript.begin - 5000:transcript.begin], reverse=True): # If others transcripts are from the same strand but not the same gene, store in an IV the exons and the overlapping start. if transcript_in_iv.data[0][ gene_col] != transcript.data[0][ gene_col] and transcript_in_iv.data[0][ 6] == "-": if overlap_start == 0: if transcript_in_iv.begin < transcript.begin: overlap_start = transcript_in_iv.end # If transcripts are already overlapping before extension, error in the original GTF. else: overlap_start = transcript.begin - 1 overlapped_transcripts.append(transcript) for exon_in_transcript in transcript_in_iv.data: if int(exon_in_transcript[4] ) < transcript.begin: exons_it[int(exon_in_transcript[3]) + 1:int(exon_in_transcript[4] )] = "exon" else: continue # Comeback to the case where there is an overlapping issue. if len(exons_it) > 1: # If there is a signal in the area where overlapping start in the stringtie output. if chromosome in dict_exon_signal: if len(dict_exon_signal[chromosome] [transcript.begin - 5000:overlap_start + 1]) != 0 and intron_exon == True: exons_it.merge_overlaps() # Convert the exon intervaltree in a intron one. for exon_number, exons in enumerate( sorted(exons_it)): if exon_number == 0: previous_end = exons.end else: introns_it[previous_end + 1:exons.begin] = "intron" previous_end = exons.end # Check if signal overlap introns and assign max extension in consequence. for signal in sorted( dict_exon_signal[chromosome] [introns_it.begin():overlap_start + 1]): if signal.data[0] == "-": for intron in sorted(introns_it): if signal.begin < intron.end and signal.end > intron.begin and signal.begin >= transcript.begin - 5000: if signal.begin > intron.begin: max_extension = signal.begin else: max_extension = intron.begin extension = True break if max_extension != 0: new_transcript_end = max_extension break else: continue # Case where no signal overlap introns. if max_extension == 0: if len( dict_exon_signal[chromosome] [overlap_start:transcript.begin]) != 0: for signal in sorted( dict_exon_signal[chromosome] [overlap_start:transcript.begin]): if signal.data[ 0] == "-" and signal.begin >= transcript.begin - 5001: new_transcript_end = signal.begin extension = True break else: extension = False # Case where no signal overlap transcripts. else: if len(dict_exon_signal[chromosome] [overlap_start:transcript.begin]) != 0: for signal in sorted( dict_exon_signal[chromosome] [overlap_start:transcript.begin]): if signal.data[ 0] == "-" and signal.begin >= transcript.begin - 5001 and signal.begin > overlap_start: if signal.data[1] * ( signal.end - signal.begin) > coverage: new_transcript_end = signal.begin extension = True break else: extension = False elif len(exons_it) == 1: if chromosome in dict_exon_signal: if len(dict_exon_signal[chromosome] [exons_it.end():transcript.begin]) != 0: for signal in sorted( dict_exon_signal[chromosome] [exons_it.end():transcript.begin]): if signal.data[ 0] == "-" and signal.begin >= exons_it.end( ) + 1: if signal.data[1] * ( signal.end - signal.begin) > coverage: new_transcript_end = signal.begin extension = True break else: extension = False else: extension = False else: # If there is a signal present from the stringtie output overlapping from the end of the transcript to an inputted value, save the signal's end. if chromosome in dict_exon_signal: if len(dict_exon_signal[chromosome] [transcript.begin - 5000:transcript.begin]) != 0: for signal in sorted( dict_exon_signal[chromosome] [transcript.begin - 5000:transcript.begin]): if signal.data[ 0] == "-" and signal.begin >= transcript.begin - 5000: if signal.data[1] * ( signal.end - signal.begin) > coverage: new_transcript_end = signal.begin extension = True break else: extension = False else: extension = False # When extension is true, end of the transcript is changed with the signal's end and added to the new dict. if extension is True: modified_transcript = copy.deepcopy(transcript) modified_transcript.data[0][3] = str(new_transcript_end) modified_transcript.data[0][1] = "BestScriptEver" modified_transcript.data[0].append("extension " + str(new_transcript_end - transcript.begin)) precisely_extended_dict[chromosome][ int(new_transcript_end ):int(transcript.end)] = modified_transcript.data # Otherwise, unmodified transcript is added. else: precisely_extended_dict[chromosome][ int(transcript.begin):int(transcript.end )] = transcript.data with open("errors_file.txt", "w") as filout: for ovlp_transcript in overlapped_transcripts: filout.write("{}\n".format(ovlp_transcript.data[0])) return precisely_extended_dict
class TemporalNodeCollection(NodeCollection): """A collection of temporal nodes""" def __init__(self, *args, **kwargs) -> None: """Initialize the NodeCollection object.""" # initialize the base class super().__init__(*args, **kwargs) # initialize an intervaltree to save events self._events = IntervalTree() # class of objects self._default_class: Any = TemporalNode @singledispatchmethod def __getitem__(self, key: Any) -> Any: return super().__getitem__(key) @__getitem__.register(slice) # type: ignore @__getitem__.register(int) # type: ignore @__getitem__.register(float) # type: ignore def _(self, key: Union[int, float, slice]) -> Any: # pylint: disable=arguments-differ start, end, _ = _get_start_end(key) for start, end, uid in sorted(self._events[start:end]): for obj in self[uid][start:end]: yield obj @property def start(self): """start of the object""" return self._events.begin() @property def end(self): """end of the object""" return self._events.end() @property def events(self): """Temporal events""" return self._events @singledispatchmethod def add(self, *args, **kwargs: Any) -> None: """Add multiple nodes. """ super().add(*args, **kwargs) def _add(self, obj: Any, **kwargs: Any) -> None: """Add an node to the set of nodes.""" super()._add(obj, **kwargs) start, end, _ = obj.last() self._events[start:end] = obj.uid def _if_exist(self, obj: Any, **kwargs: Any) -> None: """Helper function if node already exists.""" count: int = kwargs.pop('count', 1) element = self[obj.relations] element.event(**kwargs) start, end, _ = obj.last() self._events[start:end] = element.uid def _remove(self, obj) -> None: """Add an edge to the set of edges.""" for interval in sorted(self._events): if interval.data == obj.uid: self._events.remove(interval) super()._remove(obj)
class TemporalPathPyObject(PathPyObject): """Base class for a temporal object.""" def __init__(self, uid: Optional[str] = None, **kwargs: Any) -> None: """Initialize the temporal object.""" # initialize the parent class super().__init__(uid=uid) # default start and end time of the object self._start = float('-inf') self._end = float('inf') # initialize an intervaltree to save events self._events = IntervalTree() # add new events self.event(**kwargs) # variable to store changes in the events self._len_events = len(self._events) def __iter__(self): self._clean_events() # create generator for start, end, attributes in sorted(self._events): self._attributes = {**{'start': start, 'end': end}, **attributes} yield self self._attributes.pop('start', None) self._attributes.pop('end', None) @singledispatchmethod def __getitem__(self, key: Any) -> Any: self._clean_events() # get the last element _, _, last = self.last() return last.get(key, None) @__getitem__.register(tuple) # type: ignore def _(self, key: tuple) -> Any: start, end, _ = _get_start_end(key[0]) values = { k: v for _, _, o in sorted(self._events[start:end]) for k, v in o.items() } return values.get(key[1], None) if len(key) == 2 else values @__getitem__.register(slice) # type: ignore @__getitem__.register(int) # type: ignore @__getitem__.register(float) # type: ignore def _(self, key: Union[int, float, slice]) -> Any: start, end, _ = _get_start_end(key) self._clean_events() # create generator for start, end, attributes in sorted(self._events[start:end]): self._attributes = {**{'start': start, 'end': end}, **attributes} yield self self._attributes.pop('start', None) self._attributes.pop('end', None) @singledispatchmethod def __setitem__(self, key: Any, value: Any) -> None: self.event(start=self._events.begin(), end=self._events.end(), **{key: value}) @__setitem__.register(tuple) # type: ignore def _(self, key: tuple, value: Any) -> None: start, end, _ = _get_start_end(key[0]) self.event(start=start, end=end, **{key[1]: value}) @property def start(self): """start of the object""" return self.attributes.get('start', self._start) @property def end(self): """end of the object""" return self.attributes.get('end', self._end) def _clean_events(self): """helper function to clean events""" # BUG: There is a bug in the intervaltree library # merge_equals switches old and new data randomly def reducer(old, new): return {**old, **new} if len(self._events) != self._len_events: # split overlapping intervals self._events.split_overlaps() # combine the dict of the overlapping intervals self._events.merge_equals(data_reducer=reducer) # update the length of the events self._len_events = len(self._events) def event(self, *args, **kwargs) -> None: """Add a temporal event.""" # check if object is avtive or inactive active = kwargs.pop('active', True) # get start and end time of the even start, end, kwargs = _get_start_end(*args, **kwargs) if active: self._events[start:end] = kwargs # type: ignore self._attributes = kwargs.copy() else: self._events.chop(start, end) # update start and end times self._start = self._events.begin() self._end = self._events.end() def last(self): """return the last added intervall""" interval = sorted(self._events)[-1] return interval.begin, interval.end, interval.data