def test_chop_datafunc(): def datafunc(iv, islower): oldlimit = iv[islower] return "oldlimit: {0}, islower: {1}".format(oldlimit, islower) t = IntervalTree([Interval(0, 10)]) t.chop(3, 7, datafunc) assert len(t) == 2 assert sorted(t)[0] == Interval(0, 3, 'oldlimit: 10, islower: True') assert sorted(t)[1] == Interval(7, 10, 'oldlimit: 0, islower: False') t = IntervalTree([Interval(0, 10)]) t.chop(0, 7, datafunc) assert len(t) == 1 assert sorted(t)[0] == Interval(7, 10, 'oldlimit: 0, islower: False') t = IntervalTree([Interval(0, 10)]) t.chop(5, 10, datafunc) assert len(t) == 1 assert sorted(t)[0] == Interval(0, 5, 'oldlimit: 10, islower: True') t = IntervalTree([Interval(0, 10)]) t.chop(-5, 15, datafunc) assert len(t) == 0 t = IntervalTree([Interval(0, 10)]) t.chop(0, 10, datafunc) assert len(t) == 0
def interval_fragments(start, end, intervals): """ given [start,end) and intervals, return list of unclaimed intervals """ frag_tree = IntervalTree() frag_tree.add(Interval(start, end, 'fragment')) for i in intervals: frag_tree.chop(i.begin, i.end) return list(frag_tree)
def split_at_nstretch( interval: Interval, left: Coords, right: Coords, nstretches: Sequence[Interval], ) -> List[Interval]: best_nstretch = find_best_nstretch(interval, left, right, nstretches) itree = IntervalTree([interval]) itree.chop(best_nstretch.begin, best_nstretch.end) split_intervals = list(itree) # This should be 0 (if the whole intersection is N), 1 (if the nstretch # buts one of the ends), or 2 (if internal split). assert len(split_intervals) <= 2, split_intervals split_intervals_with_data = [] for i in split_intervals: assert not ((i.begin == interval.begin) and (i.end == interval.end)) data = None if i.begin == interval.begin: data = left elif i.end == interval.end: data = right assert data is not None split_intervals_with_data.append(Interval(i.begin, i.end, [data])) return split_intervals_with_data
def find_intersection_with_interval(self, interval: Interval, sample: str): """ Given an interval find all overlapping calls in the callset and truncate them appropriately. Note: we assume that the calls in the callset do not overlap for a single sample. Args: interval: a given interval sample: sample from the callset Returns: A list of sorted, non-overlapping events that completely cover a given interval """ assert sample in self.sample_names, "Sample %s is not in the callset" % sample calls = self.sample_to_calls_map.get(sample) intersecting_calls = calls.find_intersection(interval) if not intersecting_calls: return [(interval, EventType.NO_CALL)] else: result = IntervalTree([ TreeInterval(call.interval.start, call.interval.end, call.event_type) for call in intersecting_calls ]) max_val = sorted(result)[-1].end min_val = sorted(result)[0].begin result.chop(interval.end, max(interval.end, max_val)) result.chop(min(interval.start, min_val), interval.start) return [(Interval(interval.chrom, t.begin, t.end), t.data) for t in sorted(result)]
def pprint(self, depth=0): tree = IntervalTree() tree.add(Interval(self.begin, self.end)) for child in self.children: tree.chop(child.begin, child.end) tree.add(Interval(child.begin, child.end, child)) intervals = sorted(tree.items()) # if a child exists right where we start, emit a comment for this # enveloping structure, otherwise the first gap gets our comment comment = ' ' * depth + self.comment if type(intervals[0].data) == OhaNode: oha_comment(self.begin, comment) else: intervals[0] = Interval(intervals[0].begin, intervals[0].end, comment) for interval in intervals: if type(interval.data) == OhaNode: node = interval.data node.pprint(depth + 1) else: self.fp.seek(self.begin) data = self.fp.read(interval.length()) oha(data, interval.begin, interval.data)
def interval_tree(start_data, stop_data, buffer_len): starts = [] stops = [] t = IntervalTree() ## Shrink each interval by the buffer size for key, value in start_data.iteritems(): for i in range(0, len(value)): shrunk_start = value[i] + buffer_len / 2.0 shrunk_stop = stop_data[key][i] + 1 - buffer_len / 2.0 if shrunk_start < shrunk_stop: t[shrunk_start:shrunk_stop] = (shrunk_start, shrunk_stop) ## Add chromosome endpoints without buffer chrom_start, chrom_stop = get_extremes(start_data, stop_data) if chrom_start < t.begin() + 1: t[chrom_start:t.begin() + 1] = (chrom_start, t.begin() + 1) if t.end() - 1 < chrom_stop: t[t.end() - 1:chrom_stop] = (t.end() - 1, chrom_stop) ## Merge intervals that overlap in tree to get consensus t.merge_overlaps() ## Check that original intervals only overlap with one consensus interval for key, value in start_data.iteritems(): for i in range(0, len(value)): start = value[i] stop = stop_data[key][i] + 1 if len(t[start:stop]) > 1: ## If they overlap with more than one ## Remove part of consensus interval ## This will never be more than the buffer size/2 assert (len(t[start:stop]) == 2) remove_start = 0 remove_stop = 0 min_length = float('inf') for interval in t[start:stop]: overlap_start, overlap_stop = get_overlap( (start, stop), (interval[0], interval[1])) if (overlap_stop - overlap_start) < min_length: min_length = overlap_stop - overlap_start remove_start = overlap_start remove_stop = overlap_stop print(min_length) t.chop(remove_start, remove_stop) assert (min_length <= buffer_len / 2.0) assert (len(t[start:stop]) < 2) ## Get consensus start and stop points chrom_len = chrom_stop - chrom_start covered = 0.0 for interval in sorted(t): starts.append(interval[0]) stops.append(interval[1]) covered = covered + (interval[1] - interval[0]) print("The percentage of the chromosome covered is: %s" % '{0:.2f}'.format( (covered / chrom_len) * 100.0)) return (starts, stops)
def filter_nstretches( itrees: Mapping[str, IntervalTree], nstretches: Mapping[str, IntervalTree], min_non_overlap: int, ) -> None: """ Remove contigs without much going on outside N stretches. """ for scaffold, itree in nstretches.items(): # Loop through all of the potential breaks for nstretch in itree: # Find "contigs" that overlap the potential break # We do this in sorted order, from smallest to largest alignment # that means shorter ones are removed first contigs = sorted(itrees[scaffold].overlap(nstretch), key=lambda x: x.length()) to_drop = set() # Loop through the contigs to test. for contig in contigs: # Find if they overlap with any other n stretches. n_overlaps = nstretches[scaffold].overlap(contig) # Get an intervaltree of all contigs overlapping this one. contig_overlaps = IntervalTree( itrees[scaffold].overlap(contig)) # Remove all of the n-chunks from the intervals. # Note the "Coords" is still duplicated in the data attribute for n_overlap in n_overlaps: contig_overlaps.chop(n_overlap.begin, n_overlap.end) # Get the intervals that aren't the overlap under # consideration. contig_overlaps_itree = IntervalTree(o for o in contig_overlaps if o.data != contig.data) contig_overlaps_itree.merge_overlaps() # Get the fragments of the overlap under consideration contig_itree = IntervalTree(o for o in contig_overlaps if o.data == contig.data) # For each of the fragments, find how many new Non-N bases it # contributes to the contigging. len_non_overlap = sum([ find_len_non_overlap(f, contig_overlaps_itree) for f in contig_itree ]) # Remove the contig if it doesn't cut the muster if len_non_overlap < min_non_overlap: to_drop.add(contig) for contig in to_drop: itrees[scaffold].remove(contig) return
def _get_uncovered_intervals(domain: Interval, covered_intervals: IntervalTree) -> IntervalTree: """ Given an interval domain and a collection of intervals, return a list of uncovered intervals. """ tree = IntervalTree([domain]) for covered in covered_intervals: tree.chop(covered.begin, covered.end) return tree
def sorted_complement(tree, start=None, end=None) -> IntervalTree: result = IntervalTree() if start is None: start = tree.begin() if end is None: end = tree.end() result.addi(start, end) # using input tree bounds for iv in tree: result.chop(iv[0], iv[1]) return sorted(result)
def find_diff(list_a, list_b): interval_tree = IntervalTree() for interval in list_a: interval_tree.add(Interval(interval[0], interval[1])) for interval in list_b: interval_tree.chop(interval[0], interval[1]) result = [] for item in interval_tree.items(): result.append((item.begin, item.end)) return result
def _get_unparse_intervals_of_inds( dfs_inds_to_include: Sequence[int], ast: ObjectChoiceNode, unparse: UnparseResult ) -> IntervalTree: """Given some indicies we wish include, find the intervals of the total unparse string which are covered by those indicies""" include_set = set(dfs_inds_to_include) interval_tree = IntervalTree() currently_including = False for ind, pointer in enumerate(ast.depth_first_iter()): if ind % 2 != 0: # Only take into account the choice nodes. Skip the object nodes continue assert isinstance(pointer.cur_node, ObjectChoiceNode) func_need_to_do_here = None if ind in include_set: if not currently_including: func_need_to_do_here = lambda start, end: interval_tree.add(Interval(start, end)) currently_including = True else: if currently_including: func_need_to_do_here = lambda start, end: interval_tree.chop(start, end) currently_including = False if func_need_to_do_here: span = unparse.pointer_to_span(pointer) if span is None or span[1] - span[0] == 0: continue start, end = span func_need_to_do_here(start, end) interval_tree.merge_overlaps() return interval_tree
def regionTable(self): """ Get the "region table", a table indicating *base*-coordinate-delimited regions, using the same recarray dtype that is returned by BasH5Reader. *Note* that the regiontable from the StitchedZmwRead will not in general be equivalent to that from a bas.h5, if the BAM files were produced using bax2bam, because in our BAM encodings, a subread or adapter cannot extend beyond the HQ region. Additionally there is no concept of a "region score" for the BAM. """ zmwReadExtent = Interval(0, self.zmwReadLength) intervalsByType = defaultdict(list) for r in self.bamRecords: intervalsByType[_preciseReadType(r)].append( Interval(r.qStart, r.qEnd)) # Find an HQ region hqIntervalTree = IntervalTree([zmwReadExtent]) for lqInterval in intervalsByType["SCRAP:L"]: hqIntervalTree.chop(*lqInterval) hqIntervals = list(hqIntervalTree) assert len(hqIntervals) in (0, 1) if len(hqIntervals) == 0: hqInterval = Interval(0, 0) else: hqInterval = hqIntervals[0] hqRegion = (self.holeNumber, Region.HQ_REGION, hqInterval.begin, hqInterval.end, 0) # Adapters, barcodes, and inserts (and filtered inserts) regionTypeMap = { "SUBREAD": Region.INSERT_REGION, "SCRAP:A": Region.ADAPTER_REGION, "SCRAP:B": Region.BARCODE_REGION, "SCRAP:F": Region.INSERT_REGION } regions = [ hqRegion ] + \ [ (self.holeNumber, regionTypeMap[code], interval.begin, interval.end, 0) for code in regionTypeMap for interval in intervalsByType[code] ] return toRecArray(REGION_TABLE_DTYPE, regions)
def regionTable(self): """ Get the "region table", a table indicating *base*-coordinate-delimited regions, using the same recarray dtype that is returned by BasH5Reader. *Note* that the regiontable from the StitchedZmwRead will not in general be equivalent to that from a bas.h5, if the BAM files were produced using bax2bam, because in our BAM encodings, a subread or adapter cannot extend beyond the HQ region. Additionally there is no concept of a "region score" for the BAM. """ zmwReadExtent = Interval(0, self.zmwReadLength) intervalsByType = defaultdict(list) for r in self.bamRecords: intervalsByType[_preciseReadType(r)].append(Interval(r.qStart, r.qEnd)) # Find an HQ region hqIntervalTree = IntervalTree([zmwReadExtent]) for lqInterval in intervalsByType["SCRAP:L"]: hqIntervalTree.chop(*lqInterval) hqIntervals = list(hqIntervalTree) assert len(hqIntervals) in (0, 1) if len(hqIntervals) == 0: hqInterval = Interval(0, 0) else: hqInterval = hqIntervals[0] hqRegion = (self.holeNumber, Region.HQ_REGION, hqInterval.begin, hqInterval.end, 0) # Adapters, barcodes, and inserts (and filtered inserts) regionTypeMap = { "SUBREAD" : Region.INSERT_REGION, "SCRAP:A" : Region.ADAPTER_REGION, "SCRAP:B" : Region.BARCODE_REGION, "SCRAP:F" : Region.INSERT_REGION } regions = [ hqRegion ] + \ [ (self.holeNumber, regionTypeMap[code], interval.begin, interval.end, 0) for code in regionTypeMap for interval in intervalsByType[code] ] return toRecArray(REGION_TABLE_DTYPE, regions)
def find(self, size: int, data: Optional[int] = None) -> IntervalTree: """Finds an interval tree of a given size in this resource pool. This is essentially an operation to find *which* resources to allocate considering that we manage individual resource units and guarantee exclusive usage by a resource unit. Parameters ---------- size : int The size (amount) of resources to allocate data : Optional[int] The identifier of the "owner" of the found resources. This allows us to keep track which job "owns" which resources during execution. Returns: IntervalTree: An interval tree with the size requested if such a tree can be found. Otherwise, an empty tree is returned. """ used = IntervalTree() if not self.fits(size): return used free = IntervalTree([Interval(0, self.size, data)]) used_size: int = 0 for interval in self.used_pool: free.chop(interval.begin, interval.end) for interval in free: temp_size = ResourcePool.measure(interval) + used_size if temp_size == size: used.add(interval) break if temp_size < size: used.add(interval) used_size = temp_size else: used.add( Interval(interval.begin, interval.begin + size - used_size, data)) break return used
class IpTree: def __init__(self): self.tree = IntervalTree() def add_interval(self, begin, end, data): interval = Interval(begin, end, data) overlapped = self.tree[interval.begin:interval.end] for o in overlapped: if o.contains_interval(interval): return elif interval.contains_interval(o): self.tree.remove(o) self.tree.chop(interval.begin, interval.end) self.tree.add(interval) def update_w(self): for i in self.tree.all_intervals: i.data['w'] = i.end - i.begin def get_all(self): return self.tree.all_intervals
def test_chop(): t = IntervalTree([Interval(0, 10)]) t.chop(3, 7) assert len(t) == 2 assert sorted(t)[0] == Interval(0, 3) assert sorted(t)[1] == Interval(7, 10) t = IntervalTree([Interval(0, 10)]) t.chop(0, 7) assert len(t) == 1 assert sorted(t)[0] == Interval(7, 10) t = IntervalTree([Interval(0, 10)]) t.chop(5, 10) assert len(t) == 1 assert sorted(t)[0] == Interval(0, 5) t = IntervalTree([Interval(0, 10)]) t.chop(-5, 15) assert len(t) == 0 t = IntervalTree([Interval(0, 10)]) t.chop(0, 10) assert len(t) == 0
class Day(object): def __init__(self, start, end, dt): self.dt = dt self.free = IntervalTree([get_iv(start, end)]) self.booked = IntervalTree([]) def is_free(self, interval): return (self.free.overlaps(interval) and not self.booked.overlaps(interval)) def schedule(self, interval): assert self.is_free(interval),\ "Attempt to double-book: {} - {}".format( m2t(interval.begin), m2t(interval.end)) self.free.chop(interval.begin, interval.end + self.dt) self.booked.add(interval) def dumps(self): dump = '' for iv in sorted(self.booked): dump += "\t{} - {}\t{}\n".format( m2t(iv.begin), m2t(iv.end), iv.data) return dump
class Allocator(Publisher): # Initialization ------------------------------------------------------ {{{ __slots__ = ('_aa', '_am', '_arg', '_ts') def __init__(self, tslam=None, cliargs=[], **kwargs): super().__init__() self._ts = tslam self._aa = IntervalTree() self._aa.add(AddrIval(0, 2**64, AState.REVOKED)) self._am = IntervalTree() self._am.add(AddrIval(0, 2**64, AState.UNMAPD)) self._ls = {} # Argument parsing ---------------------------------------------------- {{{ argp = argparse.ArgumentParser() argp.add_argument('--fix', action='store_const', const=True, default=False, help="Automatically insert fixups for reports") argp.add_argument('--skip-map', action='store_const', const=True, default=False, help="Ignore map/unmap constraints") argp.add_argument('--drop-safe', action='store_const', const=True, default=False, help="Suppress warnings for safely dropped events") self._arg = argp.parse_args(cliargs) # --------------------------------------------------------------------- }}} # --------------------------------------------------------------------- }}} # Allocation ---------------------------------------------------------- {{{ def _allocd(self, begin, end): overlaps_a = self._aa[begin:end] overlaps_m = self._am[begin:end] if not self._arg.skip_map: overlaps_unmapped = [ o for o in overlaps_m if o.state == AState.UNMAPD ] if overlaps_unmapped: logging.warning("Allocation ts=%d b=%x e=%x overlaps unmap=%r", self._ts(), begin, end, overlaps_unmapped) # XXX fix by mapping pages overlaps_allocated = [ o for o in overlaps_a if o.state == AState.ALLOCD ] if overlaps_allocated: logging.error("Allocation ts=%d b=%x e=%x overlaps alloc=%r", self._ts(), begin, end, overlaps_allocated) if self._arg.fix: for oa in overlaps_allocated: self._publish('free', '', oa.begin) self._aa.chop(begin, end) self._aa.add(AddrIval(begin, end, AState.ALLOCD)) def allocd(self, stk, begin, end): self._allocd(begin, end) self._publish('allocd', stk, begin, end) # --------------------------------------------------------------------- }}} # Freeing ------------------------------------------------------------- {{{ def _freed(self, addr): doalloc = False end = addr + 1 # Will be fixed up later overlaps_a = self._aa[addr:end] overlaps_m = self._am[addr:end] if not self._arg.skip_map: overlaps_unmapped = [ o for o in overlaps_m if o.state == AState.UNMAPD ] if overlaps_unmapped: logging.error("Free ts=%d a=%x overlaps unmap=%r", self._ts(), addr, overlaps_unmapped) allocations = [o for o in overlaps_a if o.state == AState.ALLOCD] overlaps_free = [o for o in overlaps_a if o.state == AState.FREED] if overlaps_free != []: logging.warning("Free ts=%d a=%x overlaps free=%r", self._ts(), addr, overlaps_free) if allocations == [] and len( overlaps_free) == 1 and self._arg.drop_safe: return False else: for of in overlaps_free: if of.begin <= addr: end = max(end, of.end) if self._arg.fix: doalloc = True if len(allocations) > 1 or (allocations != [] and overlaps_free != []): logging.error("Free ts=%d a=%x multiply-attested alloc=%r free=%r", self._ts(), addr, allocations, overlaps_free) elif allocations == [] and overlaps_free == []: logging.warning("Free ts=%d a=%x no corresponding alloc", self._ts(), addr) if self._arg.fix and not self._arg.drop_safe: doalloc = True else: assert doalloc == False return False else: for a in allocations: if a.begin != addr: # Likely to leave cruft behind, indicative of serious errors logging.error("Free ts=%d a=%x within alloc=%r", self._ts(), addr, a) else: end = max(end, a.end) self._aa.chop(addr, end) self._aa.add(AddrIval(addr, end, AState.FREED)) if doalloc: self._publish('allocd', '', addr, end) return True def freed(self, stk, addr): if addr == 0: # Just throw out free(NULL) return if self._freed(addr): self._publish('freed', stk, addr) # --------------------------------------------------------------------- }}} # Reallocation -------------------------------------------------------- {{{ def reallocd(self, stk, begin_old, begin_new, end_new): self._freed(begin_old) self._allocd(begin_new, end_new) self._publish('reallocd', stk, begin_old, begin_new, end_new) # --------------------------------------------------------------------- }}} # Mapping ------------------------------------------------------------- {{{ def mapd(self, stk, begin, end, prot): # XXX self._publish('mapd', stk, begin, end, prot) # --------------------------------------------------------------------- }}} # Unmapping ----------------------------------------------------------- {{{ def unmapd(self, stk, begin, end): # XXX self._publish('unmapd', stk, begin, end) # --------------------------------------------------------------------- }}} # Revoking ------------------------------------------------------------ {{{ def revoked(self, stk, spans): for (begin, end) in spans: overlaps = self._aa[begin:end] overlaps_allocated = [ o for o in overlaps if o.state == AState.ALLOCD ] if overlaps_allocated: logging.warning("Revocation ts=%d b=%x e=%x overlaps alloc=%r", self._ts(), begin, end, overlaps_allocated) if self._arg.fix: for oa in overlaps_allocated: self._publish('free', '', oa.begin) # XXX fix by freeing self._publish('revoked', stk, spans) # --------------------------------------------------------------------- }}} # Size-measurement pass-thru ------------------------------------------ {{{ def size_measured(self, sz): self._publish('size_measured', sz) def sweep_size_measured(self, sz): self._publish('sweep_size_measured', sz)
class TemporalPathPyObject(PathPyObject): """Base class for a temporal object.""" def __init__(self, uid: Optional[str] = None, **kwargs: Any) -> None: """Initialize the temporal object.""" # initialize the parent class super().__init__(uid=uid) # default start and end time of the object self._start = float('-inf') self._end = float('inf') # initialize an intervaltree to save events self._events = IntervalTree() # add new events self.event(**kwargs) # variable to store changes in the events self._len_events = len(self._events) def __iter__(self): self._clean_events() # create generator for start, end, attributes in sorted(self._events): self._attributes = {**{'start': start, 'end': end}, **attributes} yield self self._attributes.pop('start', None) self._attributes.pop('end', None) @singledispatchmethod def __getitem__(self, key: Any) -> Any: self._clean_events() # get the last element _, _, last = self.last() return last.get(key, None) @__getitem__.register(tuple) # type: ignore def _(self, key: tuple) -> Any: start, end, _ = _get_start_end(key[0]) values = { k: v for _, _, o in sorted(self._events[start:end]) for k, v in o.items() } return values.get(key[1], None) if len(key) == 2 else values @__getitem__.register(slice) # type: ignore @__getitem__.register(int) # type: ignore @__getitem__.register(float) # type: ignore def _(self, key: Union[int, float, slice]) -> Any: start, end, _ = _get_start_end(key) self._clean_events() # create generator for start, end, attributes in sorted(self._events[start:end]): self._attributes = {**{'start': start, 'end': end}, **attributes} yield self self._attributes.pop('start', None) self._attributes.pop('end', None) @singledispatchmethod def __setitem__(self, key: Any, value: Any) -> None: self.event(start=self._events.begin(), end=self._events.end(), **{key: value}) @__setitem__.register(tuple) # type: ignore def _(self, key: tuple, value: Any) -> None: start, end, _ = _get_start_end(key[0]) self.event(start=start, end=end, **{key[1]: value}) @property def start(self): """start of the object""" return self.attributes.get('start', self._start) @property def end(self): """end of the object""" return self.attributes.get('end', self._end) def _clean_events(self): """helper function to clean events""" # BUG: There is a bug in the intervaltree library # merge_equals switches old and new data randomly def reducer(old, new): return {**old, **new} if len(self._events) != self._len_events: # split overlapping intervals self._events.split_overlaps() # combine the dict of the overlapping intervals self._events.merge_equals(data_reducer=reducer) # update the length of the events self._len_events = len(self._events) def event(self, *args, **kwargs) -> None: """Add a temporal event.""" # check if object is avtive or inactive active = kwargs.pop('active', True) # get start and end time of the even start, end, kwargs = _get_start_end(*args, **kwargs) if active: self._events[start:end] = kwargs # type: ignore self._attributes = kwargs.copy() else: self._events.chop(start, end) # update start and end times self._start = self._events.begin() self._end = self._events.end() def last(self): """return the last added intervall""" interval = sorted(self._events)[-1] return interval.begin, interval.end, interval.data
class SegmentProducer(object): save_interval = SAVE_INTERVAL def __init__(self, download, n_procs): assert download.size is not None,\ 'Segment producer passed uninitizalied Download!' self.download = download self.n_procs = n_procs # Initialize producer self.load_state() self._setup_pbar() self._setup_queues() self._setup_work() self.schedule() def _setup_pbar(self): self.pbar = None self.pbar = get_pbar(self.download.ID, self.download.size) def _setup_work(self): if self.is_complete(): log.info('File already complete.') return work_size = self.integrate(self.work_pool) self.block_size = work_size / self.n_procs def _setup_queues(self): if WINDOWS: self.q_work = Queue() self.q_complete = Queue() else: manager = Manager() self.q_work = manager.Queue() self.q_complete = manager.Queue() def integrate(self, itree): return sum([i.end-i.begin for i in itree.items()]) def validate_segment_md5sums(self): if not self.download.check_segment_md5sums: return True corrupt_segments = 0 intervals = sorted(self.completed.items()) pbar = ProgressBar(widgets=[ 'Checksumming {}: '.format(self.download.ID), Percentage(), ' ', Bar(marker='#', left='[', right=']'), ' ', ETA()]) with mmap_open(self.download.path) as data: for interval in pbar(intervals): log.debug('Checking segment md5: {}'.format(interval)) if not interval.data or 'md5sum' not in interval.data: log.error(STRIP( """User opted to check segment md5sums on restart. Previous download did not record segment md5sums (--no-segment-md5sums).""")) return chunk = data[interval.begin:interval.end] checksum = md5sum(chunk) if checksum != interval.data.get('md5sum'): log.debug('Redownloading corrupt segment {}, {}.'.format( interval, checksum)) corrupt_segments += 1 self.completed.remove(interval) if corrupt_segments: log.warn('Redownloading {} currupt segments.'.format( corrupt_segments)) def load_state(self): # Establish default intervals self.work_pool = IntervalTree([Interval(0, self.download.size)]) self.completed = IntervalTree() self.size_complete = 0 if not os.path.isfile(self.download.state_path)\ and os.path.isfile(self.download.path): log.warn(STRIP( """A file named '{} was found but no state file was found at at '{}'. Either this file was downloaded to a different location, the state file was moved, or the state file was deleted. Parcel refuses to claim the file has been successfully downloaded and will restart the download.\n""").format( self.download.path, self.download.state_path)) return if not os.path.isfile(self.download.state_path): self.download.setup_file() return # If there is a file at load_path, attempt to remove # downloaded sections from work_pool log.info('Found state file {}, attempting to resume download'.format( self.download.state_path)) if not os.path.isfile(self.download.path): log.warn(STRIP( """State file found at '{}' but no file for {}. Restarting entire download.""".format( self.download.state_path, self.download.ID))) return try: with open(self.download.state_path, "rb") as f: self.completed = pickle.load(f) assert isinstance(self.completed, IntervalTree), \ "Bad save state: {}".format(self.download.state_path) except Exception as e: self.completed = IntervalTree() log.error('Unable to resume file state: {}'.format(str(e))) else: self.validate_segment_md5sums() self.size_complete = self.integrate(self.completed) for interval in self.completed: self.work_pool.chop(interval.begin, interval.end) def save_state(self): try: # Grab a temp file in the same directory (hopefully avoud # cross device links) in order to atomically write our save file temp = tempfile.NamedTemporaryFile( prefix='.parcel_', dir=os.path.abspath(self.download.state_directory), delete=False) # Write completed state pickle.dump(self.completed, temp) # Make sure all data is written to disk temp.flush() os.fsync(temp.fileno()) temp.close() # Rename temp file as our save file, this could fail if # the state file and the temp directory are on different devices if OS_WINDOWS and os.path.exists(self.download.state_path): # If we're on windows, there's not much we can do here # except stash the old state file, rename the new one, # and back up if there is a problem. old_path = os.path.join(tempfile.gettempdir(), ''.join( random.choice(string.ascii_lowercase + string.digits) for _ in range(10))) try: # stash the old state file os.rename(self.download.state_path, old_path) # move the new state file into place os.rename(temp.name, self.download.state_path) # if no exception, then delete the old stash os.remove(old_path) except Exception as msg: log.error('Unable to write state file: {}'.format(msg)) try: os.rename(old_path, self.download.state_path) except: pass raise else: # If we're not on windows, then we'll just try to # atomically rename the file os.rename(temp.name, self.download.state_path) except KeyboardInterrupt: log.warn('Keyboard interrupt. removing temp save file'.format( temp.name)) temp.close() os.remove(temp.name) except Exception as e: log.error('Unable to save state: {}'.format(str(e))) raise def schedule(self): while True: interval = self._get_next_interval() log.debug('Returning interval: {}'.format(interval)) if not interval: return self.q_work.put(interval) def _get_next_interval(self): intervals = sorted(self.work_pool.items()) if not intervals: return None interval = intervals[0] start = interval.begin end = min(interval.end, start + self.block_size) self.work_pool.chop(start, end) return Interval(start, end) def print_progress(self): if not self.pbar: return try: self.pbar.update(self.size_complete) except Exception as e: log.debug('Unable to update pbar: {}'.format(str(e))) def check_file_exists_and_size(self): if self.download.is_regular_file: return (os.path.isfile(self.download.path) and os.path.getsize( self.download.path) == self.download.size) else: log.debug('File is not a regular file, refusing to check size.') return (os.path.exists(self.download.path)) def is_complete(self): return (self.integrate(self.completed) == self.download.size and self.check_file_exists_and_size()) def finish_download(self): # Tell the children there is no more work, each child should # pull one NoneType from the queue and exit for i in range(self.n_procs): self.q_work.put(None) # Wait for all the children to exit by checking to make sure # that everyone has taken their NoneType from the queue. # Otherwise, the segment producer will exit before the # children return, causing them to read from a closed queue log.debug('Waiting for children to report') while not self.q_work.empty(): time.sleep(0.1) # Finish the progressbar if self.pbar: self.pbar.finish() def wait_for_completion(self): try: since_save = 0 while not self.is_complete(): while since_save < self.save_interval: interval = self.q_complete.get() self.completed.add(interval) if self.is_complete(): break this_size = interval.end - interval.begin self.size_complete += this_size since_save += this_size self.print_progress() since_save = 0 self.save_state() finally: self.finish_download()
else: print(feat.featuretype) assert (False) if (geneId == None): print("Warning (coding)") print(cds.id) print(cds.attributes) continue # Skip this CDS assert (geneId != None) if (cds.end - cds.start < 1): continue # Skip this CDS codingRegions.addi(cds.start, cds.end, geneId) nonCodingRegions.chop(cds.start, cds.end) # ----------------------------------------------------- # Collect transcribed regions # ----------------------------------------------------- # Note: standard coding genes have the general structure: gene -> mRNA -> CDS # non-coding genes have the general structure XXXX_gene -> noncoding_exon (XXXX can be tRNA, rRNA, snRNA, snoRNA, ncRNA) for mRNA in db.features_of_type(('mRNA', 'noncoding_exon'), limit=selectedChromosome, completely_within=False): geneId = None for gene in db.parents(mRNA.id): if gene.featuretype == 'gene': geneId = gene.id elif gene.featuretype == 'transposable_element_gene' or gene.featuretype == 'LTR_retrotransposon' or gene.featuretype == 'tRNA_gene' or gene.featuretype == 'ncRNA_gene' or gene.featuretype == 'snoRNA_gene' or gene.featuretype == 'rRNA_gene' or gene.featuretype == 'snRNA_gene' or gene.featuretype == 'telomerase_RNA_gene': # TODO - What to do about transposable element genes?!
class IntervalGroup(BaseTree): _tree: IntervalTree @staticmethod def compatible_keys(keys): for key in keys: if not isinstance(key, tuple): return False if not len(key) == 2: return False if not all([isinstance(x, int) for x in key]): return False return True @classmethod def from_dict(cls, d): ivs = [Interval(*k, v) for k, v in d.items()] return cls(IntervalTree(ivs)) @classmethod def from_label_dict(cls, d): ivs = [Interval(*map(int, k.split("-")), v) for k, v in d.items()] return cls(IntervalTree(ivs)) def add_group(self, name, group): self[name] = group def key_to_label(self, key): return f"{key[0]}-{key[1]}" def label_to_key(self, label): return tuple(apply(int, label.split("-"))) def to_label_dict(self): return {f"{iv.begin}-{iv.end}": iv.data for iv in sorted(self._tree)} def to_dict(self): return {(iv.begin, iv.end): iv.data for iv in sorted(self._tree)} def __init__(self, tree=None, *args, **kwargs): if tree is None: tree = IntervalTree() if not isinstance(tree, IntervalTree): raise TypeError("tree must be an instance of IntervalTree.") self._tree = tree def __getitem__(self, key): if isinstance(key, str): key = self.label_to_key(key) if isinstance(key, int): return self.value(key) elif isinstance(key, tuple) and len(key) == 2: return self.overlap_content(*key) elif isinstance(key, Iterable): return self.values_at(key) elif isinstance(key, slice): start = key.start or self.start stop = key.stop or self.end if key.step is None: return self.overlap(key.start, key.stop) else: return self.values_at(range(start, stop, key.step)) @property def start(self): return self._tree.begin() @property def end(self): return self._tree.end() def __setitem__(self, key, value): if isinstance(key, str): key = self.label_to_key(key) if isinstance(key, slice): start, stop, step = key.start, key.stop, key.step elif isinstance(key, tuple): if len(key) == 2: start, stop = key step = None elif len(key) == 3: start, stop, step = key else: raise ValueError("Setting intervals with tuple must be \ of form (start, end) or (start, end, step)") else: raise TypeError( "Wrong type. Setting intervals can only be done using a \ slice or tuple of (start, end) or (start, end, step)" ) if start is None: start = self.start if stop is None: stop = self.end if step is None: self.set_interval(start, stop, value) else: indices = list(range(start, stop, step)) for begin, end, val in zip(indices[:-1], indices[1:], value): self.set_interval(begin, end, val) def __delitem__(self, key): if isinstance(key, str): key = self.label_to_key(key) if isinstance(key, tuple) and len(key) == 2: self._tree.chop(*key) if isinstance(key, slice): self._tree.chop(key.start, key.end) raise TypeError("Must pass a tuple of (begin,end) or slice.") def keys(self): for iv in sorted(self._tree): yield iv.begin, iv.end def labels(self): return map(self.key_to_label, self.keys()) def items(self): for iv in sorted(self._tree): yield (iv.begin, iv.end), iv.data def values(self): for iv in sorted(self._tree): yield iv.data def __iter__(self): return self.keys() def __len__(self): return len(self._tree) def __bool__(self): return bool(len(self._tree)) def __getstate__(self): return tuple(sorted([tuple(iv) for iv in self._tree])) def __setstate__(self, d): ivs = [Interval(*iv) for iv in d] self._tree = IntervalTree(ivs) def overlap(self, begin, end): hits = sorted(self._tree.overlap(begin, end)) return [ Interval(max(iv.begin, begin), min(iv.end, end), iv.data) for iv in hits ] def overlap_content(self, begin, end): hits = sorted(self._tree.overlap(begin, end)) if len(hits) == 1: return hits[0].data return [hit.data for hit in hits] def value(self, index): hits = sorted(self._tree.at(index)) if len(hits) == 1: return hits[0].data return hits def values_at(self, indices): return [self.value(i) for i in indices] def set_interval(self, begin, end, value): self._tree.chop(begin, end) self._tree.addi(begin, end, value) def to_df(self, title="tree"): import pandas as pd ivs = [] for (begin, end), data in self.items(): if isinstance(data, BaseTree): data = float("nan") interval = { "label": f"{begin}-{end}", "begin": begin, "parameter": title, "mid": (begin + end) / 2, "end": end, "data": data } ivs.append(interval) return pd.DataFrame(ivs) def to_native(self): ivs = [] for (begin, end), data in self.items(): if isinstance(data, BaseTree): iv = Interval(begin, end, data.to_native()) else: iv = Interval(begin, end, data) ivs.append(iv) return IntervalTree(ivs) def explorer(self, title="tree"): import panel as pn pn.extension() from ..visualizations import IntervalTreeExplorer return IntervalTreeExplorer(tree=self, label=title)
def make_plan(store, base_url, uuid, power_up_before=60, power_down_after=60, power_down_gap=600): """ Generate plan for selected program in the store. The plan goes at least 4 hours into the future. """ log.msg('Create plan for program {}...'.format(uuid)) # Do not continue unless the program actually exists. if uuid not in store.program: log.msg('New plan has {} items.'.format(len(EMPTY_PLAN['items']))) return EMPTY_PLAN # We are going to plan for the next 4 hours. # Some of these hours will be today and some may be tomorrow. now = datetime.now() today = now.date() tomorrow = today + timedelta(days=1) # Do not generate items outside the 4h time window. not_before = mktime(now.timetuple()) not_after = not_before + 4 * 3600 # Use the interval tree to decide what events override what segments. ptree = IntervalTree() # Use another tree to track screen layouts. ltree = IntervalTree() # And another tree to track device power pwrtree = IntervalTree() # Start with an interval covering the whole 4h window. ltree[not_before:not_after] = DEFAULT_LAYOUT # Assume off is default pwrtree[not_before:not_after] = 'standby' for segment in store.segment.filter(program=uuid, day=today.weekday()): insert_segment(ptree, today, segment) insert_segment(ltree, today, segment) for segment in store.segment.filter(program=uuid, day=tomorrow.weekday()): insert_segment(ptree, tomorrow, segment) insert_segment(ltree, tomorrow, segment) # Ordered screen layouts. layouts = [] # Generate layouts. for interval in sorted(ltree): if interval.end < not_before: continue if interval.begin > not_after: break layouts.append({ 'start': interval.begin, 'end': interval.end, 'mode': interval.data['mode'], 'sidebar': interval.data['sidebar'], 'panel': interval.data['panel'], }) for event in store.event.filter(program=uuid, date=today.isoformat()): insert_segment(ptree, today, event) for event in store.event.filter(program=uuid, date=tomorrow.isoformat()): insert_segment(ptree, tomorrow, event) # Ordered playlist items. items = [] # Generate items for all intervals. for interval in sorted(ptree): begin = interval.begin if interval.end < not_before: # Skip this interval, it is already in the past. continue if begin > not_after: # End here, no need to go that far in the future. break playlist = store.item.filter(playlist=interval.data['playlist']) playlist = sorted(playlist, key=lambda item: item['position']) for item in cycle(playlist): # Locate the file backing the item. file = store.file[item['file']] # NOTE: Do not allow items to have shorter than 1s duration # or else we get stuck in this loop forever. duration = max(1.0, item['duration']) # Be careful not to exceed segment range. end = min(begin + duration, interval.end) if end >= not_before: # Insert the item only when it's in the future. items.append({ 'start': begin, 'end': end, 'type': file['type'], 'url': base_url + '/' + file['path'] \ if file['stream_url'] is None \ else file['stream_url'], }) # Update our current position. begin = end if begin >= interval.end or begin > not_after: # Advance to the next segment. break power = [] # Set power intervals for interval in sorted(ptree): if interval.end < not_before: continue if interval.begin > not_after: break # Power the device up a few seconds ahead to let it warm up. begin = interval.begin - power_up_before # Power the device down a few seconds after the segment ends. end = interval.end + power_down_after pwrtree.chop(begin, end) pwrtree[begin:end] = 'on' for interval in sorted(pwrtree): duration = interval.end - interval.begin state = interval.data # Do not turn the device off for gaps shorter than a certain # minimum to limit equipment wear and improve user experience. if state == 'standby' and duration < power_down_gap: state = 'on' power.append({ 'start': interval.begin, 'end': interval.end, 'power': state }) log.msg(''' New plan has {} items and {} layouts. '''.strip().format(len(items), len(layouts))) return { 'id': uuid4().hex, 'items': items, 'layouts': layouts, 'power': power, }
def _calculate_work_and_wait_time_by_status(issue, lead_time_statuses, work_statuses): work_intervals = IntervalTree() wait_intervals = IntervalTree() work_time_by_status = Counter() wait_time_by_status = Counter() work_time_by_status_with_block_time = Counter() wait_time_by_status_with_block_time = Counter() last_status_change_date = issue[CREATED_DATE] for transition in issue[STATUS_TRANSITIONS]: if transition['from'] in lead_time_statuses: if transition['from'] in work_statuses: work_intervals.add( Interval(last_status_change_date, transition['date'], transition['from'])) else: wait_intervals.add( Interval(last_status_change_date, transition['date'], transition['from'])) last_status_change_date = transition['date'] wait_intervals_with_block_time = copy.deepcopy(wait_intervals) work_intervals_with_block_time = copy.deepcopy(work_intervals) for i in range(len(issue[FLAG_TRANSITIONS])): transition_block_start = issue[FLAG_TRANSITIONS][i] if transition_block_start['from'] is None: if i + 1 < len(issue[FLAG_TRANSITIONS]): transition_block_end = issue[FLAG_TRANSITIONS][i + 1] wait_intervals.chop(transition_block_start['date'], transition_block_end['date']) work_intervals.chop(transition_block_start['date'], transition_block_end['date']) for interval in work_intervals: work_time_by_status[interval.data] += ( interval.end - interval.begin).total_seconds() / SECONDS_IN_DAY for interval in wait_intervals: wait_time_by_status[interval.data] += ( interval.end - interval.begin).total_seconds() / SECONDS_IN_DAY for interval in work_intervals_with_block_time: work_time_by_status_with_block_time[interval.data] += ( interval.end - interval.begin).total_seconds() / SECONDS_IN_DAY for interval in wait_intervals_with_block_time: wait_time_by_status_with_block_time[interval.data] += ( interval.end - interval.begin).total_seconds() / SECONDS_IN_DAY return { WORK_TIME_BY_STATUS: { '{}_work_time'.format(x): work_time_by_status[x] for x in lead_time_statuses }, WAIT_TIME_BY_STATUS: { '{}_wait_time'.format(x): wait_time_by_status[x] for x in lead_time_statuses }, WORK_TIME_BY_STATUS_WITH_BLOCK_TIME: { '{}_work_time_with_block_time'.format(x): work_time_by_status_with_block_time[x] for x in lead_time_statuses }, WAIT_TIME_BY_STATUS_WITH_BLOCK_TIME: { '{}_wait_time_with_block_time'.format(x): wait_time_by_status_with_block_time[x] for x in lead_time_statuses }, }