def test_merge_overlaps_reducer_with_initializer(): def reducer(old, new): return old + [new] # empty tree e = IntervalTree() e.merge_overlaps(data_reducer=reducer, data_initializer=[]) e.verify() assert not e # One Interval in tree o = IntervalTree.from_tuples([(1, 2, 'hello')]) o.merge_overlaps(data_reducer=reducer, data_initializer=[]) o.verify() assert len(o) == 1 assert sorted(o) == [Interval(1, 2, ['hello'])] # many Intervals in tree, with gap t = IntervalTree.from_tuples(data.ivs1.data) t.merge_overlaps(data_reducer=reducer, data_initializer=[]) t.verify() assert len(t) == 2 assert sorted(t) == [ Interval(1, 2, ['[1,2)']), Interval(4, 15, [ '[4,7)', '[5,9)', '[6,10)', '[8,10)', '[8,15)', '[10,12)', '[12,14)', '[14,15)', ]) ]
def _get_unparse_intervals_of_inds( dfs_inds_to_include: Sequence[int], ast: ObjectChoiceNode, unparse: UnparseResult ) -> IntervalTree: """Given some indicies we wish include, find the intervals of the total unparse string which are covered by those indicies""" include_set = set(dfs_inds_to_include) interval_tree = IntervalTree() currently_including = False for ind, pointer in enumerate(ast.depth_first_iter()): if ind % 2 != 0: # Only take into account the choice nodes. Skip the object nodes continue assert isinstance(pointer.cur_node, ObjectChoiceNode) func_need_to_do_here = None if ind in include_set: if not currently_including: func_need_to_do_here = lambda start, end: interval_tree.add(Interval(start, end)) currently_including = True else: if currently_including: func_need_to_do_here = lambda start, end: interval_tree.chop(start, end) currently_including = False if func_need_to_do_here: span = unparse.pointer_to_span(pointer) if span is None or span[1] - span[0] == 0: continue start, end = span func_need_to_do_here(start, end) interval_tree.merge_overlaps() return interval_tree
def test_merge_overlaps_reducer_with_initializer(): def reducer(old, new): return old + [new] # empty tree e = IntervalTree() e.merge_overlaps(data_reducer=reducer, data_initializer=[]) e.verify() assert not e # One Interval in tree o = IntervalTree.from_tuples([(1, 2, 'hello')]) o.merge_overlaps(data_reducer=reducer, data_initializer=[]) o.verify() assert len(o) == 1 assert sorted(o) == [Interval(1, 2, ['hello'])] # many Intervals in tree, with gap t = trees['ivs1']() t.merge_overlaps(data_reducer=reducer, data_initializer=[]) t.verify() assert len(t) == 2 assert sorted(t) == [ Interval(1, 2, ['[1,2)']), Interval(4, 15, [ '[4,7)', '[5,9)', '[6,10)', '[8,10)', '[8,15)', '[10,12)', '[12,14)', '[14,15)', ]) ]
def layout_cost(params): geometry = params_to_geometry(params) pdf = DOC.compile(geometry) pdf_document = fitz.open(pdf) if pdf_document.pageCount > 1: return 10 page1 = pdf_document[-1] full_tree_y = IntervalTree() tree_y = IntervalTree() blks = page1.getTextBlocks() # Read text blocks of input page # Calculate CropBox & displacement disp = fitz.Rect(page1.CropBoxPosition, page1.CropBoxPosition) croprect = page1.rect + disp full_tree_y.add(Interval(croprect[1], croprect[3])) for b in blks: # loop through the blocks r = fitz.Rect(b[:4]) # block rectangle # add dislacement of original /CropBox r += disp _, y0, _, y1 = r tree_y.add(Interval(y0, y1)) tree_y.merge_overlaps() for i in tree_y: full_tree_y.add(i) full_tree_y.split_overlaps() # For top and bottom margins, we only know they are the first and last elements in the list full_tree_y_list = list(sorted(full_tree_y)) _, bottom_margin = \ map(get_interval_width, full_tree_y_list[::len(full_tree_y_list) - 1]) return bottom_margin
def concatDifferences(diffs): if(len(diffs) > 1): points = list() tree = IntervalTree() for diff in diffs: if(diff[0] == diff[1]): points.append(diff) else: tree[diff[0]:diff[1]] = (diff[2], diff[3]) tree.merge_overlaps(tupleReducer) items = tree.items() for point in points: if(len(tree[point[0]]) == 0): items.add((point[0], point[1], (point[2], point[3]))) points = list() tree = IntervalTree() for item in items: if(item[2][0] == item[2][1]): points.append([item[2][0], item[2][1], item[0], item[1]]) else: tree[item[2][0]:item[2][1]] = (item[0], item[1]) tree.merge_overlaps(tupleReducer) items = tree.items() for point in points: if(len(tree[point[0]]) == 0): items.add((point[0], point[1], (point[2], point[3]))) diffs = list() for item in items: diffs.append([item[2][0], item[2][1], item[0], item[1]]) return diffs
def aln_coverage(aln_list): """ Calculate the coverage across the reported alignments for a given read. This will most often involve only a single alignment, but also considers non-overlapping alignments reported by BWA MEM scavenged from the XP tag. Reports the number of bases covered (<=read_len) and the overlap between them (normally 0). :param aln_list: the list of alignments for a read :return: dict {coverage: xx, overlap: yy} """ # using an intervaltree for this tr = IntervalTree() tot = 0 for ti in aln_list: if ti['is_reverse']: # reversed reads must be tallied from the opposite end n = ti['total'] for op, nb in ti['cigartuple']: if op == 0: tr.addi(n - nb, n) tot += nb n -= nb else: # forward mapped reads tally from start position n = 0 for op, nb in ti['cigartuple']: if op == 0: tr.addi(n, n + nb) tot += nb n += nb # lazy means of merging intervals tr.merge_overlaps() cov = sum([i.end - i.begin for i in tr]) return {'coverage': cov, 'overlap': tot - cov, 'has_multi': len(aln_list) > 1}
def store(self, key, start, end, data): # Which intervals have we already stored? redis_itree = self.red.hget(key, "itree") if redis_itree is None: itree = IntervalTree() else: itree = loads(redis_itree) # Add our new interval into the tree interval = Interval(start, end) itree = itree | IntervalTree([interval]) itree.merge_overlaps() zset = key + "__zset" # name of the redis sorted set self.red.hmset(key, {"itree": dumps(itree), "zset": zset}) # Start a redis pipeline. All of these actions are committed to the server in batch and within a single # transaction, rather than executing each one separately over the network. pipe = self.red.pipeline() # Store the data. Each key in the data should be a position, and value can be arbitrary data. for k, v in iteritems(data): v_serial = v if not isinstance(v, str): v_serial = msgpack.packb(v, use_bin_type=True) pipe.zadd(zset, k, v_serial) pipe.execute()
def main(): t = IntervalTree() with open(instr_file) as f: for l in f: l = l.strip() if (l.endswith("@") or l.endswith("#") or l.startswith("==")): continue words = l.split() if (words[1] != words[2]): overhead = 0 if (words[-1] == "overhead"): overhead = int(words[2]) - int(words[1]) t[int(words[1]):int(words[2])] = overhead t.merge_overlaps(lambda acc, v: acc + v) sorted_latencies = sorted( list(map(lambda x: x.end - x.begin - x.data, sorted(t)))) if (len(sorted_latencies) > 0): max_latency = sorted_latencies[len(sorted_latencies) - 1] avg_latency = sum(sorted_latencies) / len(sorted_latencies) else: max_latency = 0 avg_latency = 0 distr = distribution(sorted_latencies) out = {} out["name"] = bench_name out["mean_latency"] = avg_latency out["max_latency"] = max_latency out["distr_latency"] = distr print(json.dumps(out))
def repeats_main(args): paf = PAF.from_file(args.inpaf) repeats = defaultdict(list) for p in paf: qgenome = get_genome_name(p.query, args.sep) tgenome = get_genome_name(p.target, args.sep) if qgenome != tgenome: continue query, qinterval = p.query_as_interval() target, tinterval = p.target_as_interval() if (query == target) and qinterval.overlaps(tinterval): filtered = sym_diff(qinterval, tinterval) repeats[query].extend(filtered) else: repeats[query].append(qinterval) repeats[target].append(tinterval) for seqid, intervals in repeats.items(): itree = IntervalTree(intervals) itree.merge_overlaps() for interval in itree: bed = BED(seqid, interval.begin, interval.end) print(bed, file=args.outfile) return
def test_merge_overlaps_reducer_wo_initializer(): def reducer(old, new): return "%s, %s" % (old, new) # empty tree e = IntervalTree() e.merge_overlaps(data_reducer=reducer) e.verify() assert not e # One Interval in tree o = IntervalTree.from_tuples([(1, 2, 'hello')]) o.merge_overlaps(data_reducer=reducer) o.verify() assert len(o) == 1 assert sorted(o) == [Interval(1, 2, 'hello')] # many Intervals in tree, with gap t = trees['ivs1']() t.merge_overlaps(data_reducer=reducer) t.verify() assert len(t) == 2 assert sorted(t) == [ Interval(1, 2,'[1,2)'), Interval(4, 15, '[4,7), [5,9), [6,10), [8,10), [8,15), [10,12), [12,14), [14,15)') ]
def interval_tree(start_data, stop_data, buffer_len): starts = [] stops = [] t = IntervalTree() ## Shrink each interval by the buffer size for key, value in start_data.iteritems(): for i in range(0, len(value)): shrunk_start = value[i] + buffer_len / 2.0 shrunk_stop = stop_data[key][i] + 1 - buffer_len / 2.0 if shrunk_start < shrunk_stop: t[shrunk_start:shrunk_stop] = (shrunk_start, shrunk_stop) ## Add chromosome endpoints without buffer chrom_start, chrom_stop = get_extremes(start_data, stop_data) if chrom_start < t.begin() + 1: t[chrom_start:t.begin() + 1] = (chrom_start, t.begin() + 1) if t.end() - 1 < chrom_stop: t[t.end() - 1:chrom_stop] = (t.end() - 1, chrom_stop) ## Merge intervals that overlap in tree to get consensus t.merge_overlaps() ## Check that original intervals only overlap with one consensus interval for key, value in start_data.iteritems(): for i in range(0, len(value)): start = value[i] stop = stop_data[key][i] + 1 if len(t[start:stop]) > 1: ## If they overlap with more than one ## Remove part of consensus interval ## This will never be more than the buffer size/2 assert (len(t[start:stop]) == 2) remove_start = 0 remove_stop = 0 min_length = float('inf') for interval in t[start:stop]: overlap_start, overlap_stop = get_overlap( (start, stop), (interval[0], interval[1])) if (overlap_stop - overlap_start) < min_length: min_length = overlap_stop - overlap_start remove_start = overlap_start remove_stop = overlap_stop print(min_length) t.chop(remove_start, remove_stop) assert (min_length <= buffer_len / 2.0) assert (len(t[start:stop]) < 2) ## Get consensus start and stop points chrom_len = chrom_stop - chrom_start covered = 0.0 for interval in sorted(t): starts.append(interval[0]) stops.append(interval[1]) covered = covered + (interval[1] - interval[0]) print("The percentage of the chromosome covered is: %s" % '{0:.2f}'.format( (covered / chrom_len) * 100.0)) return (starts, stops)
def get_safeguard_speed(self, ego_idx, decision_trajectory, desired_speed, pred_t=4, safe_dis_range=7): danger_speed_range = IntervalTree() for collision_idx, collision_time in self.collision_points: if collision_idx > len(decision_trajectory) - 1: continue dis_sum = np.cumsum( np.linalg.norm(np.diff(decision_trajectory, axis=0), axis=1)) dis = dis_sum[collision_idx - 1] - dis_sum[ego_idx] speed_min = max(0, ((dis - safe_dis_range) / collision_time)) speed_max = min(desired_speed + 0.01, (dis + safe_dis_range) / collision_time) if speed_min >= speed_max: continue # rospy.logdebug("col_idx:%d,col_time:%.2f,ocp_min:%.2f,ocp_max:%.2f,desired_speed:%.2f)", # collision_idx,collision_time, # speed_min,speed_max,desired_speed) danger_speed_range[speed_min:speed_max] = (speed_min, speed_max) if len(danger_speed_range[desired_speed]) == 0: return desired_speed else: danger_speed_range.merge_overlaps() speed = sorted(danger_speed_range)[-1].begin return speed
def find_len_non_overlap(interval: Interval, itree: IntervalTree) -> int: overlaps = IntervalTree(itree.overlap(interval)) overlaps.merge_overlaps() len_overlap = sum([intersection(interval, o).length() for o in overlaps]) return interval.length() - len_overlap
def test_merge_overlaps_reducer_wo_initializer(): def reducer(old, new): return "%s, %s" % (old, new) # empty tree e = IntervalTree() e.merge_overlaps(data_reducer=reducer) e.verify() assert not e # One Interval in tree o = IntervalTree.from_tuples([(1, 2, 'hello')]) o.merge_overlaps(data_reducer=reducer) o.verify() assert len(o) == 1 assert sorted(o) == [Interval(1, 2, 'hello')] # many Intervals in tree, with gap t = IntervalTree.from_tuples(data.ivs1.data) t.merge_overlaps(data_reducer=reducer) t.verify() assert len(t) == 2 assert sorted(t) == [ Interval(1, 2,'[1,2)'), Interval(4, 15, '[4,7), [5,9), [6,10), [8,10), [8,15), [10,12), [12,14), [14,15)') ]
class AslrOracle: def __init__(self): self.queries = 0 self.InitCache() def CheckAddress(self, address): return self.CheckRange(address, 0x1000) def InitCache(self): self.cached_queries = 0 self.good_regions = IntervalTree() self.bad_regions = IntervalTree() def InsertToCache(self, start, end, valid): if valid: self.good_regions.add(Interval(start, end + 1)) self.good_regions.merge_overlaps() else: self.bad_regions.add(Interval(start, end)) def CheckCache(self, start, end): good_overlaps = self.good_regions.overlap(start, end) for overlap in good_overlaps: if (overlap[0] <= start) and (overlap[1] >= end): self.cached_queries += 1 return True bad_overlaps = self.bad_regions.envelop(start, end) if len(bad_overlaps) > 0: self.cached_queries += 1 return False return None
def get_length(self): gene_tree = IntervalTree() for t in self.transcript.values(): for e in t.exon: gene_tree.addi(e[0], e[1]) gene_tree.merge_overlaps() return sum(x.end - x.begin + 1 for x in gene_tree)
def filter_nstretches( itrees: Mapping[str, IntervalTree], nstretches: Mapping[str, IntervalTree], min_non_overlap: int, ) -> None: """ Remove contigs without much going on outside N stretches. """ for scaffold, itree in nstretches.items(): # Loop through all of the potential breaks for nstretch in itree: # Find "contigs" that overlap the potential break # We do this in sorted order, from smallest to largest alignment # that means shorter ones are removed first contigs = sorted(itrees[scaffold].overlap(nstretch), key=lambda x: x.length()) to_drop = set() # Loop through the contigs to test. for contig in contigs: # Find if they overlap with any other n stretches. n_overlaps = nstretches[scaffold].overlap(contig) # Get an intervaltree of all contigs overlapping this one. contig_overlaps = IntervalTree( itrees[scaffold].overlap(contig)) # Remove all of the n-chunks from the intervals. # Note the "Coords" is still duplicated in the data attribute for n_overlap in n_overlaps: contig_overlaps.chop(n_overlap.begin, n_overlap.end) # Get the intervals that aren't the overlap under # consideration. contig_overlaps_itree = IntervalTree(o for o in contig_overlaps if o.data != contig.data) contig_overlaps_itree.merge_overlaps() # Get the fragments of the overlap under consideration contig_itree = IntervalTree(o for o in contig_overlaps if o.data == contig.data) # For each of the fragments, find how many new Non-N bases it # contributes to the contigging. len_non_overlap = sum([ find_len_non_overlap(f, contig_overlaps_itree) for f in contig_itree ]) # Remove the contig if it doesn't cut the muster if len_non_overlap < min_non_overlap: to_drop.add(contig) for contig in to_drop: itrees[scaffold].remove(contig) return
def get_cnv_df(self, pat_tree, mat_tree): both_alleles = IntervalTree(list(pat_tree) + list(mat_tree)) both_alleles.split_overlaps() both_alleles.merge_overlaps(data_reducer=self.specify_levels) seg_df = [] for segment in both_alleles: seg_df.append([self.chr_name, segment.begin, segment.end, segment.data['major'], segment.data['minor']]) return pd.DataFrame(seg_df, columns=['Chromosome', 'Start.bp', 'End.bp', 'major', 'minor'])
def get_chrom_features(chrom, txs): ''' Get all merged intervals on given chromosome ''' chrom_features = txs[txs.seqname == chrom] chrom_features = zip(chrom_features.start, chrom_features.end) chrom_tree = IntervalTree() [chrom_tree.addi(s, e) for s, e in chrom_features if e - s > 0] chrom_tree.merge_overlaps() return chrom_tree
def get_gene_lookup(tx_ref_file): ''' Generate start/end coordinate reference for genes and output as an interval tree dictionary. Also output dataframe containing chromosome, start and ends for all exons. ''' ref_trees, ex_ref_out = None, None if tx_ref_file == '': return ref_trees, ex_trees, ex_ref_out logging.info('Generating lookup for genes...') #TODO: standardise with make_supertranscript for gtf handling tx_ref = pd.read_csv(tx_ref_file, comment='#', sep='\t', header=None, low_memory=False) tx_ref['gene_id'] = tx_ref[8].apply(lambda x: get_attribute(x, 'gene_id')) tx_ref['gene'] = tx_ref[8].apply(lambda x: get_attribute(x, 'gene_name')) # create start/end gene lookup, grouping adjacent rows # (this prevents merging distant genes with the same IDs) gn_ref = tx_ref[[0, 3, 4, 'gene_id', 'gene']] gn_ref.columns = ['chrom', 'start', 'end', 'gene_id', 'gene'] adj_check = (gn_ref.gene_id != gn_ref.gene_id.shift()).cumsum() gn_ref = gn_ref.groupby(['chrom', 'gene_id', 'gene', adj_check], as_index=False, sort=False).agg({'start': min, 'end': max}) gn_ref = gn_ref.drop_duplicates() # start/end coordinates for gene matching ref_trees = {} chroms = np.unique(gn_ref.chrom.values) for chrom in chroms: chr_ref = gn_ref[gn_ref.chrom == chrom].drop_duplicates() ref_tree = IntervalTree() for s,e,g in zip(chr_ref['start'].values, chr_ref['end'].values, chr_ref['gene'].values): if g != '': ref_tree.addi(s-1, e, g) ref_trees[chrom] = ref_tree # merged exon boundaries for block annotation ex_ref = tx_ref[tx_ref[2] == 'exon'] ex_ref_out = pd.DataFrame() ex_trees = {} for chrom in chroms: chr_ref = ex_ref[ex_ref[0] == chrom].drop_duplicates() ex_tree = IntervalTree() for s,e in zip(chr_ref[3].values, chr_ref[4].values): ex_tree.addi(s-1, e) ex_tree.merge_overlaps() tmp = pd.DataFrame([(chrom, tree[0], tree[1]) for tree in ex_tree], columns=['chrom', 'start', 'end']) ex_ref_out = pd.concat([ex_ref_out, tmp], ignore_index=True) ex_trees[chrom] = ex_tree return ref_trees, ex_trees, ex_ref_out
def total_intersection(itree: IntervalTree, interval: Interval) -> int: if interval.length() <= 0: return 0 total = 0 ovlps = IntervalTree(itree.overlap(interval)) ovlps.merge_overlaps() for ovlp in ovlps: inter = intersect(interval, ovlp) total += inter.length() return total
def score_document_regions(self, query, doc, fast=False): if fast: return self.fast_score_document_regions(query, doc) try: nltk.data.find("tokenizers/punkt") except (LookupError, OSError): nltk.download("punkt") try: nltk.data.find("corpora/stopwords") except LookupError: nltk.download("stopwords") result = {} stops = stopwords.words("english") if self.skip_stopwords else None qfield_values = [] specified_qfields = list(filter(None, self.queryfield)) # Choose a query field to do the highlighting with if specified_qfields: for fname in specified_qfields: qfield_values.append(query._asdict()[fname]) else: # Use the first field in the query that is not the id # ._asdict() is an OrderedDict, so this is deterministic for fname, fval in query._asdict().items(): if fname != "query_id": qfield_values = [fval] break assert len(qfield_values) for qfield_value in qfield_values: for word in word_tokenize(qfield_value): word = word.lower() if not word.isalpha() and not word.isnumeric(): continue if stops and word.lower() in stops: continue for dfield, dvalue in zip(doc._fields, doc): if not isinstance(dvalue, str): continue # skip non-strings for now if dfield not in result: result[dfield] = [] for match in re.finditer("\\b" + re.escape(word) + "\\b", dvalue.lower()): result[dfield].append([match.start(), match.end()]) for field, values in list(result.items()): tree = IntervalTree() for start, stop in values: tree[start:stop] = 1 tree.merge_overlaps() result[field] = sorted([[i.begin, i.end, 1.0] for i in tree]) return result
def get_unique_loci(intervals): grouped_intervals = defaultdict(list) for genome, seqid, start, end in intervals: grouped_intervals[(genome, seqid)].append(Interval(start, end)) unique_loci = list() for (genome, seqid), intvls in grouped_intervals.items(): itree = IntervalTree(intvls) itree.merge_overlaps() for intvl in itree: unique_loci.append((genome, seqid, intvl.begin, intvl.end)) return unique_loci
def read(self, length, offset, fh): """ Read data from this GhostFile. :param length: :param offset: :param fh: :return: """ if offset >= self.__filesize or length == 0: return b'' data = b'' intervals = IntervalTree(self.__rewritten_intervals[offset:offset+length]) intervals.merge_overlaps() intervals.slice(offset) intervals.slice(offset + length) intervals = sorted(intervals[offset:offset+length]) assert offset < self.__filesize assert intervals[0].begin >= offset and intervals[-1].end <= offset + length if len(intervals) > 0 else True if len(intervals) == 0: return b'\x00' * min(length, self.__filesize - offset) assert len(intervals) > 0 # Used to fill any hole at the start of the read range end_prev_interval = offset # Read the data for interv in intervals: # Fill any hole before this interval data += b'\x00' * (interv.begin - end_prev_interval) os.lseek(fh, interv.begin, os.SEEK_SET) data += os.read(fh, interv.length()) end_prev_interval = interv.end # Fill any hole at the end of the read range data += b'\x00' * (offset + length - intervals[-1].end) if offset + length > self.__filesize: data = data[0:self.__filesize-offset] assert len(data) <= length assert offset + len(data) <= self.__filesize return data
def calc_full_cnv(self, phylogeny): pat_tree = IntervalTree() for i in self.paternal_tree: weighted_cn = i.data.cn_change * phylogeny.ccfs[i.data.cluster_num] pat_tree[i.begin: i.end] = Event(i.data.type, i.data.allele, i.data.cluster_num, weighted_cn) pat_tree.split_overlaps() pat_tree.merge_overlaps(data_reducer=self.sum_levels) mat_tree = IntervalTree() for i in self.maternal_tree: weighted_cn = i.data.cn_change * phylogeny.ccfs[i.data.cluster_num] mat_tree[i.begin: i.end] = Event(i.data.type, i.data.allele, i.data.cluster_num, weighted_cn) mat_tree.split_overlaps() mat_tree.merge_overlaps(data_reducer=self.sum_levels) # could deliver a Chromosome (or child class) instead of just a tree return pat_tree, mat_tree
def update(self): intervals = IntervalTree() stack = [] for match in self.secretsRe[0].finditer(self.fullString): intervals.addi(match.start(), match.end(), "begin") #<err.*?> for match in self.secretsRe[1].finditer(self.fullString): intervals.addi(match.start(), match.end(), "center") #</err><corr> for match in self.secretsRe[2].finditer(self.fullString): intervals.addi(match.start(), match.end(), "end") #</corr> intervals = sorted(intervals) hidedIntervals = IntervalTree() if(self.mode == "latest"): for i in intervals: if(i.data == "begin"): stack.append(i) elif(i.data == "center"): token = stack.pop() if(len(stack) == 0): hidedIntervals.addi(token.begin, i.end, None) elif(i.data == "end"): hidedIntervals.addi(i.begin, i.end, None) else: for i in intervals: if(i.data == "begin"): hidedIntervals.addi(i.begin, i.end, None) elif(i.data == "center"): stack.append(i) elif(i.data == "end"): token = stack.pop() if(len(stack) == 0): hidedIntervals.addi(token.begin, i.end, None) hidedIntervals.merge_overlaps() if(hidedIntervals != None): self.hidedIntervals = sorted(hidedIntervals) else: self.hidedIntervals = [] if(len(self.hidedIntervals) == 0): self.visibleString = self.fullString else: self.visibleString = "" curIndex = 0 for match in self.hidedIntervals: if(curIndex < match[0]): self.visibleString += self.fullString[curIndex:match[0]] curIndex = match[1] self.visibleString += self.fullString[curIndex:]
def sweep_function(binary, start, addr_stack): block_tree = IntervalTree() branch_stack = [start] heapify(branch_stack) while branch_stack: addr = heappop(branch_stack) part = FunctionBlock(start_addr=addr) for inst in THUMB_DISASSEMBLER.disasm(binary[addr:], addr): i_addr = inst.address if block_tree.overlaps(i_addr): # we already visited this address (maybe a loop?) break part.instructions += 1 part.stop_addr = i_addr + inst.size ops = len(inst.operands) reg_reads, reg_writes = inst.regs_access() if inst.id in BRANCH_IDS: if ARM_REG_LR in reg_reads: if inst.cc == ARM_CC_AL: # this is a conditional return continue # this is a unconditional return break new_addr = parse_imm(inst.operands[0].imm) heappush(branch_stack, new_addr) if inst.cc == ARM_CC_AL: part.branches += 1 break else: part.conditional_branches += 1 elif ops == 1 and inst.id in CALL_IDS: new_addr = parse_imm(inst.operands[0].imm) addr_stack.add(new_addr) part.calls.add(new_addr) elif ops == 2 and inst.id in COND_BRANCH_IDS: new_addr = parse_imm(inst.operands[1].imm) heappush(branch_stack, new_addr) part.conditional_branches += 1 elif ARM_REG_PC in reg_writes: # assume return for any otherwise unmatched changes of PC break if part.start_addr < part.stop_addr: block_tree[part.start_addr:part.stop_addr] = part block_tree.merge_overlaps(add) return block_tree
def fast_score_document_regions(self, query, doc): """ Score document regions with Aho–Corasick_algorithm. :param query: :param doc: :param run_idx: :return: """ try: nltk.data.find("tokenizers/punkt") except (LookupError, OSError): nltk.download("punkt") try: nltk.data.find("corpora/stopwords") except LookupError: nltk.download("stopwords") result = {} stops = stopwords.words("english") if self.skip_stopwords else None query_tokens = set() for qfield_value in query: query_tokens.update([ w.lower() for w in word_tokenize(qfield_value) if (w.isalpha() or w.isnumeric()) and ( not stops or not w.lower() in stops) ]) if not hasattr(self, "A"): self.A = ahocorasick.Automaton() self.A.clear() for idx, token in enumerate(query_tokens): self.A.add_word(token, (idx, token)) self.A.make_automaton() for dfield, dvalue in zip(doc._fields, doc): matches = [(end_idx - len(match) + 1, end_idx) for end_idx, (_, match) in self.A.iter(dvalue.lower())] result[dfield] = matches for field, values in list(result.items()): tree = IntervalTree() for start, stop in values: tree[start:stop + 1] = 1 tree.merge_overlaps() result[field] = sorted([[i.begin, i.end, 1.0] for i in tree]) return result
def paf_to_intervals(pafs: Iterable[PAF]) -> Dict[Tuple[str, str], float]: """ """ pairwise_intervals: Dict[Tuple[str, str], List[Interval]] = defaultdict(list) # noqa lengths: Dict[str, int] = dict() for paf in pafs: if paf.query == paf.target: continue lengths[paf.query] = paf.qlen lengths[paf.target] = paf.tlen if paf.qlen < paf.tlen: _, interval = paf.query_as_interval() id_ = (paf.query, paf.target) elif paf.tlen < paf.qlen: _, interval = paf.target_as_interval() id_ = (paf.target, paf.query) elif paf.query < paf.target: _, interval = paf.query_as_interval() id_ = (paf.query, paf.target) else: _, interval = paf.target_as_interval() id_ = (paf.target, paf.query) pairwise_intervals[id_].append(interval) pairwise_covs: Dict[Tuple[str, str], float] = dict() for (query, target), intervals in pairwise_intervals.items(): itree = IntervalTree(intervals) itree.merge_overlaps() ali_length = 0 for interval in itree: ali_length += interval.length() contig_length = lengths[query] cov = ali_length / contig_length pairwise_covs[(query, target)] = cov return pairwise_covs
def seq_from_exons_introns(exons, introns, join=True): """ Merges exons and introns and returns sequence Note that exons and introns are in different formats Exons are tuples of (exn_num, chrom, start, stop, strand, gene) (should be nonoverlapping) Introns are tuples of (intron_seq, intron_gcoords) that are nonoverlappping by construction We ignore the intron sequence IN CASE the coords overlap """ itree = IntervalTree() chroms = set() strands = set() for exn_num, chrom, start, stop, strand, gene in exons: chroms.add(chrom) strands.add(strand) itree[start:stop] = f"exon_{exn_num}" assert len(chroms) == 1 chrom = chroms.pop() assert len(strands) == 1 strand = strands.pop() for i, gcoord in enumerate(introns[1]): chrom, startstop, strand = gcoord.split(":") start, stop = map(int, startstop.split("-")) itree[start:stop] = f"ri_{i}" itree_orig = itree.copy() itree.merge_overlaps(lambda x, y: ";".join([x, y])) if len(itree) != len(itree_orig): logging.warn(f"Contains overlaps: {itree_orig}") # The itree sorts everything in 5' to 3' regardless of strand seqs = [] for interval in itree: # Actual sequences are rev comped properly # seq = GENOME_FA[chrom][interval.begin:interval.end] seq = GENOME_FA.get_seq(chrom, interval.begin, interval.end, strand == "-") assert seq.seq seqs.append(seq.seq) return ''.join(seqs) if join else seqs
def merge_overlapping(features: Sequence[GFFRecord], pad: int) -> Iterator[Tuple[int, int]]: """ """ itree = IntervalTree(pad_intervals(gffrecords_to_intervals(features), pad)) itree.merge_overlaps(strict=False) seen: Set[Tuple[int, int]] = set() for interval in itree: if (interval.begin, interval.end) in seen: continue else: seen.add((interval.begin, interval.end)) yield interval.begin, interval.end return
def merge_short_intervals(tree: IntervalTree, min_len: int = 1) -> IntervalTree: """Merge short intervals with their neighbour.""" merged_tree = IntervalTree() for i, iv in enumerate(sorted(tree)): if iv.length() >= min_len: merged_tree.add(iv) continue if has_adjoining_neighbour_to_right(iv, tree): # have to add 2 to end to force merge as is no inclusive merged_tree.add(Interval(iv.begin, iv.end + 2, data=iv.data)) elif has_adjoining_neighbour_to_left(iv, tree): merged_tree.add(Interval(iv.begin - 1, iv.end, data=iv.data)) merged_tree.merge_overlaps(data_reducer=data_reducer) if any(iv.length() < min_len for iv in merged_tree): return merge_short_intervals(merged_tree, min_len=min_len) else: return merged_tree
def get_tx_juncs(read): ''' Get all junctions from the given contig ''' starts, ends = zip(*read.get_blocks()) # merge adjacent 'junctions' (i.e. insertions) blocks = IntervalTree() for s, e in zip(starts, ends): blocks.addi(s, e) blocks.merge_overlaps(strict=False) starts = np.sort([block[0] for block in blocks]) ends = np.sort([block[1] for block in blocks]) chroms = [read.reference_name] * (len(starts) - 1) tx_juncs = list(zip(chroms, ends[:-1], starts[1:])) tx_juncs = [junc for junc in tx_juncs if (junc[2] - junc[1]) >= MIN_GAP] return tx_juncs
def test_merge_overlaps_empty(): t = IntervalTree() t.merge_overlaps() t.verify() assert len(t) == 0
def get_unknown_meanings(self, w=None,option=None): ivt = IntervalTree([Interval(iv.begin,iv.end) for iv in self._content_coding if not iv.data]) ivt.merge_overlaps() return ivt
class BitwrappedStream(object): """A stream that wraps other streams to provide bit-level access""" closed = True def __init__(self, stream): """Init the bit-wrapped stream :stream: The normal byte stream """ self._stream = stream self._bits = collections.deque() self.closed = False # assume that bitfields end on an even boundary, # otherwise the entire stream will be treated as # a bit stream with no padding self.padded = True # packed left-to-right self.direction = BIT_DIR_LEFT_RIGHT self.range_set = IntervalTree() def is_eof(self): """Return if the stream has reached EOF or not without discarding any unflushed bits :returns: True/False """ pos = self._stream.tell() byte = self._stream.read(1) self._stream.seek(pos, 0) return utils.binary(byte) == utils.binary("") def close(self): """Close the stream """ self.closed = True self._flush_bits_to_stream() self._stream.close() def flush(self): """Flush the stream """ self._flush_bits_to_stream() self._stream.flush() def isatty(self): """Return if the stream is a tty """ return self._stream.isatty() def read(self, num): """Read ``num`` number of bytes from the stream. Note that this will automatically resets/ends the current bit-reading if it does not end on an even byte AND ``self.padded`` is True. If ``self.padded`` is True, then the entire stream is treated as a bitstream. :num: number of bytes to read :returns: the read bytes, or empty string if EOF has been reached """ start_pos = self.tell() if self.padded: # we toss out any uneven bytes self._bits.clear() res = utils.binary(self._stream.read(num)) else: bits = self.read_bits(num * 8) res = bits_to_bytes(bits) res = utils.binary(res) end_pos = self.tell() self._update_consumed_ranges(start_pos, end_pos) return res def read_bits(self, num): """Read ``num`` number of bits from the stream :num: number of bits to read :returns: a list of ``num`` bits, or an empty list if EOF has been reached """ if num > len(self._bits): needed = num - len(self._bits) num_bytes = (needed // 8) + 1 read_bytes = self._stream.read(num_bytes) for bit in bytes_to_bits(read_bytes): self._bits.append(bit) res = [] while len(res) < num and len(self._bits) > 0: res.append(self._bits.popleft()) return res def write(self, data): """Write data to the stream :data: the data to write to the stream :returns: None """ if self.padded: # flush out any remaining bits first if len(self._bits) > 0: self._flush_bits_to_stream() self._stream.write(data) else: # nothing to do here if len(data) == 0: return bits = bytes_to_bits(data) self.write_bits(bits) def write_bits(self, bits): """Write the bits to the stream. Add the bits to the existing unflushed bits and write complete bytes to the stream. """ for bit in bits: self._bits.append(bit) while len(self._bits) >= 8: byte_bits = [self._bits.popleft() for x in six.moves.range(8)] byte = bits_to_bytes(byte_bits) self._stream.write(byte) # there may be unflushed bits leftover and THAT'S OKAY def tell(self): """Return the current position in the stream (ignoring bit position) :returns: int for the position in the stream """ res = self._stream.tell() if len(self._bits) > 0: res -= 1 return res def seek(self, pos, seek_type=0): """Seek to the specified position in the stream with seek_type. Unflushed bits will be discarded in the case of a seek. The stream will also keep track of which bytes have and have not been consumed so that the dom will capture all of the bytes in the stream. :pos: offset :seek_type: direction :returns: TODO """ self._bits.clear() return self._stream.seek(pos, seek_type) def size(self): """Return the size of the stream, or -1 if it cannot be determined. """ pos = self._stream.tell() # seek to the end of the stream self._stream.seek(0,2) size = self._stream.tell() self._stream.seek(pos, 0) return size def unconsumed_ranges(self): """Return an IntervalTree of unconsumed ranges, of the format (start, end] with the end value not being included """ res = IntervalTree() prev = None # normal iteration is not in a predictable order ranges = sorted([x for x in self.range_set], key=lambda x: x.begin) for rng in ranges: if prev is None: prev = rng continue res.add(Interval(prev.end, rng.begin)) prev = rng # means we've seeked past the end if len(self.range_set[self.tell()]) != 1: res.add(Interval(prev.end, self.tell())) return res # ----------------------------- # PRIVATE FUNCTIONS # ----------------------------- def _update_consumed_ranges(self, start_pos, end_pos): """Update the ``self.consumed_ranges`` array with which byte ranges have been consecutively consumed. """ self.range_set.add(Interval(start_pos, end_pos+1)) self.range_set.merge_overlaps() def _flush_bits_to_stream(self): """Flush the bits to the stream. This is used when a few bits have been read and ``self._bits`` contains unconsumed/ flushed bits when data is to be written to the stream """ if len(self._bits) == 0: return 0 bits = list(self._bits) diff = 8 - (len(bits) % 8) padding = [0] * diff bits = bits + padding self._stream.write(bits_to_bytes(bits)) self._bits.clear()
nodes_to_process = True start = time.time() more_file = True merge_requested = False loop_count = 0 while nodes_to_process: loop_count += 1 if loop_count > MIN_MEM_NODE_COUNT: loop_count = 0 merge_requested = True #REFILL THE BUFFER if (g_person_node_count+g_wine_node_count) < MIN_MEM_NODE_COUNT: if merge_requested: pt.merge_overlaps() wt.merge_overlaps() merge_requested = False while (g_person_node_count+g_wine_node_count) < MAX_MEM_NODE_COUNT and more_file: line = f.readline() #read in line from input if line: add_line_to_graph(line) else: more_file = False # WINE SECTION wine_node_with_fewest_edges = None wine_node_with_fewest_edges_edge_count = FEWER_COMPARE wine_search_count = 0 for node in nx.dfs_postorder_nodes(fg, "r"): #dfs postorder is magic and should be worshiped. --Andy Weir