def create_triplex_superset_tree(full_file_path): """ :param full_file_path: :return: """ print("Creating triplex superset TREE VERSION for file: " + full_file_path) tree = intervaltree.IntervalTree() content = utils.get_tpx_lines_by_chromosome(full_file_path) output_file = open(full_file_path.replace('.tpx', '.ss.tpx'), 'w') output_file.write(get_first_line(full_file_path)) def compare(iv1, iv2): if iv1[0] < iv2[0]: # start is smaller return -1 elif iv1[0] > iv2[0]: # start is larger return 1 else: # starts are equal, check for end point return iv2[1] - iv1[1] for chrom in content.keys(): print("Processing #" + str(len(content[chrom])) + " triplexes from chromosome " + chrom) sorted_intervals = set() for index, line in enumerate(content[chrom]): cols = line.split('\t') (tfo_chr, tfo_start_offset) = cols[0].split(':') (tts_chr, tts_start_offset) = cols[3].split(':') tfo_start_offset = int(tfo_start_offset) tts_start_offset = int(tts_start_offset) (tfo_start_pos, tfo_end_pos, tts_start_pos, tts_end_pos) = (int(cols[1]) + tfo_start_offset, int(cols[2]) + tfo_start_offset, int(cols[4]) + tts_start_offset, int(cols[5]) + tts_start_offset) sorted_intervals.add((tts_start_pos, tts_end_pos, index)) assert (tfo_chr == tts_chr) sorted_intervals = sorted(sorted_intervals, cmp=compare) # add first interval always tree.add( intervaltree.Interval(sorted_intervals[0][0], sorted_intervals[0][1], sorted_intervals[0][2])) output_file.write(content[chrom][sorted_intervals[0][2]]) for iv in sorted_intervals[1:]: overlaps_start = set( [x.data for x in tree.top_node.search_overlap([iv[0]])]) contained = False for iv_overlap in tree.top_node.search_overlap([iv[1]]): if iv_overlap.data in overlaps_start: contained = True if not contained: tree.add(intervaltree.Interval(iv[0], iv[1], iv[2])) output_file.write(content[chrom][iv[2]]) output_file.close()
def lookup_symbol_interval(self, name, num): (startaddr, endaddr) = db_info.get(self.stage).mmap_var_loc(name) reloc_names = db_info.get(self.stage).reloc_names_in_substage(num) varloc = intervaltree.Interval(startaddr, endaddr) for (rname, rbegin, rsize, roffset) in db_info.get( self.stage).reloc_info_by_cardinal(reloc_names): relrange = intervaltree.Interval(rbegin, rbegin + rsize) if relrange.contains_interval(varloc): offset = roffset varloc = intervaltree.Interval(varloc.begin + offset, varloc.end + offset) return varloc
def divide_intervals(self, stages, table): divided_intervals = {i: intervaltree.IntervalTree() for i in stages} for r in table.iterrows(): n = r["substagenum"] divided_intervals[n].add( intervaltree.Interval(r["minaddr"], r["maxaddr"])) if self.interval_type == self.CUMULATIVE: for i in range(n + 1, len(stages)): divided_intervals[i].add( intervaltree.Interval(r["minaddr"], r["maxaddr"])) for i in divided_intervals.itervalues(): i.merge_overlaps() return divided_intervals
def make_variant(seq, tree, max_indel=5, chrom='contig1'): """Return a random variant (variable-length indel or single base sub) which does not overlap with any intervals in a supplied tree and update the tree. """ attempts = 0 while True: attempts += 1 vpos = np.random.randint(max_indel + 1, len(seq)-max_indel - 1) indel_len = np.random.randint(-max_indel, max_indel + 1) if indel_len < 0: # del ref = seq[vpos: vpos + abs(indel_len) + 1] alt = seq[vpos] elif indel_len > 1: # ins ref = seq[vpos] alt = ref + ''.join(np.random.choice(bases, indel_len)) else: # single nucleotide sub ref = seq[vpos] alt = np.random.choice([b for b in bases if str(b) != str(ref)]) v = medaka.vcf.Variant(chrom, vpos, ref=ref, alt=alt, qual=np.random.randint(1,10), genotype_data={'GT':'1/1'}) vtrimmed = v.trim() end = vtrimmed.pos + len(vtrimmed.ref) interval = intervaltree.Interval(v.pos, end + 1) if not tree.overlaps(interval): tree.add(interval) # print('Variant {} succeeded after {}'.format(len(tree), attempts)) break return vtrimmed
def extract_data_page_slack(page): ''' extract the slack bytes from the given data page. Args: page (cim.DataPage): the page from which to extract slack space. Yields: SlackRegion: the raw bytes of the slack space. ''' # start by marking the entire page as allocated slack = intervaltree.IntervalTree( [intervaltree.Interval(0, cim.DATA_PAGE_SIZE)]) # remove the toc region slack.chop(0, len(page.toc)) # if there is a toc, then we remove the empty entry at the end # (this is not included in the list of entries, but its part of the toc). if len(page.toc) > 0: slack.chop(len(page.toc), len(page.toc) + 0x10) # and regions for each of the entries for j in range(page.toc.count): entry = page.toc[j] slack.chop(entry.offset, entry.offset + entry.size) for region in sorted(slack): begin, end, _ = region if (end - begin) > cim.DATA_PAGE_SIZE: continue yield SlackRegion(page.logical_page_number, begin, page.buf[begin:end])
def samples_to_bed(args): """Write a bed file from samples in a datastore file.""" logger = medaka.common.get_named_logger('Variants') index = medaka.datastore.DataIndex(args.inputs) trees = collections.defaultdict(intervaltree.IntervalTree) logger.info("Building interval tree") for s, f in index.samples: d = medaka.common.Sample.decode_sample_name(s) # start and end are string repr of floats (major.minor coordinates) start, end = int(float(d['start'])), int(float(d['end'])) # add one to end of interval, as intervaltree intervals and bed file # intervals are end-exclusive (i.e. they don't contain the last # coordinate), whilst the last position in a sample is included in that # sample. trees[d['ref_name']].add(intervaltree.Interval(start, end + 1)) with open(args.output, 'w') as fh: for contig, tree in trees.items(): # strict=False as consecutive samples can start and end on the same # major (overlap is in minor) hence if samples are abutting but not # overlapping in major coords, merge them tree.merge_overlaps(strict=False) logger.info("Writing intervals for {}".format(contig)) for i in sorted(tree.all_intervals): fh.write("{}\t{}\t{}\n".format(contig, i.begin, i.end)) logger.info("All done, bed file written to {}".format(args.output))
def write_interval_info(self, hwname, pclo=None, pchi=None, substage_names=[], substage_entries={}): wt = self._get_writestable(hwname) if "framac" in hwname: return [(r['destlo'], r['desthi']) for r in pytable_utils.get_rows( '(%d <= writepc) & (writepc < %d)' % (pclo, pchi))] else: fns = substage_entries substages = substage_names num = 0 intervals = {n: intervaltree.IntervalTree() for n in substages} for r in wt.read_sorted('index'): pc = r['pc'] if num < len(fns) - 1: # check if we found the entrypoint to the next stage (lopc, hipc) = substage_entries[num + 1] if (lopc <= pc) and (pc < hipc): num += 1 if num in substages: start = r['dest'] end = start + pytable_utils.get_rows( wt, 'pc == %d' % r['pc'])[0]['writesize'] intervals[num].add(intervaltree.Interval(start, end)) return intervals
def update_writes(self, line, pc, lo, hi, stage, origpc=None, substage=None): if not pc: (path, lineno) = line.split(':') lineno = int(lineno) else: (path, lineno) = ('', 0) if not origpc: origpc = pc w = WriteDstResult(path, lineno, '', [intervaltree.Interval(lo, hi)], pc, origpc, substage_name=substage) if lo > hi: print "%x > %x at %x" % (lo, hi, pc) traceback.print_stack() self.writerangetable.add_dsts_entry(w)
def from_line(cls, line, stage): res = cls.regexp.match(line.strip()) if res is None: raise Exception("%s is not a framac dst result") else: min_value = int(res.group(1), 0) max_value = int(res.group(2), 0) if max_value > 0xFFFFFFFF: max_value = 0xFFFFFFFF lvalue = res.group(3) path = res.group(4) # somewhat of a hack to get relative path root = Main.get_target_cfg().software_cfg.root if path.startswith(root): path = os.path.relpath(path, root) path = os.path.join(Main.raw.runtime.temp_target_src_dir, path) elif path.startswith( Main.test_suite_path): # not sure why this happens path = os.path.relpath(path, Main.test_suite_path) path = os.path.join(Main.raw.runtime.temp_target_src_dir, path) elif path.startswith("/tmp/tmp"): path = "/".join(path.split("/")[3:]) path = os.path.join(Main.raw.runtime.temp_target_src_dir, path) lineno = int(res.group(5)) callstack = res.group(6) # somewhat of a hack for muxconf return cls(path, lineno, lvalue, [intervaltree.Interval(min_value, max_value)], callstack=callstack, stage=stage)
def load_bed(self, instream): self.trees = {} for line in instream: chrom, start0, end1 = line.split('\t', 3)[:3] if chrom not in self.trees: self.trees[chrom] = intervaltree.IntervalTree() self.trees[chrom].add(intervaltree.Interval( int(start0), int(end1)))
def _build_semester_lookup(): """ Build data structure to let us easily look up date -> strm. """ all_semesters = Semester.objects.all() intervals = ((s.name, Semester.start_end_dates(s)) for s in all_semesters) intervals = (intervaltree.Interval(st, en + ONE_DAY, name) for (name, (st, en)) in intervals) return intervaltree.IntervalTree(intervals)
def _add_addr_range(self, start, end): if type(start) == int and type(end) == int: if not end >= start: raise Exception("start addr %x must be smaller than end address %x for %s" % (start, end, self.short_name)) self.addresses.add(intervaltree.Interval(start, end)) else: raise Exception("One of start addr (%s) end addr (%s) is not an int for %s" % (start, end, self.short_name))
def intervaltrees_from_bed(path_to_bed): """Created dict of intervaltrees from a .bed file, indexed by chrom. :param path_to_bed: str, path to .bed file. :returns: { str chrom: `intervaltree.IntervalTree` obj }. """ trees = defaultdict(intervaltree.IntervalTree) for chrom, start, stop in yield_from_bed(path_to_bed): trees[chrom].add(intervaltree.Interval(begin=start, end=stop)) return trees
def setup(self): if not self.attr_exists("setup_done"): self.setup_done = True self.supported_bootloader_cfgs = {} self.hardware_type_cfgs = {} for b in self.supported_bootloaders: self.supported_bootloader_cfgs[b] = self.object_config_lookup( "Bootloader", b) for h in self.types: self.hardware_type_cfgs[h] = self.object_config_lookup( "HardwareConfig", h) self.base_mem_map = os.path.join(Main.hw_info_path, self.name, self.base_mem_map) self.sdskeleton = os.path.join(Main.hw_info_path, self.name, self.sdskeleton) self.tech_reference = os.path.join(Main.hw_info_path, self.name, self.tech_reference) if self.attr_exists("phy_addr_range"): lo = self.phy_addr_range['loaddr'] hi = self.phy_addr_range['hiaddr'] self.phy_addr_range = intervaltree.IntervalTree( [intervaltree.Interval(lo, hi)]) range_intervals = [] if self.attr_exists("ram_ranges"): self.ram_range_names = [r['name'] for r in self.ram_ranges] for r in self.ram_ranges: lo = r['loaddr'] hi = r['hiaddr'] inter = intervaltree.Interval(lo, hi) range_intervals.append(inter) setattr(self, r['name'], inter) else: self.ram_range_names = [] self.ram_ranges = intervaltree.IntervalTree(range_intervals) self.ram_ranges.merge_overlaps() self.non_ram_ranges = self.phy_addr_range for r in self.ram_ranges: self.non_ram_ranges.chop(r.begin, r.end) self.non_ram_ranges.remove_overlap(r)
def intervaltree_from_bed(path_to_bed, chrom): """Created intervaltree from a .bed file for the given chromosome. :param path_to_bed: str, path to .bed file. :param chrom: str, chromosome name. :returns: `intervaltree.IntervalTree` obj. """ tree = intervaltree.IntervalTree() for j in (i for i in pybedtools.BedTool(path_to_bed) if i.chrom == chrom): tree.add(intervaltree.Interval(begin=j.start, end=j.stop)) return tree
def sort_intervals(intervals): tree = intervaltree.IntervalTree() interval_counts = defaultdict(lambda: 0) for i in intervals: interval_counts[i] = interval_counts[i] + 1 for key, value in interval_counts.items(): tree.add(intervaltree.Interval(key[0], key[1] + 1, value)) answer = [] for i in range(24): answer.append((sum(value[2] for value in tree.at(i)))) return answer
def fillGaps(tads,N,counter): # THIS FUNCTION WORKS FOR SINGLE CHROMOSOME # chromosome is [0,N] interval gaps = itr.IntervalTree([itr.Interval(0,N,1)]) for tad in tads: gaps.chop(tad.begin,tad.end) # create new tads object with filled gaps filled = tads.copy() for gap in gaps: filled.addi(gap.begin,gap.end,counter) counter += 1 return counter, filled
def region_interval_trees(regions): chromosome_intervals = defaultdict(list) for region in regions: interval = intervaltree.Interval(region.start - 1, region.end, data=region) chromosome_intervals[region.chromosome].append(interval) region_trees = dict() for chromosome, intervals in chromosome_intervals.items(): region_trees[chromosome] = intervaltree.IntervalTree(intervals) return region_trees
def _interval_tree_regions(self, regions): intervals = defaultdict(list) for i, region in enumerate(regions): interval = intervaltree.Interval(region.start - 1, region.end, data=i) intervals[region.chromosome].append(interval) interval_trees = { chromosome: intervaltree.IntervalTree(intervals) for chromosome, intervals in intervals.items() } return interval_trees
def complement_intervaltrees(trees, contig_lengths): """Complement intervals, returning intervals not present in the input trees. :param trees: {str contig: `intervaltree.IntervalTree` objs} :param contig_lengths: {str contig: int contig length} :returns: {str contig: `intervaltree.IntervalTree` objs} """ comp = collections.defaultdict(intervaltree.IntervalTree) for contig, length in contig_lengths.items(): comp[contig].add(intervaltree.Interval(0, length)) for contig, tree in trees.items(): for interval in tree: comp[contig].chop(interval.begin, interval.end) return comp
def _relocated(regname, fullname): r = allregions[regname] ret = s rend = fullname.rsplit(".", 1)[1] cardinalres = re.match("([\d])+_relocated", rend) cardinal = cardinalres.group(1) relocindex = re.sub("[._relocated]+", "", rend) if r.addresses_resolved: start = min(r.addresses).begin end = max(r.addresses).end (offset, mod) = db_info.get(self.stage).reloc_offset_and_mod_from_cardinal(relocindex) ret = intervaltree.Interval((start + offset) % mod, (end + offset) % mod) return ret
def update(self, labels): """ Add a list of labels to the end of the list. Args: labels (list): Labels to add. """ ivs = [] for label in labels: label.label_list = self ivs.append(intervaltree.Interval(label.start, label.end, label)) self.label_tree.update(ivs)
def setup(self): if not self.attr_exists("setup_done"): self.populate_path_from_name("hw_info_path", or_default=True, do_setattr=True) self._check_path("hw_info_path") self._update_raw("hw_info_path", self.hw_info_path) self.setup_done = True self.host_cfgs = {} for h in self.hosts: self.host_cfgs[h] = self.object_config_lookup("HostConfig", h) if not self.attr_exists("addr_range"): self.addr_range = Main.default_raw.HardwareClass._hw.addr_range lo = long(self.addr_range[0]) hi = long(self.addr_range[1]) self.addr_range = intervaltree.IntervalTree( [intervaltree.Interval(lo, hi)]) range_intervals = [] self.addr_range_names = [] if self.attr_exists("named_addr_ranges"): for (k, [lo, hi]) in self.named_addr_ranges.iteritems(): self.addr_range_names.append(k) inter = intervaltree.Interval(long(lo), long(hi)) range_intervals.append(inter) setattr(self, k, inter) self._update_raw(k, inter) self.ram_ranges = intervaltree.IntervalTree(range_intervals) self.ram_ranges.merge_overlaps() self.non_ram_ranges = self.addr_range for r in self.ram_ranges: self.non_ram_ranges.chop(r.begin, r.end) self.non_ram_ranges.remove_overlap(r) if not hasattr(self, "default_host"): self.default_host = self.hosts[0]
def calculate_framac_intervals(self, substages): intervals = {n: intervaltree.IntervalTree() for n in substages} for num in substages: for r in pytable_utils.get_rows(self.trace_intervals_table, 'substagenum == %s' % num): # lookup writes performed by this function f = r['functionname'] (lopc, hipc) = self.fun_info(f) res = db_info.get(self.stage).write_interval_info( tracename, lopc, hipc) intervals[num] += intervaltree.IntervalTree( [intervaltree.Interval(r[0], r[1]) for r in res]) return intervals
def allowed_writes(self, substage): n = substage query = "(substagenum == %d) & (writable == True)" % (n) drs = self.substage_region_policy_table.where(query) iis = intervaltree.IntervalTree() for region in drs: if region['allowed_symbol']: sname = region['symbol_elf_name'] iis.add(self.lookup_symbol_interval(sname, n)) else: query = 'short_name == "%s"' % region['short_name'] for r in self.substage_mmap_addr_table.where(query): iis.add(intervaltree.Interval(r['startaddr'], r['endaddr'])) iis.merge_overlaps() iis.merge_equals() return iis
def make_interval_trees(graph): out_edges = {} in_edges = {} for edge_name, edge in graph.edges.items(): out_edges.setdefault(edge['v'], []) out_edges[edge['v']].append(edge) in_edges.setdefault(edge['w'], []) in_edges[edge['w']].append(edge) interval_dict = {} for v, edges in out_edges.items(): interval_dict[v] = [intervaltree.Interval(edge['v_start'], edge['v_start'] + 1, edge) for edge in edges] interval_trees = {} for v, intervals in interval_dict.items(): interval_trees[v] = intervaltree.IntervalTree(intervals) return interval_trees
def resolve_addresses(self, all_regions={}, values={}): all_resolved = True if self.addresses_resolved is True: return elif self._csv: (f, parsed) = parse_am37x_register_tables.parsecsv(self._csv) addrs = intervaltree.IntervalTree() for p in parsed: addr = p[parse_am37x_register_tables.TITable.ADDRESS] if addr: addr = int(addr, 0) else: continue wid = p[parse_am37x_register_tables.TITable.WIDTH] name = p[parse_am37x_register_tables.TITable.NAME] # create a unique name without spaces name = re.sub("[\s]", "", name) + (".%x" % addr) size = int(wid) / 8 if wid else 4 i = intervaltree.Interval(addr, addr + size) addrs.add(i) raw_perms = p[parse_am37x_register_tables.TITable.TYPE].lower() perms = "readonly" if 'w' not in raw_perms else 'global' self._raw_subregions[name] = { 'addresses': [i.begin, i.end], 'include_children': False, 'type': perms, } self.children_names.append("%s.%s" % (self.short_name, name)) self.addresses = addrs f.close() all_resolved = True elif (type(self._raw_addresses) == list): if type(self._raw_addresses[0] ) == list: # its a list of lists of subregions for a in self._raw_addresses: all_resolved = all_resolved and self._resolve_addr_region( a, all_regions, values) else: all_resolved = self._resolve_addr_region( self._raw_addresses, all_regions, values) else: all_resolved = self._resolve_special_addr_region( self._raw_addresses, all_regions, values) self.addresses_resolved = all_resolved
def __init__(self, interval_tuples): '''intervals is like [('22', 12321, 12345, 'APOL1'), ...]''' self._its = {} self._gene_starts = {} self._gene_ends = {} for interval_tuple in interval_tuples: chrom, pos_start, pos_end, gene_name = interval_tuple assert isinstance(pos_start, int) assert isinstance(pos_end, int) if chrom not in self._its: self._its[chrom] = intervaltree.IntervalTree() self._gene_starts[chrom] = [] self._gene_ends[chrom] = [] self._its[chrom].add(intervaltree.Interval(pos_start, pos_end, gene_name)) self._gene_starts[chrom].append((pos_start, gene_name)) self._gene_ends[chrom].append((pos_end, gene_name)) for chrom in self._its: self._gene_starts[chrom] = BisectFinder(self._gene_starts[chrom]) self._gene_ends[chrom] = BisectFinder(self._gene_ends[chrom])
def gff2trees(gfffile, feature='CDS'): trees = {} i = 0 for cds in cds_from_gff(gfffile, feature=feature): chr = cds['chr'] if chr not in trees: trees[chr] = intervaltree.IntervalTree() # merge with any existing interval olaps = trees[chr][cds['begin']:cds['end']] olaps_begins = [x[0] for x in olaps] olaps_ends = [x[1] for x in olaps] newmin = min([cds['begin']] + olaps_begins) newmax = max([cds['end']] + olaps_ends) # clean up and add trees[chr].chop(newmin, newmax) cdsint = intervaltree.Interval(newmin, newmax, i) trees[chr].add(cdsint) i += 1 return trees
def __init__(self, data, row_sentence_bounds, window=5, process_all=False): """ Class for managing windowed input data (like TIMIT). :param data: Numpy matrix. Each row should be an example data :param row_sentence_bounds: Numpy matrix with bounds for padding. TODO add default NONE :param window: half-window size :param process_all: (default False) if True adds context to all data at object initialization. Otherwise the windowed data is created in runtime. """ self.window = window self.data = data base_shape = self.data.shape self.shape = (base_shape[0], (2 * self.window + 1) * base_shape[1]) self.tree = it.IntervalTree([it.Interval(int(e[0]), int(e[1]) + 1) for e in row_sentence_bounds]) if process_all: print('adding context to all the dataset', end='- ') self.data = self.generate_all() print('DONE') self.process_all = process_all