def _arrange_genes(gene_data_list): """ Given an iterable of gene data dictionaries, returns a list of lists of gene names that should be displayed at various levels. """ gene_data_list = sorted(gene_data_list, key=lambda x: x['end'] - x['start'], reverse=True) display_levels = [ intervaltree.IntervalTree(), ] for gene_data in gene_data_list: found_home = False level_idx = 0 while not found_home: if level_idx >= len(display_levels): display_levels.append(intervaltree.IntervalTree()) if display_levels[level_idx].overlaps(gene_data['start'], gene_data['end']): level_idx += 1 else: display_levels[level_idx].addi(gene_data['start'], gene_data['end'], data=gene_data) found_home = True return [[gene_interval.data['ID'] for gene_interval in this_level] for this_level in display_levels]
def consolidate_write_table(self, framac=False): populated = False for t in self.writerangetable_consolidated.tables.itervalues(): if t.nrows > 0: populated = True break if populated: self.writerangetable_consolidated.purge() last = None sortindex = 'line' if framac else 'writepc' intervals = intervaltree.IntervalTree() r = None substagenums = substage.SubstagesInfo.substage_numbers(self.stage) writepc = None line = None lvalue = None dst_not_in_ram = True for n in substagenums: if n not in self.writerangetable_consolidated.tables.keys(): self.writerangetable_consolidated._init_table(n) if n > 0: # add last interval self._add_intervals_to_table( self.writerangetable_consolidated.tables[n], intervals, writepc, line, lvalue, dst_not_in_ram, n) last = None lvalue = None dst_not_in_ram = True writepc = None line = None count = 0 intervals = intervaltree.IntervalTree() # clear intervals print "writerange[%s] %s" % (n, self.writerangetable.tables[n].nrows) for r in self.writerangetable.tables[n].read_sorted(sortindex): count += 1 if not last == r[sortindex]: if last is not None: self._add_intervals_to_table( self.writerangetable_consolidated.tables[n], intervals, writepc, line, lvalue, dst_not_in_ram, n) intervals = intervaltree.IntervalTree() # clear intervals writepc = r['writepc'] line = r['line'] lvalue = r['lvalue'] dst_not_in_ram = r['dst_not_in_ram'] last = r[sortindex] if last is None: last = r[sortindex] dst_not_in_ram = dst_not_in_ram and r['dst_not_in_ram'] intervals.addi(r['dstlo'], r['dsthi']) if intervals: self._add_intervals_to_table( self.writerangetable_consolidated.tables[n], intervals, writepc, line, lvalue, dst_not_in_ram, n) self.writerangetable_consolidated.flush_table() for n in substagenums: print "write range consolidated "\ "stage %s nrows %s" % (n, self.writerangetable_consolidated.tables[n].nrows)
def get_srs_tree(srs_tuple_list): # Use default value for all lengths by default if srs_tuple_list is None or len(srs_tuple_list) == 0: srs_tree = intervaltree.IntervalTree() # State-run-smooth srs_tree[0:np.inf] = DEFAULT_STATE_RUN_SMOOTH return srs_tree # Check and sort for srs_element in srs_tuple_list: if len(srs_element) != 2: raise RuntimeError('Element in "state run smooth" tuple list that is not length 2: ' + srs_element) srs_tuple_list = sorted(srs_tuple_list) # Create tree srs_tree = intervaltree.IntervalTree() # State-run-smooth # Get first limit last_inv_lim, last_smooth_factor = srs_tuple_list[0] last_inv_lim = int(last_inv_lim) last_smooth_factor = int(last_smooth_factor) if last_inv_lim < 0: raise RuntimeError('State run inversion size limits must be 0 or greater: {}'.format(last_inv_lim)) if last_smooth_factor < 4: raise RuntimeError('Not tested with "state run smooth" factor less than 4: {}'.format(last_smooth_factor)) # Add first limit from 0 if last_inv_lim > 0: srs_tree[0:last_inv_lim] = np.min([last_inv_lim, 20]) # 20 or last_inv_lim, whichever is smaller # Process remaining intervals for inv_lim, smooth_factor in srs_tuple_list[1:]: inv_lim = int(inv_lim) smooth_factor = int(smooth_factor) # Check if smooth_factor < 20: raise RuntimeError('Not tested with "state run smooth" factor less than 20: {}'.format(smooth_factor)) if inv_lim == last_inv_lim: raise RuntimeError('Duplicate limit in state run limits: {}'.format(inv_lim)) # Add to tree srs_tree[last_inv_lim:inv_lim] = last_smooth_factor # Advance last_inv_lim and last_smooth_factor last_inv_lim = inv_lim last_smooth_factor = smooth_factor # Add sample to infinity srs_tree[last_inv_lim:np.inf] = last_smooth_factor # Return tree return srs_tree
def __init__(self, file=None): self.midi = None self.notes = list() self.metas = intervaltree.IntervalTree() # indexed by second intervals self.timeline = intervaltree.IntervalTree( ) # indexed by tick intervals self.pending_notes = dict() if file: self.parse(file)
def removeUnmappable(tads1,tads2): # unmappable regions are marked with negative ids u1 = itr.IntervalTree([t for t in tads1 if t.data < 0]) u2 = itr.IntervalTree([t for t in tads2 if t.data < 0]) u = u1 | u2 u.split_overlaps() u.merge_equals() for gap in u: tads1.chop(gap.begin,gap.end) tads2.chop(gap.begin,gap.end) return tads1, tads2
def generate_interval_tree(TF_file, Json_output_file): with open(cluster_TF_file, "r") as file: with open(JSON_dict_file, 'w') as outfile: for line in file: splitted = line.strip("\n").split("\t", 4) chrom, start, end, TF = splitted[0], splitted[1], splitted[ 2], splitted[3] cell_line = splitted[4].split()[1] line_info = [chrom, str(start), str(end), str(TF)] #Generate a interval tree to implement binary search tree algorithm: #Data structure is Intervaltree here, with intervals as its elements #(analogous to elements of the list, and elements of the dictionary, #characters of the string array) my_tree[start:end]: if TF in master_TF_dict_return.keys(): if chrom in master_TF_dict_return[TF].keys(): master_TF_dict_return[TF][chrom].appendi( int(start), int(end), "\t".join(line_info)) else: master_TF_dict_return[TF][ chrom] = intervaltree.IntervalTree() master_TF_dict_return[TF][chrom].appendi( int(start), int(end), "\t".join(line_info)) #master_TF_dict_return[TF].update({chrom = intervaltree.IntervalTree()} else: master_TF_dict_return[TF] = { chrom: intervaltree.IntervalTree() } master_TF_dict_return[TF][chrom].appendi( int(start), int(end), "\t".join(line_info)) master_TF_dict_return[TF].update( {"significant_unique_dmr_hits": 0}) master_TF_dict_return[TF].update( {"background_unique_dmr_hits": 0}) master_TF_dict_return[TF].update( {"TotalTF_coveredBy_sig_unique_dmr_hits": 0}) master_TF_dict_return[TF].update( {"TotalTF_coveredBy_bg_unique_dmr_hits": 0}) master_TF_dict_return[TF].update( {"custom_overlap_list_sig": []}) master_TF_dict_return[TF].update( {"custom_overlap_list_bg": []}) master_TF_dict_return[TF].update( {"As_overlap_list_sig": []}) master_TF_dict_return[TF].update( {"As_overlap_list_bg": []}) master_TF_dict_return[TF].update( {"Bs_overlap_list_sig": []}) master_TF_dict_return[TF].update( {"Bs_overlap_list_bg": []}) #json.dump(master_TF_dict_return, outfile) return (master_TF_dict_return)
def __call__(self, gap): if gap.du < 0.5 or gap.dv < 0.5: return 0 k = 5 box = shapely.geometry.box(*outset_bounds(gap.bounds, k)) flow = intervaltree.IntervalTree() obst = intervaltree.IntervalTree() flow_widths = [] flow_width_weights = [] for sep in self._separators.query(box): intersection = sep.intersection(box) if intersection is None or intersection.is_empty: continue label = self._label(sep.name) sep_dir = self._direction[label] for segment in extract_segments(intersection): minx, miny, maxx, maxy = segment.bounds smin = (minx, miny) smax = (maxx, maxy) if sep_dir == gap.axis: uax = gap.axis obst.addi(smin[uax], smax[uax] + 1, True) else: vax = 1 - gap.axis flow.addi(smin[vax], smax[vax] + 1, True) flow_widths.append(self._separators.width(sep.name)) flow_width_weights.append(smax[vax] - smin[vax]) flow.merge_overlaps(strict=False) obst.merge_overlaps(strict=False) flow_score = sum(i.length() for i in flow) / gap.dv obst_score = sum(i.length() for i in obst) / gap.du if self._thickness_delta and flow_widths: w = np.average(flow_widths, weights=flow_width_weights) delta_t = self._thickness_delta(w) obst_score -= delta_t flow_score += delta_t score = gap.du * gap.dv # i.e. largest whitespace area score = (score * (1 - obst_score)) * (1 + flow_score) return score
def _file_to_tree(filename): with tokenize.open(filename) as file: parsed = ast.parse(file.read(), filename=filename) classes = intervaltree.IntervalTree() tree = intervaltree.IntervalTree() for node in ast.walk(parsed): if isinstance(node, (ast.ClassDef)): start, end = Main._compute_interval(node) classes[start:end] = node if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)): start, end = Main._compute_interval(node) tree[start:end] = node return classes, tree
def calculate_framac_intervals(self, substages): intervals = {n: intervaltree.IntervalTree() for n in substages} for num in substages: for r in pytable_utils.get_rows(self.trace_intervals_table, 'substagenum == %s' % num): # lookup writes performed by this function f = r['functionname'] (lopc, hipc) = self.fun_info(f) res = db_info.get(self.stage).write_interval_info( tracename, lopc, hipc) intervals[num] += intervaltree.IntervalTree( [intervaltree.Interval(r[0], r[1]) for r in res]) return intervals
def __init__(self): # Create underlying C object wrapped so that M6502_delete is called # automatically on destruction. self._mpu = ffi.gc(lib.M6502_new(ffi.NULL, ffi.NULL, ffi.NULL), lib.M6502_delete) # Three interval trees mapping address intervals to callables for read, # write and call callbacks. self._read_cbs = intervaltree.IntervalTree() self._write_cbs = intervaltree.IntervalTree() self._call_cbs = intervaltree.IntervalTree() # Record a weak reference ourselves in the mapping dict for callbacks. _map_dict[self._mpu] = weakref.ref(self) self.reset()
def _resolve_special_addr_region(self, handle, allregions, values): if handle == 'remainder': parent = self.parent if self.parent and self.parent.addresses_resolved: siblings = [ allregions[n] for n in self.parent.children_names if n in allregions.iterkeys() ] if not all( map( lambda x: x.short_name == self.short_name or x. addresses_resolved, siblings)): return False if not len(siblings) == len(self.parent.children_names): return False remainder = intervaltree.IntervalTree(self.parent.addresses) for s in siblings: if s.short_name == self.short_name: continue for i in s.addresses: remainder.chop(i.begin, i.end) toremove = [] self.addresses = remainder return True elif handle == 'children': res = intervaltree.IntervalTree() if len(self.children_names) == 0: return False for n in self.children_names: if n in allregions.iterkeys( ) and allregions[n].addresses_resolved: res = res | allregions[n].addresses else: return False self.addresses = res return True else: reg_name = handle.rsplit(".", 1)[0] if reg_name in allregions.iterkeys( ) and allregions[reg_name].addresses_resolved: res = self._resolve_region_relative(handle, allregions) if isinstance(res, type(handle)): return False else: self.addresses = intervaltree.IntervalTree([res]) return True else: return False
def __init__(self, arch=ARM, output_directory=None): super(Avatar, self).__init__() self.watchmen = Watchmen(self) self.arch = arch self.arch.init(self) self.targets = {} self.status = {} self.memory_ranges = intervaltree.IntervalTree() self.output_directory = (tempfile.mkdtemp( suffix="_avatar") if output_directory is None else output_directory) if not path.exists(self.output_directory): makedirs(self.output_directory) self._close = Event() self.queue = queue.Queue() self.start() self.log = logging.getLogger('avatar') format = '%(asctime)s | %(name)s.%(levelname)s | %(message)s' logging.basicConfig(filename='%s/avatar.log' % self.output_directory, level=logging.INFO, format=format) self.log.info("Initialized Avatar. Output directory is %s" % self.output_directory) signal.signal(signal.SIGINT, self.sigint_wrapper) self.sigint_handler = self.shutdown self.loaded_plugins = []
def createIntervalTree(testIntervals): tree = intervaltree.IntervalTree() for start, end, weight in testIntervals: tree.addi(start, end, weight) return tree
def write_interval_info(self, hwname, pclo=None, pchi=None, substage_names=[], substage_entries={}): wt = self._get_writestable(hwname) if "framac" in hwname: return [(r['destlo'], r['desthi']) for r in pytable_utils.get_rows( '(%d <= writepc) & (writepc < %d)' % (pclo, pchi))] else: fns = substage_entries substages = substage_names num = 0 intervals = {n: intervaltree.IntervalTree() for n in substages} for r in wt.read_sorted('index'): pc = r['pc'] if num < len(fns) - 1: # check if we found the entrypoint to the next stage (lopc, hipc) = substage_entries[num + 1] if (lopc <= pc) and (pc < hipc): num += 1 if num in substages: start = r['dest'] end = start + pytable_utils.get_rows( wt, 'pc == %d' % r['pc'])[0]['writesize'] intervals[num].add(intervaltree.Interval(start, end)) return intervals
def make_gene_tree(): gene_tree = intervaltree.IntervalTree() gene_data = get_gene_data() for gene in gene_data: # print(gene['min'], gene['max']) gene_tree[gene['min']:gene['max']] = dict(locus=gene['uniquename']) return gene_tree
def extract_data_page_slack(page): ''' extract the slack bytes from the given data page. Args: page (cim.DataPage): the page from which to extract slack space. Yields: SlackRegion: the raw bytes of the slack space. ''' # start by marking the entire page as allocated slack = intervaltree.IntervalTree( [intervaltree.Interval(0, cim.DATA_PAGE_SIZE)]) # remove the toc region slack.chop(0, len(page.toc)) # if there is a toc, then we remove the empty entry at the end # (this is not included in the list of entries, but its part of the toc). if len(page.toc) > 0: slack.chop(len(page.toc), len(page.toc) + 0x10) # and regions for each of the entries for j in range(page.toc.count): entry = page.toc[j] slack.chop(entry.offset, entry.offset + entry.size) for region in sorted(slack): begin, end, _ = region if (end - begin) > cim.DATA_PAGE_SIZE: continue yield SlackRegion(page.logical_page_number, begin, page.buf[begin:end])
def __init__(self, short_name, d, stage, parent=None, values={}): if parent is None: parent_type = None parent_default_perms = None parent_include_children = None parent_reclassifiable = None else: parent_type = parent.typ parent_default_perms = parent.default_perms parent_include_children = parent.include_children parent_reclassifiable = parent.reclassifiable self.stage = stage self.addresses = intervaltree.IntervalTree() self.short_name = short_name self.name = get_value(d, 'name') self._raw_typ = get_value(d, 'type', parent_type).lower() self._raw_addresses = get_value(d, 'addresses') self._raw_default_perms = get_value(d, 'default_perms', parent_default_perms) self._raw_subregions = get_value(d, 'subregions') self._raw_include_children = get_value(d, 'include_children', parent_include_children) self._raw_reclassifiable = get_value(d, 'reclassifiable', parent_reclassifiable) self._csv = get_value(d, 'csv') if self._csv: self._csv = Main.populate_from_config(self._csv) if parent and parent._csv: # if parent had csv, don't propigate csv definition self._csv = None self.contents = get_value(d, 'contents') self.children_names = [self.short_name + '.' + s for s in self._raw_subregions.iterkeys()] self.parent = parent self.addresses_resolved = False self._convert_from_raw(values) self.resolve_addresses(values=values) self.reclassification_rules = {0: self.typ}
def parse_cytoband(lines): """Parse iterable with cytoband coordinates Args: lines(iterable): Strings on format "chr1\t2300000\t5400000\tp36.32\tgpos25" Returns: cytobands(dict): Dictionary with chromosome names as keys and interval trees as values """ cytobands = {} for line in lines: if line.startswith("#"): continue line = line.rstrip() splitted_line = line.split("\t") chrom = splitted_line[0].lstrip("chr") start = int(splitted_line[1]) stop = int(splitted_line[2]) name = splitted_line[3] if chrom in cytobands: # Add interval to existing tree cytobands[chrom][start:stop] = name else: # Create a new interval tree new_tree = intervaltree.IntervalTree() # create the interval new_tree[start:stop] = name # Add the interval tree cytobands[chrom] = new_tree return cytobands
def get_gdb_sym(addr): ret = intervaltree.IntervalTree() addr = int(addr) global sym_cache xs = sym_cache[addr] if (len(xs) > 0): print("xs = '%s'" % xs) for x in xs: print("x = '%s'" % x) else: xaddr = addr nm = gdb.parse_and_eval(f"(void*)({xaddr})") m = symre.match(str(nm)) if (m): symsize = 1 ssz = m.group(2) if (ssz is not None): symsize = int(ssz[1:]) eaddr = xaddr xaddr -= symsize + 1 saddr = eaddr - symsize # print("saddr = 0x%x" % saddr ) # print("eaddr = 0x%x" % eaddr ) # ret.append( ( saddr, eaddr, m.group(1) ) ) ret[saddr:eaddr + 1] = m.group(1)
def filter_introns(introns, genes, options): ### build interval trees of all genes starts and ends chrms = sp.array([_.strand for _ in genes]) strands = sp.array([_.chr for _ in genes]) gene_trees = dict() for c in sp.unique(chrms): for s in sp.unique(strands): gene_trees[(c, s)] = it.IntervalTree() c_idx = sp.where((chrms == c) & (strands == s))[0] for i in c_idx: gene_trees[(c, s)][genes[i].start:genes[i].stop] = i ### match all introns agains trees and remove elements overlapping ### more than one gene on the same chr/strand cnt_tot = 0 cnt_rem = 0 strand_list = ['+', '-'] offset = options.intron_edges['append_new_terminal_exons_len'] for si, s in enumerate(strand_list): for i in range(introns.shape[0]): if introns[i, si].shape[0] == 0: continue k_idx = [] cnt_tot += introns[i, si].shape[0] for j in range(introns[i, si].shape[0]): if len(gene_trees[(s, genes[i].chr)].overlap(introns[i, si][j, 0] - offset, introns[i, si][j, 1] + offset)) == 1: k_idx.append(j) if len(k_idx) < introns[i, si].shape[0]: cnt_rem += (introns[i, si].shape[0] - len(k_idx)) introns[i, si] = introns[i, si][k_idx, :] print('removed %i of %i (%.2f percent) introns overlapping to no or multiple genes' % (cnt_rem, cnt_tot, cnt_rem / float(max(cnt_tot, 1)) * 100)) return introns
def __init__(self, path, lineno, lvalue, values, pc=None, origpc=None, substage_name=None, callstack="", stage=None): self.path = path self.pc = pc self.origpc = origpc self.lineno = lineno self.values = intervaltree.IntervalTree() for v in values: self.values.add(v) self.lvalue = lvalue self.stage = stage if substage_name is None and callstack: policy = getattr(Main.raw.policies.substages_file, self.stage.stagename) get_config('policy_file', self.stage) self.substages = substage.SubstagesInfo.substage_names_from_file( policy) self.substages[0] = "frama_go" called_fns = callstack.split("->") called_fns = filter(len, called_fns) called_fns.reverse() for f in called_fns: if f in self.substages: substage_name = self.substages.index(f) break self.substage = substage_name
def tupletree(table, start='start', stop='stop', value=None): """ Construct an interval tree for the given table, where each node in the tree is a row of the table. """ import intervaltree tree = intervaltree.IntervalTree() it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) assert start in flds, 'start field not recognised' assert stop in flds, 'stop field not recognised' getstart = itemgetter(flds.index(start)) getstop = itemgetter(flds.index(stop)) if value is None: getvalue = tuple else: valueindices = asindices(hdr, value) assert len(valueindices) > 0, 'invalid value field specification' getvalue = itemgetter(*valueindices) for row in it: tree.addi(getstart(row), getstop(row), getvalue(row)) return tree
def facettupletrees(table, key, start='start', stop='stop', value=None): """ Construct faceted interval trees for the given table, where each node in the tree is a row of the table. """ import intervaltree it = iter(table) hdr = next(it) flds = list(map(text_type, hdr)) assert start in flds, 'start field not recognised' assert stop in flds, 'stop field not recognised' getstart = itemgetter(flds.index(start)) getstop = itemgetter(flds.index(stop)) if value is None: getvalue = tuple else: valueindices = asindices(hdr, value) assert len(valueindices) > 0, 'invalid value field specification' getvalue = itemgetter(*valueindices) keyindices = asindices(hdr, key) assert len(keyindices) > 0, 'invalid key' getkey = itemgetter(*keyindices) trees = dict() for row in it: k = getkey(row) if k not in trees: trees[k] = intervaltree.IntervalTree() trees[k].addi(getstart(row), getstop(row), getvalue(row)) return trees
def _assign_blocks_to_contigs(contig_intervals_file_distance, block_interval_tree): """ For each contig, create an interval tree that stores the sequence interval stored in each block (for all blocks that contain part of the contig), as well as the offset of the start of that block. :param contig_intervals_file_distance: A dictionary of intervals, keyed by contig name, storing the locations in the file spanned by each contig. :param block_interval_tree: An interval tree storing the start and end locations in the uncompressed file spanned by each compressed block, as well as the offset of the block start. :return: Return a dictionary of such interval trees keyed by contig name. """ start_time = datetime.datetime.now() verbose_print('\tAssigning compressed blocks to sequence contigs ...') sequence_blocks = {} for contig in sorted(contig_intervals_file_distance): if contig not in sequence_blocks: sequence_blocks[contig] = intervaltree.IntervalTree() for block_interval in block_interval_tree.search( *contig_intervals_file_distance[contig]): block_start_text_distance = block_interval.begin - contig_intervals_file_distance[ contig][0] block_end_text_distance = block_interval.end - contig_intervals_file_distance[ contig][0] sequence_blocks[contig].addi(block_start_text_distance, block_end_text_distance, block_interval.data) verbose_print('\t\tDone in {}.'.format(datetime.datetime.now() - start_time)) return sequence_blocks
def Calc(args): tree = {} f = subprocess.Popen(shlex.split("gzip -fdc %s" % (args.bed_fn)), stdout=subprocess.PIPE, bufsize=8388608) for row in f.stdout: row = row.split() name = row[0] if name not in tree: tree[name] = intervaltree.IntervalTree() begin = int(row[1]) end = int(row[2]) - 1 if end == begin: end += 1 tree[name].addi(begin, end) f.stdout.close() f.wait() f = subprocess.Popen(shlex.split("gzip -fdc %s" % (args.input_fn)), stdout=subprocess.PIPE, bufsize=8388608) for row in f.stdout: ctgName, pos = [(row.split()[i]) for i in [0, 1]] pos = int(pos) if ctgName not in tree: continue if len(tree[ctgName].search(pos)) == 0: continue sys.stdout.write(row) sys.stdout.flush() f.stdout.close() f.wait()
def variants(self): """Yield diploid variants. :yields `Variant` objs """ for chrom, ln in loose_version_sort(self.chroms): sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "]" + " INFO: MERGING VARIANTS IN CONTIG: " + str(chrom) + "\n") sys.stderr.flush() merged = [] trees = [vcf._tree[chrom] for vcf in self.vcfs] # assign haplotype so that otherwise identical variants in both # trees are not treated as identical (we need to be able to # distinguish between 0/1 and 1/1) for h, tree in enumerate(trees): for i in tree.all_intervals: i.data.info['mhap'] = h comb = intervaltree.IntervalTree( trees[0].all_intervals.union(trees[1].all_intervals)) # if strict, merge only overlapping intervals (not adjacent ones) comb.merge_overlaps( strict=self.only_overlapping, data_initializer=list(), data_reducer=lambda x, y: x + [y]) ref_seq = self.fasta.fetch(chrom).upper() for interval in comb.all_intervals: merged.append(_merge_variants( interval, trees, ref_seq, detailed_info=self.detailed_info, discard_phase=self.discard_phase)) yield from sorted(merged, key=lambda x: x.pos)
def nr_interval_merge(df_chr, overlap=0.5): """ Reduce a dataframe to non-redundant intervals based on reciprocal overlap. All records in the dataframe must be on the same chromosome. :param df_chr: DataFrame of one chromosome. :param overlap: Reciprocal overlap (0, 1]. :return: Dataframe subset using the first record in a unique interval. """ index_list = list() # Dataframe indices to return interval_tree = intervaltree.IntervalTree() # Tree of intervals # Iterate rows for index, row in df_chr.iterrows(): ri_match = False # Find matches for interval in interval_tree[row['POS']:row['END']]: if reciprocal_overlap(row['POS'], row['END'], interval.begin, interval.end) >= 0.50: ri_match = True break # Append to non-redundant records if no match if not ri_match: index_list.append(index) # All records are added to the tree interval_tree[row['POS']:row['END']] = True return df_chr.loc[index_list]
def main(name, bed, src, target): logging.info('parsing {}...'.format(bed)) tree = {} for idx, line in enumerate(open(bed, 'r')): chrom, start, finish, annotation = line.strip('\n').split('\t')[:4] if chrom not in tree: tree[chrom] = intervaltree.IntervalTree() tree[chrom][int(start):int(finish)] = annotation logging.info('parsing {}: done.'.format(bed)) logging.info('reading from stdin...') yes = no = 0 for idx, line in enumerate(src): chrom, start, finish, annotation = line.strip('\n').split('\t')[:4] if chrom not in tree: sys.stdout.write(line) no += 1 continue overlaps = tree[chrom][int(start):int(finish)] if len(overlaps) == 0: sys.stdout.write(line) no += 1 else: values = set([overlap.data for overlap in overlaps]) value = ','.join(values) if annotation.endswith(';'): sys.stdout.write('{}\t{}\t{}\t{}{}={}\n'.format( chrom, start, finish, annotation, name, value)) else: sys.stdout.write('{}\t{}\t{}\t{};{}={}\n'.format( chrom, start, finish, annotation, name, value)) yes += 1 logging.info('%i overlaps. %i with no overlap.', yes, no)
def variants(self): """Yield diploid variants. :yields `medaka.vcf.Variant` objs """ for chrom in medaka.common.loose_version_sort(self.chroms): self.logger.info('Merging variants in chrom {}'.format(chrom)) merged = [] trees = [vcf._tree[chrom] for vcf in self.vcfs] # assign haplotype so that otherwise identical variants in both # trees are not treated as identical (we need to be able to # distinguish between 0/1 and 1/1) for h, tree in enumerate(trees): for i in tree.all_intervals: i.data.info['mhap'] = h comb = intervaltree.IntervalTree( trees[0].all_intervals.union(trees[1].all_intervals)) # if strict, merge only overlapping intervals (not adjacent ones) comb.merge_overlaps( strict=self.only_overlapping, data_initializer=list(), data_reducer=lambda x, y: x + [y]) ref_seq = self.fasta.fetch(chrom).upper() for interval in comb.all_intervals: merged.append(_merge_variants( interval, trees, ref_seq, detailed_info=self.detailed_info, discard_phase=self.discard_phase)) yield from sorted(merged, key=lambda x: x.pos)
def __init__(self, refFile, chrom="chr"): self.ref_fasta = pysam.FastaFile(refFile) self.alts = {} # AltAllele.Key -> AltAllele self.refs = intervaltree.IntervalTree() self.chrom = chrom self.first_pos = None self.last_pos = None