def _arrange_genes(gene_data_list):
        """
        Given an iterable of gene data dictionaries,
        returns a list of lists of gene names that
        should be displayed at various levels.
        """
        gene_data_list = sorted(gene_data_list,
                                key=lambda x: x['end'] - x['start'],
                                reverse=True)

        display_levels = [
            intervaltree.IntervalTree(),
        ]

        for gene_data in gene_data_list:
            found_home = False
            level_idx = 0
            while not found_home:
                if level_idx >= len(display_levels):
                    display_levels.append(intervaltree.IntervalTree())
                if display_levels[level_idx].overlaps(gene_data['start'],
                                                      gene_data['end']):
                    level_idx += 1
                else:
                    display_levels[level_idx].addi(gene_data['start'],
                                                   gene_data['end'],
                                                   data=gene_data)
                    found_home = True

        return [[gene_interval.data['ID'] for gene_interval in this_level]
                for this_level in display_levels]
 def consolidate_write_table(self, framac=False):
     populated = False
     for t in self.writerangetable_consolidated.tables.itervalues():
         if t.nrows > 0:
             populated = True
             break
     if populated:
         self.writerangetable_consolidated.purge()
     last = None
     sortindex = 'line' if framac else 'writepc'
     intervals = intervaltree.IntervalTree()
     r = None
     substagenums = substage.SubstagesInfo.substage_numbers(self.stage)
     writepc = None
     line = None
     lvalue = None
     dst_not_in_ram = True
     for n in substagenums:
         if n not in self.writerangetable_consolidated.tables.keys():
             self.writerangetable_consolidated._init_table(n)
         if n > 0:  # add last interval
             self._add_intervals_to_table(
                 self.writerangetable_consolidated.tables[n], intervals,
                 writepc, line, lvalue, dst_not_in_ram, n)
         last = None
         lvalue = None
         dst_not_in_ram = True
         writepc = None
         line = None
         count = 0
         intervals = intervaltree.IntervalTree()  # clear intervals
         print "writerange[%s] %s" % (n,
                                      self.writerangetable.tables[n].nrows)
         for r in self.writerangetable.tables[n].read_sorted(sortindex):
             count += 1
             if not last == r[sortindex]:
                 if last is not None:
                     self._add_intervals_to_table(
                         self.writerangetable_consolidated.tables[n],
                         intervals, writepc, line, lvalue, dst_not_in_ram,
                         n)
                 intervals = intervaltree.IntervalTree()  # clear intervals
                 writepc = r['writepc']
                 line = r['line']
                 lvalue = r['lvalue']
                 dst_not_in_ram = r['dst_not_in_ram']
                 last = r[sortindex]
             if last is None:
                 last = r[sortindex]
             dst_not_in_ram = dst_not_in_ram and r['dst_not_in_ram']
             intervals.addi(r['dstlo'], r['dsthi'])
         if intervals:
             self._add_intervals_to_table(
                 self.writerangetable_consolidated.tables[n], intervals,
                 writepc, line, lvalue, dst_not_in_ram, n)
     self.writerangetable_consolidated.flush_table()
     for n in substagenums:
         print "write range consolidated "\
             "stage %s nrows %s" % (n,
                                    self.writerangetable_consolidated.tables[n].nrows)
Exemple #3
0
def get_srs_tree(srs_tuple_list):

    # Use default value for all lengths by default
    if srs_tuple_list is None or len(srs_tuple_list) == 0:
        srs_tree = intervaltree.IntervalTree()  # State-run-smooth
        srs_tree[0:np.inf] = DEFAULT_STATE_RUN_SMOOTH
        return srs_tree

    # Check and sort
    for srs_element in srs_tuple_list:
        if len(srs_element) != 2:
            raise RuntimeError('Element in "state run smooth" tuple list that is not length 2: ' + srs_element)

    srs_tuple_list = sorted(srs_tuple_list)

    # Create tree
    srs_tree = intervaltree.IntervalTree()  # State-run-smooth

    # Get first limit
    last_inv_lim, last_smooth_factor = srs_tuple_list[0]
    last_inv_lim = int(last_inv_lim)
    last_smooth_factor = int(last_smooth_factor)

    if last_inv_lim < 0:
        raise RuntimeError('State run inversion size limits must be 0 or greater: {}'.format(last_inv_lim))

    if last_smooth_factor < 4:
        raise RuntimeError('Not tested with "state run smooth" factor less than 4: {}'.format(last_smooth_factor))

    # Add first limit from 0
    if last_inv_lim > 0:
        srs_tree[0:last_inv_lim] = np.min([last_inv_lim, 20])  # 20 or last_inv_lim, whichever is smaller

    # Process remaining intervals
    for inv_lim, smooth_factor in srs_tuple_list[1:]:
        inv_lim = int(inv_lim)
        smooth_factor = int(smooth_factor)

        # Check
        if smooth_factor < 20:
            raise RuntimeError('Not tested with "state run smooth" factor less than 20: {}'.format(smooth_factor))

        if inv_lim == last_inv_lim:
            raise RuntimeError('Duplicate limit in state run limits: {}'.format(inv_lim))

        # Add to tree
        srs_tree[last_inv_lim:inv_lim] = last_smooth_factor

        # Advance last_inv_lim and last_smooth_factor
        last_inv_lim = inv_lim
        last_smooth_factor = smooth_factor

    # Add sample to infinity
    srs_tree[last_inv_lim:np.inf] = last_smooth_factor

    # Return tree
    return srs_tree
Exemple #4
0
 def __init__(self, file=None):
     self.midi = None
     self.notes = list()
     self.metas = intervaltree.IntervalTree()  # indexed by second intervals
     self.timeline = intervaltree.IntervalTree(
     )  # indexed by tick intervals
     self.pending_notes = dict()
     if file:
         self.parse(file)
Exemple #5
0
def removeUnmappable(tads1,tads2):
    # unmappable regions are marked with negative ids
    u1 = itr.IntervalTree([t for t in tads1 if t.data < 0])
    u2 = itr.IntervalTree([t for t in tads2 if t.data < 0])
    u = u1 | u2
    u.split_overlaps()
    u.merge_equals()
    for gap in u:
        tads1.chop(gap.begin,gap.end)
        tads2.chop(gap.begin,gap.end)
    return tads1, tads2
Exemple #6
0
def generate_interval_tree(TF_file, Json_output_file):
    with open(cluster_TF_file, "r") as file:
        with open(JSON_dict_file, 'w') as outfile:
            for line in file:
                splitted = line.strip("\n").split("\t", 4)
                chrom, start, end, TF = splitted[0], splitted[1], splitted[
                    2], splitted[3]
                cell_line = splitted[4].split()[1]
                line_info = [chrom, str(start), str(end), str(TF)]
                #Generate a interval tree to implement binary search tree algorithm:
                #Data structure is Intervaltree here, with intervals as its elements
                #(analogous to elements of the list, and elements of the dictionary,
                #characters of the string array) my_tree[start:end]:
                if TF in master_TF_dict_return.keys():
                    if chrom in master_TF_dict_return[TF].keys():
                        master_TF_dict_return[TF][chrom].appendi(
                            int(start), int(end), "\t".join(line_info))
                    else:
                        master_TF_dict_return[TF][
                            chrom] = intervaltree.IntervalTree()
                        master_TF_dict_return[TF][chrom].appendi(
                            int(start), int(end), "\t".join(line_info))
                        #master_TF_dict_return[TF].update({chrom = intervaltree.IntervalTree()}
                else:
                    master_TF_dict_return[TF] = {
                        chrom: intervaltree.IntervalTree()
                    }
                    master_TF_dict_return[TF][chrom].appendi(
                        int(start), int(end), "\t".join(line_info))

                    master_TF_dict_return[TF].update(
                        {"significant_unique_dmr_hits": 0})
                    master_TF_dict_return[TF].update(
                        {"background_unique_dmr_hits": 0})
                    master_TF_dict_return[TF].update(
                        {"TotalTF_coveredBy_sig_unique_dmr_hits": 0})
                    master_TF_dict_return[TF].update(
                        {"TotalTF_coveredBy_bg_unique_dmr_hits": 0})

                    master_TF_dict_return[TF].update(
                        {"custom_overlap_list_sig": []})
                    master_TF_dict_return[TF].update(
                        {"custom_overlap_list_bg": []})
                    master_TF_dict_return[TF].update(
                        {"As_overlap_list_sig": []})
                    master_TF_dict_return[TF].update(
                        {"As_overlap_list_bg": []})
                    master_TF_dict_return[TF].update(
                        {"Bs_overlap_list_sig": []})
                    master_TF_dict_return[TF].update(
                        {"Bs_overlap_list_bg": []})

            #json.dump(master_TF_dict_return, outfile)
            return (master_TF_dict_return)
Exemple #7
0
    def __call__(self, gap):
        if gap.du < 0.5 or gap.dv < 0.5:
            return 0

        k = 5
        box = shapely.geometry.box(*outset_bounds(gap.bounds, k))

        flow = intervaltree.IntervalTree()
        obst = intervaltree.IntervalTree()

        flow_widths = []
        flow_width_weights = []

        for sep in self._separators.query(box):
            intersection = sep.intersection(box)
            if intersection is None or intersection.is_empty:
                continue

            label = self._label(sep.name)
            sep_dir = self._direction[label]

            for segment in extract_segments(intersection):
                minx, miny, maxx, maxy = segment.bounds
                smin = (minx, miny)
                smax = (maxx, maxy)

                if sep_dir == gap.axis:
                    uax = gap.axis
                    obst.addi(smin[uax], smax[uax] + 1, True)
                else:
                    vax = 1 - gap.axis
                    flow.addi(smin[vax], smax[vax] + 1, True)

                    flow_widths.append(self._separators.width(sep.name))
                    flow_width_weights.append(smax[vax] - smin[vax])

        flow.merge_overlaps(strict=False)
        obst.merge_overlaps(strict=False)

        flow_score = sum(i.length() for i in flow) / gap.dv
        obst_score = sum(i.length() for i in obst) / gap.du

        if self._thickness_delta and flow_widths:
            w = np.average(flow_widths, weights=flow_width_weights)
            delta_t = self._thickness_delta(w)
            obst_score -= delta_t
            flow_score += delta_t

        score = gap.du * gap.dv  # i.e. largest whitespace area
        score = (score * (1 - obst_score)) * (1 + flow_score)

        return score
Exemple #8
0
 def _file_to_tree(filename):
     with tokenize.open(filename) as file:
         parsed = ast.parse(file.read(), filename=filename)
     classes = intervaltree.IntervalTree()
     tree = intervaltree.IntervalTree()
     for node in ast.walk(parsed):
         if isinstance(node, (ast.ClassDef)):
             start, end = Main._compute_interval(node)
             classes[start:end] = node
         if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
             start, end = Main._compute_interval(node)
             tree[start:end] = node
     return classes, tree
    def calculate_framac_intervals(self, substages):

        intervals = {n: intervaltree.IntervalTree() for n in substages}
        for num in substages:
            for r in pytable_utils.get_rows(self.trace_intervals_table,
                                            'substagenum == %s' % num):
                # lookup writes performed by this function
                f = r['functionname']
                (lopc, hipc) = self.fun_info(f)
                res = db_info.get(self.stage).write_interval_info(
                    tracename, lopc, hipc)
                intervals[num] += intervaltree.IntervalTree(
                    [intervaltree.Interval(r[0], r[1]) for r in res])

        return intervals
Exemple #10
0
    def __init__(self):
        # Create underlying C object wrapped so that M6502_delete is called
        # automatically on destruction.
        self._mpu = ffi.gc(lib.M6502_new(ffi.NULL, ffi.NULL, ffi.NULL),
                           lib.M6502_delete)

        # Three interval trees mapping address intervals to callables for read,
        # write and call callbacks.
        self._read_cbs = intervaltree.IntervalTree()
        self._write_cbs = intervaltree.IntervalTree()
        self._call_cbs = intervaltree.IntervalTree()

        # Record a weak reference ourselves in the mapping dict for callbacks.
        _map_dict[self._mpu] = weakref.ref(self)
        self.reset()
Exemple #11
0
 def _resolve_special_addr_region(self, handle, allregions, values):
     if handle == 'remainder':
         parent = self.parent
         if self.parent and self.parent.addresses_resolved:
             siblings = [
                 allregions[n] for n in self.parent.children_names
                 if n in allregions.iterkeys()
             ]
             if not all(
                     map(
                         lambda x: x.short_name == self.short_name or x.
                         addresses_resolved, siblings)):
                 return False
             if not len(siblings) == len(self.parent.children_names):
                 return False
             remainder = intervaltree.IntervalTree(self.parent.addresses)
             for s in siblings:
                 if s.short_name == self.short_name:
                     continue
                 for i in s.addresses:
                     remainder.chop(i.begin, i.end)
             toremove = []
             self.addresses = remainder
             return True
     elif handle == 'children':
         res = intervaltree.IntervalTree()
         if len(self.children_names) == 0:
             return False
         for n in self.children_names:
             if n in allregions.iterkeys(
             ) and allregions[n].addresses_resolved:
                 res = res | allregions[n].addresses
             else:
                 return False
         self.addresses = res
         return True
     else:
         reg_name = handle.rsplit(".", 1)[0]
         if reg_name in allregions.iterkeys(
         ) and allregions[reg_name].addresses_resolved:
             res = self._resolve_region_relative(handle, allregions)
             if isinstance(res, type(handle)):
                 return False
             else:
                 self.addresses = intervaltree.IntervalTree([res])
                 return True
         else:
             return False
Exemple #12
0
    def __init__(self, arch=ARM, output_directory=None):
        super(Avatar, self).__init__()

        self.watchmen = Watchmen(self)
        self.arch = arch
        self.arch.init(self)
        self.targets = {}
        self.status = {}
        self.memory_ranges = intervaltree.IntervalTree()

        self.output_directory = (tempfile.mkdtemp(
            suffix="_avatar") if output_directory is None else
                                 output_directory)
        if not path.exists(self.output_directory):
            makedirs(self.output_directory)

        self._close = Event()
        self.queue = queue.Queue()
        self.start()

        self.log = logging.getLogger('avatar')
        format = '%(asctime)s | %(name)s.%(levelname)s | %(message)s'
        logging.basicConfig(filename='%s/avatar.log' % self.output_directory,
                            level=logging.INFO,
                            format=format)
        self.log.info("Initialized Avatar. Output directory is %s" %
                      self.output_directory)

        signal.signal(signal.SIGINT, self.sigint_wrapper)
        self.sigint_handler = self.shutdown
        self.loaded_plugins = []
Exemple #13
0
def createIntervalTree(testIntervals):
    tree = intervaltree.IntervalTree()

    for start, end, weight in testIntervals:
        tree.addi(start, end, weight)

    return tree
Exemple #14
0
    def write_interval_info(self,
                            hwname,
                            pclo=None,
                            pchi=None,
                            substage_names=[],
                            substage_entries={}):
        wt = self._get_writestable(hwname)
        if "framac" in hwname:
            return [(r['destlo'], r['desthi']) for r in pytable_utils.get_rows(
                '(%d <= writepc) & (writepc < %d)' % (pclo, pchi))]
        else:
            fns = substage_entries
            substages = substage_names
            num = 0
            intervals = {n: intervaltree.IntervalTree() for n in substages}

            for r in wt.read_sorted('index'):
                pc = r['pc']
                if num < len(fns) - 1:
                    # check if we found the entrypoint to the next stage
                    (lopc, hipc) = substage_entries[num + 1]
                    if (lopc <= pc) and (pc < hipc):
                        num += 1
                if num in substages:
                    start = r['dest']
                    end = start + pytable_utils.get_rows(
                        wt, 'pc == %d' % r['pc'])[0]['writesize']
                    intervals[num].add(intervaltree.Interval(start, end))
            return intervals
Exemple #15
0
def make_gene_tree():
    gene_tree = intervaltree.IntervalTree()
    gene_data = get_gene_data()
    for gene in gene_data:
        # print(gene['min'], gene['max'])
        gene_tree[gene['min']:gene['max']] = dict(locus=gene['uniquename'])
    return gene_tree
Exemple #16
0
def extract_data_page_slack(page):
    '''
    extract the slack bytes from the given data page.
    
    Args:
        page (cim.DataPage): the page from which to extract slack space.

    Yields:
        SlackRegion: the raw bytes of the slack space.
    '''

    # start by marking the entire page as allocated
    slack = intervaltree.IntervalTree(
        [intervaltree.Interval(0, cim.DATA_PAGE_SIZE)])

    # remove the toc region
    slack.chop(0, len(page.toc))

    # if there is a toc, then we remove the empty entry at the end
    # (this is not included in the list of entries, but its part of the toc).
    if len(page.toc) > 0:
        slack.chop(len(page.toc), len(page.toc) + 0x10)

    # and regions for each of the entries
    for j in range(page.toc.count):
        entry = page.toc[j]
        slack.chop(entry.offset, entry.offset + entry.size)

    for region in sorted(slack):
        begin, end, _ = region
        if (end - begin) > cim.DATA_PAGE_SIZE:
            continue

        yield SlackRegion(page.logical_page_number, begin, page.buf[begin:end])
Exemple #17
0
 def __init__(self, short_name, d, stage, parent=None, values={}):
     if parent is None:
         parent_type = None
         parent_default_perms = None
         parent_include_children = None
         parent_reclassifiable = None
     else:
         parent_type = parent.typ
         parent_default_perms = parent.default_perms
         parent_include_children = parent.include_children
         parent_reclassifiable = parent.reclassifiable
     self.stage = stage
     self.addresses = intervaltree.IntervalTree()
     self.short_name = short_name
     self.name = get_value(d, 'name')
     self._raw_typ = get_value(d, 'type', parent_type).lower()
     self._raw_addresses = get_value(d, 'addresses')
     self._raw_default_perms = get_value(d, 'default_perms', parent_default_perms)
     self._raw_subregions = get_value(d, 'subregions')
     self._raw_include_children = get_value(d, 'include_children', parent_include_children)
     self._raw_reclassifiable = get_value(d, 'reclassifiable', parent_reclassifiable)
     self._csv = get_value(d, 'csv')
     if self._csv:
         self._csv = Main.populate_from_config(self._csv)            
     if parent and parent._csv:
         # if parent had csv, don't propigate csv definition
         self._csv = None
     self.contents = get_value(d, 'contents')
     self.children_names = [self.short_name + '.' + s for s in self._raw_subregions.iterkeys()]
     self.parent = parent
     self.addresses_resolved = False
     self._convert_from_raw(values)
     self.resolve_addresses(values=values)
     self.reclassification_rules = {0: self.typ}
Exemple #18
0
def parse_cytoband(lines):
    """Parse iterable with cytoband coordinates

    Args:
        lines(iterable): Strings on format "chr1\t2300000\t5400000\tp36.32\tgpos25"

    Returns:
        cytobands(dict): Dictionary with chromosome names as keys and
                         interval trees as values
    """
    cytobands = {}
    for line in lines:
        if line.startswith("#"):
            continue
        line = line.rstrip()
        splitted_line = line.split("\t")
        chrom = splitted_line[0].lstrip("chr")
        start = int(splitted_line[1])
        stop = int(splitted_line[2])
        name = splitted_line[3]
        if chrom in cytobands:
            # Add interval to existing tree
            cytobands[chrom][start:stop] = name
        else:
            # Create a new interval tree
            new_tree = intervaltree.IntervalTree()
            # create the interval
            new_tree[start:stop] = name
            # Add the interval tree
            cytobands[chrom] = new_tree

    return cytobands
Exemple #19
0
def get_gdb_sym(addr):
    ret = intervaltree.IntervalTree()
    addr = int(addr)
    global sym_cache
    xs = sym_cache[addr]
    if (len(xs) > 0):
        print("xs = '%s'" % xs)
        for x in xs:
            print("x = '%s'" % x)
    else:
        xaddr = addr
        nm = gdb.parse_and_eval(f"(void*)({xaddr})")
        m = symre.match(str(nm))
        if (m):
            symsize = 1
            ssz = m.group(2)
            if (ssz is not None):
                symsize = int(ssz[1:])
            eaddr = xaddr
            xaddr -= symsize + 1
            saddr = eaddr - symsize
            #            print("saddr = 0x%x" % saddr )
            #            print("eaddr = 0x%x" % eaddr )
            #            ret.append( ( saddr, eaddr, m.group(1) ) )
            ret[saddr:eaddr + 1] = m.group(1)
Exemple #20
0
def filter_introns(introns, genes, options):
    
    ### build interval trees of all genes starts and ends
    chrms = sp.array([_.strand for _ in genes])
    strands = sp.array([_.chr for _ in genes])
    gene_trees = dict()
    for c in sp.unique(chrms):
        for s in sp.unique(strands):
            gene_trees[(c, s)] = it.IntervalTree()
            c_idx = sp.where((chrms == c) & (strands == s))[0]
            for i in c_idx:
                gene_trees[(c, s)][genes[i].start:genes[i].stop] = i

    ### match all introns agains trees and remove elements overlapping
    ### more than one gene on the same chr/strand
    cnt_tot = 0
    cnt_rem = 0
    strand_list = ['+', '-']
    offset = options.intron_edges['append_new_terminal_exons_len']
    for si, s in enumerate(strand_list):
        for i in range(introns.shape[0]):
            if introns[i, si].shape[0] == 0:
                continue
            k_idx = []
            cnt_tot += introns[i, si].shape[0]
            for j in range(introns[i, si].shape[0]):
                if len(gene_trees[(s, genes[i].chr)].overlap(introns[i, si][j, 0] - offset, introns[i, si][j, 1] + offset)) == 1:
                    k_idx.append(j)
            if len(k_idx) < introns[i, si].shape[0]:
                cnt_rem += (introns[i, si].shape[0] - len(k_idx))
                introns[i, si] = introns[i, si][k_idx, :]
    print('removed %i of %i (%.2f percent) introns overlapping to no or multiple genes' % (cnt_rem, cnt_tot, cnt_rem / float(max(cnt_tot, 1)) * 100))

    return introns
 def __init__(self,
              path,
              lineno,
              lvalue,
              values,
              pc=None,
              origpc=None,
              substage_name=None,
              callstack="",
              stage=None):
     self.path = path
     self.pc = pc
     self.origpc = origpc
     self.lineno = lineno
     self.values = intervaltree.IntervalTree()
     for v in values:
         self.values.add(v)
     self.lvalue = lvalue
     self.stage = stage
     if substage_name is None and callstack:
         policy = getattr(Main.raw.policies.substages_file,
                          self.stage.stagename)
         get_config('policy_file', self.stage)
         self.substages = substage.SubstagesInfo.substage_names_from_file(
             policy)
         self.substages[0] = "frama_go"
         called_fns = callstack.split("->")
         called_fns = filter(len, called_fns)
         called_fns.reverse()
         for f in called_fns:
             if f in self.substages:
                 substage_name = self.substages.index(f)
                 break
     self.substage = substage_name
Exemple #22
0
def tupletree(table, start='start', stop='stop', value=None):
    """
    Construct an interval tree for the given table, where each node in the tree
    is a row of the table.

    """

    import intervaltree
    tree = intervaltree.IntervalTree()
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    assert start in flds, 'start field not recognised'
    assert stop in flds, 'stop field not recognised'
    getstart = itemgetter(flds.index(start))
    getstop = itemgetter(flds.index(stop))
    if value is None:
        getvalue = tuple
    else:
        valueindices = asindices(hdr, value)
        assert len(valueindices) > 0, 'invalid value field specification'
        getvalue = itemgetter(*valueindices)
    for row in it:
        tree.addi(getstart(row), getstop(row), getvalue(row))
    return tree
Exemple #23
0
def facettupletrees(table, key, start='start', stop='stop', value=None):
    """
    Construct faceted interval trees for the given table, where each node in
    the tree is a row of the table.

    """

    import intervaltree
    it = iter(table)
    hdr = next(it)
    flds = list(map(text_type, hdr))
    assert start in flds, 'start field not recognised'
    assert stop in flds, 'stop field not recognised'
    getstart = itemgetter(flds.index(start))
    getstop = itemgetter(flds.index(stop))
    if value is None:
        getvalue = tuple
    else:
        valueindices = asindices(hdr, value)
        assert len(valueindices) > 0, 'invalid value field specification'
        getvalue = itemgetter(*valueindices)
    keyindices = asindices(hdr, key)
    assert len(keyindices) > 0, 'invalid key'
    getkey = itemgetter(*keyindices)

    trees = dict()
    for row in it:
        k = getkey(row)
        if k not in trees:
            trees[k] = intervaltree.IntervalTree()
        trees[k].addi(getstart(row), getstop(row), getvalue(row))
    return trees
Exemple #24
0
    def _assign_blocks_to_contigs(contig_intervals_file_distance,
                                  block_interval_tree):
        """
        For each contig, create an interval tree that stores the sequence interval stored in each block
        (for all blocks that contain part of the contig), as well as the offset of the start of that block.
        :param contig_intervals_file_distance: A dictionary of intervals, keyed by contig name,
            storing the locations in the file spanned by each contig.
        :param block_interval_tree:  An interval tree storing the start and end locations in the uncompressed
            file spanned by each compressed block, as well as the offset of the block start.
        :return: Return a dictionary of such interval trees keyed by contig name.
        """
        start_time = datetime.datetime.now()
        verbose_print('\tAssigning compressed blocks to sequence contigs ...')

        sequence_blocks = {}

        for contig in sorted(contig_intervals_file_distance):

            if contig not in sequence_blocks:
                sequence_blocks[contig] = intervaltree.IntervalTree()

            for block_interval in block_interval_tree.search(
                    *contig_intervals_file_distance[contig]):
                block_start_text_distance = block_interval.begin - contig_intervals_file_distance[
                    contig][0]
                block_end_text_distance = block_interval.end - contig_intervals_file_distance[
                    contig][0]
                sequence_blocks[contig].addi(block_start_text_distance,
                                             block_end_text_distance,
                                             block_interval.data)

        verbose_print('\t\tDone in {}.'.format(datetime.datetime.now() -
                                               start_time))
        return sequence_blocks
Exemple #25
0
def Calc(args):

    tree = {}
    f = subprocess.Popen(shlex.split("gzip -fdc %s" % (args.bed_fn)),
                         stdout=subprocess.PIPE,
                         bufsize=8388608)
    for row in f.stdout:
        row = row.split()
        name = row[0]
        if name not in tree:
            tree[name] = intervaltree.IntervalTree()
        begin = int(row[1])
        end = int(row[2]) - 1
        if end == begin: end += 1
        tree[name].addi(begin, end)
    f.stdout.close()
    f.wait()

    f = subprocess.Popen(shlex.split("gzip -fdc %s" % (args.input_fn)),
                         stdout=subprocess.PIPE,
                         bufsize=8388608)
    for row in f.stdout:
        ctgName, pos = [(row.split()[i]) for i in [0, 1]]
        pos = int(pos)
        if ctgName not in tree:
            continue
        if len(tree[ctgName].search(pos)) == 0:
            continue
        sys.stdout.write(row)
    sys.stdout.flush()
    f.stdout.close()
    f.wait()
    def variants(self):
        """Yield diploid variants.

        :yields `Variant` objs

        """
        for chrom, ln in loose_version_sort(self.chroms):
            sys.stderr.write("[" + str(datetime.now().strftime('%m-%d-%Y %H:%M:%S')) + "]" + " INFO: MERGING VARIANTS IN CONTIG: " + str(chrom) + "\n")
            sys.stderr.flush()
            merged = []
            trees = [vcf._tree[chrom] for vcf in self.vcfs]
            # assign haplotype so that otherwise identical variants in both
            # trees are not treated as identical (we need to be able to
            # distinguish between 0/1 and 1/1)
            for h, tree in enumerate(trees):
                for i in tree.all_intervals:
                    i.data.info['mhap'] = h
            comb = intervaltree.IntervalTree(
                trees[0].all_intervals.union(trees[1].all_intervals))
            # if strict, merge only overlapping intervals (not adjacent ones)
            comb.merge_overlaps(
                strict=self.only_overlapping,
                data_initializer=list(),
                data_reducer=lambda x, y: x + [y])
            ref_seq = self.fasta.fetch(chrom).upper()
            for interval in comb.all_intervals:
                merged.append(_merge_variants(
                    interval, trees, ref_seq,
                    detailed_info=self.detailed_info,
                    discard_phase=self.discard_phase))
            yield from sorted(merged, key=lambda x: x.pos)
Exemple #27
0
def nr_interval_merge(df_chr, overlap=0.5):
    """
    Reduce a dataframe to non-redundant intervals based on reciprocal overlap. All records in the dataframe must be
    on the same chromosome.

    :param df_chr: DataFrame of one chromosome.
    :param overlap: Reciprocal overlap (0, 1].

    :return: Dataframe subset using the first record in a unique interval.
    """

    index_list = list()  # Dataframe indices to return

    interval_tree = intervaltree.IntervalTree()  # Tree of intervals

    # Iterate rows
    for index, row in df_chr.iterrows():
        ri_match = False

        # Find matches
        for interval in interval_tree[row['POS']:row['END']]:
            if reciprocal_overlap(row['POS'], row['END'], interval.begin, interval.end) >= 0.50:
                ri_match = True
                break

        # Append to non-redundant records if no match
        if not ri_match:
            index_list.append(index)

        # All records are added to the tree
        interval_tree[row['POS']:row['END']] = True

    return df_chr.loc[index_list]
Exemple #28
0
def main(name, bed, src, target):
    logging.info('parsing {}...'.format(bed))
    tree = {}
    for idx, line in enumerate(open(bed, 'r')):
        chrom, start, finish, annotation = line.strip('\n').split('\t')[:4]
        if chrom not in tree:
            tree[chrom] = intervaltree.IntervalTree()
        tree[chrom][int(start):int(finish)] = annotation
    logging.info('parsing {}: done.'.format(bed))

    logging.info('reading from stdin...')
    yes = no = 0
    for idx, line in enumerate(src):
        chrom, start, finish, annotation = line.strip('\n').split('\t')[:4]
        if chrom not in tree:
            sys.stdout.write(line)
            no += 1
            continue
        overlaps = tree[chrom][int(start):int(finish)]
        if len(overlaps) == 0:
            sys.stdout.write(line)
            no += 1
        else:
            values = set([overlap.data for overlap in overlaps])
            value = ','.join(values)
            if annotation.endswith(';'):
                sys.stdout.write('{}\t{}\t{}\t{}{}={}\n'.format(
                    chrom, start, finish, annotation, name, value))
            else:
                sys.stdout.write('{}\t{}\t{}\t{};{}={}\n'.format(
                    chrom, start, finish, annotation, name, value))
            yes += 1

    logging.info('%i overlaps. %i with no overlap.', yes, no)
Exemple #29
0
    def variants(self):
        """Yield diploid variants.

        :yields `medaka.vcf.Variant` objs

        """
        for chrom in medaka.common.loose_version_sort(self.chroms):
            self.logger.info('Merging variants in chrom {}'.format(chrom))
            merged = []
            trees = [vcf._tree[chrom] for vcf in self.vcfs]
            # assign haplotype so that otherwise identical variants in both
            # trees are not treated as identical (we need to be able to
            # distinguish between 0/1 and 1/1)
            for h, tree in enumerate(trees):
                for i in tree.all_intervals:
                    i.data.info['mhap'] = h
            comb = intervaltree.IntervalTree(
                trees[0].all_intervals.union(trees[1].all_intervals))
            # if strict, merge only overlapping intervals (not adjacent ones)
            comb.merge_overlaps(
                strict=self.only_overlapping,
                data_initializer=list(),
                data_reducer=lambda x, y: x + [y])
            ref_seq = self.fasta.fetch(chrom).upper()
            for interval in comb.all_intervals:
                merged.append(_merge_variants(
                    interval, trees, ref_seq,
                    detailed_info=self.detailed_info,
                    discard_phase=self.discard_phase))
            yield from sorted(merged, key=lambda x: x.pos)
Exemple #30
0
 def __init__(self, refFile, chrom="chr"):
     self.ref_fasta = pysam.FastaFile(refFile)
     self.alts = {}  # AltAllele.Key -> AltAllele
     self.refs = intervaltree.IntervalTree()
     self.chrom = chrom
     self.first_pos = None
     self.last_pos = None