Esempio n. 1
0
def create_triplex_superset_tree(full_file_path):
    """

    :param full_file_path:
    :return:
    """
    print("Creating triplex superset TREE VERSION for file: " + full_file_path)
    tree = intervaltree.IntervalTree()
    content = utils.get_tpx_lines_by_chromosome(full_file_path)

    output_file = open(full_file_path.replace('.tpx', '.ss.tpx'), 'w')
    output_file.write(get_first_line(full_file_path))

    def compare(iv1, iv2):
        if iv1[0] < iv2[0]:  # start is smaller
            return -1
        elif iv1[0] > iv2[0]:  # start is larger
            return 1
        else:  # starts are equal, check for end point
            return iv2[1] - iv1[1]

    for chrom in content.keys():
        print("Processing #" + str(len(content[chrom])) +
              " triplexes from chromosome " + chrom)
        sorted_intervals = set()
        for index, line in enumerate(content[chrom]):
            cols = line.split('\t')
            (tfo_chr, tfo_start_offset) = cols[0].split(':')
            (tts_chr, tts_start_offset) = cols[3].split(':')
            tfo_start_offset = int(tfo_start_offset)
            tts_start_offset = int(tts_start_offset)
            (tfo_start_pos, tfo_end_pos,
             tts_start_pos, tts_end_pos) = (int(cols[1]) + tfo_start_offset,
                                            int(cols[2]) + tfo_start_offset,
                                            int(cols[4]) + tts_start_offset,
                                            int(cols[5]) + tts_start_offset)
            sorted_intervals.add((tts_start_pos, tts_end_pos, index))
            assert (tfo_chr == tts_chr)

        sorted_intervals = sorted(sorted_intervals, cmp=compare)

        # add first interval always
        tree.add(
            intervaltree.Interval(sorted_intervals[0][0],
                                  sorted_intervals[0][1],
                                  sorted_intervals[0][2]))
        output_file.write(content[chrom][sorted_intervals[0][2]])
        for iv in sorted_intervals[1:]:
            overlaps_start = set(
                [x.data for x in tree.top_node.search_overlap([iv[0]])])
            contained = False
            for iv_overlap in tree.top_node.search_overlap([iv[1]]):
                if iv_overlap.data in overlaps_start:
                    contained = True
            if not contained:
                tree.add(intervaltree.Interval(iv[0], iv[1], iv[2]))
                output_file.write(content[chrom][iv[2]])
    output_file.close()
 def lookup_symbol_interval(self, name, num):
     (startaddr, endaddr) = db_info.get(self.stage).mmap_var_loc(name)
     reloc_names = db_info.get(self.stage).reloc_names_in_substage(num)
     varloc = intervaltree.Interval(startaddr, endaddr)
     for (rname, rbegin, rsize, roffset) in db_info.get(
             self.stage).reloc_info_by_cardinal(reloc_names):
         relrange = intervaltree.Interval(rbegin, rbegin + rsize)
         if relrange.contains_interval(varloc):
             offset = roffset
             varloc = intervaltree.Interval(varloc.begin + offset,
                                            varloc.end + offset)
     return varloc
 def divide_intervals(self, stages, table):
     divided_intervals = {i: intervaltree.IntervalTree() for i in stages}
     for r in table.iterrows():
         n = r["substagenum"]
         divided_intervals[n].add(
             intervaltree.Interval(r["minaddr"], r["maxaddr"]))
         if self.interval_type == self.CUMULATIVE:
             for i in range(n + 1, len(stages)):
                 divided_intervals[i].add(
                     intervaltree.Interval(r["minaddr"], r["maxaddr"]))
     for i in divided_intervals.itervalues():
         i.merge_overlaps()
     return divided_intervals
Esempio n. 4
0
def make_variant(seq, tree, max_indel=5, chrom='contig1'):
    """Return a random variant (variable-length indel or single base sub) which
    does not overlap with any intervals in a supplied tree and update the tree.
    """
    attempts = 0
    while True:
        attempts += 1
        vpos = np.random.randint(max_indel + 1, len(seq)-max_indel - 1)
        indel_len = np.random.randint(-max_indel, max_indel + 1)
        if indel_len < 0:  # del
            ref = seq[vpos: vpos + abs(indel_len) + 1]
            alt = seq[vpos]
        elif indel_len > 1:  # ins
            ref = seq[vpos]
            alt = ref + ''.join(np.random.choice(bases, indel_len))
        else:  # single nucleotide sub
            ref = seq[vpos]
            alt = np.random.choice([b for b in bases if str(b) != str(ref)])
        v = medaka.vcf.Variant(chrom, vpos, ref=ref, alt=alt, qual=np.random.randint(1,10), genotype_data={'GT':'1/1'})
        vtrimmed = v.trim()
        end = vtrimmed.pos + len(vtrimmed.ref)
        interval = intervaltree.Interval(v.pos, end + 1)
        if not tree.overlaps(interval):
            tree.add(interval)
            # print('Variant {} succeeded after {}'.format(len(tree), attempts))
            break
    return vtrimmed
Esempio n. 5
0
def extract_data_page_slack(page):
    '''
    extract the slack bytes from the given data page.
    
    Args:
        page (cim.DataPage): the page from which to extract slack space.

    Yields:
        SlackRegion: the raw bytes of the slack space.
    '''

    # start by marking the entire page as allocated
    slack = intervaltree.IntervalTree(
        [intervaltree.Interval(0, cim.DATA_PAGE_SIZE)])

    # remove the toc region
    slack.chop(0, len(page.toc))

    # if there is a toc, then we remove the empty entry at the end
    # (this is not included in the list of entries, but its part of the toc).
    if len(page.toc) > 0:
        slack.chop(len(page.toc), len(page.toc) + 0x10)

    # and regions for each of the entries
    for j in range(page.toc.count):
        entry = page.toc[j]
        slack.chop(entry.offset, entry.offset + entry.size)

    for region in sorted(slack):
        begin, end, _ = region
        if (end - begin) > cim.DATA_PAGE_SIZE:
            continue

        yield SlackRegion(page.logical_page_number, begin, page.buf[begin:end])
Esempio n. 6
0
def samples_to_bed(args):
    """Write a bed file from samples in a datastore file."""
    logger = medaka.common.get_named_logger('Variants')

    index = medaka.datastore.DataIndex(args.inputs)

    trees = collections.defaultdict(intervaltree.IntervalTree)
    logger.info("Building interval tree")
    for s, f in index.samples:
        d = medaka.common.Sample.decode_sample_name(s)
        # start and end are string repr of floats (major.minor coordinates)
        start, end = int(float(d['start'])), int(float(d['end']))
        # add one to end of interval, as intervaltree intervals and bed file
        # intervals are end-exclusive (i.e. they don't contain the last
        # coordinate), whilst the last position in a sample is included in that
        # sample.
        trees[d['ref_name']].add(intervaltree.Interval(start, end + 1))

    with open(args.output, 'w') as fh:
        for contig, tree in trees.items():
            # strict=False as consecutive samples can start and end on the same
            # major (overlap is in minor) hence if samples are abutting but not
            # overlapping in major coords, merge them
            tree.merge_overlaps(strict=False)
            logger.info("Writing intervals for {}".format(contig))
            for i in sorted(tree.all_intervals):
                fh.write("{}\t{}\t{}\n".format(contig, i.begin, i.end))

    logger.info("All done, bed file written to {}".format(args.output))
Esempio n. 7
0
    def write_interval_info(self,
                            hwname,
                            pclo=None,
                            pchi=None,
                            substage_names=[],
                            substage_entries={}):
        wt = self._get_writestable(hwname)
        if "framac" in hwname:
            return [(r['destlo'], r['desthi']) for r in pytable_utils.get_rows(
                '(%d <= writepc) & (writepc < %d)' % (pclo, pchi))]
        else:
            fns = substage_entries
            substages = substage_names
            num = 0
            intervals = {n: intervaltree.IntervalTree() for n in substages}

            for r in wt.read_sorted('index'):
                pc = r['pc']
                if num < len(fns) - 1:
                    # check if we found the entrypoint to the next stage
                    (lopc, hipc) = substage_entries[num + 1]
                    if (lopc <= pc) and (pc < hipc):
                        num += 1
                if num in substages:
                    start = r['dest']
                    end = start + pytable_utils.get_rows(
                        wt, 'pc == %d' % r['pc'])[0]['writesize']
                    intervals[num].add(intervaltree.Interval(start, end))
            return intervals
 def update_writes(self,
                   line,
                   pc,
                   lo,
                   hi,
                   stage,
                   origpc=None,
                   substage=None):
     if not pc:
         (path, lineno) = line.split(':')
         lineno = int(lineno)
     else:
         (path, lineno) = ('', 0)
     if not origpc:
         origpc = pc
     w = WriteDstResult(path,
                        lineno,
                        '', [intervaltree.Interval(lo, hi)],
                        pc,
                        origpc,
                        substage_name=substage)
     if lo > hi:
         print "%x > %x at %x" % (lo, hi, pc)
         traceback.print_stack()
     self.writerangetable.add_dsts_entry(w)
 def from_line(cls, line, stage):
     res = cls.regexp.match(line.strip())
     if res is None:
         raise Exception("%s is not a framac dst result")
     else:
         min_value = int(res.group(1), 0)
         max_value = int(res.group(2), 0)
         if max_value > 0xFFFFFFFF:
             max_value = 0xFFFFFFFF
         lvalue = res.group(3)
         path = res.group(4)
         # somewhat of a hack to get relative path
         root = Main.get_target_cfg().software_cfg.root
         if path.startswith(root):
             path = os.path.relpath(path, root)
             path = os.path.join(Main.raw.runtime.temp_target_src_dir, path)
         elif path.startswith(
                 Main.test_suite_path):  # not sure why this happens
             path = os.path.relpath(path, Main.test_suite_path)
             path = os.path.join(Main.raw.runtime.temp_target_src_dir, path)
         elif path.startswith("/tmp/tmp"):
             path = "/".join(path.split("/")[3:])
             path = os.path.join(Main.raw.runtime.temp_target_src_dir, path)
         lineno = int(res.group(5))
         callstack = res.group(6)
         # somewhat of a hack for muxconf
         return cls(path,
                    lineno,
                    lvalue, [intervaltree.Interval(min_value, max_value)],
                    callstack=callstack,
                    stage=stage)
Esempio n. 10
0
 def load_bed(self, instream):
     self.trees = {}
     for line in instream:
         chrom, start0, end1 = line.split('\t', 3)[:3]
         if chrom not in self.trees:
             self.trees[chrom] = intervaltree.IntervalTree()
         self.trees[chrom].add(intervaltree.Interval(
             int(start0), int(end1)))
Esempio n. 11
0
def _build_semester_lookup():
    """
    Build data structure to let us easily look up date -> strm.
    """
    all_semesters = Semester.objects.all()
    intervals = ((s.name, Semester.start_end_dates(s)) for s in all_semesters)
    intervals = (intervaltree.Interval(st, en + ONE_DAY, name)
                 for (name, (st, en)) in intervals)
    return intervaltree.IntervalTree(intervals)
Esempio n. 12
0
 def _add_addr_range(self, start, end):
     if type(start) == int and type(end) == int:
         if not end >= start:
             raise Exception("start addr %x must be smaller than end address %x for %s" %
                             (start, end, self.short_name))
         self.addresses.add(intervaltree.Interval(start, end))
     else:
         raise Exception("One of start addr (%s) end addr (%s) is not an int for %s" %
                         (start, end, self.short_name))
Esempio n. 13
0
def intervaltrees_from_bed(path_to_bed):
    """Created dict of intervaltrees from a .bed file, indexed by chrom.

    :param path_to_bed: str, path to .bed file.
    :returns: { str chrom: `intervaltree.IntervalTree` obj }.
    """
    trees = defaultdict(intervaltree.IntervalTree)
    for chrom, start, stop in yield_from_bed(path_to_bed):
        trees[chrom].add(intervaltree.Interval(begin=start, end=stop))
    return trees
Esempio n. 14
0
    def setup(self):
        if not self.attr_exists("setup_done"):
            self.setup_done = True

            self.supported_bootloader_cfgs = {}
            self.hardware_type_cfgs = {}
            for b in self.supported_bootloaders:
                self.supported_bootloader_cfgs[b] = self.object_config_lookup(
                    "Bootloader", b)

            for h in self.types:
                self.hardware_type_cfgs[h] = self.object_config_lookup(
                    "HardwareConfig", h)
            self.base_mem_map = os.path.join(Main.hw_info_path, self.name,
                                             self.base_mem_map)
            self.sdskeleton = os.path.join(Main.hw_info_path, self.name,
                                           self.sdskeleton)
            self.tech_reference = os.path.join(Main.hw_info_path, self.name,
                                               self.tech_reference)

            if self.attr_exists("phy_addr_range"):
                lo = self.phy_addr_range['loaddr']
                hi = self.phy_addr_range['hiaddr']
                self.phy_addr_range = intervaltree.IntervalTree(
                    [intervaltree.Interval(lo, hi)])
            range_intervals = []
            if self.attr_exists("ram_ranges"):
                self.ram_range_names = [r['name'] for r in self.ram_ranges]
                for r in self.ram_ranges:
                    lo = r['loaddr']
                    hi = r['hiaddr']
                    inter = intervaltree.Interval(lo, hi)
                    range_intervals.append(inter)
                    setattr(self, r['name'], inter)
            else:
                self.ram_range_names = []
            self.ram_ranges = intervaltree.IntervalTree(range_intervals)
            self.ram_ranges.merge_overlaps()
            self.non_ram_ranges = self.phy_addr_range
            for r in self.ram_ranges:
                self.non_ram_ranges.chop(r.begin, r.end)
                self.non_ram_ranges.remove_overlap(r)
Esempio n. 15
0
def intervaltree_from_bed(path_to_bed, chrom):
    """Created intervaltree from a .bed file for the given chromosome.

    :param path_to_bed: str, path to .bed file.
    :param chrom: str, chromosome name.
    :returns: `intervaltree.IntervalTree` obj.
    """
    tree = intervaltree.IntervalTree()
    for j in (i for i in pybedtools.BedTool(path_to_bed) if i.chrom == chrom):
        tree.add(intervaltree.Interval(begin=j.start, end=j.stop))
    return tree
Esempio n. 16
0
def sort_intervals(intervals):
    tree = intervaltree.IntervalTree()
    interval_counts = defaultdict(lambda: 0)
    for i in intervals:
        interval_counts[i] = interval_counts[i] + 1
    for key, value in interval_counts.items():
        tree.add(intervaltree.Interval(key[0], key[1] + 1, value))
    answer = []
    for i in range(24):
        answer.append((sum(value[2] for value in tree.at(i))))
    return answer
Esempio n. 17
0
def fillGaps(tads,N,counter):
    # THIS FUNCTION WORKS FOR SINGLE CHROMOSOME
    # chromosome is [0,N] interval
    gaps = itr.IntervalTree([itr.Interval(0,N,1)])
    for tad in tads:
        gaps.chop(tad.begin,tad.end)
    # create new tads object with filled gaps
    filled = tads.copy()
    for gap in gaps:
        filled.addi(gap.begin,gap.end,counter)
        counter += 1
    return counter, filled
Esempio n. 18
0
def region_interval_trees(regions):
    chromosome_intervals = defaultdict(list)
    for region in regions:
        interval = intervaltree.Interval(region.start - 1,
                                         region.end,
                                         data=region)
        chromosome_intervals[region.chromosome].append(interval)

    region_trees = dict()
    for chromosome, intervals in chromosome_intervals.items():
        region_trees[chromosome] = intervaltree.IntervalTree(intervals)

    return region_trees
Esempio n. 19
0
    def _interval_tree_regions(self, regions):
        intervals = defaultdict(list)
        for i, region in enumerate(regions):
            interval = intervaltree.Interval(region.start - 1,
                                             region.end,
                                             data=i)
            intervals[region.chromosome].append(interval)

        interval_trees = {
            chromosome: intervaltree.IntervalTree(intervals)
            for chromosome, intervals in intervals.items()
        }
        return interval_trees
Esempio n. 20
0
def complement_intervaltrees(trees, contig_lengths):
    """Complement intervals, returning intervals not present in the input trees.

    :param trees: {str contig: `intervaltree.IntervalTree` objs}
    :param contig_lengths: {str contig: int contig length}
    :returns: {str contig: `intervaltree.IntervalTree` objs}
    """
    comp = collections.defaultdict(intervaltree.IntervalTree)
    for contig, length in contig_lengths.items():
        comp[contig].add(intervaltree.Interval(0, length))
    for contig, tree in trees.items():
        for interval in tree:
            comp[contig].chop(interval.begin, interval.end)
    return comp
Esempio n. 21
0
 def _relocated(regname, fullname):
     r = allregions[regname]
     ret = s
     rend = fullname.rsplit(".", 1)[1]
     cardinalres = re.match("([\d])+_relocated", rend)
     cardinal = cardinalres.group(1)
     relocindex = re.sub("[._relocated]+", "", rend)
     if r.addresses_resolved:
         start = min(r.addresses).begin
         end = max(r.addresses).end
         (offset,
          mod) = db_info.get(self.stage).reloc_offset_and_mod_from_cardinal(relocindex)
         ret = intervaltree.Interval((start + offset) % mod, (end + offset) % mod)
     return ret
Esempio n. 22
0
    def update(self, labels):
        """
        Add a list of labels to the end of the list.

        Args:
            labels (list): Labels to add.
        """
        ivs = []

        for label in labels:
            label.label_list = self
            ivs.append(intervaltree.Interval(label.start, label.end, label))

        self.label_tree.update(ivs)
    def setup(self):
        if not self.attr_exists("setup_done"):
            self.populate_path_from_name("hw_info_path",
                                         or_default=True,
                                         do_setattr=True)
            self._check_path("hw_info_path")
            self._update_raw("hw_info_path", self.hw_info_path)

            self.setup_done = True
            self.host_cfgs = {}
            for h in self.hosts:
                self.host_cfgs[h] = self.object_config_lookup("HostConfig", h)

            if not self.attr_exists("addr_range"):
                self.addr_range = Main.default_raw.HardwareClass._hw.addr_range
            lo = long(self.addr_range[0])
            hi = long(self.addr_range[1])
            self.addr_range = intervaltree.IntervalTree(
                [intervaltree.Interval(lo, hi)])
            range_intervals = []
            self.addr_range_names = []
            if self.attr_exists("named_addr_ranges"):
                for (k, [lo, hi]) in self.named_addr_ranges.iteritems():
                    self.addr_range_names.append(k)
                    inter = intervaltree.Interval(long(lo), long(hi))
                    range_intervals.append(inter)
                    setattr(self, k, inter)
                    self._update_raw(k, inter)

            self.ram_ranges = intervaltree.IntervalTree(range_intervals)
            self.ram_ranges.merge_overlaps()
            self.non_ram_ranges = self.addr_range
            for r in self.ram_ranges:
                self.non_ram_ranges.chop(r.begin, r.end)
                self.non_ram_ranges.remove_overlap(r)
            if not hasattr(self, "default_host"):
                self.default_host = self.hosts[0]
    def calculate_framac_intervals(self, substages):

        intervals = {n: intervaltree.IntervalTree() for n in substages}
        for num in substages:
            for r in pytable_utils.get_rows(self.trace_intervals_table,
                                            'substagenum == %s' % num):
                # lookup writes performed by this function
                f = r['functionname']
                (lopc, hipc) = self.fun_info(f)
                res = db_info.get(self.stage).write_interval_info(
                    tracename, lopc, hipc)
                intervals[num] += intervaltree.IntervalTree(
                    [intervaltree.Interval(r[0], r[1]) for r in res])

        return intervals
 def allowed_writes(self, substage):
     n = substage
     query = "(substagenum == %d) & (writable == True)" % (n)
     drs = self.substage_region_policy_table.where(query)
     iis = intervaltree.IntervalTree()
     for region in drs:
         if region['allowed_symbol']:
             sname = region['symbol_elf_name']
             iis.add(self.lookup_symbol_interval(sname, n))
         else:
             query = 'short_name == "%s"' % region['short_name']
             for r in self.substage_mmap_addr_table.where(query):
                 iis.add(intervaltree.Interval(r['startaddr'],
                                               r['endaddr']))
     iis.merge_overlaps()
     iis.merge_equals()
     return iis
Esempio n. 26
0
def make_interval_trees(graph):
    out_edges = {}
    in_edges = {}
    for edge_name, edge in graph.edges.items():
        out_edges.setdefault(edge['v'], [])
        out_edges[edge['v']].append(edge)
        in_edges.setdefault(edge['w'], [])
        in_edges[edge['w']].append(edge)

    interval_dict = {}
    for v, edges in out_edges.items():
        interval_dict[v] = [intervaltree.Interval(edge['v_start'], edge['v_start'] + 1, edge) for edge in edges]

    interval_trees = {}
    for v, intervals in interval_dict.items():
        interval_trees[v] = intervaltree.IntervalTree(intervals)

    return interval_trees
Esempio n. 27
0
 def resolve_addresses(self, all_regions={}, values={}):
     all_resolved = True
     if self.addresses_resolved is True:
         return
     elif self._csv:
         (f, parsed) = parse_am37x_register_tables.parsecsv(self._csv)
         addrs = intervaltree.IntervalTree()
         for p in parsed:
             addr = p[parse_am37x_register_tables.TITable.ADDRESS]
             if addr:
                 addr = int(addr, 0)
             else:
                 continue
             wid = p[parse_am37x_register_tables.TITable.WIDTH]
             name = p[parse_am37x_register_tables.TITable.NAME]
             # create a unique name without spaces
             name = re.sub("[\s]", "", name) + (".%x" % addr)
             size = int(wid) / 8 if wid else 4
             i = intervaltree.Interval(addr, addr + size)
             addrs.add(i)
             raw_perms = p[parse_am37x_register_tables.TITable.TYPE].lower()
             perms = "readonly" if 'w' not in raw_perms else 'global'
             self._raw_subregions[name] = {
                 'addresses': [i.begin, i.end],
                 'include_children': False,
                 'type': perms,
             }
             self.children_names.append("%s.%s" % (self.short_name, name))
         self.addresses = addrs
         f.close()
         all_resolved = True
     elif (type(self._raw_addresses) == list):
         if type(self._raw_addresses[0]
                 ) == list:  # its a list of lists of subregions
             for a in self._raw_addresses:
                 all_resolved = all_resolved and self._resolve_addr_region(
                     a, all_regions, values)
         else:
             all_resolved = self._resolve_addr_region(
                 self._raw_addresses, all_regions, values)
     else:
         all_resolved = self._resolve_special_addr_region(
             self._raw_addresses, all_regions, values)
     self.addresses_resolved = all_resolved
Esempio n. 28
0
 def __init__(self, interval_tuples):
     '''intervals is like [('22', 12321, 12345, 'APOL1'), ...]'''
     self._its = {}
     self._gene_starts = {}
     self._gene_ends = {}
     for interval_tuple in interval_tuples:
         chrom, pos_start, pos_end, gene_name = interval_tuple
         assert isinstance(pos_start, int)
         assert isinstance(pos_end, int)
         if chrom not in self._its:
             self._its[chrom] = intervaltree.IntervalTree()
             self._gene_starts[chrom] = []
             self._gene_ends[chrom] = []
         self._its[chrom].add(intervaltree.Interval(pos_start, pos_end, gene_name))
         self._gene_starts[chrom].append((pos_start, gene_name))
         self._gene_ends[chrom].append((pos_end, gene_name))
     for chrom in self._its:
         self._gene_starts[chrom] = BisectFinder(self._gene_starts[chrom])
         self._gene_ends[chrom] = BisectFinder(self._gene_ends[chrom])
Esempio n. 29
0
def gff2trees(gfffile, feature='CDS'):
    trees = {}
    i = 0
    for cds in cds_from_gff(gfffile, feature=feature):
        chr = cds['chr']
        if chr not in trees:
            trees[chr] = intervaltree.IntervalTree()
        # merge with any existing interval
        olaps = trees[chr][cds['begin']:cds['end']]
        olaps_begins = [x[0] for x in olaps]
        olaps_ends = [x[1] for x in olaps]
        newmin = min([cds['begin']] + olaps_begins)
        newmax = max([cds['end']] + olaps_ends)
        # clean up and add
        trees[chr].chop(newmin, newmax)
        cdsint = intervaltree.Interval(newmin, newmax, i)
        trees[chr].add(cdsint)
        i += 1
    return trees
Esempio n. 30
0
    def __init__(self, data, row_sentence_bounds, window=5, process_all=False):
        """
        Class for managing windowed input data (like TIMIT).

        :param data: Numpy matrix. Each row should be an example data
        :param row_sentence_bounds:  Numpy matrix with bounds for padding. TODO add default NONE
        :param window: half-window size
        :param process_all: (default False) if True adds context to all data at object initialization.
                            Otherwise the windowed data is created in runtime.
        """
        self.window = window
        self.data = data
        base_shape = self.data.shape
        self.shape = (base_shape[0], (2 * self.window + 1) * base_shape[1])
        self.tree = it.IntervalTree([it.Interval(int(e[0]), int(e[1]) + 1) for e in row_sentence_bounds])
        if process_all:
            print('adding context to all the dataset', end='- ')
            self.data = self.generate_all()
            print('DONE')
        self.process_all = process_all