Ejemplo n.º 1
0
 def calculate_non_overlapping_range_with(self, occupied):
     # convert block occurrences into ranges
     potential_block_range = RangeSet()
     for occurrence in self.block_occurrences():
         potential_block_range.add_range(
             occurrence, occurrence + self.minimum_block_length)
     #check the intersection with the already occupied ranges
     block_intersection = potential_block_range.intersection(occupied)
     if not block_intersection:
         # no overlap, return complete block_range
         return potential_block_range
     # There is overlap with occupied range
     # we need to deal with it
     real_block_range = RangeSet()
     for lower in potential_block_range.contiguous():
         # TODO: what I really want here is a find first over a generator
         upper = [
             x for x in block_intersection.contiguous() if x[0] >= lower[0]
         ]
         if upper:
             lower = lower[0]
             upper = upper[0][0]
             if lower != upper:
                 real_block_range.add_range(lower, upper)
     if not real_block_range:
         # There is complete overlap, so return None
         return None
     # Assert: check that the first slice is not larger than potential block length!
     first_range = next(real_block_range.contiguous())
     if first_range[-1] - first_range[0] + 1 > self.minimum_block_length:
         raise PartialOverlapException()
     return real_block_range
Ejemplo n.º 2
0
 def calculate_non_overlapping_range_with(self, occupied):
     # convert block occurrences into ranges
     potential_block_range = RangeSet()
     for occurrence in self.block_occurrences():
         potential_block_range.add_range(occurrence, occurrence + self.minimum_block_length)
     #check the intersection with the already occupied ranges
     block_intersection = potential_block_range.intersection(occupied)
     if not block_intersection:
         # no overlap, return complete block_range
         return potential_block_range
     # There is overlap with occupied range
     # we need to deal with it
     real_block_range = RangeSet()
     for lower in potential_block_range.contiguous():
         # TODO: what I really want here is a find first over a generator
         upper = [x for x in block_intersection.contiguous() if x[0] >= lower[0]]
         if upper:
             lower = lower[0]
             upper = upper[0][0]
             if lower != upper:
                 real_block_range.add_range(lower, upper)
     if not real_block_range:
         # There is complete overlap, so return None
         return None
     # Assert: check that the first slice is not larger than potential block length!
     first_range = real_block_range.contiguous().next()
     if first_range[-1]-first_range[0]+1>self.minimum_block_length:
         raise PartialOverlapException()
     return real_block_range
    def _get_non_overlapping_repeating_blocks(self):
        # The LCP intervals that are calculated from the extend suffix array are all potential blocks.
        # However some potential blocks overlap. To decide the definitive blocks we sort the potential blocks on the
        # amount of witnesses they occur in.
        potential_blocks = self.token_index.split_lcp_array_into_intervals()
        # we add all the intervals to a priority queue based on 1) number of witnesses 2) block length
        queue = PriorityQueue()
        for interval in potential_blocks:
            queue.put(interval)

        occupied = RangeSet()
        real_blocks = []

        while not queue.empty():
            item = queue.get()
            # print(item)
            # test intersection with occupied
            potential_block_range = item._as_range()
            # check the intersection with the already occupied ranges
            block_intersection = potential_block_range.intersection(occupied)
            if not block_intersection:
                # print("Selected!")
                occupied.union_update(potential_block_range)
                real_blocks.append(Block(potential_block_range))
                continue

            # check complete overlap or partial
            if block_intersection == potential_block_range:
                # print("complete overlap; skip")
                continue

            # print("partial overlap!")
            occurrence_difference = potential_block_range.difference(
                block_intersection)
            # print(occurrence_difference)

            # check on left partial overlap
            # filter it

            # determine start positions
            start_pos = item.block_occurrences()

            # print(start_pos)
            resulting_difference = RangeSet()
            count = 0
            for range in occurrence_difference.contiguous():
                if range[0] in start_pos:
                    resulting_difference.add_range(range[0], range[-1] + 1)
                    count += 1
            # print(resulting_difference)

            if count < 2:
                continue

            # in case of right partial overlap
            # calculate the minimum allowed range

            minimum_length = item.length
            for range in resulting_difference.contiguous():
                if len(range) < minimum_length:
                    minimum_length = len(range)

            # print(minimum_length)

            result = RangeSet()
            for range in resulting_difference.contiguous():
                result.add_range(range[0], range[0] + minimum_length)
            # print("Selecting partial result: "+str(result))

            occupied.union_update(result)
            real_blocks.append(Block(result))

        return real_blocks
Ejemplo n.º 4
0
    def _get_non_overlapping_repeating_blocks(self):
        # The LCP intervals that are calculated from the extend suffix array are all potential blocks.
        # However some potential blocks overlap. To decide the definitive blocks we sort the potential blocks on the
        # amount of witnesses they occur in.
        potential_blocks = self.token_index.split_lcp_array_into_intervals()
        # we add all the intervals to a priority queue based on 1) number of witnesses 2) block length
        queue = PriorityQueue()
        for interval in potential_blocks:
            queue.put(interval)

        occupied = RangeSet()
        real_blocks = []

        while not queue.empty():
            item = queue.get()
            # print(item)
            # test intersection with occupied
            potential_block_range = item._as_range()
            # check the intersection with the already occupied ranges
            block_intersection = potential_block_range.intersection(occupied)
            if not block_intersection:
                # print("Selected!")
                occupied.union_update(potential_block_range)
                real_blocks.append(Block(potential_block_range))
                continue

            # check complete overlap or partial
            if block_intersection == potential_block_range:
                # print("complete overlap; skip")
                continue

            # print("partial overlap!")
            occurrence_difference = potential_block_range.difference(block_intersection)
            # print(occurrence_difference)

            # check on left partial overlap
            # filter it

            # determine start positions
            start_pos = item.block_occurrences()

            # print(start_pos)
            resulting_difference = RangeSet()
            count = 0
            for range in occurrence_difference.contiguous():
                if range[0] in start_pos:
                    resulting_difference.add_range(range[0], range[-1]+1)
                    count+=1
            # print(resulting_difference)

            if count < 2:
                continue

            # in case of right partial overlap
            # calculate the minimum allowed range

            minimum_length = item.length
            for range in resulting_difference.contiguous():
                if len(range) < minimum_length:
                    minimum_length = len(range)

            # print(minimum_length)

            result = RangeSet()
            for range in resulting_difference.contiguous():
                result.add_range(range[0], range[0]+minimum_length)
            # print("Selecting partial result: "+str(result))

            occupied.union_update(result)
            real_blocks.append(Block(result))

        return real_blocks
Ejemplo n.º 5
0
 def test_contiguous(self):
     r0 = RangeSet()
     self.assertEqual([], [str(ns) for ns in r0.contiguous()])
     r1 = RangeSet("1,3-9,14-21,30-39,42")
     self.assertEqual(['1', '3-9', '14-21', '30-39', '42'],
                      [str(ns) for ns in r1.contiguous()])
Ejemplo n.º 6
0
 def test_contiguous(self):
     r1 = RangeSet("1,3-9,14-21,30-39,42")
     self.assertEqual(['1', '3-9', '14-21', '30-39', '42'], [str(ns) for ns in r1.contiguous()])