Ejemplo n.º 1
0
 def get_compute_rangeset(self):
     """ returns rangeset of compute ids
     """
     rset = RangeSet()
     for child in self.get_children():
         mat = re.match(self._comp_regex, child)
         if mat:
             mdic = mat.groupdict()
             rset.union_update(RangeSet(str(mdic['id'])))
     return rset
 def get_compute_rangeset(self):
     """ returns rangeset of compute ids
     """
     rset = RangeSet()
     for child in self.get_children():
         mat = re.match(self._comp_regex, child)
         if mat:
             mdic = mat.groupdict()
             rset.union_update(RangeSet(str(mdic["id"])))
     return rset
Ejemplo n.º 3
0
 def testUpdate(self):
     """test RangeSet.update()"""
     r1 = RangeSet("1-100,102,105-242,800")
     self.assertEqual(len(r1), 240)
     r2 = RangeSet("243-799,1924-1984")
     self.assertEqual(len(r2), 618)
     r1.update(r2)
     self.assertEqual(type(r1), RangeSet)
     self.assertEqual(r1.padding, None)
     self.assertEqual(len(r1), 240 + 618)
     self.assertEqual(str(r1), "1-100,102,105-800,1924-1984")
     r1 = RangeSet("1-100,102,105-242,800")
     r1.union_update(r2)
     self.assertEqual(len(r1), 240 + 618)
     self.assertEqual(str(r1), "1-100,102,105-800,1924-1984")
Ejemplo n.º 4
0
 def testUpdate(self):
     """test RangeSet.update()"""
     r1 = RangeSet("1-100,102,105-242,800")
     self.assertEqual(len(r1), 240)
     r2 = RangeSet("243-799,1924-1984")
     self.assertEqual(len(r2), 618)
     r1.update(r2)
     self.assertEqual(type(r1), RangeSet)
     self.assertEqual(r1.padding, None)
     self.assertEqual(len(r1), 240+618) 
     self.assertEqual(str(r1), "1-100,102,105-800,1924-1984")
     r1 = RangeSet("1-100,102,105-242,800")
     r1.union_update(r2)
     self.assertEqual(len(r1), 240+618) 
     self.assertEqual(str(r1), "1-100,102,105-800,1924-1984")
Ejemplo n.º 5
0
 def _get_non_overlapping_repeating_blocks(self):
     extended_suffix_array = self.collation.to_extended_suffix_array()
     potential_blocks = extended_suffix_array.split_lcp_array_into_intervals(
     )
     self.filter_potential_blocks(potential_blocks)
     # step 3: sort the blocks based on depth (number of repetitions) first,
     # second length of LCP interval,
     # third sort on parent LCP interval occurrences.
     sorted_blocks_on_priority = sorted(
         potential_blocks,
         key=attrgetter("number_of_occurrences", "minimum_block_length",
                        "number_of_siblings"),
         reverse=True)
     # step 4: select the definitive blocks
     occupied = RangeSet()
     real_blocks = []
     for potential_block in sorted_blocks_on_priority:
         #           print(potential_block.info())
         try:
             non_overlapping_range = potential_block.calculate_non_overlapping_range_with(
                 occupied)
             if non_overlapping_range:
                 #                     print("Selecting: "+str(potential_block))
                 occupied.union_update(non_overlapping_range)
                 real_blocks.append(Block(non_overlapping_range))
         except PartialOverlapException:
             #                 print("Skip due to conflict: "+str(potential_block))
             while potential_block.minimum_block_length > 1:
                 # retry with a different length: one less
                 for idx in range(potential_block.start + 1,
                                  potential_block.end + 1):
                     potential_block.LCP[idx] -= 1
                 potential_block.length -= 1
                 try:
                     non_overlapping_range = potential_block.calculate_non_overlapping_range_with(
                         occupied)
                     if non_overlapping_range:
                         #                             print("Retried and selecting: "+str(potential_block))
                         occupied.union_update(non_overlapping_range)
                         real_blocks.append(Block(non_overlapping_range))
                         break
                 except PartialOverlapException:
                     #                         print("Retried and failed again")
                     pass
     return real_blocks
Ejemplo n.º 6
0
    def get_non_overlapping_repeating_blocks(self):
        extended_suffix_array = self.collation.to_extended_suffix_array()
        potential_blocks = extended_suffix_array.split_lcp_array_into_intervals() 
        self.filter_potential_blocks(potential_blocks)
        # step 3: sort the blocks based on depth (number of repetitions) first,
        # second length of LCP interval,
        # third sort on parent LCP interval occurrences.
        sorted_blocks_on_priority = sorted(potential_blocks, key=attrgetter("number_of_occurrences", "minimum_block_length", "number_of_siblings"), reverse=True)
        # step 4: select the definitive blocks
        occupied = RangeSet()
        real_blocks = []
        for potential_block in sorted_blocks_on_priority:
#           print(potential_block.info())
            try:
                non_overlapping_range = potential_block.calculate_non_overlapping_range_with(occupied)
                if non_overlapping_range:
#                     print("Selecting: "+str(potential_block))
                    occupied.union_update(non_overlapping_range)
                    real_blocks.append(Block(non_overlapping_range))
            except PartialOverlapException:          
#                 print("Skip due to conflict: "+str(potential_block))
                while potential_block.minimum_block_length > 1:
                    # retry with a different length: one less
                    for idx in range(potential_block.start+1, potential_block.end+1):
                        potential_block.LCP[idx] -= 1
                    potential_block.length -= 1
                    try:
                        non_overlapping_range = potential_block.calculate_non_overlapping_range_with(occupied)
                        if non_overlapping_range:
#                             print("Retried and selecting: "+str(potential_block))
                            occupied.union_update(non_overlapping_range)
                            real_blocks.append(Block(non_overlapping_range))
                            break
                    except PartialOverlapException:          
#                         print("Retried and failed again")
                        pass
        return real_blocks
    def _get_non_overlapping_repeating_blocks(self):
        # The LCP intervals that are calculated from the extend suffix array are all potential blocks.
        # However some potential blocks overlap. To decide the definitive blocks we sort the potential blocks on the
        # amount of witnesses they occur in.
        potential_blocks = self.token_index.split_lcp_array_into_intervals()
        # we add all the intervals to a priority queue based on 1) number of witnesses 2) block length
        queue = PriorityQueue()
        for interval in potential_blocks:
            queue.put(interval)

        occupied = RangeSet()
        real_blocks = []

        while not queue.empty():
            item = queue.get()
            # print(item)
            # test intersection with occupied
            potential_block_range = item._as_range()
            # check the intersection with the already occupied ranges
            block_intersection = potential_block_range.intersection(occupied)
            if not block_intersection:
                # print("Selected!")
                occupied.union_update(potential_block_range)
                real_blocks.append(Block(potential_block_range))
                continue

            # check complete overlap or partial
            if block_intersection == potential_block_range:
                # print("complete overlap; skip")
                continue

            # print("partial overlap!")
            occurrence_difference = potential_block_range.difference(
                block_intersection)
            # print(occurrence_difference)

            # check on left partial overlap
            # filter it

            # determine start positions
            start_pos = item.block_occurrences()

            # print(start_pos)
            resulting_difference = RangeSet()
            count = 0
            for range in occurrence_difference.contiguous():
                if range[0] in start_pos:
                    resulting_difference.add_range(range[0], range[-1] + 1)
                    count += 1
            # print(resulting_difference)

            if count < 2:
                continue

            # in case of right partial overlap
            # calculate the minimum allowed range

            minimum_length = item.length
            for range in resulting_difference.contiguous():
                if len(range) < minimum_length:
                    minimum_length = len(range)

            # print(minimum_length)

            result = RangeSet()
            for range in resulting_difference.contiguous():
                result.add_range(range[0], range[0] + minimum_length)
            # print("Selecting partial result: "+str(result))

            occupied.union_update(result)
            real_blocks.append(Block(result))

        return real_blocks
Ejemplo n.º 8
0
    def _get_non_overlapping_repeating_blocks(self):
        # The LCP intervals that are calculated from the extend suffix array are all potential blocks.
        # However some potential blocks overlap. To decide the definitive blocks we sort the potential blocks on the
        # amount of witnesses they occur in.
        potential_blocks = self.token_index.split_lcp_array_into_intervals()
        # we add all the intervals to a priority queue based on 1) number of witnesses 2) block length
        queue = PriorityQueue()
        for interval in potential_blocks:
            queue.put(interval)

        occupied = RangeSet()
        real_blocks = []

        while not queue.empty():
            item = queue.get()
            # print(item)
            # test intersection with occupied
            potential_block_range = item._as_range()
            # check the intersection with the already occupied ranges
            block_intersection = potential_block_range.intersection(occupied)
            if not block_intersection:
                # print("Selected!")
                occupied.union_update(potential_block_range)
                real_blocks.append(Block(potential_block_range))
                continue

            # check complete overlap or partial
            if block_intersection == potential_block_range:
                # print("complete overlap; skip")
                continue

            # print("partial overlap!")
            occurrence_difference = potential_block_range.difference(block_intersection)
            # print(occurrence_difference)

            # check on left partial overlap
            # filter it

            # determine start positions
            start_pos = item.block_occurrences()

            # print(start_pos)
            resulting_difference = RangeSet()
            count = 0
            for range in occurrence_difference.contiguous():
                if range[0] in start_pos:
                    resulting_difference.add_range(range[0], range[-1]+1)
                    count+=1
            # print(resulting_difference)

            if count < 2:
                continue

            # in case of right partial overlap
            # calculate the minimum allowed range

            minimum_length = item.length
            for range in resulting_difference.contiguous():
                if len(range) < minimum_length:
                    minimum_length = len(range)

            # print(minimum_length)

            result = RangeSet()
            for range in resulting_difference.contiguous():
                result.add_range(range[0], range[0]+minimum_length)
            # print("Selecting partial result: "+str(result))

            occupied.union_update(result)
            real_blocks.append(Block(result))

        return real_blocks