def get_compute_rangeset(self): """ returns rangeset of compute ids """ rset = RangeSet() for child in self.get_children(): mat = re.match(self._comp_regex, child) if mat: mdic = mat.groupdict() rset.union_update(RangeSet(str(mdic['id']))) return rset
def get_compute_rangeset(self): """ returns rangeset of compute ids """ rset = RangeSet() for child in self.get_children(): mat = re.match(self._comp_regex, child) if mat: mdic = mat.groupdict() rset.union_update(RangeSet(str(mdic["id"]))) return rset
def testUpdate(self): """test RangeSet.update()""" r1 = RangeSet("1-100,102,105-242,800") self.assertEqual(len(r1), 240) r2 = RangeSet("243-799,1924-1984") self.assertEqual(len(r2), 618) r1.update(r2) self.assertEqual(type(r1), RangeSet) self.assertEqual(r1.padding, None) self.assertEqual(len(r1), 240 + 618) self.assertEqual(str(r1), "1-100,102,105-800,1924-1984") r1 = RangeSet("1-100,102,105-242,800") r1.union_update(r2) self.assertEqual(len(r1), 240 + 618) self.assertEqual(str(r1), "1-100,102,105-800,1924-1984")
def testUpdate(self): """test RangeSet.update()""" r1 = RangeSet("1-100,102,105-242,800") self.assertEqual(len(r1), 240) r2 = RangeSet("243-799,1924-1984") self.assertEqual(len(r2), 618) r1.update(r2) self.assertEqual(type(r1), RangeSet) self.assertEqual(r1.padding, None) self.assertEqual(len(r1), 240+618) self.assertEqual(str(r1), "1-100,102,105-800,1924-1984") r1 = RangeSet("1-100,102,105-242,800") r1.union_update(r2) self.assertEqual(len(r1), 240+618) self.assertEqual(str(r1), "1-100,102,105-800,1924-1984")
def _get_non_overlapping_repeating_blocks(self): extended_suffix_array = self.collation.to_extended_suffix_array() potential_blocks = extended_suffix_array.split_lcp_array_into_intervals( ) self.filter_potential_blocks(potential_blocks) # step 3: sort the blocks based on depth (number of repetitions) first, # second length of LCP interval, # third sort on parent LCP interval occurrences. sorted_blocks_on_priority = sorted( potential_blocks, key=attrgetter("number_of_occurrences", "minimum_block_length", "number_of_siblings"), reverse=True) # step 4: select the definitive blocks occupied = RangeSet() real_blocks = [] for potential_block in sorted_blocks_on_priority: # print(potential_block.info()) try: non_overlapping_range = potential_block.calculate_non_overlapping_range_with( occupied) if non_overlapping_range: # print("Selecting: "+str(potential_block)) occupied.union_update(non_overlapping_range) real_blocks.append(Block(non_overlapping_range)) except PartialOverlapException: # print("Skip due to conflict: "+str(potential_block)) while potential_block.minimum_block_length > 1: # retry with a different length: one less for idx in range(potential_block.start + 1, potential_block.end + 1): potential_block.LCP[idx] -= 1 potential_block.length -= 1 try: non_overlapping_range = potential_block.calculate_non_overlapping_range_with( occupied) if non_overlapping_range: # print("Retried and selecting: "+str(potential_block)) occupied.union_update(non_overlapping_range) real_blocks.append(Block(non_overlapping_range)) break except PartialOverlapException: # print("Retried and failed again") pass return real_blocks
def get_non_overlapping_repeating_blocks(self): extended_suffix_array = self.collation.to_extended_suffix_array() potential_blocks = extended_suffix_array.split_lcp_array_into_intervals() self.filter_potential_blocks(potential_blocks) # step 3: sort the blocks based on depth (number of repetitions) first, # second length of LCP interval, # third sort on parent LCP interval occurrences. sorted_blocks_on_priority = sorted(potential_blocks, key=attrgetter("number_of_occurrences", "minimum_block_length", "number_of_siblings"), reverse=True) # step 4: select the definitive blocks occupied = RangeSet() real_blocks = [] for potential_block in sorted_blocks_on_priority: # print(potential_block.info()) try: non_overlapping_range = potential_block.calculate_non_overlapping_range_with(occupied) if non_overlapping_range: # print("Selecting: "+str(potential_block)) occupied.union_update(non_overlapping_range) real_blocks.append(Block(non_overlapping_range)) except PartialOverlapException: # print("Skip due to conflict: "+str(potential_block)) while potential_block.minimum_block_length > 1: # retry with a different length: one less for idx in range(potential_block.start+1, potential_block.end+1): potential_block.LCP[idx] -= 1 potential_block.length -= 1 try: non_overlapping_range = potential_block.calculate_non_overlapping_range_with(occupied) if non_overlapping_range: # print("Retried and selecting: "+str(potential_block)) occupied.union_update(non_overlapping_range) real_blocks.append(Block(non_overlapping_range)) break except PartialOverlapException: # print("Retried and failed again") pass return real_blocks
def _get_non_overlapping_repeating_blocks(self): # The LCP intervals that are calculated from the extend suffix array are all potential blocks. # However some potential blocks overlap. To decide the definitive blocks we sort the potential blocks on the # amount of witnesses they occur in. potential_blocks = self.token_index.split_lcp_array_into_intervals() # we add all the intervals to a priority queue based on 1) number of witnesses 2) block length queue = PriorityQueue() for interval in potential_blocks: queue.put(interval) occupied = RangeSet() real_blocks = [] while not queue.empty(): item = queue.get() # print(item) # test intersection with occupied potential_block_range = item._as_range() # check the intersection with the already occupied ranges block_intersection = potential_block_range.intersection(occupied) if not block_intersection: # print("Selected!") occupied.union_update(potential_block_range) real_blocks.append(Block(potential_block_range)) continue # check complete overlap or partial if block_intersection == potential_block_range: # print("complete overlap; skip") continue # print("partial overlap!") occurrence_difference = potential_block_range.difference( block_intersection) # print(occurrence_difference) # check on left partial overlap # filter it # determine start positions start_pos = item.block_occurrences() # print(start_pos) resulting_difference = RangeSet() count = 0 for range in occurrence_difference.contiguous(): if range[0] in start_pos: resulting_difference.add_range(range[0], range[-1] + 1) count += 1 # print(resulting_difference) if count < 2: continue # in case of right partial overlap # calculate the minimum allowed range minimum_length = item.length for range in resulting_difference.contiguous(): if len(range) < minimum_length: minimum_length = len(range) # print(minimum_length) result = RangeSet() for range in resulting_difference.contiguous(): result.add_range(range[0], range[0] + minimum_length) # print("Selecting partial result: "+str(result)) occupied.union_update(result) real_blocks.append(Block(result)) return real_blocks
def _get_non_overlapping_repeating_blocks(self): # The LCP intervals that are calculated from the extend suffix array are all potential blocks. # However some potential blocks overlap. To decide the definitive blocks we sort the potential blocks on the # amount of witnesses they occur in. potential_blocks = self.token_index.split_lcp_array_into_intervals() # we add all the intervals to a priority queue based on 1) number of witnesses 2) block length queue = PriorityQueue() for interval in potential_blocks: queue.put(interval) occupied = RangeSet() real_blocks = [] while not queue.empty(): item = queue.get() # print(item) # test intersection with occupied potential_block_range = item._as_range() # check the intersection with the already occupied ranges block_intersection = potential_block_range.intersection(occupied) if not block_intersection: # print("Selected!") occupied.union_update(potential_block_range) real_blocks.append(Block(potential_block_range)) continue # check complete overlap or partial if block_intersection == potential_block_range: # print("complete overlap; skip") continue # print("partial overlap!") occurrence_difference = potential_block_range.difference(block_intersection) # print(occurrence_difference) # check on left partial overlap # filter it # determine start positions start_pos = item.block_occurrences() # print(start_pos) resulting_difference = RangeSet() count = 0 for range in occurrence_difference.contiguous(): if range[0] in start_pos: resulting_difference.add_range(range[0], range[-1]+1) count+=1 # print(resulting_difference) if count < 2: continue # in case of right partial overlap # calculate the minimum allowed range minimum_length = item.length for range in resulting_difference.contiguous(): if len(range) < minimum_length: minimum_length = len(range) # print(minimum_length) result = RangeSet() for range in resulting_difference.contiguous(): result.add_range(range[0], range[0]+minimum_length) # print("Selecting partial result: "+str(result)) occupied.union_update(result) real_blocks.append(Block(result)) return real_blocks