def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 17-25")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 26-29")), blocks) # q r s t
def test_blocks_failing_transposition_use_case_old_algorithm(self): collation = Collation() collation.add_plain_witness("W1", "the cat and the dog") collation.add_plain_witness("W2", "the dog and the cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-1, 9-10")) block2 = Block(RangeSet("3-4, 6-7")) block3 = Block(RangeSet("2, 8")) self.assertEqual([block1, block2, block3], blocks)
def test_2(self): collation = Collation() collation.add_plain_witness("W1", "in the in the bleach") collation.add_plain_witness("W2", "in the in the bleach in the") collation.add_plain_witness("W3", "in the in the bleach in the") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-4, 6-10, 14-18")), blocks) # in the in the bleach self.assertIn(Block(RangeSet("11-12, 19-20")), blocks) # in the
def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks) # a b c d self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks) # g h i self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")), blocks) # ! q r s t self.assertIn(Block(RangeSet("4, 20")), blocks) # F
def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-3, 17-20, 32-35")), blocks) # a b c d self.assertIn(Block(RangeSet("5-7, 22-24, 37-39")), blocks) # g h i self.assertIn(Block(RangeSet("10-14, 25-29, 40-44")), blocks) # ! q r s t self.assertIn(Block(RangeSet("4, 21")), blocks) # F
def test_blocks_splitting_token_case(self): collation = Collation() collation.add_plain_witness("W1", "a c b c") collation.add_plain_witness("W2", "a c b") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 5-7")) # a c b self.assertIn(block1, blocks)
def test_non_overlapping_blocks_black_cat(self): collation = Collation() collation.add_plain_witness("W1", "the black cat") collation.add_plain_witness("W2", "the black cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 4-6")) self.assertEqual([block1], blocks)
def _get_non_overlapping_repeating_blocks(self): # The LCP intervals that are calculated from the extend suffix array are all potential blocks. # However some potential blocks overlap. To decide the definitive blocks we sort the potential blocks on the # amount of witnesses they occur in. potential_blocks = self.token_index.split_lcp_array_into_intervals() # we add all the intervals to a priority queue based on 1) number of witnesses 2) block length queue = PriorityQueue() for interval in potential_blocks: queue.put(interval) occupied = RangeSet() real_blocks = [] while not queue.empty(): item = queue.get() # print(item) # test intersection with occupied potential_block_range = item._as_range() # check the intersection with the already occupied ranges block_intersection = potential_block_range.intersection(occupied) if not block_intersection: # print("Selected!") occupied.union_update(potential_block_range) real_blocks.append(Block(potential_block_range)) continue # check complete overlap or partial if block_intersection == potential_block_range: # print("complete overlap; skip") continue # print("partial overlap!") occurrence_difference = potential_block_range.difference( block_intersection) # print(occurrence_difference) # check on left partial overlap # filter it # determine start positions start_pos = item.block_occurrences() # print(start_pos) resulting_difference = RangeSet() count = 0 for range in occurrence_difference.contiguous(): if range[0] in start_pos: resulting_difference.add_range(range[0], range[-1] + 1) count += 1 # print(resulting_difference) if count < 2: continue # in case of right partial overlap # calculate the minimum allowed range minimum_length = item.length for range in resulting_difference.contiguous(): if len(range) < minimum_length: minimum_length = len(range) # print(minimum_length) result = RangeSet() for range in resulting_difference.contiguous(): result.add_range(range[0], range[0] + minimum_length) # print("Selecting partial result: "+str(result)) occupied.union_update(result) real_blocks.append(Block(result)) return real_blocks