def test_non_overlapping_blocks_Hermans(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-8, 16-24")),
                   blocks)  # a b c d F g h i !
     self.assertIn(Block(RangeSet("11-14, 25-28")), blocks)  # q r s t
 def test_non_overlapping_blocks_Hermans(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-8, 17-25")),
                   blocks)  # a b c d F g h i !
     self.assertIn(Block(RangeSet("11-14, 26-29")), blocks)  # q r s t
Esempio n. 3
0
 def test_blocks_failing_transposition_use_case_old_algorithm(self):
     collation = Collation()
     collation.add_plain_witness("W1", "the cat and the dog")
     collation.add_plain_witness("W2", "the dog and the cat")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     block1 = Block(RangeSet("0-1, 9-10"))
     block2 = Block(RangeSet("3-4, 6-7"))
     block3 = Block(RangeSet("2, 8"))
     self.assertEqual([block1, block2, block3], blocks)
 def test_2(self):
     collation = Collation()
     collation.add_plain_witness("W1", "in the in the bleach")
     collation.add_plain_witness("W2", "in the in the bleach in the")
     collation.add_plain_witness("W3", "in the in the bleach in the")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-4, 6-10, 14-18")),
                   blocks)  # in the in the bleach
     self.assertIn(Block(RangeSet("11-12, 19-20")), blocks)  # in the
 def test_blocks_Hermans_case_three_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks)  # a b c d
     self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks)  # g h i
     self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")),
                   blocks)  # ! q r s t
     self.assertIn(Block(RangeSet("4, 20")), blocks)  # F
 def test_blocks_Hermans_case_three_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-3, 17-20, 32-35")), blocks)  # a b c d
     self.assertIn(Block(RangeSet("5-7, 22-24, 37-39")), blocks)  # g h i
     self.assertIn(Block(RangeSet("10-14, 25-29, 40-44")),
                   blocks)  # ! q r s t
     self.assertIn(Block(RangeSet("4, 21")), blocks)  # F
Esempio n. 7
0
 def test_blocks_splitting_token_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a c b c")
     collation.add_plain_witness("W2", "a c b")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     block1 = Block(RangeSet("0-2, 5-7")) # a c b
     self.assertIn(block1, blocks)
Esempio n. 8
0
 def test_non_overlapping_blocks_black_cat(self):
     collation = Collation()
     collation.add_plain_witness("W1", "the black cat")
     collation.add_plain_witness("W2", "the black cat")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     block1 = Block(RangeSet("0-2, 4-6"))
     self.assertEqual([block1], blocks)
    def _get_non_overlapping_repeating_blocks(self):
        # The LCP intervals that are calculated from the extend suffix array are all potential blocks.
        # However some potential blocks overlap. To decide the definitive blocks we sort the potential blocks on the
        # amount of witnesses they occur in.
        potential_blocks = self.token_index.split_lcp_array_into_intervals()
        # we add all the intervals to a priority queue based on 1) number of witnesses 2) block length
        queue = PriorityQueue()
        for interval in potential_blocks:
            queue.put(interval)

        occupied = RangeSet()
        real_blocks = []

        while not queue.empty():
            item = queue.get()
            # print(item)
            # test intersection with occupied
            potential_block_range = item._as_range()
            # check the intersection with the already occupied ranges
            block_intersection = potential_block_range.intersection(occupied)
            if not block_intersection:
                # print("Selected!")
                occupied.union_update(potential_block_range)
                real_blocks.append(Block(potential_block_range))
                continue

            # check complete overlap or partial
            if block_intersection == potential_block_range:
                # print("complete overlap; skip")
                continue

            # print("partial overlap!")
            occurrence_difference = potential_block_range.difference(
                block_intersection)
            # print(occurrence_difference)

            # check on left partial overlap
            # filter it

            # determine start positions
            start_pos = item.block_occurrences()

            # print(start_pos)
            resulting_difference = RangeSet()
            count = 0
            for range in occurrence_difference.contiguous():
                if range[0] in start_pos:
                    resulting_difference.add_range(range[0], range[-1] + 1)
                    count += 1
            # print(resulting_difference)

            if count < 2:
                continue

            # in case of right partial overlap
            # calculate the minimum allowed range

            minimum_length = item.length
            for range in resulting_difference.contiguous():
                if len(range) < minimum_length:
                    minimum_length = len(range)

            # print(minimum_length)

            result = RangeSet()
            for range in resulting_difference.contiguous():
                result.add_range(range[0], range[0] + minimum_length)
            # print("Selecting partial result: "+str(result))

            occupied.union_update(result)
            real_blocks.append(Block(result))

        return real_blocks