def test_hermans_witness_order_independence_case_two_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("B", "a b c d F g h i ! q r s t")
     alignment_table = collate(collation)
     self.assertEquals(["a b c d F g h i!", "K!", "q r s t"], alignment_table.rows[0].to_list())
     self.assertEquals(["a b c d F g h i!", "-", "q r s t"], alignment_table.rows[1].to_list())
 def testDoubleTransposition1(self):
     collation = Collation()
     collation.add_plain_witness("A", "the cat is black")
     collation.add_plain_witness("B", "black is the cat")
     alignment_table = collate(collation)
     self.assertEquals(["the cat", "is", "black"], alignment_table.rows[0].to_list())
     self.assertEquals(["black", "is", "the cat"], alignment_table.rows[1].to_list())
 def test_non_overlapping_blocks_overlap_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "in the in the bleach")
     collation.add_plain_witness("W2", "in the in the bleach in the")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-4, 7-11")), blocks) # in the in the bleach
 def testBeckett(self):
     collation = Collation()
     collation.add_plain_witness("1", "The same clock as when for example Magee once died.")
     collation.add_plain_witness("2", "The same as when for example Magee once died.")
     table = collate(collation)
     self.assertEquals(["The same", "clock", "as when for example Magee once died."], table.rows[0].to_list())
     self.assertEquals(["The same", None, "as when for example Magee once died."], table.rows[1].to_list())
 def test_superbase_generation_multiple_short_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("A", "a")
     collation.add_plain_witness("B", "b")
     collation.add_plain_witness("C", "c")
     aligner = EditGraphAligner(collation)
     graph = VariantGraph()
     aligner.collate(graph)
 def test_non_overlapping_blocks_Hermans(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-8, 17-25")), blocks) # a b c d F g h i !
     self.assertIn(Block(RangeSet("11-14, 26-29")), blocks) # q r s t
Esempio n. 7
0
 def test_witness_ranges_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     self.assertEqual(RangeSet("0-14"), token_index.get_range_for_witness("W1"))
     self.assertEqual(RangeSet("16-28"), token_index.get_range_for_witness("W2"))
 def test_exact_matching(self):
     collation = Collation()
     collation.add_plain_witness("A", "I bought this glass , because it matches those dinner plates")
     collation.add_plain_witness("B", "I bought those glasses")
     alignment_table = collate(collation)
     self.assertEqual(["I bought ", "this glass , because it matches ", "those ", "dinner plates"],
                       alignment_table.rows[0].to_list_of_strings())
     self.assertEqual(["I bought ", None, "those ", "glasses"], alignment_table.rows[1].to_list_of_strings())
Esempio n. 9
0
 def test_token_array_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     # $ is meant to separate witnesses here
     self.assertTokenArray("a b c d F g h i ! K ! q r s t $0 a b c d F g h i ! q r s t", token_index)
Esempio n. 10
0
 def test_non_overlapping_blocks_Hermans(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i !
     self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
Esempio n. 11
0
 def test_non_overlapping_blocks_Hermans(self):
     collation = Collation()
     collation.add_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_witness("W2", "a b c d F g h i ! q r s t")
     algorithm = DekkerSuffixAlgorithm(collation)
     blocks = algorithm.get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i !
     self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
 def test_non_overlapping_blocks_black_cat(self):
     collation = Collation()
     collation.add_plain_witness("W1", "the black cat")
     collation.add_plain_witness("W2", "the black cat")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     block1 = Block(RangeSet("0-2, 4-6"))
     self.assertEqual([block1], blocks)
 def test_blocks_splitting_token_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a c b c")
     collation.add_plain_witness("W2", "a c b")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     block1 = Block(RangeSet("0-2, 5-7")) # a c b
     self.assertIn(block1, blocks)
 def test_near_matching(self):
     collation = Collation()
     collation.add_plain_witness("A", "I bought this glass, because it matches those dinner plates")
     collation.add_plain_witness("B", "I bought those glasses")
     alignment_table = collate(collation, near_match=True)
     self.assertEquals(["I bought", "this glass, because it matches those dinner plates"],
                       alignment_table.rows[0].to_list())
     self.assertEquals(["I bought", "those glasses"], alignment_table.rows[1].to_list())
Esempio n. 15
0
 def test_2(self):
     collation = Collation()
     collation.add_plain_witness("W1", "in the in the bleach")
     collation.add_plain_witness("W2", "in the in the bleach in the")
     collation.add_plain_witness("W3", "in the in the bleach in the")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-4, 6-10, 14-18")), blocks) # in the in the bleach
     self.assertIn(Block(RangeSet("11-12, 19-20")), blocks) # in the
 def test_hermans_witness_order_independence_case_three_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("B", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("C", "a b c d E g h i ! q r s t")
     alignment_table = collate(collation)
     self.assertEquals(["a b c d ", "F ", "g h i ", "! K ", "! q r s t"], alignment_table.rows[0].to_list_of_strings())
     self.assertEquals(["a b c d ", "F ", "g h i ", None, "! q r s t"], alignment_table.rows[1].to_list_of_strings())
     self.assertEquals(["a b c d ", "E ", "g h i ", None, "! q r s t"], alignment_table.rows[2].to_list_of_strings())
    def test_align_with_longest_match(self):
        collation = Collation()
        collation.add_plain_witness("A", "a g a g c t a g t")
        collation.add_plain_witness("B", "a g c t")

        alignment_table = collate(collation)
        print("alignment_table=\n", alignment_table)
        self.assertEqual(['a g ', 'a g c t ', 'a g t'], alignment_table.rows[0].to_list_of_strings())
        self.assertEqual([None, 'a g c t', None], alignment_table.rows[1].to_list_of_strings())
 def test_witness_order(self):
     collation = Collation()
     collation.add_plain_witness("A", "x a y")
     collation.add_plain_witness("B", "x b y")
     collation.add_plain_witness("C", "x a b y")
     alignment_table = collate(collation)
     self.assertEquals(["x", "a", "-", "y"], alignment_table.rows[0].to_list())
     self.assertEquals(["x", "-", "b", "y"], alignment_table.rows[1].to_list())
     self.assertEquals(["x", "a", "b", "y"], alignment_table.rows[2].to_list())
 def test_block_witnesses_Hermans_case_two_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     algorithm = Scorer(collation)
     block_witness = algorithm._get_block_witness(collation.witnesses[0])
     self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug())
     block_witness = algorithm._get_block_witness(collation.witnesses[1])
     self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug())
    def test_non_overlapping_blocks_Hermans(self):
        collation = Collation()
        collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
        collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")

        alignment_table = collate(collation)
        print("alignment_table=\n", alignment_table)
        self.assertEqual(['a b c d F g h i ', '! K ', '! q r s t'], alignment_table.rows[0].to_list_of_strings())
        self.assertEqual(['a b c d F g h i ', None, '! q r s t'], alignment_table.rows[1].to_list_of_strings())
Esempio n. 21
0
 def test_superbase(self):
     collation = Collation()
     collation.add_plain_witness("A", "X a b c d e f X g h i Y Z j k")
     collation.add_plain_witness("B", "a b c Y d e f Y Z g h i X j k")
     aligner = EditGraphAligner(collation)
     graph = VariantGraph()
     aligner.collate(graph, collation)
     superbase = aligner.new_superbase
     self.assertSuperbaseEquals("X a b c Y d e f X Y Z g h i Y Z X j k", superbase)
 def test_heuristic_function_everything_equals(self):
     collation = Collation()
     collation.add_plain_witness("A", "everything equal")
     collation.add_plain_witness("B", "everything equal")
     aligner = ExperimentalAstarAligner(collation)
     aligner._create_heuristic_table(collation.witnesses[0].tokens(), collation.witnesses[1])
     self.assertEqual([0, 1, 2], aligner.heuristic_table[0])
     self.assertEqual([1, 0, 1], aligner.heuristic_table[1])
     self.assertEqual([2, 1, 0], aligner.heuristic_table[2])
     pass
Esempio n. 23
0
 def test_superbase_generation_multiple_short_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("A", "a")
     collation.add_plain_witness("B", "b")
     collation.add_plain_witness("C", "c")
     aligner = EditGraphAligner(collation)
     graph = VariantGraph()
     aligner.collate(graph, collation)
     superbase = aligner.new_superbase
     self.assertSuperbaseEquals("a b c", superbase)
 def test_blocks_failing_transposition_use_case_old_algorithm(self):
     collation = Collation()
     collation.add_plain_witness("W1", "the cat and the dog")
     collation.add_plain_witness("W2", "the dog and the cat")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     block1 = Block(RangeSet("0-1, 9-10"))
     block2 = Block(RangeSet("3-4, 6-7"))
     block3 = Block(RangeSet("2, 8"))
     self.assertEqual([block1, block2, block3], blocks)
Esempio n. 25
0
 def test_lcp_intervals_number_of_witnesses_Hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     intervals = token_index.split_lcp_array_into_intervals()
     potential_block = intervals[1] # ! q r s t
     self.assertEqual(3, potential_block.get_depth())
Esempio n. 26
0
    def test_near_matching_middle(self):
        # Three candidates, closest is middle, match rank 2 0 1 (0 is closest)
        # Should go to the middle; incorrectly goes right
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "abcd 0123 efgh")
        collation.add_plain_witness("B", "abcd 0xxx 012x 01xx efgh")
        alignment_table = str(collate(collation, near_match=True, segmentation=False, scheduler=scheduler))
        # Find the rightmost rank with a gap (rank 4); this is activeRank
        # Find the first witness with a gap at that rank (A)
        # Find first token to the left of the gap for the first gappy witness ("0123" in A at rank 2)
        #   and check whether to move it
        # Calculate strength of match for all columns from the token's current rank (2) through activeRank (4)
        #   parameters are token string and rank to check
        self.assertTask("build column for rank", ["0123", "2"], scheduler[0])
        self.assertTask("build column for rank", ["0123", "3"], scheduler[1])
        self.assertTask("build column for rank", ["0123", "4"], scheduler[2])
        # The best (max()) fit of "0123" among all ranks between current rank 2 and activeRank 4
        #   is at rank 3, so move "0123" from current rank 2 to rank 3
        self.assertTask("move node from prior rank to rank with best match", ["0123", "2", "3"], scheduler[3])
        # No more gaps at activeRank 4, no gaps at rank 3, so move to next rank with a gap
        #   (rank 2, gap in A), with "abcd" at rank 1
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[4])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[5])
        # Don't move it because it's closer to current location
        # No more gaps at rank 2, non gaps at rank 1, no more ranks
        self.assertEquals(6, len(scheduler))
        expected = """\
+---+------+------+------+------+------+
| A | abcd | -    | 0123 | -    | efgh |
| B | abcd | 0xxx | 012x | 01xx | efgh |
+---+------+------+------+------+------+"""
        self.assertEqual(expected, alignment_table)

        def test_near_matching_clash(self):
            # If the previous rank has a vertex with more than one witness, where at least
            # one witness is a candidate for being moved, don't move it if any of the
            # witnesses has a node at the new rank.
            #
            # If there were only A and B, we'd move cce away from bbb to align with cce.
            # Witness C should prevent this.
            self.maxDiff = None
            collation = Collation()
            collation.add_plain_witness("A", "aaa bbb ccc ddd")
            collation.add_plain_witness("B", "aaa cce ddd")
            collation.add_plain_witness("C", "aaa cce ccc ddd")
            alignment_table = str(collate(collation, near_match=True, segmentation=False))
            expected = """\
    +---+-----+-----+-----+-----+
    | A | aaa | bbb | ccc | ddd |
    | B | aaa | cce | -   | ddd |
    | C | aaa | cce | ccc | ddd |
    +---+-----+-----+-----+-----+"""
            self.assertEqual(expected, alignment_table)
Esempio n. 27
0
    def test_near_matching_accidentally_incorrect_short(self):
        collation = Collation()
        collation.add_plain_witness("A", "over this dog")
        collation.add_plain_witness("B", "over there that dog")
        alignment_table = str(collate(collation, near_match=True, segmentation=False))
        expected = """\
+---+------+-------+------+-----+
| A | over | -     | this | dog |
| B | over | there | that | dog |
+---+------+-------+------+-----+"""
        self.assertEqual(expected, alignment_table)
 def test_lcp_intervals_failing_use_case_old_algorithm(self):
     collation = Collation()
     collation.add_plain_witness("W1", "the cat and the dog")
     collation.add_plain_witness("W2", "the dog and the cat")
     parent_lcp_intervals, child_lcp_intervals = collation.get_lcp_intervals()
     self.assertIn((1,2), parent_lcp_intervals)
     self.assertIn((3,4), parent_lcp_intervals)
     self.assertIn((5,6), parent_lcp_intervals)
     self.assertIn((7, 10), parent_lcp_intervals)
     self.assertIn((7,8), child_lcp_intervals[7])
     self.assertIn((9,10), child_lcp_intervals[7])
 def test_blocks_Hermans_case_three_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
     algorithm = Scorer(collation)
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-3, 17-20, 32-35")), blocks) # a b c d
     self.assertIn(Block(RangeSet("5-7, 22-24, 37-39")), blocks) # g h i
     self.assertIn(Block(RangeSet("10-14, 25-29, 40-44")), blocks) # ! q r s t
     self.assertIn(Block(RangeSet("4, 21")), blocks) # F
Esempio n. 30
0
    def test_near_matching_accidentally_correct_long(self):
        collation = Collation()
        collation.add_plain_witness("A", "The brown fox jumps over this dog.")
        collation.add_plain_witness("B", "The brown fox jumps over that there dog.")
        alignment_table = str(collate(collation, near_match=True, segmentation=False))
        expected = """\
+---+-----+-------+-----+-------+------+------+-------+-----+---+
| A | The | brown | fox | jumps | over | this | -     | dog | . |
| B | The | brown | fox | jumps | over | that | there | dog | . |
+---+-----+-------+-----+-------+------+------+-------+-----+---+"""
        self.assertEqual(expected, alignment_table)
Esempio n. 31
0
     def test_near_matching_clash(self):
         # If the previous rank has a vertex with more than one witness, where at least
         # one witness is a candidate for being moved, don't move it if any of the
         # witnesses has a node at the new rank.
         #
         # If there were only A and B, we'd move cce away from bbb to align with cce.
         # Witness C should prevent this.
         self.maxDiff = None
         collation = Collation()
         collation.add_plain_witness("A", "aaa bbb ccc ddd")
         collation.add_plain_witness("B", "aaa cce ddd")
         collation.add_plain_witness("C", "aaa cce ccc ddd")
         alignment_table = str(
             collate(collation, near_match=True, segmentation=False))
         expected = """\
 +---+-----+-----+-----+-----+
 | A | aaa | bbb | ccc | ddd |
 | B | aaa | cce | -   | ddd |
 | C | aaa | cce | ccc | ddd |
 +---+-----+-----+-----+-----+"""
         self.assertEqual(expected, alignment_table)
Esempio n. 32
0
    def test_1(self):
        collation = Collation()
        collation.add_plain_witness("A", "a")
        collation.add_plain_witness("B", "b")
        collation.add_plain_witness("C", "a b")

        alignment_table = collate(collation)
        print("alignment_table=\n", alignment_table)
        self.assertEqual(['a', None],
                         alignment_table.rows[0].to_list_of_strings())
        self.assertEqual([None, 'b'],
                         alignment_table.rows[1].to_list_of_strings())
        self.assertEqual(['a ', 'b'],
                         alignment_table.rows[2].to_list_of_strings())
Esempio n. 33
0
    def test_duplicated_tokens_in_witness2(self):
        collation = Collation()
        collation.add_plain_witness("A", "a")
        collation.add_plain_witness("B", "b")
        collation.add_plain_witness("C", "c")
        collation.add_plain_witness("D", "a b c a b c")

        # alignment_table = collate(collation)
        # self.assertEqual(['a', None, None, None], alignment_table.rows[0].to_list_of_strings())
        # self.assertEqual([None, 'b', None, None], alignment_table.rows[1].to_list_of_strings())
        # self.assertEqual([None, None, 'c', None], alignment_table.rows[2].to_list_of_strings())
        # self.assertEqual(['a ', 'b ', 'c ', 'a b c'], alignment_table.rows[3].to_list_of_strings())

        expected_tei = """<?xml version="1.0" ?>
<cx:apparatus xmlns="http://www.tei-c.org/ns/1.0" xmlns:cx="http://interedition.eu/collatex/ns/1.0">
	<app>
		<rdg wit="#D">a b c</rdg>
	</app>
	 
	<app>
		<rdg wit="#A">a</rdg>
		<rdg wit="#D">a</rdg>
	</app>
	 
	<app>
		<rdg wit="#B">b</rdg>
		<rdg wit="#D">b</rdg>
	</app>
	 
	<app>
		<rdg wit="#C #D">c</rdg>
	</app>
</cx:apparatus>
"""

        # alignment_table = collate(collation)
        # print("alignment_table=\n",alignment_table)

        output_tei = collate(collation, output="tei", indent=True)
        self.assertEqual(expected_tei, output_tei)
Esempio n. 34
0
    def test_2(self):
        collation = Collation()
        collation.add_plain_witness("W1", "in the in the bleach")
        collation.add_plain_witness("W2", "in the in the bleach in the")
        collation.add_plain_witness("W3", "in the in the bleach in the")

        alignment_table = collate(collation)
        print("alignment_table=\n", alignment_table)
        self.assertEqual(['in the in the bleach', None],
                         alignment_table.rows[0].to_list_of_strings())
        self.assertEqual(['in the in the bleach ', 'in the'],
                         alignment_table.rows[1].to_list_of_strings())
        self.assertEqual(['in the in the bleach ', 'in the'],
                         alignment_table.rows[2].to_list_of_strings())
Esempio n. 35
0
    def test_blocks_Hermans_case_three_witnesses(self):
        collation = Collation()
        collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
        collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
        collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")

        alignment_table = collate(collation)
        print("alignment_table=\n", alignment_table)
        self.assertEqual(['a b c d ', 'F ', 'g h i ', '! K ', '! q r s t'],
                         alignment_table.rows[0].to_list_of_strings())
        self.assertEqual(['a b c d ', 'F ', 'g h i ', None, '! q r s t'],
                         alignment_table.rows[1].to_list_of_strings())
        self.assertEqual(['a b c d ', 'E ', 'g h i ', None, '! q r s t'],
                         alignment_table.rows[2].to_list_of_strings())
 def test_block_witnesses_Hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
     algorithm = Scorer(collation)
     block_witness1 = algorithm._get_block_witness(collation.witnesses[0])
     self.assertEquals(["a b c d", "F", "g h i", "! q r s t"],
                       block_witness1.debug())
     block_witness2 = algorithm._get_block_witness(collation.witnesses[1])
     self.assertEquals(["a b c d", "F", "g h i", "! q r s t"],
                       block_witness2.debug())
     block_witness3 = algorithm._get_block_witness(collation.witnesses[2])
     self.assertEquals(["a b c d", "g h i", "! q r s t"],
                       block_witness3.debug())
 def testPretokenizedWitness(self):
     pretokenized_witness = {
         "witnesses": [{
             "id":
             "A",
             "tokens": [{
                 "t": "A",
                 "ref": 123
             }, {
                 "t": "black",
                 "adj": True
             }, {
                 "t": "cat",
                 "id": "xyz"
             }, {
                 "t": "bird",
                 "id": "abc"
             }]
         }, {
             "id":
             "B",
             "tokens": [{
                 "t": "A"
             }, {
                 "t": "white",
                 "adj": True
             }, {
                 "t": "mousedog bird",
                 "adj": False
             }]
         }]
     }
     c = Collation.create_from_dict(pretokenized_witness)
     result = collate(c, segmentation=False)
     self.assertEqual(len(result.rows[0].to_list()), 4)
     self.assertEqual(len(result.rows[1].to_list()), 4)
     # The second witness should have a token that reads 'mousedog bird'.
     self.assertIn("mousedog bird", str(result.rows[1].to_list()))
Esempio n. 38
0
    def test_near_matching_accidentally_incorrect_long(self):
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "The brown fox jumps over this dog.")
        collation.add_plain_witness("B", "The brown fox jumps over there that dog.")
        alignment_table = str(collate(collation, near_match=True, segmentation=False, scheduler=scheduler))
        self.assertTask("build column for rank", ["this", "6"], scheduler[0])
        self.assertTask("build column for rank", ["this", "7"], scheduler[1])
        self.assertTask("move node from prior rank to rank with best match", ["this", "6", "7"], scheduler[2])
        self.assertTask("build column for rank", ["over", "5"], scheduler[3])
        self.assertTask("build column for rank", ["over", "6"], scheduler[4])
        self.assertEqual(5, len(scheduler))
        expected = """\
+---+-----+-------+-----+-------+------+-------+------+-----+---+
| A | The | brown | fox | jumps | over | -     | this | dog | . |
| B | The | brown | fox | jumps | over | there | that | dog | . |
+---+-----+-------+-----+-------+------+-------+------+-----+---+"""
        self.assertEqual(expected, alignment_table)
Esempio n. 39
0
 def testOmission(self):
     collation = Collation()
     collation.add_plain_witness("A", "a b c")
     collation.add_plain_witness("B", "b c")
     aligner = EditGraphAligner(collation)
     graph = VariantGraph()
     aligner.collate(graph)
     table = aligner.table
     #         self.debug_table(aligner, table)
     self.assertEqual(0, table[0][0].g)
     self.assertEqual(-1, table[0][1].g)
     self.assertEqual(-2, table[0][2].g)
     self.assertEqual(-3, table[0][3].g)
     self.assertEqual(-1, table[1][0].g)
     self.assertEqual(-2, table[1][1].g)
     self.assertEqual(-1, table[1][2].g)
     self.assertEqual(-2, table[1][3].g)
     self.assertEqual(-2, table[2][0].g)
     self.assertEqual(-3, table[2][1].g)
     self.assertEqual(-2, table[2][2].g)
     self.assertEqual(-1, table[2][3].g)
Esempio n. 40
0
    def test_near_matching_middle(self):
        # Three candidates, closest is middle, match rank 2 0 1 (0 is closest)
        # Should go to the middle; incorrectly goes right
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "abcd 0123 efgh")
        collation.add_plain_witness("B", "abcd 0xxx 012x 01xx efgh")
        alignment_table = str(
            collate(collation,
                    near_match=True,
                    segmentation=False,
                    scheduler=scheduler))
        # Find the rightmost rank with a gap (rank 4); this is activeRank
        # Find the first witness with a gap at that rank (A)
        # Find first token to the left of the gap for the first gappy witness ("0123" in A at rank 2)
        #   and check whether to move it
        # Calculate strength of match for all columns from the token's current rank (2) through activeRank (4)
        #   parameters are token string and rank to check
        self.assertTask("build column for rank", ["0123", "2"], scheduler[0])
        self.assertTask("build column for rank", ["0123", "3"], scheduler[1])
        self.assertTask("build column for rank", ["0123", "4"], scheduler[2])
        # The best (max()) fit of "0123" among all ranks between current rank 2 and activeRank 4
        #   is at rank 3, so move "0123" from current rank 2 to rank 3
        self.assertTask("move node from prior rank to rank with best match",
                        ["0123", "2", "3"], scheduler[3])
        # No more gaps at activeRank 4, no gaps at rank 3, so move to next rank with a gap
        #   (rank 2, gap in A), with "abcd" at rank 1
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[4])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[5])
        # Don't move it because it's closer to current location
        # No more gaps at rank 2, non gaps at rank 1, no more ranks
        self.assertEquals(6, len(scheduler))
        expected = """\
+---+------+------+------+------+------+
| A | abcd | -    | 0123 | -    | efgh |
| B | abcd | 0xxx | 012x | 01xx | efgh |
+---+------+------+------+------+------+"""
        self.assertEqual(expected, alignment_table)
 def test_longer_example(self):
     collation = Collation()
     collation.add_plain_witness("A", "The quick brown fox jumps over the dog.")
     collation.add_plain_witness("B", "The brown fox jumps over the lazy dog.")
     collate(collation)
    return chunk


print_chunk = chunk(print_file, xmlid)
ms_chunk = chunk(ms_file, xmlid)

if print_chunk == '' and ms_chunk == '':
    print(('\n\t[renew_collation_on_paragraph.py] Paragraph {}'
           ' NOT FOUND in file {}\n\tIs {} the right file?\n\n').format(
               xmlid, juxta_file, juxta_file))

###########
# Collate #
###########

collation = Collation()
collation.add_plain_witness(print_siglum, print_chunk)
collation.add_plain_witness(ms_siglum, ms_chunk)
output_string = collate(collation,
                        output='tei',
                        segmentation=False,
                        near_match=True,
                        indent=False)

################################
# Add empty <rdg> when missing #
################################

element = etree.fromstring(output_string)
for app in element.findall('.//t:app', ns):
    if len(app) == 1:
Esempio n. 43
0
'''
Created on Apr 20, 2014

Darwin Integration test

@author: Ronald Haentjens Dekker
'''
import json
from collatex import Collation, collate

if __name__ == '__main__':
    # read source JSON data into dictionary 
    json_data=open('darwin_chapter1_para1.json')
    data = json.load(json_data)
    json_data.close()
    #pprint(data)
    
    # generate collation object from dictionary    
    collation = Collation.create_from_dict(data)

    print(collate(collation))
    
#     write_dot(graph.graph, "rawoutput") 
Esempio n. 44
0
 def test_lcp_child_intervals_darwin(self):
     lcp_array = array('i', [0, 0, 0, 96, 96, 0, 151, 9, 1, 105, 105, 0, 83, 83, 0, 95, 95, 0, 39, 39, 0, 24, 24, 0, 108, 1, 0, 232, 32, 0, 181, 39, 0, 185, 43, 1, 33, 33, 0, 159, 17, 0, 160, 18, 0, 106, 106, 0, 60, 60, 0, 171, 29, 1, 215, 15, 1, 122, 15, 1, 57, 57, 1, 153, 11, 1, 165, 23, 0, 9, 9, 1, 170, 28, 0, 214, 14, 0, 62, 62, 1, 191, 49, 0, 61, 61, 0, 148, 6, 1, 8, 8, 0, 19, 19, 0, 123, 16, 0, 75, 75, 1, 90, 90, 1, 28, 28, 0, 167, 25, 1, 112, 5, 1, 132, 25, 0, 50, 50, 0, 31, 31, 0, 77, 77, 0, 97, 97, 0, 6, 6, 0, 38, 38, 0, 63, 63, 0, 30, 30, 0, 80, 80, 0, 154, 12, 0, 145, 3, 0, 129, 22, 3, 67, 67, 0, 88, 88, 0, 45, 45, 0, 217, 17, 1, 22, 22, 0, 166, 24, 0, 25, 25, 0, 201, 1, 0, 155, 13, 1, 120, 13, 0, 175, 33, 0, 195, 53, 0, 135, 28, 0, 10, 10, 0, 147, 5, 0, 138, 31, 0, 161, 19, 1, 73, 73, 0, 198, 1, 0, 86, 86, 0, 74, 74, 1, 111, 4, 0, 210, 10, 0, 84, 84, 0, 42, 42, 0, 199, 2, 0, 119, 12, 0, 46, 46, 1, 202, 2, 0, 71, 71, 0, 40, 40, 0, 142, 0, 0, 52, 52, 0, 168, 26, 2, 113, 6, 1, 163, 21, 0, 133, 26, 0, 3, 3, 0, 186, 44, 1, 101, 101, 0, 193, 51, 2, 227, 27, 0, 107, 1, 37, 37, 1, 140, 33, 0, 1, 205, 5, 0, 47, 47, 0, 127, 20, 1, 65, 65, 0, 230, 30, 0, 41, 41, 0, 91, 91, 0, 1, 1, 0, 200, 3, 0, 156, 14, 0, 0, 76, 76, 0, 109, 2, 1, 182, 40, 0, 68, 68, 1, 14, 14, 0, 126, 19, 0, 34, 34, 1, 192, 50, 1, 85, 85, 1, 128, 21, 2, 66, 66, 1, 183, 41, 1, 220, 20, 1, 5, 5, 1, 212, 12, 2, 174, 32, 2, 226, 26, 1, 59, 59, 0, 16, 16, 0, 218, 18, 0, 23, 23, 1, 11, 11, 0, 36, 36, 1, 178, 36, 0, 51, 51, 0, 213, 13, 1, 190, 48, 0, 2, 2, 1, 222, 22, 1, 188, 46, 0, 78, 78, 0, 53, 53, 0, 197, 0, 0, 136, 29, 1, 219, 19, 1, 12, 12, 0, 114, 7, 0, 89, 89, 0, 172, 30, 2, 216, 16, 0, 21, 21, 0, 209, 9, 0, 81, 81, 0, 102, 102, 0, 134, 27, 0, 98, 98, 0, 131, 24, 0, 4, 4, 0, 35, 35, 0, 179, 37, 0, 224, 24, 0, 82, 82, 0, 72, 72, 0, 139, 32, 0, 125, 18, 0, 103, 103, 0, 121, 14, 0, 189, 47, 0, 184, 42, 0, 7, 7, 1, 17, 17, 0, 207, 7, 0, 221, 21, 0, 20, 20, 0, 196, 54, 0, 79, 79, 1, 204, 4, 1, 144, 2, 2, 94, 94, 1, 56, 56, 0, 211, 11, 1, 194, 52, 3, 228, 28, 1, 157, 15, 1, 69, 69, 1, 54, 54, 1, 115, 8, 1, 173, 31, 1, 225, 25, 1, 177, 35, 1, 100, 100, 0, 203, 3, 0, 150, 8, 0, 104, 104, 0, 143, 1, 1, 93, 93, 0, 118, 11, 0, 29, 29, 1, 64, 64, 1, 146, 4, 1, 137, 30, 1, 229, 29, 2, 70, 70, 1, 44, 44, 1, 49, 49, 1, 117, 10, 0, 152, 10, 0, 130, 23, 1, 26, 26, 1, 110, 3, 1, 158, 16, 0, 124, 17, 0, 206, 6, 0, 141, 34, 1, 92, 92, 0, 32, 32, 1, 27, 27, 0, 58, 58, 0, 162, 20, 0, 13, 13, 0, 187, 45, 1, 223, 23, 0, 43, 43, 0, 48, 48, 0, 176, 34, 0, 99, 99, 0, 149, 7, 1, 231, 31, 1, 180, 38, 0, 18, 18, 0, 55, 55, 0, 169, 27, 2, 164, 22, 1, 208, 8, 1, 116, 9, 0, 87, 87, 0, 15, 15])
     collation = Collation()
     _, child_lcp_intervals = collation.get_lcp_intervals(lcp=lcp_array)
     self.assertEqual([(5, 7), (8, 10)], child_lcp_intervals[5])
     self.assertEqual([(513, 515),(516, 518),(519, 521),(522, 524),(525,527)], child_lcp_intervals[513])
Esempio n. 45
0
 def testThisMorningExample(self):
     collation = Collation()
     collation.add_plain_witness("A", "This morning the cat observed little birds in the trees.")
     collation.add_plain_witness("B",
                                 "The cat was observing birds in the little trees this morning, it observed birds for two hours.")
     alignment_table = collate(collation, detect_transpositions=True)
Esempio n. 46
0
    def test_near_matching_rank_0(self):
        # find_prior_node() should check ranks back through 0, not 1
        collation = Collation()
        collation.add_plain_witness("A", "this")
        collation.add_plain_witness("B", "there thin")
        output = str(collate(collation, near_match=True, segmentation=False))
        expected = """\
+---+-------+------+
| A | -     | this |
| B | there | thin |
+---+-------+------+"""
        self.assertEqual(expected, output)

        #     def test_near_matching_middle(self):
        #         # Three candidates, closest is middle, match rank 2 0 1 (0 is closest)
        #         # Should go to the middle; incorrectly goes right
        #         self.maxDiff = None
        #         scheduler = Scheduler()
        #         collation = Collation()
        #         collation.add_plain_witness("A", "abcd 0123 efgh")
        #         collation.add_plain_witness("B", "abcd 0xxx 012x 01xx efgh")
        #         alignment_table = str(collate(collation, near_match=True, segmentation=False))
        #         # Find the rightmost rank with a gap (rank 4); this is activeRank
        #         # Find the first witness with a gap at that rank (A)
        #         # Find first token to the left of the gap for the first gappy witness ("0123" in A at rank 2)
        #         #   and check whether to move it
        #         # Calculate strength of match for all columns from the token's current rank (2) through activeRank (4)
        #         #   parameters are token string and rank to check
        #         self.assertTask("build column for rank", ["0123", "2"], scheduler[0])
        #         self.assertTask("build column for rank", ["0123", "3"], scheduler[1])
        #         self.assertTask("build column for rank", ["0123", "4"], scheduler[2])
        #         # The best (max()) fit of "0123" among all ranks between current rank 2 and activeRank 4
        #         #   is at rank 3, so move "0123" from current rank 2 to rank 3
        #         self.assertTask("move node from prior rank to rank with best match", ["0123", "2", "3"], scheduler[3])
        #         # No more gaps at activeRank 4, no gaps at rank 3, so move to next rank with a gap
        #         #   (rank 2, gap in A), with "abcd" at rank 1
        #         self.assertTask("build column for rank", ["abcd", "1"], scheduler[4])
        #         self.assertTask("build column for rank", ["abcd", "2"], scheduler[5])
        #         # Don't move it because it's closer to current location
        #         # No more gaps at rank 2, non gaps at rank 1, no more ranks
        #         self.assertEqual(6, len(scheduler))
        #         expected = """\
        # +---+------+------+------+------+------+
        # | A | abcd | -    | 0123 | -    | efgh |
        # | B | abcd | 0xxx | 012x | 01xx | efgh |
        # +---+------+------+------+------+------+"""
        #         self.assertEqual(expected, alignment_table)

        def test_near_matching_clash(self):
            # If the previous rank has a vertex with more than one witness, where at least
            # one witness is a candidate for being moved, don't move it if any of the
            # witnesses has a node at the new rank.
            #
            # If there were only A and B, we'd move cce away from bbb to align with cce.
            # Witness C should prevent this.
            self.maxDiff = None
            collation = Collation()
            collation.add_plain_witness("A", "aaa bbb ccc ddd")
            collation.add_plain_witness("B", "aaa cce ddd")
            collation.add_plain_witness("C", "aaa cce ccc ddd")
            alignment_table = str(
                collate(collation, near_match=True, segmentation=False))
            expected = """\
    +---+-----+-----+-----+-----+
    | A | aaa | bbb | ccc | ddd |
    | B | aaa | cce | -   | ddd |
    | C | aaa | cce | ccc | ddd |
    +---+-----+-----+-----+-----+"""
            self.assertEqual(expected, alignment_table)
Esempio n. 47
0
 def test_near_matching(self):
     collation = Collation()
     collation.add_plain_witness("A", "I bought this glass , because it matches those dinner plates")
     collation.add_plain_witness("B", "I bought those glasses")
     # Arguments to collate() must be passed as arguments to assertRaises()
     self.assertRaises(SegmentationError, collate, collation, near_match=True)
Esempio n. 48
0
    def test_near_matching_three_witnesses(self):
        self.maxDiff = None
        scheduler = Scheduler()
        collation = Collation()
        collation.add_plain_witness("A", "abcd 012345 efgh")
        collation.add_plain_witness(
            "B", "abcd 0xxxxx 01xxxx 01234x 012xxx 0123xx efgh")
        collation.add_plain_witness("C", "abcd 01xxxx zz23xx efgh")
        alignment_table = str(
            collate(collation,
                    near_match=True,
                    segmentation=False,
                    scheduler=scheduler))
        # Find the rightmost rank with a gap (rank 6); this is activeRank
        # Find the first witness (alphabetically by siglum) with a gap at that rank (A)
        # Get the first token to the left of the gap for the first gappy witness ("012345" in A at rank 2)
        #   and check whether to move it
        # Calculate strength of match for all columns from current rank (2) through activeRank (6), inclusive
        self.assertTask("build column for rank", ["012345", "2"], scheduler[0])
        self.assertTask("build column for rank", ["012345", "3"], scheduler[1])
        self.assertTask("build column for rank", ["012345", "4"], scheduler[2])
        self.assertTask("build column for rank", ["012345", "5"], scheduler[3])
        self.assertTask("build column for rank", ["012345", "6"], scheduler[4])
        # The best (max()) fit of "012345" among all ranks between current rank 2 and activeRank 6
        #   is at rank 4, so move "012345" from current rank 2 to rank 4
        self.assertTask("move node from prior rank to rank with best match",
                        ["012345", "2", "4"], scheduler[5])
        # Find next (alphabetically) witness with a gap at activeRank (still 6), which is witness C
        # Get the first token to the left of the gap ("zz23xx" in C at rank 4)
        #   and check whether to move it
        # Calculate strength of match for all columns from current rank (4) through activeRank (6), inclusive
        self.assertTask("build column for rank", ["zz23xx", "4"], scheduler[6])
        self.assertTask("build column for rank", ["zz23xx", "5"], scheduler[7])
        self.assertTask("build column for rank", ["zz23xx", "6"], scheduler[8])
        # The best (max()) fit of "zz23xx" among all ranks between current rank 4 and activeRank 6
        #   is at rank 6, so move "zz23xx" from current rank 4 to rank 6
        self.assertTask("move node from prior rank to rank with best match",
                        ["zz23xx", "4", "6"], scheduler[9])
        # No more gaps at rank 6, so advance to rank 5, which has gaps in witnesses A and C
        # First gap (alphabetically by siglum) at rank 5 is in witness A, where left node is "012345" at rank 4
        self.assertTask("build column for rank", ["012345", "4"],
                        scheduler[10])
        self.assertTask("build column for rank", ["012345", "5"],
                        scheduler[11])
        # Match is closest at current rank 4, so don't move the node
        # Next gap at rank 5 is in witness C, where left node is "01xxxx" at rank 3
        self.assertTask("build column for rank", ["01xxxx", "3"],
                        scheduler[12])
        self.assertTask("build column for rank", ["01xxxx", "4"],
                        scheduler[13])
        self.assertTask("build column for rank", ["01xxxx", "5"],
                        scheduler[14])
        # Exact match at current rank 3, so don't move it
        # No more gaps at rank 5, so advance to rank 4, which has a gap in witness C,
        #   where left node is "01xxxx" at rank 3
        self.assertTask("build column for rank", ["01xxxx", "3"],
                        scheduler[15])
        self.assertTask("build column for rank", ["01xxxx", "4"],
                        scheduler[16])
        # Exact match at rank 3, so don't move it
        # No more gaps at rank 4, so advance to rank 3, where only gap is in witness A, with "abcd" at rank 1
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[17])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[18])
        self.assertTask("build column for rank", ["abcd", "3"], scheduler[19])
        # Exact match at rank 1, so don't move it
        # No more gaps at rank 3, so advance to rank 2, with gaps in witnesses A and C and "abcd" at rank 1
        # Check witness A first
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[20])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[21])
        # Exact match at rank 1, so don't move it
        # Check witness C
        self.assertTask("build column for rank", ["abcd", "1"], scheduler[22])
        self.assertTask("build column for rank", ["abcd", "2"], scheduler[23])
        # Exact match at rank 1, so don't move it
        # No more gaps at rank 2, no gaps at rank 1
        self.assertEquals(24, len(scheduler))
        expected = """\
+---+------+--------+--------+--------+--------+--------+------+
| A | abcd | -      | -      | 012345 | -      | -      | efgh |
| B | abcd | 0xxxxx | 01xxxx | 01234x | 012xxx | 0123xx | efgh |
| C | abcd | -      | 01xxxx | -      | -      | zz23xx | efgh |
+---+------+--------+--------+--------+--------+--------+------+"""
        self.assertEqual(expected, alignment_table)