def test_hermans_witness_order_independence_case_two_witnesses(self): collation = Collation() collation.add_plain_witness("A", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("B", "a b c d F g h i ! q r s t") alignment_table = collate(collation) self.assertEquals(["a b c d F g h i!", "K!", "q r s t"], alignment_table.rows[0].to_list()) self.assertEquals(["a b c d F g h i!", "-", "q r s t"], alignment_table.rows[1].to_list())
def testDoubleTransposition1(self): collation = Collation() collation.add_plain_witness("A", "the cat is black") collation.add_plain_witness("B", "black is the cat") alignment_table = collate(collation) self.assertEquals(["the cat", "is", "black"], alignment_table.rows[0].to_list()) self.assertEquals(["black", "is", "the cat"], alignment_table.rows[1].to_list())
def test_non_overlapping_blocks_overlap_case(self): collation = Collation() collation.add_plain_witness("W1", "in the in the bleach") collation.add_plain_witness("W2", "in the in the bleach in the") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-4, 7-11")), blocks) # in the in the bleach
def testBeckett(self): collation = Collation() collation.add_plain_witness("1", "The same clock as when for example Magee once died.") collation.add_plain_witness("2", "The same as when for example Magee once died.") table = collate(collation) self.assertEquals(["The same", "clock", "as when for example Magee once died."], table.rows[0].to_list()) self.assertEquals(["The same", None, "as when for example Magee once died."], table.rows[1].to_list())
def test_superbase_generation_multiple_short_witnesses(self): collation = Collation() collation.add_plain_witness("A", "a") collation.add_plain_witness("B", "b") collation.add_plain_witness("C", "c") aligner = EditGraphAligner(collation) graph = VariantGraph() aligner.collate(graph)
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 17-25")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 26-29")), blocks) # q r s t
def test_witness_ranges_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() self.assertEqual(RangeSet("0-14"), token_index.get_range_for_witness("W1")) self.assertEqual(RangeSet("16-28"), token_index.get_range_for_witness("W2"))
def test_exact_matching(self): collation = Collation() collation.add_plain_witness("A", "I bought this glass , because it matches those dinner plates") collation.add_plain_witness("B", "I bought those glasses") alignment_table = collate(collation) self.assertEqual(["I bought ", "this glass , because it matches ", "those ", "dinner plates"], alignment_table.rows[0].to_list_of_strings()) self.assertEqual(["I bought ", None, "those ", "glasses"], alignment_table.rows[1].to_list_of_strings())
def test_token_array_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() # $ is meant to separate witnesses here self.assertTokenArray("a b c d F g h i ! K ! q r s t $0 a b c d F g h i ! q r s t", token_index)
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_witness("W2", "a b c d F g h i ! q r s t") algorithm = DekkerSuffixAlgorithm(collation) blocks = algorithm.get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
def test_non_overlapping_blocks_black_cat(self): collation = Collation() collation.add_plain_witness("W1", "the black cat") collation.add_plain_witness("W2", "the black cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 4-6")) self.assertEqual([block1], blocks)
def test_blocks_splitting_token_case(self): collation = Collation() collation.add_plain_witness("W1", "a c b c") collation.add_plain_witness("W2", "a c b") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 5-7")) # a c b self.assertIn(block1, blocks)
def test_near_matching(self): collation = Collation() collation.add_plain_witness("A", "I bought this glass, because it matches those dinner plates") collation.add_plain_witness("B", "I bought those glasses") alignment_table = collate(collation, near_match=True) self.assertEquals(["I bought", "this glass, because it matches those dinner plates"], alignment_table.rows[0].to_list()) self.assertEquals(["I bought", "those glasses"], alignment_table.rows[1].to_list())
def test_2(self): collation = Collation() collation.add_plain_witness("W1", "in the in the bleach") collation.add_plain_witness("W2", "in the in the bleach in the") collation.add_plain_witness("W3", "in the in the bleach in the") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-4, 6-10, 14-18")), blocks) # in the in the bleach self.assertIn(Block(RangeSet("11-12, 19-20")), blocks) # in the
def test_hermans_witness_order_independence_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("A", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("B", "a b c d F g h i ! q r s t") collation.add_plain_witness("C", "a b c d E g h i ! q r s t") alignment_table = collate(collation) self.assertEquals(["a b c d ", "F ", "g h i ", "! K ", "! q r s t"], alignment_table.rows[0].to_list_of_strings()) self.assertEquals(["a b c d ", "F ", "g h i ", None, "! q r s t"], alignment_table.rows[1].to_list_of_strings()) self.assertEquals(["a b c d ", "E ", "g h i ", None, "! q r s t"], alignment_table.rows[2].to_list_of_strings())
def test_align_with_longest_match(self): collation = Collation() collation.add_plain_witness("A", "a g a g c t a g t") collation.add_plain_witness("B", "a g c t") alignment_table = collate(collation) print("alignment_table=\n", alignment_table) self.assertEqual(['a g ', 'a g c t ', 'a g t'], alignment_table.rows[0].to_list_of_strings()) self.assertEqual([None, 'a g c t', None], alignment_table.rows[1].to_list_of_strings())
def test_witness_order(self): collation = Collation() collation.add_plain_witness("A", "x a y") collation.add_plain_witness("B", "x b y") collation.add_plain_witness("C", "x a b y") alignment_table = collate(collation) self.assertEquals(["x", "a", "-", "y"], alignment_table.rows[0].to_list()) self.assertEquals(["x", "-", "b", "y"], alignment_table.rows[1].to_list()) self.assertEquals(["x", "a", "b", "y"], alignment_table.rows[2].to_list())
def test_block_witnesses_Hermans_case_two_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(collation) block_witness = algorithm._get_block_witness(collation.witnesses[0]) self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug()) block_witness = algorithm._get_block_witness(collation.witnesses[1]) self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug())
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") alignment_table = collate(collation) print("alignment_table=\n", alignment_table) self.assertEqual(['a b c d F g h i ', '! K ', '! q r s t'], alignment_table.rows[0].to_list_of_strings()) self.assertEqual(['a b c d F g h i ', None, '! q r s t'], alignment_table.rows[1].to_list_of_strings())
def test_superbase(self): collation = Collation() collation.add_plain_witness("A", "X a b c d e f X g h i Y Z j k") collation.add_plain_witness("B", "a b c Y d e f Y Z g h i X j k") aligner = EditGraphAligner(collation) graph = VariantGraph() aligner.collate(graph, collation) superbase = aligner.new_superbase self.assertSuperbaseEquals("X a b c Y d e f X Y Z g h i Y Z X j k", superbase)
def test_heuristic_function_everything_equals(self): collation = Collation() collation.add_plain_witness("A", "everything equal") collation.add_plain_witness("B", "everything equal") aligner = ExperimentalAstarAligner(collation) aligner._create_heuristic_table(collation.witnesses[0].tokens(), collation.witnesses[1]) self.assertEqual([0, 1, 2], aligner.heuristic_table[0]) self.assertEqual([1, 0, 1], aligner.heuristic_table[1]) self.assertEqual([2, 1, 0], aligner.heuristic_table[2]) pass
def test_superbase_generation_multiple_short_witnesses(self): collation = Collation() collation.add_plain_witness("A", "a") collation.add_plain_witness("B", "b") collation.add_plain_witness("C", "c") aligner = EditGraphAligner(collation) graph = VariantGraph() aligner.collate(graph, collation) superbase = aligner.new_superbase self.assertSuperbaseEquals("a b c", superbase)
def test_blocks_failing_transposition_use_case_old_algorithm(self): collation = Collation() collation.add_plain_witness("W1", "the cat and the dog") collation.add_plain_witness("W2", "the dog and the cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-1, 9-10")) block2 = Block(RangeSet("3-4, 6-7")) block3 = Block(RangeSet("2, 8")) self.assertEqual([block1, block2, block3], blocks)
def test_lcp_intervals_number_of_witnesses_Hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() intervals = token_index.split_lcp_array_into_intervals() potential_block = intervals[1] # ! q r s t self.assertEqual(3, potential_block.get_depth())
def test_near_matching_middle(self): # Three candidates, closest is middle, match rank 2 0 1 (0 is closest) # Should go to the middle; incorrectly goes right self.maxDiff = None scheduler = Scheduler() collation = Collation() collation.add_plain_witness("A", "abcd 0123 efgh") collation.add_plain_witness("B", "abcd 0xxx 012x 01xx efgh") alignment_table = str(collate(collation, near_match=True, segmentation=False, scheduler=scheduler)) # Find the rightmost rank with a gap (rank 4); this is activeRank # Find the first witness with a gap at that rank (A) # Find first token to the left of the gap for the first gappy witness ("0123" in A at rank 2) # and check whether to move it # Calculate strength of match for all columns from the token's current rank (2) through activeRank (4) # parameters are token string and rank to check self.assertTask("build column for rank", ["0123", "2"], scheduler[0]) self.assertTask("build column for rank", ["0123", "3"], scheduler[1]) self.assertTask("build column for rank", ["0123", "4"], scheduler[2]) # The best (max()) fit of "0123" among all ranks between current rank 2 and activeRank 4 # is at rank 3, so move "0123" from current rank 2 to rank 3 self.assertTask("move node from prior rank to rank with best match", ["0123", "2", "3"], scheduler[3]) # No more gaps at activeRank 4, no gaps at rank 3, so move to next rank with a gap # (rank 2, gap in A), with "abcd" at rank 1 self.assertTask("build column for rank", ["abcd", "1"], scheduler[4]) self.assertTask("build column for rank", ["abcd", "2"], scheduler[5]) # Don't move it because it's closer to current location # No more gaps at rank 2, non gaps at rank 1, no more ranks self.assertEquals(6, len(scheduler)) expected = """\ +---+------+------+------+------+------+ | A | abcd | - | 0123 | - | efgh | | B | abcd | 0xxx | 012x | 01xx | efgh | +---+------+------+------+------+------+""" self.assertEqual(expected, alignment_table) def test_near_matching_clash(self): # If the previous rank has a vertex with more than one witness, where at least # one witness is a candidate for being moved, don't move it if any of the # witnesses has a node at the new rank. # # If there were only A and B, we'd move cce away from bbb to align with cce. # Witness C should prevent this. self.maxDiff = None collation = Collation() collation.add_plain_witness("A", "aaa bbb ccc ddd") collation.add_plain_witness("B", "aaa cce ddd") collation.add_plain_witness("C", "aaa cce ccc ddd") alignment_table = str(collate(collation, near_match=True, segmentation=False)) expected = """\ +---+-----+-----+-----+-----+ | A | aaa | bbb | ccc | ddd | | B | aaa | cce | - | ddd | | C | aaa | cce | ccc | ddd | +---+-----+-----+-----+-----+""" self.assertEqual(expected, alignment_table)
def test_near_matching_accidentally_incorrect_short(self): collation = Collation() collation.add_plain_witness("A", "over this dog") collation.add_plain_witness("B", "over there that dog") alignment_table = str(collate(collation, near_match=True, segmentation=False)) expected = """\ +---+------+-------+------+-----+ | A | over | - | this | dog | | B | over | there | that | dog | +---+------+-------+------+-----+""" self.assertEqual(expected, alignment_table)
def test_lcp_intervals_failing_use_case_old_algorithm(self): collation = Collation() collation.add_plain_witness("W1", "the cat and the dog") collation.add_plain_witness("W2", "the dog and the cat") parent_lcp_intervals, child_lcp_intervals = collation.get_lcp_intervals() self.assertIn((1,2), parent_lcp_intervals) self.assertIn((3,4), parent_lcp_intervals) self.assertIn((5,6), parent_lcp_intervals) self.assertIn((7, 10), parent_lcp_intervals) self.assertIn((7,8), child_lcp_intervals[7]) self.assertIn((9,10), child_lcp_intervals[7])
def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-3, 17-20, 32-35")), blocks) # a b c d self.assertIn(Block(RangeSet("5-7, 22-24, 37-39")), blocks) # g h i self.assertIn(Block(RangeSet("10-14, 25-29, 40-44")), blocks) # ! q r s t self.assertIn(Block(RangeSet("4, 21")), blocks) # F
def test_near_matching_accidentally_correct_long(self): collation = Collation() collation.add_plain_witness("A", "The brown fox jumps over this dog.") collation.add_plain_witness("B", "The brown fox jumps over that there dog.") alignment_table = str(collate(collation, near_match=True, segmentation=False)) expected = """\ +---+-----+-------+-----+-------+------+------+-------+-----+---+ | A | The | brown | fox | jumps | over | this | - | dog | . | | B | The | brown | fox | jumps | over | that | there | dog | . | +---+-----+-------+-----+-------+------+------+-------+-----+---+""" self.assertEqual(expected, alignment_table)
def test_near_matching_clash(self): # If the previous rank has a vertex with more than one witness, where at least # one witness is a candidate for being moved, don't move it if any of the # witnesses has a node at the new rank. # # If there were only A and B, we'd move cce away from bbb to align with cce. # Witness C should prevent this. self.maxDiff = None collation = Collation() collation.add_plain_witness("A", "aaa bbb ccc ddd") collation.add_plain_witness("B", "aaa cce ddd") collation.add_plain_witness("C", "aaa cce ccc ddd") alignment_table = str( collate(collation, near_match=True, segmentation=False)) expected = """\ +---+-----+-----+-----+-----+ | A | aaa | bbb | ccc | ddd | | B | aaa | cce | - | ddd | | C | aaa | cce | ccc | ddd | +---+-----+-----+-----+-----+""" self.assertEqual(expected, alignment_table)
def test_1(self): collation = Collation() collation.add_plain_witness("A", "a") collation.add_plain_witness("B", "b") collation.add_plain_witness("C", "a b") alignment_table = collate(collation) print("alignment_table=\n", alignment_table) self.assertEqual(['a', None], alignment_table.rows[0].to_list_of_strings()) self.assertEqual([None, 'b'], alignment_table.rows[1].to_list_of_strings()) self.assertEqual(['a ', 'b'], alignment_table.rows[2].to_list_of_strings())
def test_duplicated_tokens_in_witness2(self): collation = Collation() collation.add_plain_witness("A", "a") collation.add_plain_witness("B", "b") collation.add_plain_witness("C", "c") collation.add_plain_witness("D", "a b c a b c") # alignment_table = collate(collation) # self.assertEqual(['a', None, None, None], alignment_table.rows[0].to_list_of_strings()) # self.assertEqual([None, 'b', None, None], alignment_table.rows[1].to_list_of_strings()) # self.assertEqual([None, None, 'c', None], alignment_table.rows[2].to_list_of_strings()) # self.assertEqual(['a ', 'b ', 'c ', 'a b c'], alignment_table.rows[3].to_list_of_strings()) expected_tei = """<?xml version="1.0" ?> <cx:apparatus xmlns="" xmlns:cx=""> <app> <rdg wit="#D">a b c</rdg> </app> <app> <rdg wit="#A">a</rdg> <rdg wit="#D">a</rdg> </app> <app> <rdg wit="#B">b</rdg> <rdg wit="#D">b</rdg> </app> <app> <rdg wit="#C #D">c</rdg> </app> </cx:apparatus> """ # alignment_table = collate(collation) # print("alignment_table=\n",alignment_table) output_tei = collate(collation, output="tei", indent=True) self.assertEqual(expected_tei, output_tei)
def test_2(self): collation = Collation() collation.add_plain_witness("W1", "in the in the bleach") collation.add_plain_witness("W2", "in the in the bleach in the") collation.add_plain_witness("W3", "in the in the bleach in the") alignment_table = collate(collation) print("alignment_table=\n", alignment_table) self.assertEqual(['in the in the bleach', None], alignment_table.rows[0].to_list_of_strings()) self.assertEqual(['in the in the bleach ', 'in the'], alignment_table.rows[1].to_list_of_strings()) self.assertEqual(['in the in the bleach ', 'in the'], alignment_table.rows[2].to_list_of_strings())
def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") alignment_table = collate(collation) print("alignment_table=\n", alignment_table) self.assertEqual(['a b c d ', 'F ', 'g h i ', '! K ', '! q r s t'], alignment_table.rows[0].to_list_of_strings()) self.assertEqual(['a b c d ', 'F ', 'g h i ', None, '! q r s t'], alignment_table.rows[1].to_list_of_strings()) self.assertEqual(['a b c d ', 'E ', 'g h i ', None, '! q r s t'], alignment_table.rows[2].to_list_of_strings())
def test_block_witnesses_Hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(collation) block_witness1 = algorithm._get_block_witness(collation.witnesses[0]) self.assertEquals(["a b c d", "F", "g h i", "! q r s t"], block_witness1.debug()) block_witness2 = algorithm._get_block_witness(collation.witnesses[1]) self.assertEquals(["a b c d", "F", "g h i", "! q r s t"], block_witness2.debug()) block_witness3 = algorithm._get_block_witness(collation.witnesses[2]) self.assertEquals(["a b c d", "g h i", "! q r s t"], block_witness3.debug())
def testPretokenizedWitness(self): pretokenized_witness = { "witnesses": [{ "id": "A", "tokens": [{ "t": "A", "ref": 123 }, { "t": "black", "adj": True }, { "t": "cat", "id": "xyz" }, { "t": "bird", "id": "abc" }] }, { "id": "B", "tokens": [{ "t": "A" }, { "t": "white", "adj": True }, { "t": "mousedog bird", "adj": False }] }] } c = Collation.create_from_dict(pretokenized_witness) result = collate(c, segmentation=False) self.assertEqual(len(result.rows[0].to_list()), 4) self.assertEqual(len(result.rows[1].to_list()), 4) # The second witness should have a token that reads 'mousedog bird'. self.assertIn("mousedog bird", str(result.rows[1].to_list()))
def test_near_matching_accidentally_incorrect_long(self): self.maxDiff = None scheduler = Scheduler() collation = Collation() collation.add_plain_witness("A", "The brown fox jumps over this dog.") collation.add_plain_witness("B", "The brown fox jumps over there that dog.") alignment_table = str(collate(collation, near_match=True, segmentation=False, scheduler=scheduler)) self.assertTask("build column for rank", ["this", "6"], scheduler[0]) self.assertTask("build column for rank", ["this", "7"], scheduler[1]) self.assertTask("move node from prior rank to rank with best match", ["this", "6", "7"], scheduler[2]) self.assertTask("build column for rank", ["over", "5"], scheduler[3]) self.assertTask("build column for rank", ["over", "6"], scheduler[4]) self.assertEqual(5, len(scheduler)) expected = """\ +---+-----+-------+-----+-------+------+-------+------+-----+---+ | A | The | brown | fox | jumps | over | - | this | dog | . | | B | The | brown | fox | jumps | over | there | that | dog | . | +---+-----+-------+-----+-------+------+-------+------+-----+---+""" self.assertEqual(expected, alignment_table)
def testOmission(self): collation = Collation() collation.add_plain_witness("A", "a b c") collation.add_plain_witness("B", "b c") aligner = EditGraphAligner(collation) graph = VariantGraph() aligner.collate(graph) table = aligner.table # self.debug_table(aligner, table) self.assertEqual(0, table[0][0].g) self.assertEqual(-1, table[0][1].g) self.assertEqual(-2, table[0][2].g) self.assertEqual(-3, table[0][3].g) self.assertEqual(-1, table[1][0].g) self.assertEqual(-2, table[1][1].g) self.assertEqual(-1, table[1][2].g) self.assertEqual(-2, table[1][3].g) self.assertEqual(-2, table[2][0].g) self.assertEqual(-3, table[2][1].g) self.assertEqual(-2, table[2][2].g) self.assertEqual(-1, table[2][3].g)
def test_near_matching_middle(self): # Three candidates, closest is middle, match rank 2 0 1 (0 is closest) # Should go to the middle; incorrectly goes right self.maxDiff = None scheduler = Scheduler() collation = Collation() collation.add_plain_witness("A", "abcd 0123 efgh") collation.add_plain_witness("B", "abcd 0xxx 012x 01xx efgh") alignment_table = str( collate(collation, near_match=True, segmentation=False, scheduler=scheduler)) # Find the rightmost rank with a gap (rank 4); this is activeRank # Find the first witness with a gap at that rank (A) # Find first token to the left of the gap for the first gappy witness ("0123" in A at rank 2) # and check whether to move it # Calculate strength of match for all columns from the token's current rank (2) through activeRank (4) # parameters are token string and rank to check self.assertTask("build column for rank", ["0123", "2"], scheduler[0]) self.assertTask("build column for rank", ["0123", "3"], scheduler[1]) self.assertTask("build column for rank", ["0123", "4"], scheduler[2]) # The best (max()) fit of "0123" among all ranks between current rank 2 and activeRank 4 # is at rank 3, so move "0123" from current rank 2 to rank 3 self.assertTask("move node from prior rank to rank with best match", ["0123", "2", "3"], scheduler[3]) # No more gaps at activeRank 4, no gaps at rank 3, so move to next rank with a gap # (rank 2, gap in A), with "abcd" at rank 1 self.assertTask("build column for rank", ["abcd", "1"], scheduler[4]) self.assertTask("build column for rank", ["abcd", "2"], scheduler[5]) # Don't move it because it's closer to current location # No more gaps at rank 2, non gaps at rank 1, no more ranks self.assertEquals(6, len(scheduler)) expected = """\ +---+------+------+------+------+------+ | A | abcd | - | 0123 | - | efgh | | B | abcd | 0xxx | 012x | 01xx | efgh | +---+------+------+------+------+------+""" self.assertEqual(expected, alignment_table)
def test_longer_example(self): collation = Collation() collation.add_plain_witness("A", "The quick brown fox jumps over the dog.") collation.add_plain_witness("B", "The brown fox jumps over the lazy dog.") collate(collation)
return chunk print_chunk = chunk(print_file, xmlid) ms_chunk = chunk(ms_file, xmlid) if print_chunk == '' and ms_chunk == '': print(('\n\t[] Paragraph {}' ' NOT FOUND in file {}\n\tIs {} the right file?\n\n').format( xmlid, juxta_file, juxta_file)) ########### # Collate # ########### collation = Collation() collation.add_plain_witness(print_siglum, print_chunk) collation.add_plain_witness(ms_siglum, ms_chunk) output_string = collate(collation, output='tei', segmentation=False, near_match=True, indent=False) ################################ # Add empty <rdg> when missing # ################################ element = etree.fromstring(output_string) for app in element.findall('.//t:app', ns): if len(app) == 1:
''' Created on Apr 20, 2014 Darwin Integration test @author: Ronald Haentjens Dekker ''' import json from collatex import Collation, collate if __name__ == '__main__': # read source JSON data into dictionary json_data=open('darwin_chapter1_para1.json') data = json.load(json_data) json_data.close() #pprint(data) # generate collation object from dictionary collation = Collation.create_from_dict(data) print(collate(collation)) # write_dot(graph.graph, "rawoutput")
def test_lcp_child_intervals_darwin(self): lcp_array = array('i', [0, 0, 0, 96, 96, 0, 151, 9, 1, 105, 105, 0, 83, 83, 0, 95, 95, 0, 39, 39, 0, 24, 24, 0, 108, 1, 0, 232, 32, 0, 181, 39, 0, 185, 43, 1, 33, 33, 0, 159, 17, 0, 160, 18, 0, 106, 106, 0, 60, 60, 0, 171, 29, 1, 215, 15, 1, 122, 15, 1, 57, 57, 1, 153, 11, 1, 165, 23, 0, 9, 9, 1, 170, 28, 0, 214, 14, 0, 62, 62, 1, 191, 49, 0, 61, 61, 0, 148, 6, 1, 8, 8, 0, 19, 19, 0, 123, 16, 0, 75, 75, 1, 90, 90, 1, 28, 28, 0, 167, 25, 1, 112, 5, 1, 132, 25, 0, 50, 50, 0, 31, 31, 0, 77, 77, 0, 97, 97, 0, 6, 6, 0, 38, 38, 0, 63, 63, 0, 30, 30, 0, 80, 80, 0, 154, 12, 0, 145, 3, 0, 129, 22, 3, 67, 67, 0, 88, 88, 0, 45, 45, 0, 217, 17, 1, 22, 22, 0, 166, 24, 0, 25, 25, 0, 201, 1, 0, 155, 13, 1, 120, 13, 0, 175, 33, 0, 195, 53, 0, 135, 28, 0, 10, 10, 0, 147, 5, 0, 138, 31, 0, 161, 19, 1, 73, 73, 0, 198, 1, 0, 86, 86, 0, 74, 74, 1, 111, 4, 0, 210, 10, 0, 84, 84, 0, 42, 42, 0, 199, 2, 0, 119, 12, 0, 46, 46, 1, 202, 2, 0, 71, 71, 0, 40, 40, 0, 142, 0, 0, 52, 52, 0, 168, 26, 2, 113, 6, 1, 163, 21, 0, 133, 26, 0, 3, 3, 0, 186, 44, 1, 101, 101, 0, 193, 51, 2, 227, 27, 0, 107, 1, 37, 37, 1, 140, 33, 0, 1, 205, 5, 0, 47, 47, 0, 127, 20, 1, 65, 65, 0, 230, 30, 0, 41, 41, 0, 91, 91, 0, 1, 1, 0, 200, 3, 0, 156, 14, 0, 0, 76, 76, 0, 109, 2, 1, 182, 40, 0, 68, 68, 1, 14, 14, 0, 126, 19, 0, 34, 34, 1, 192, 50, 1, 85, 85, 1, 128, 21, 2, 66, 66, 1, 183, 41, 1, 220, 20, 1, 5, 5, 1, 212, 12, 2, 174, 32, 2, 226, 26, 1, 59, 59, 0, 16, 16, 0, 218, 18, 0, 23, 23, 1, 11, 11, 0, 36, 36, 1, 178, 36, 0, 51, 51, 0, 213, 13, 1, 190, 48, 0, 2, 2, 1, 222, 22, 1, 188, 46, 0, 78, 78, 0, 53, 53, 0, 197, 0, 0, 136, 29, 1, 219, 19, 1, 12, 12, 0, 114, 7, 0, 89, 89, 0, 172, 30, 2, 216, 16, 0, 21, 21, 0, 209, 9, 0, 81, 81, 0, 102, 102, 0, 134, 27, 0, 98, 98, 0, 131, 24, 0, 4, 4, 0, 35, 35, 0, 179, 37, 0, 224, 24, 0, 82, 82, 0, 72, 72, 0, 139, 32, 0, 125, 18, 0, 103, 103, 0, 121, 14, 0, 189, 47, 0, 184, 42, 0, 7, 7, 1, 17, 17, 0, 207, 7, 0, 221, 21, 0, 20, 20, 0, 196, 54, 0, 79, 79, 1, 204, 4, 1, 144, 2, 2, 94, 94, 1, 56, 56, 0, 211, 11, 1, 194, 52, 3, 228, 28, 1, 157, 15, 1, 69, 69, 1, 54, 54, 1, 115, 8, 1, 173, 31, 1, 225, 25, 1, 177, 35, 1, 100, 100, 0, 203, 3, 0, 150, 8, 0, 104, 104, 0, 143, 1, 1, 93, 93, 0, 118, 11, 0, 29, 29, 1, 64, 64, 1, 146, 4, 1, 137, 30, 1, 229, 29, 2, 70, 70, 1, 44, 44, 1, 49, 49, 1, 117, 10, 0, 152, 10, 0, 130, 23, 1, 26, 26, 1, 110, 3, 1, 158, 16, 0, 124, 17, 0, 206, 6, 0, 141, 34, 1, 92, 92, 0, 32, 32, 1, 27, 27, 0, 58, 58, 0, 162, 20, 0, 13, 13, 0, 187, 45, 1, 223, 23, 0, 43, 43, 0, 48, 48, 0, 176, 34, 0, 99, 99, 0, 149, 7, 1, 231, 31, 1, 180, 38, 0, 18, 18, 0, 55, 55, 0, 169, 27, 2, 164, 22, 1, 208, 8, 1, 116, 9, 0, 87, 87, 0, 15, 15]) collation = Collation() _, child_lcp_intervals = collation.get_lcp_intervals(lcp=lcp_array) self.assertEqual([(5, 7), (8, 10)], child_lcp_intervals[5]) self.assertEqual([(513, 515),(516, 518),(519, 521),(522, 524),(525,527)], child_lcp_intervals[513])
def testThisMorningExample(self): collation = Collation() collation.add_plain_witness("A", "This morning the cat observed little birds in the trees.") collation.add_plain_witness("B", "The cat was observing birds in the little trees this morning, it observed birds for two hours.") alignment_table = collate(collation, detect_transpositions=True)
def test_near_matching_rank_0(self): # find_prior_node() should check ranks back through 0, not 1 collation = Collation() collation.add_plain_witness("A", "this") collation.add_plain_witness("B", "there thin") output = str(collate(collation, near_match=True, segmentation=False)) expected = """\ +---+-------+------+ | A | - | this | | B | there | thin | +---+-------+------+""" self.assertEqual(expected, output) # def test_near_matching_middle(self): # # Three candidates, closest is middle, match rank 2 0 1 (0 is closest) # # Should go to the middle; incorrectly goes right # self.maxDiff = None # scheduler = Scheduler() # collation = Collation() # collation.add_plain_witness("A", "abcd 0123 efgh") # collation.add_plain_witness("B", "abcd 0xxx 012x 01xx efgh") # alignment_table = str(collate(collation, near_match=True, segmentation=False)) # # Find the rightmost rank with a gap (rank 4); this is activeRank # # Find the first witness with a gap at that rank (A) # # Find first token to the left of the gap for the first gappy witness ("0123" in A at rank 2) # # and check whether to move it # # Calculate strength of match for all columns from the token's current rank (2) through activeRank (4) # # parameters are token string and rank to check # self.assertTask("build column for rank", ["0123", "2"], scheduler[0]) # self.assertTask("build column for rank", ["0123", "3"], scheduler[1]) # self.assertTask("build column for rank", ["0123", "4"], scheduler[2]) # # The best (max()) fit of "0123" among all ranks between current rank 2 and activeRank 4 # # is at rank 3, so move "0123" from current rank 2 to rank 3 # self.assertTask("move node from prior rank to rank with best match", ["0123", "2", "3"], scheduler[3]) # # No more gaps at activeRank 4, no gaps at rank 3, so move to next rank with a gap # # (rank 2, gap in A), with "abcd" at rank 1 # self.assertTask("build column for rank", ["abcd", "1"], scheduler[4]) # self.assertTask("build column for rank", ["abcd", "2"], scheduler[5]) # # Don't move it because it's closer to current location # # No more gaps at rank 2, non gaps at rank 1, no more ranks # self.assertEqual(6, len(scheduler)) # expected = """\ # +---+------+------+------+------+------+ # | A | abcd | - | 0123 | - | efgh | # | B | abcd | 0xxx | 012x | 01xx | efgh | # +---+------+------+------+------+------+""" # self.assertEqual(expected, alignment_table) def test_near_matching_clash(self): # If the previous rank has a vertex with more than one witness, where at least # one witness is a candidate for being moved, don't move it if any of the # witnesses has a node at the new rank. # # If there were only A and B, we'd move cce away from bbb to align with cce. # Witness C should prevent this. self.maxDiff = None collation = Collation() collation.add_plain_witness("A", "aaa bbb ccc ddd") collation.add_plain_witness("B", "aaa cce ddd") collation.add_plain_witness("C", "aaa cce ccc ddd") alignment_table = str( collate(collation, near_match=True, segmentation=False)) expected = """\ +---+-----+-----+-----+-----+ | A | aaa | bbb | ccc | ddd | | B | aaa | cce | - | ddd | | C | aaa | cce | ccc | ddd | +---+-----+-----+-----+-----+""" self.assertEqual(expected, alignment_table)
def test_near_matching(self): collation = Collation() collation.add_plain_witness("A", "I bought this glass , because it matches those dinner plates") collation.add_plain_witness("B", "I bought those glasses") # Arguments to collate() must be passed as arguments to assertRaises() self.assertRaises(SegmentationError, collate, collation, near_match=True)
def test_near_matching_three_witnesses(self): self.maxDiff = None scheduler = Scheduler() collation = Collation() collation.add_plain_witness("A", "abcd 012345 efgh") collation.add_plain_witness( "B", "abcd 0xxxxx 01xxxx 01234x 012xxx 0123xx efgh") collation.add_plain_witness("C", "abcd 01xxxx zz23xx efgh") alignment_table = str( collate(collation, near_match=True, segmentation=False, scheduler=scheduler)) # Find the rightmost rank with a gap (rank 6); this is activeRank # Find the first witness (alphabetically by siglum) with a gap at that rank (A) # Get the first token to the left of the gap for the first gappy witness ("012345" in A at rank 2) # and check whether to move it # Calculate strength of match for all columns from current rank (2) through activeRank (6), inclusive self.assertTask("build column for rank", ["012345", "2"], scheduler[0]) self.assertTask("build column for rank", ["012345", "3"], scheduler[1]) self.assertTask("build column for rank", ["012345", "4"], scheduler[2]) self.assertTask("build column for rank", ["012345", "5"], scheduler[3]) self.assertTask("build column for rank", ["012345", "6"], scheduler[4]) # The best (max()) fit of "012345" among all ranks between current rank 2 and activeRank 6 # is at rank 4, so move "012345" from current rank 2 to rank 4 self.assertTask("move node from prior rank to rank with best match", ["012345", "2", "4"], scheduler[5]) # Find next (alphabetically) witness with a gap at activeRank (still 6), which is witness C # Get the first token to the left of the gap ("zz23xx" in C at rank 4) # and check whether to move it # Calculate strength of match for all columns from current rank (4) through activeRank (6), inclusive self.assertTask("build column for rank", ["zz23xx", "4"], scheduler[6]) self.assertTask("build column for rank", ["zz23xx", "5"], scheduler[7]) self.assertTask("build column for rank", ["zz23xx", "6"], scheduler[8]) # The best (max()) fit of "zz23xx" among all ranks between current rank 4 and activeRank 6 # is at rank 6, so move "zz23xx" from current rank 4 to rank 6 self.assertTask("move node from prior rank to rank with best match", ["zz23xx", "4", "6"], scheduler[9]) # No more gaps at rank 6, so advance to rank 5, which has gaps in witnesses A and C # First gap (alphabetically by siglum) at rank 5 is in witness A, where left node is "012345" at rank 4 self.assertTask("build column for rank", ["012345", "4"], scheduler[10]) self.assertTask("build column for rank", ["012345", "5"], scheduler[11]) # Match is closest at current rank 4, so don't move the node # Next gap at rank 5 is in witness C, where left node is "01xxxx" at rank 3 self.assertTask("build column for rank", ["01xxxx", "3"], scheduler[12]) self.assertTask("build column for rank", ["01xxxx", "4"], scheduler[13]) self.assertTask("build column for rank", ["01xxxx", "5"], scheduler[14]) # Exact match at current rank 3, so don't move it # No more gaps at rank 5, so advance to rank 4, which has a gap in witness C, # where left node is "01xxxx" at rank 3 self.assertTask("build column for rank", ["01xxxx", "3"], scheduler[15]) self.assertTask("build column for rank", ["01xxxx", "4"], scheduler[16]) # Exact match at rank 3, so don't move it # No more gaps at rank 4, so advance to rank 3, where only gap is in witness A, with "abcd" at rank 1 self.assertTask("build column for rank", ["abcd", "1"], scheduler[17]) self.assertTask("build column for rank", ["abcd", "2"], scheduler[18]) self.assertTask("build column for rank", ["abcd", "3"], scheduler[19]) # Exact match at rank 1, so don't move it # No more gaps at rank 3, so advance to rank 2, with gaps in witnesses A and C and "abcd" at rank 1 # Check witness A first self.assertTask("build column for rank", ["abcd", "1"], scheduler[20]) self.assertTask("build column for rank", ["abcd", "2"], scheduler[21]) # Exact match at rank 1, so don't move it # Check witness C self.assertTask("build column for rank", ["abcd", "1"], scheduler[22]) self.assertTask("build column for rank", ["abcd", "2"], scheduler[23]) # Exact match at rank 1, so don't move it # No more gaps at rank 2, no gaps at rank 1 self.assertEquals(24, len(scheduler)) expected = """\ +---+------+--------+--------+--------+--------+--------+------+ | A | abcd | - | - | 012345 | - | - | efgh | | B | abcd | 0xxxxx | 01xxxx | 01234x | 012xxx | 0123xx | efgh | | C | abcd | - | 01xxxx | - | - | zz23xx | efgh | +---+------+--------+--------+--------+--------+--------+------+""" self.assertEqual(expected, alignment_table)