class DecisionTreeTest(TestCase): def setUp(self): # we need to create witnesses # 1: a, b, c, d, e # 2: a, e, c, d # 3: a, d, b a = Witness({'id': 'A', 'content': "a b c d e"}) b = Witness({'id': 'B', 'content': "a e c d"}) c = Witness({'id': 'C', 'content': "a d b"}) self.witnesses = [a, b, c] self.tokenindex = TokenIndex(self.witnesses) self.tokenindex.prepare() def test_maximum_score(self): # from the token index we need to calculate the maximum amount of matches lcp_intervals = self.tokenindex.split_lcp_array_into_intervals() possible_matches = calculate_maximum(lcp_intervals) # print(possible_matches) self.assertEquals(12, possible_matches) def test_decision_tree(self): tree = DecisionTree(self.witnesses) root = tree.root self.assertEquals((0, 0, 0), root.coordinates) # we need three scores, (current score), (minimum global score, maximum global score) self.assertEquals(0, root.current_score) self.assertEquals(0, root.minimum_global_score) self.assertEquals(12, root.maximum_global_score)
def __init__(self, witnesses): self.witnesses = witnesses self.tokenindex = TokenIndex(witnesses) self.tokenindex.prepare() self.lcp_intervals = self.tokenindex.split_lcp_array_into_intervals() global_maximum_score = calculate_maximum(self.lcp_intervals) self.root = DecisionNode(global_maximum_score)
class DecisionTreeTest(TestCase): def setUp(self): # we need to create witnesses # 1: a, b, c, d, e # 2: a, e, c, d # 3: a, d, b a = Witness({'id':'A', 'content':"a b c d e"}) b = Witness({'id':'B', 'content':"a e c d"}) c = Witness({'id':'C', 'content':"a d b"}) self.witnesses = [a, b, c] self.tokenindex = TokenIndex(self.witnesses) self.tokenindex.prepare() def test_maximum_score(self): # from the token index we need to calculate the maximum amount of matches lcp_intervals = self.tokenindex.split_lcp_array_into_intervals() possible_matches = calculate_maximum(lcp_intervals) # print(possible_matches) self.assertEquals(12, possible_matches) def test_decision_tree(self): tree = DecisionTree(self.witnesses) root = tree.root self.assertEquals((0, 0, 0), root.coordinates) # we need three scores, (current score), (minimum global score, maximum global score) self.assertEquals(0, root.current_score) self.assertEquals(0, root.minimum_global_score) self.assertEquals(12, root.maximum_global_score)
def testTokenArrayMarkersWithThreeWitnesses(self): collation = Collation() collation.add_plain_witness("W1", "interesting nice huh") collation.add_plain_witness("W2", "very nice right") collation.add_plain_witness("W3", "especially interesting") token_index = TokenIndex(collation.witnesses) token_index.prepare() self.assertTokenArray("interesting nice huh $0 very nice right $1 especially interesting", token_index)
def test_witness_ranges_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() self.assertEqual(RangeSet("0-14"), token_index.get_range_for_witness("W1")) self.assertEqual(RangeSet("16-28"), token_index.get_range_for_witness("W2"))
def test_token_array_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() # $ is meant to separate witnesses here self.assertTokenArray("a b c d F g h i ! K ! q r s t $0 a b c d F g h i ! q r s t", token_index)
def test_lcp_intervals_number_of_witnesses_Hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() intervals = token_index.split_lcp_array_into_intervals() potential_block = intervals[1] # ! q r s t self.assertEqual(3, potential_block.get_depth())
def test_lcp_intervals_number_of_witnesses_Hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() intervals = token_index.split_lcp_array_into_intervals() potential_block = intervals[1] # ! q r s t self.assertEqual(3, potential_block.number_of_witnesses)
def test_token_array_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() # $ is meant to separate witnesses here self.assertTokenArray( "a b c d F g h i ! K ! q r s t $0 a b c d F g h i ! q r s t", token_index)
def testTokenArrayMarkersWithThreeWitnesses(self): collation = Collation() collation.add_plain_witness("W1", "interesting nice huh") collation.add_plain_witness("W2", "very nice right") collation.add_plain_witness("W3", "especially interesting") token_index = TokenIndex(collation.witnesses) token_index.prepare() self.assertTokenArray( "interesting nice huh $0 very nice right $1 especially interesting", token_index)
class DecisionTree(object): def __init__(self, witnesses): self.witnesses = witnesses self.tokenindex = TokenIndex(witnesses) self.tokenindex.prepare() self.lcp_intervals = self.tokenindex.split_lcp_array_into_intervals() global_maximum_score = calculate_maximum(self.lcp_intervals) self.root = DecisionNode(global_maximum_score) pass
def testCaseDanielStoekl(self): collation = Collation() collation.add_plain_witness("W1", "a b c d e") collation.add_plain_witness("W2", "a e c d") collation.add_plain_witness("W3", "a d b") token_index = TokenIndex(collation.witnesses) token_index.prepare() # Note: the suffix array can have multiple forms # outcome of sorting is not guaranteed # however the LCP array is fixed we can assert that self.assertEqual(array('i', [0, 0, 0, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 1]), token_index.get_lcp_array())
def test_filter_potential_blocks(self): collation = Collation() collation.add_plain_witness("W1", "a a") collation.add_plain_witness("w2", "a") token_index = TokenIndex(collation.witnesses) token_index.prepare() intervals = token_index.split_lcp_array_into_intervals() # expectations # There is one interval with length 1, number of occurrences 3, number of witnesses: 2 a_interval = intervals[0] # a self.assertEqual(2, a_interval.number_of_witnesses) self.assertEqual(1, a_interval.length) self.assertEqual(3, a_interval.number_of_occurrences)
def testCaseDanielStoekl(self): collation = Collation() collation.add_plain_witness("W1", "a b c d e") collation.add_plain_witness("W2", "a e c d") collation.add_plain_witness("W3", "a d b") token_index = TokenIndex(collation.witnesses) token_index.prepare() # Note: the suffix array can have multiple forms # outcome of sorting is not guaranteed # however the LCP array is fixed we can assert that self.assertEquals( array('i', [0, 0, 0, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 1]), token_index.get_lcp_array())
def setUp(self): # we need to create witnesses # 1: a, b, c, d, e # 2: a, e, c, d # 3: a, d, b a = Witness({'id': 'A', 'content': "a b c d e"}) b = Witness({'id': 'B', 'content': "a e c d"}) c = Witness({'id': 'C', 'content': "a d b"}) self.witnesses = [a, b, c] self.tokenindex = TokenIndex(self.witnesses) self.tokenindex.prepare()
def test_filter_potential_blocks(self): collation = Collation() collation.add_plain_witness("W1", "a a") collation.add_plain_witness("w2", "a") token_index = TokenIndex(collation.witnesses) token_index.prepare() blocks = token_index.split_lcp_array_into_intervals() # expectations # There is one interval with length 1, number of occurrences 3, number of witnesses: 2 a_block = blocks[0] # a self.assertEqual(2, a_block.get_depth()) self.assertEqual(1, a_block.length) self.assertEqual(3, len(a_block.get_all_instances()))
def __init__(self, collation, near_match=False, debug_scores=False, detect_transpositions=False, properties_filter=None): self.collation = collation self.debug_scores = debug_scores self.detect_transpositions = detect_transpositions self.token_index = TokenIndex(collation.witnesses) self.scorer = Scorer(self.token_index, near_match=near_match, properties_filter=properties_filter) self.align_function = self._align_table
def testCaseDanielStoeklLCPIntervals(self): collation = Collation() collation.add_plain_witness("W1", "a b c d e") collation.add_plain_witness("W2", "a e c d") collation.add_plain_witness("W3", "a d b") token_index = TokenIndex(collation.witnesses) token_index.prepare() blocks = token_index.split_lcp_array_into_intervals() self.assertLCP_Interval(2, 1, 3, 3, blocks[0]) # a self.assertLCP_Interval(5, 1, 2, 2, blocks[1]) # b self.assertLCP_Interval(7, 2, 2, 2, blocks[2]) # c d self.assertLCP_Interval(9, 1, 3, 3, blocks[3]) # d self.assertLCP_Interval(12, 1, 2, 2, blocks[4]) # e self.assertEquals(5, len(blocks))
def testCaseDanielStoeklLCPIntervals(self): collation = Collation() collation.add_plain_witness("W1", "a b c d e") collation.add_plain_witness("W2", "a e c d") collation.add_plain_witness("W3", "a d b") token_index = TokenIndex(collation.witnesses) token_index.prepare() blocks = token_index.split_lcp_array_into_intervals() self.assertLCP_Interval(2, 1, 3, 3, blocks[0]) # a self.assertLCP_Interval(5, 1, 2, 2, blocks[1]) # b self.assertLCP_Interval(7, 2, 2, 2, blocks[2]) # c d self.assertLCP_Interval(9, 1, 3, 3, blocks[3]) # d self.assertLCP_Interval(12, 1, 2, 2, blocks[4]) # e self.assertEqual(5, len(blocks))
def test_non_overlapping_blocks_overlap_case(self): collation = Collation() collation.add_plain_witness("W1", "in the in the bleach") collation.add_plain_witness("W2", "in the in the bleach in the") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-4, 6-10")), blocks) # in the in the bleach
def test_split_lcp_intervals_descending_LCP(self): lcp_array = array('i', [0, 20, 20, 20, 4]) sa_array = array('i', [0, 1, 2, 3, 4]) # FAKED! token_index = TokenIndex.for_test(sa_array, lcp_array) split_intervals = token_index.split_lcp_array_into_intervals() self.assertIntervalIn(0, 20, 4, split_intervals) self.assertIntervalIn(0, 4, 5, split_intervals) self.assertEqual(2, len(split_intervals), "More items: "+str(split_intervals))
def __init__(self, collation, near_match=False, debug_scores=False, detect_transpositions=False, properties_filter=None): self.scorer = Scorer() self.collation = collation self.debug_scores = debug_scores self.detect_transpositions = detect_transpositions self.properties_filter = properties_filter self.token_index = TokenIndex(collation.witnesses) self.token_position_to_vertex = {} self.added_witness = [] self.omitted_base = [] self.vertex_array = [] self.cells = [[]]
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
def test_split_lcp_intervals_descending_LCP(self): lcp_array = array('i', [0, 20, 20, 20, 4]) sa_array = array('i', [0, 1, 2, 3, 4]) # FAKED! token_index = TokenIndex.for_test(sa_array, lcp_array) split_intervals = token_index.split_lcp_array_into_intervals() self.assertIntervalIn(0, 20, 4, split_intervals) self.assertIntervalIn(0, 4, 5, split_intervals) self.assertEqual(2, len(split_intervals), "More items: " + str(split_intervals))
def test_split_lcp_intervals_ascending_then_descending_LCP(self): lcp_array = array('i', [0, 10, 149, 93, 7, 1]) sa_array = array('i', [0, 1, 2, 3, 4, 5]) # FAKED! token_index = TokenIndex.for_test(sa_array, lcp_array) split_intervals = token_index.split_lcp_array_into_intervals() self.assertIntervalIn(0, 10, 4, split_intervals) self.assertIntervalIn(1, 149, 2, split_intervals) self.assertIntervalIn(1, 93, 3, split_intervals) self.assertIntervalIn(0, 7, 5, split_intervals) self.assertIntervalIn(0, 1, 6, split_intervals) self.assertEqual(5, len(split_intervals), "More items: "+str(split_intervals))
def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks) # a b c d self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks) # g h i self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")), blocks) # ! q r s t self.assertIn(Block(RangeSet("4, 20")), blocks) # F
def test_split_lcp_intervals_ascending_then_descending_LCP(self): lcp_array = array('i', [0, 10, 149, 93, 7, 1]) sa_array = array('i', [0, 1, 2, 3, 4, 5]) # FAKED! token_index = TokenIndex.for_test(sa_array, lcp_array) split_intervals = token_index.split_lcp_array_into_intervals() self.assertIntervalIn(0, 10, 4, split_intervals) self.assertIntervalIn(1, 149, 2, split_intervals) self.assertIntervalIn(1, 93, 3, split_intervals) self.assertIntervalIn(0, 7, 5, split_intervals) self.assertIntervalIn(0, 1, 6, split_intervals) self.assertEqual(5, len(split_intervals), "More items: " + str(split_intervals))
def test_split_lcp_intervals_ascending_descending_ascending(self): lcp_array = array('i', [0, 4, 143, 87, 1, 1, 12, 93, 93, 37]) sa_array = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]) # FAKED! token_index = TokenIndex.for_test(sa_array, lcp_array) split_intervals = token_index.split_lcp_array_into_intervals() self.assertIntervalIn(1, 143, 2, split_intervals) self.assertIntervalIn(1, 87, 3, split_intervals) self.assertIntervalIn(0, 4, 4, split_intervals) self.assertIntervalIn(6, 93, 3, split_intervals) self.assertIntervalIn(0, 1, 10, split_intervals) self.assertIntervalIn(5, 12, 5, split_intervals) self.assertIntervalIn(6, 37, 4, split_intervals)
def setUp(self): # we need to create witnesses # 1: a, b, c, d, e # 2: a, e, c, d # 3: a, d, b a = Witness({'id':'A', 'content':"a b c d e"}) b = Witness({'id':'B', 'content':"a e c d"}) c = Witness({'id':'C', 'content':"a d b"}) self.witnesses = [a, b, c] self.tokenindex = TokenIndex(self.witnesses) self.tokenindex.prepare()
def test_witness_ranges_hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") token_index = TokenIndex(collation.witnesses) token_index.prepare() self.assertEquals(RangeSet("0-14"), token_index.get_range_for_witness("W1")) self.assertEquals(RangeSet("16-28"), token_index.get_range_for_witness("W2"))
class EditGraphAligner(CollationAlgorithm): def __init__(self, collation, near_match=False, debug_scores=False, detect_transpositions=False, properties_filter=None): self.scorer = Scorer() self.collation = collation self.debug_scores = debug_scores self.detect_transpositions = detect_transpositions self.properties_filter = properties_filter self.token_index = TokenIndex(collation.witnesses) self.token_position_to_vertex = {} self.added_witness = [] self.omitted_base = [] self.vertex_array = [] self.cells = [[]] def collate(self, graph): """ :type graph: VariantGraph """ # prepare the token index self.token_index.prepare() self.vertex_array = [None] * len(self.token_index.token_array) # Build the variant graph for the first witness # this is easy: generate a vertex for every token first_witness = self.collation.witnesses[0] tokens = first_witness.tokens() token_to_vertex = self.merge(graph, first_witness.sigil, tokens) # print("> token_to_vertex=", token_to_vertex) self.update_token_position_to_vertex(token_to_vertex) self.update_token_to_vertex_array(tokens, first_witness, self.token_position_to_vertex) # align witness 2 - n for x in range(1, len(self.collation.witnesses)): witness = self.collation.witnesses[x] tokens = witness.tokens() # print("\nwitness", witness.sigil) variant_graph_ranking = VariantGraphRanking.of(graph) # print("> x =", x, ", variant_graph_ranking =", variant_graph_ranking.byRank) variant_graph_ranks = list( set( map(lambda v: variant_graph_ranking.byVertex.get(v), graph.vertices()))) # we leave in the rank of the start vertex, but remove the rank of the end vertex variant_graph_ranks.pop() # now the vertical stuff tokens_as_index_list = self.as_index_list(tokens) match_cube = MatchCube(self.token_index, witness, self.vertex_array, variant_graph_ranking, self.properties_filter) # print("> match_cube.matches=", match_cube.matches) self.fill_needleman_wunsch_table(variant_graph_ranks, tokens_as_index_list, match_cube) aligned = self.align_matching_tokens(match_cube) # print("> aligned=", aligned) # print("self.token_index.token_array=", self.token_index.token_array) # alignment = self.align_function(superbase, next_witness, token_to_vertex, match_cube) # merge witness_token_to_generated_vertex = self.merge( graph, witness.sigil, witness.tokens(), aligned) # print("> witness_token_to_generated_vertex =", witness_token_to_generated_vertex) token_to_vertex.update(witness_token_to_generated_vertex) # print("> token_to_vertex =", token_to_vertex) self.update_token_position_to_vertex(token_to_vertex, aligned) witness_token_position_to_vertex = {} for p in self.token_index.get_range_for_witness(witness.sigil): # print("> p= ", p) witness_token_position_to_vertex[ p] = self.token_position_to_vertex[p] self.update_token_to_vertex_array( tokens, witness, witness_token_position_to_vertex) # print("> vertex_array =", self.vertex_array) # print("actual") # self._debug_edit_graph_table(self.table) # print("expected") # self._debug_edit_graph_table(self.table2) # change superbase # superbase = self.new_superbase if self.detect_transpositions: detector = TranspositionDetection(self) detector.detect() # if self.debug_scores: # self._debug_edit_graph_table(self.table) @staticmethod def as_index_list(tokens): tokens_as_index_list = [0] counter = 1 for t in tokens: tokens_as_index_list.append(counter) counter += 1 return tokens_as_index_list def fill_needleman_wunsch_table(self, variant_graph_ranks, tokens_as_index_list, match_cube): self.cells = [[None for row in range(0, len(variant_graph_ranks))] for col in range(0, len(tokens_as_index_list))] scorer = Scorer(match_cube) # init 0,0 self.cells[0][0] = Score(ScoreType.empty, 0, 0, None, 0) # fill the first row with gaps for x in range(1, len(variant_graph_ranks)): previous_x = x - 1 self.cells[0][x] = scorer.gap(x, 0, self.cells[0][previous_x]) # fill the first column with gaps for y in range(1, len(tokens_as_index_list)): # print("\nself.cells.len = ", len(self.cells), " x ", len(self.cells[0])) # print("y=", y) # print("self.cells[y][0]=", self.cells[y][0]) previous_y = y - 1 # print("previous_y=", previous_y) # print("self.cells[previous_y][0]=", self.cells[previous_y][0]) self.cells[y][0] = scorer.gap(0, y, self.cells[previous_y][0]) _debug_cells(self.cells) # fill the remaining cells # fill the rest of the cells in a y by x fashion for y in range(1, len(tokens_as_index_list)): for x in range(1, len(variant_graph_ranks)): previous_y = y - 1 previous_x = x - 1 from_upper_left = scorer.score( x, y, self.cells[previous_y][previous_x]) from_left = scorer.gap(x, y, self.cells[y][previous_x]) from_upper = self.calculate_from_upper(scorer, y, x, previous_y, match_cube) max_score = max(from_upper_left, from_left, from_upper, key=lambda s: s.global_score) self.cells[y][x] = max_score def calculate_from_upper(self, scorer, y, x, previous_y, match_cube): upper_is_match = match_cube.has_match(previous_y - 1, x - 1) if upper_is_match: return scorer.score(x, y, self.cells[previous_y][x]) else: return scorer.gap(x, y, self.cells[previous_y][x]) def align_matching_tokens(self, cube): # using the score iterator.. # find all the matches # later for the transposition detection, we also want to keep track of all the additions, # omissions, and replacements aligned = {} scores = ScoreIterator(self.cells) matched_vertices = [] for score in scores: if score.type == ScoreType.match: rank = score.x - 1 match = cube.get_match(score.y - 1, rank) if match.vertex not in matched_vertices: aligned[match.token] = match.vertex matched_vertices.append(match.vertex) return aligned def update_token_to_vertex_array(self, tokens, witness, witness_token_position_to_vertex): # we need to update the token -> vertex map # that information is stored in protected map # print("> witness_token_position_to_vertex =", witness_token_position_to_vertex) # t = list(witness_token_to_vertex)[0] # #print("> list(witness_token_to_vertex)[0] =", t) # #print("> t.token_string =", t.token_string) # #print("> t.token_data =", t.token_data) # print("> witness_token_position_to_vertex =", witness_token_position_to_vertex) for token_position in self.token_index.get_range_for_witness( witness.sigil): # print("> token_position =", token_position) vertex = witness_token_position_to_vertex[token_position] self.vertex_array[token_position] = vertex def update_token_position_to_vertex(self, token_to_vertex, aligned={}): for token in token_to_vertex: # print("> token =", token) position = token.token_data['_token_array_position'] # print("> position =", position) self.token_position_to_vertex[position] = token_to_vertex[token] for token in aligned: # print("> token =", token) position = token.token_data['_token_array_position'] # print("> position =", position) self.token_position_to_vertex[position] = aligned[token]
class EditGraphAligner(CollationAlgorithm): def __init__(self, collation, near_match=False, debug_scores=False, detect_transpositions=False, properties_filter=None): self.collation = collation self.debug_scores = debug_scores self.detect_transpositions = detect_transpositions self.token_index = TokenIndex(collation.witnesses) self.scorer = Scorer(self.token_index, near_match=near_match, properties_filter=properties_filter) self.align_function = self._align_table self.added_witness = [] self.omitted_base = [] def collate(self, graph, collation): """ :type graph: VariantGraph :type collation: Collation """ # prepare the token index self.token_index.prepare() # Build the variant graph for the first witness # this is easy: generate a vertex for every token first_witness = collation.witnesses[0] tokens = first_witness.tokens() token_to_vertex = self.merge(graph, first_witness.sigil, tokens) # let the scorer prepare the first witness self.scorer.prepare_witness(first_witness) # construct superbase superbase = tokens # align witness 2 - n for x in range(1, len(collation.witnesses)): next_witness = collation.witnesses[x] # let the scorer prepare the next witness self.scorer.prepare_witness(next_witness) # # VOOR CONTROLE! # alignment = self._align_table(superbase, next_witness, token_to_vertex) # self.table2 = self.table # alignment = token -> vertex alignment = self.align_function(superbase, next_witness, token_to_vertex) # merge token_to_vertex.update( self.merge(graph, next_witness.sigil, next_witness.tokens(), alignment)) # print("actual") # self._debug_edit_graph_table(self.table) # print("expected") # self._debug_edit_graph_table(self.table2) # change superbase superbase = self.new_superbase if self.detect_transpositions: detector = TranspositionDetection(self) detector.detect() if self.debug_scores: self._debug_edit_graph_table(self.table) def _align_table(self, superbase, witness, token_to_vertex): if not superbase: raise Exception("Superbase is empty!") # print(""+str(superbase)+":"+str(witness.tokens())) self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) self.table = [[ EditGraphNode() for _ in range(self.length_witness_a + 1) ] for _ in range(self.length_witness_b + 1)] # per diagonal calculate the score (taking into account the three surrounding nodes) self.traverse_diagonally() alignment = {} self.additions = [] self.omissions = [] self.new_superbase = [] # start lower right cell x = self.length_witness_a y = self.length_witness_b # work our way to the upper left while x > 0 and y > 0: cell = self.table[y][x] self._process_cell(token_to_vertex, self.tokens_witness_a, self.tokens_witness_b, alignment, x, y) # examine neighbor nodes nodes_to_examine = set() nodes_to_examine.add(self.table[y][x - 1]) nodes_to_examine.add(self.table[y - 1][x]) nodes_to_examine.add(self.table[y - 1][x - 1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) # move position if self.table[y - 1][x - 1] == parent_node: # another match or replacement if not cell.match: self.omitted_base.insert(0, self.tokens_witness_a[x - 1]) self.added_witness.insert(0, self.tokens_witness_b[y - 1]) # print("replacement:"+str(self.tokens_witness_a[x-1])+":"+str(self.tokens_witness_b[y-1])) # else: # print("match:"+str(self.tokens_witness_a[x-1])) y -= 1 x -= 1 else: if self.table[y - 1][x] == parent_node: # addition? self.added_witness.insert(0, self.tokens_witness_b[y - 1]) # print("added:" + str(self.tokens_witness_b[y - 1])) y -= 1 else: if self.table[y][x - 1] == parent_node: # omission? self.omitted_base.insert(0, self.tokens_witness_a[x - 1]) # print("omitted:" + str(self.tokens_witness_a[x - 1])) x -= 1 # process additions/omissions in the begin of the superbase/witness if x > 0: self.omitted_base = self.tokens_witness_a[0:x] + self.omitted_base if y > 0: self.added_witness = self.tokens_witness_b[0:y] + self.added_witness self.add_to_superbase() return alignment def add_to_superbase(self): if self.omitted_base or self.added_witness: # print("update superbase:" + str(self.omitted_base) + ":" + str(self.added_witness)) # update superbase with additions, omissions self.new_superbase = self.added_witness + self.new_superbase self.new_superbase = self.omitted_base + self.new_superbase self.added_witness = [] self.omitted_base = [] def _process_cell(self, token_to_vertex, witness_a, witness_b, alignment, x, y): cell = self.table[y][x] if cell.match: # process segments self.add_to_superbase() # process alignment token = witness_a[x - 1] token2 = witness_b[y - 1] vertex = token_to_vertex[token] alignment[token2] = vertex # print("match") # print(token2) self.new_superbase.insert(0, token) return cell # This function traverses the table diagonally and scores each cell. # Original function from Mark Byers; translated from C into Python. def traverse_diagonally(self): m = self.length_witness_b + 1 n = self.length_witness_a + 1 for _slice in range(0, m + n - 1, 1): z1 = 0 if _slice < n else _slice - n + 1 z2 = 0 if _slice < m else _slice - m + 1 j = _slice - z2 while j >= z1: x = _slice - j y = j self.score_cell(y, x) j -= 1 def score_cell(self, y, x): # initialize root node score to zero (no edit operations have # been performed) if y == 0 and x == 0: self.table[y][x].g = 0 return # examine neighbor nodes nodes_to_examine = set() # fetch existing score from the left node if possible if x > 0: nodes_to_examine.add(self.table[y][x - 1]) if y > 0: nodes_to_examine.add(self.table[y - 1][x]) if x > 0 and y > 0: nodes_to_examine.add(self.table[y - 1][x - 1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) if parent_node == self.table[y - 1][x - 1]: edit_operation = 0 else: edit_operation = 1 token_a = self.tokens_witness_a[x - 1] token_b = self.tokens_witness_b[y - 1] self.scorer.score_cell(self.table[y][x], parent_node, token_a, token_b, y, x, edit_operation) def _debug_edit_graph_table(self, table): # print the table horizontal x = PrettyTable() x.header = False for y in range(0, len(table)): cells = table[y] x.add_row(cells) # alignment can only be set after the field names are known. # since add_row sets the field names, it has to be set after x.add_row(cells) x.align = "l" print(x) return x
class EditGraphAligner(CollationAlgorithm): def __init__(self, collation, near_match=False, debug_scores=False, detect_transpositions=False, properties_filter=None): self.collation = collation self.debug_scores = debug_scores self.detect_transpositions = detect_transpositions self.token_index = TokenIndex(collation.witnesses) self.scorer = Scorer(self.token_index, near_match=near_match, properties_filter=properties_filter) self.align_function = self._align_table self.added_witness = [] self.omitted_base = [] def collate(self, graph, collation): """ :type graph: VariantGraph :type collation: Collation """ # prepare the token index self.token_index.prepare() # Build the variant graph for the first witness # this is easy: generate a vertex for every token first_witness = collation.witnesses[0] tokens = first_witness.tokens() token_to_vertex = self.merge(graph, first_witness.sigil, tokens) # let the scorer prepare the first witness self.scorer.prepare_witness(first_witness) # construct superbase superbase = tokens # align witness 2 - n for x in range(1, len(collation.witnesses)): next_witness = collation.witnesses[x] # let the scorer prepare the next witness self.scorer.prepare_witness(next_witness) # # VOOR CONTROLE! # alignment = self._align_table(superbase, next_witness, token_to_vertex) # self.table2 = self.table # alignment = token -> vertex alignment = self.align_function(superbase, next_witness, token_to_vertex) # merge token_to_vertex.update(self.merge(graph, next_witness.sigil, next_witness.tokens(), alignment)) # print("actual") # self._debug_edit_graph_table(self.table) # print("expected") # self._debug_edit_graph_table(self.table2) # change superbase superbase = self.new_superbase if self.detect_transpositions: detector = TranspositionDetection(self) detector.detect() if self.debug_scores: self._debug_edit_graph_table(self.table) def _align_table(self, superbase, witness, token_to_vertex): if not superbase: raise Exception("Superbase is empty!") # print(""+str(superbase)+":"+str(witness.tokens())) self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) self.table = [[EditGraphNode() for _ in range(self.length_witness_a+1)] for _ in range(self.length_witness_b+1)] # per diagonal calculate the score (taking into account the three surrounding nodes) self.traverse_diagonally() alignment = {} self.additions = [] self.omissions = [] self.new_superbase=[] # start lower right cell x = self.length_witness_a y = self.length_witness_b # work our way to the upper left while x > 0 and y > 0: cell = self.table[y][x] self._process_cell(token_to_vertex, self.tokens_witness_a, self.tokens_witness_b, alignment, x, y) # examine neighbor nodes nodes_to_examine = set() nodes_to_examine.add(self.table[y][x-1]) nodes_to_examine.add(self.table[y-1][x]) nodes_to_examine.add(self.table[y-1][x-1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) # move position if self.table[y-1][x-1] == parent_node: # another match or replacement if not cell.match: self.omitted_base.insert(0, self.tokens_witness_a[x-1]) self.added_witness.insert(0, self.tokens_witness_b[y-1]) # print("replacement:"+str(self.tokens_witness_a[x-1])+":"+str(self.tokens_witness_b[y-1])) # else: # print("match:"+str(self.tokens_witness_a[x-1])) y -= 1 x -= 1 else: if self.table[y-1][x] == parent_node: # addition? self.added_witness.insert(0, self.tokens_witness_b[y - 1]) # print("added:" + str(self.tokens_witness_b[y - 1])) y -= 1 else: if self.table[y][x-1] == parent_node: # omission? self.omitted_base.insert(0, self.tokens_witness_a[x - 1]) # print("omitted:" + str(self.tokens_witness_a[x - 1])) x -= 1 # process additions/omissions in the begin of the superbase/witness if x > 0: self.omitted_base = self.tokens_witness_a[0:x] + self.omitted_base if y > 0: self.added_witness = self.tokens_witness_b[0:y] + self.added_witness self.add_to_superbase() return alignment def add_to_superbase(self): if self.omitted_base or self.added_witness: # print("update superbase:" + str(self.omitted_base) + ":" + str(self.added_witness)) # update superbase with additions, omissions self.new_superbase = self.added_witness + self.new_superbase self.new_superbase = self.omitted_base + self.new_superbase self.added_witness = [] self.omitted_base = [] def _process_cell(self, token_to_vertex, witness_a, witness_b, alignment, x, y): cell = self.table[y][x] if cell.match: # process segments self.add_to_superbase() # process alignment token = witness_a[x-1] token2 = witness_b[y-1] vertex = token_to_vertex[token] alignment[token2] = vertex # print("match") # print(token2) self.new_superbase.insert(0, token) return cell # This function traverses the table diagonally and scores each cell. # Original function from Mark Byers; translated from C into Python. def traverse_diagonally(self): m = self.length_witness_b+1 n = self.length_witness_a+1 for _slice in range(0, m + n - 1, 1): z1 = 0 if _slice < n else _slice - n + 1 z2 = 0 if _slice < m else _slice - m + 1 j = _slice - z2 while j >= z1: x = _slice - j y = j self.score_cell(y, x) j -= 1 def score_cell(self, y, x): # initialize root node score to zero (no edit operations have # been performed) if y == 0 and x == 0: self.table[y][x].g = 0 return # examine neighbor nodes nodes_to_examine = set() # fetch existing score from the left node if possible if x > 0: nodes_to_examine.add(self.table[y][x-1]) if y > 0: nodes_to_examine.add(self.table[y-1][x]) if x > 0 and y > 0: nodes_to_examine.add(self.table[y-1][x-1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) if parent_node == self.table[y-1][x-1]: edit_operation = 0 else: edit_operation = 1 token_a = self.tokens_witness_a[x-1] token_b = self.tokens_witness_b[y-1] self.scorer.score_cell(self.table[y][x], parent_node, token_a, token_b, y, x, edit_operation) def _debug_edit_graph_table(self, table): # print the table horizontal x = PrettyTable() x.header=False for y in range(0, len(table)): cells = table[y] x.add_row(cells) # alignment can only be set after the field names are known. # since add_row sets the field names, it has to be set after x.add_row(cells) x.align="l" print(x) return x