def test_non_overlapping_blocks_overlap_case(self): collation = Collation() collation.add_plain_witness("W1", "in the in the bleach") collation.add_plain_witness("W2", "in the in the bleach in the") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-4, 7-11")), blocks) # in the in the bleach
def test_non_overlapping_blocks_black_cat(self): collation = Collation() collation.add_plain_witness("W1", "the black cat") collation.add_plain_witness("W2", "the black cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 4-6")) self.assertEqual([block1], blocks)
def test_non_overlapping_blocks_overlap_case(self): collation = Collation() collation.add_plain_witness("W1", "in the in the bleach") collation.add_plain_witness("W2", "in the in the bleach in the") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-4, 6-10")), blocks) # in the in the bleach
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
def test_blocks_splitting_token_case(self): collation = Collation() collation.add_plain_witness("W1", "a c b c") collation.add_plain_witness("W2", "a c b") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-2, 5-7")) # a c b self.assertIn(block1, blocks)
def test_non_overlapping_blocks_Hermans(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-8, 17-25")), blocks) # a b c d F g h i ! self.assertIn(Block(RangeSet("11-14, 26-29")), blocks) # q r s t
def test_2(self): collation = Collation() collation.add_plain_witness("W1", "in the in the bleach") collation.add_plain_witness("W2", "in the in the bleach in the") collation.add_plain_witness("W3", "in the in the bleach in the") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-4, 6-10, 14-18")), blocks) # in the in the bleach self.assertIn(Block(RangeSet("11-12, 19-20")), blocks) # in the
def test_filter_potential_blocks(self): collation = Collation() collation.add_plain_witness("W1", "a a") collation.add_plain_witness("w2", "a") extsufarr = collation.to_extended_suffix_array() potential_blocks = extsufarr.split_lcp_array_into_intervals() algorithm = Scorer(collation) algorithm.filter_potential_blocks(potential_blocks) self.assertFalse(potential_blocks)
def test_block_witnesses_Hermans_case_two_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") algorithm = Scorer(collation) block_witness = algorithm._get_block_witness(collation.witnesses[0]) self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug()) block_witness = algorithm._get_block_witness(collation.witnesses[1]) self.assertEquals(["a b c d F g h i !", "q r s t"], block_witness.debug())
def test_blocks_failing_transposition_use_case_old_algorithm(self): collation = Collation() collation.add_plain_witness("W1", "the cat and the dog") collation.add_plain_witness("W2", "the dog and the cat") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() block1 = Block(RangeSet("0-1, 9-10")) block2 = Block(RangeSet("3-4, 6-7")) block3 = Block(RangeSet("2, 8")) self.assertEqual([block1, block2, block3], blocks)
def __init__(self, collation, near_match=False, astar=False, debug_scores=False): self.collation = collation self.debug_scores = debug_scores self.scorer = Scorer(collation, near_match) print("INFO: Aligning using a* search algorithm. BETA quality.") self.align_function = self._align_astar
def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(TokenIndex.create_token_index(collation)) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks) # a b c d self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks) # g h i self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")), blocks) # ! q r s t self.assertIn(Block(RangeSet("4, 20")), blocks) # F
def test_blocks_Hermans_case_three_witnesses(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(collation) blocks = algorithm._get_non_overlapping_repeating_blocks() self.assertIn(Block(RangeSet("0-3, 17-20, 32-35")), blocks) # a b c d self.assertIn(Block(RangeSet("5-7, 22-24, 37-39")), blocks) # g h i self.assertIn(Block(RangeSet("10-14, 25-29, 40-44")), blocks) # ! q r s t self.assertIn(Block(RangeSet("4, 21")), blocks) # F
def test_block_witnesses_Hermans_case(self): collation = Collation() collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t") collation.add_plain_witness("W2", "a b c d F g h i ! q r s t") collation.add_plain_witness("W3", "a b c d E g h i ! q r s t") algorithm = Scorer(collation) block_witness1 = algorithm._get_block_witness(collation.witnesses[0]) self.assertEquals(["a b c d", "F", "g h i", "! q r s t"], block_witness1.debug()) block_witness2 = algorithm._get_block_witness(collation.witnesses[1]) self.assertEquals(["a b c d", "F", "g h i", "! q r s t"], block_witness2.debug()) block_witness3 = algorithm._get_block_witness(collation.witnesses[2]) self.assertEquals(["a b c d", "g h i", "! q r s t"], block_witness3.debug())
def __init__(self, collation, near_match=False, debug_scores=False, detect_transpositions=False, properties_filter=None): self.collation = collation self.debug_scores = debug_scores self.detect_transpositions = detect_transpositions self.token_index = TokenIndex(collation.witnesses) self.scorer = Scorer(self.token_index, near_match=near_match, properties_filter=properties_filter) self.align_function = self._align_table
def __init__(self, collation, near_match=False, astar=False, debug_scores=False): self.collation = collation self.debug_scores = debug_scores self.scorer = Scorer(collation, near_match) if not astar: self.align_function = self._align_table else: print("INFO: Aligning using a* search algorithm. BETA quality.") self.align_function = self._align_astar
class ExperimentalAstarAligner(CollationAlgorithm): def __init__(self, collation, near_match=False, astar=False, debug_scores=False): self.collation = collation self.debug_scores = debug_scores self.scorer = Scorer(collation, near_match) print("INFO: Aligning using a* search algorithm. BETA quality.") self.align_function = self._align_astar def collate(self, graph, collation): ''' :type graph: VariantGraph :type collation: Collation ''' # Build the variant graph for the first witness # this is easy: generate a vertex for every token first_witness = collation.witnesses[0] tokens = first_witness.tokens() token_to_vertex = self.merge(graph, first_witness.sigil, tokens) # let the scorer prepare the first witness self.scorer.prepare_witness(first_witness) # construct superbase superbase = tokens # align witness 2 - n for x in range(1, len(collation.witnesses)): next_witness = collation.witnesses[x] # let the scorer prepare the next witness self.scorer.prepare_witness(next_witness) # # VOOR CONTROLE! # alignment = self._align_table(superbase, next_witness, token_to_vertex) # self.table2 = self.table # alignment = token -> vertex alignment = self.align_function(superbase, next_witness, token_to_vertex) # merge token_to_vertex.update( self.merge(graph, next_witness.sigil, next_witness.tokens(), alignment)) # print("actual") # self._debug_edit_graph_table(self.table) # print("expected") # self._debug_edit_graph_table(self.table2) # change superbase superbase = self.new_superbase if self.debug_scores: self._debug_edit_graph_table(self.table) def _debug_edit_graph_table(self, table): # print the table horizontal x = PrettyTable() x.header = False for y in range(0, len(table)): cells = table[y] x.add_row(cells) # alignment can only be set after the field names are known. # since add_row sets the field names, it has to be set after x.add_row(cells) x.align = "l" print(x) return x # method is here for debug purposes # at no time in real life a complete table is needed def _create_heuristic_table(self, superbase, witness): self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) aligner = AstarEditGraphAligner(self.tokens_witness_a, self.tokens_witness_b, self.scorer) self.table = [[ AstarEditGraphNode(aligner, y, x) for x in range(self.length_witness_a + 1) ] for y in range(self.length_witness_b + 1)] self.heuristic_table = [[0 for x in range(self.length_witness_a + 1)] for y in range(self.length_witness_b + 1)] for y in range(self.length_witness_b + 1): for x in range(self.length_witness_a + 1): # TODO: I could create node dynamically # TODO: creating empty integer is also not really needed self.heuristic_table[y][x] = aligner.heuristic( self.table[y][x]) def _align_astar(self, superbase, witness, token_to_vertex, control_table=None): self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) self.control_table = control_table aligner = AstarEditGraphAligner(self.tokens_witness_a, self.tokens_witness_b, self.scorer) self.table = [[ AstarEditGraphNode(aligner, y, x) for x in range(self.length_witness_a + 1) ] for y in range(self.length_witness_b + 1)] aligner.table = self.table start = self.table[0][0] path = aligner.search(start, self.control_table) self._debug_path = path # transform path into an alignment alignment = {} # segment stuff # note we traverse from left to right! self.last_x = 0 self.last_y = 0 self.new_superbase = [] for element in path: # print(element.y, element.x) if element.match == True: # process segments self.newer_add_to_superbase(self.tokens_witness_a, self.tokens_witness_b, element.x, element.y) self.last_x = element.x self.last_y = element.y # process alignment token = self.tokens_witness_a[element.x - 1] token2 = self.tokens_witness_b[element.y - 1] vertex = token_to_vertex[token] alignment[token2] = vertex # add match to superbase self.new_superbase.append(token) # process additions/omissions in the begin of the superbase/witness self.newer_add_to_superbase(self.tokens_witness_a, self.tokens_witness_b, self.length_witness_a, self.length_witness_b) return alignment def newer_add_to_superbase(self, witness_a, witness_b, x, y): if x - self.last_x - 1 > 0 or y - self.last_y - 1 > 0: # create new segment omitted_base = witness_a[self.last_x:x - 1] added_witness = witness_b[self.last_y:y - 1] self.new_superbase += omitted_base self.new_superbase += added_witness
class ExperimentalAstarAligner(CollationAlgorithm): def __init__(self, collation, near_match=False, astar=False, debug_scores=False): self.collation = collation self.debug_scores = debug_scores self.scorer = Scorer(collation, near_match) print("INFO: Aligning using a* search algorithm. BETA quality.") self.align_function = self._align_astar def collate(self, graph, collation): ''' :type graph: VariantGraph :type collation: Collation ''' # Build the variant graph for the first witness # this is easy: generate a vertex for every token first_witness = collation.witnesses[0] tokens = first_witness.tokens() token_to_vertex = self.merge(graph, first_witness.sigil, tokens) # let the scorer prepare the first witness self.scorer.prepare_witness(first_witness) # construct superbase superbase = tokens # align witness 2 - n for x in range(1, len(collation.witnesses)): next_witness = collation.witnesses[x] # let the scorer prepare the next witness self.scorer.prepare_witness(next_witness) # # VOOR CONTROLE! # alignment = self._align_table(superbase, next_witness, token_to_vertex) # self.table2 = self.table # alignment = token -> vertex alignment = self.align_function(superbase, next_witness, token_to_vertex) # merge token_to_vertex.update(self.merge(graph, next_witness.sigil, next_witness.tokens(), alignment)) # print("actual") # self._debug_edit_graph_table(self.table) # print("expected") # self._debug_edit_graph_table(self.table2) # change superbase superbase = self.new_superbase if self.debug_scores: self._debug_edit_graph_table(self.table) def _debug_edit_graph_table(self, table): # print the table horizontal x = PrettyTable() x.header = False for y in range(0, len(table)): cells = table[y] x.add_row(cells) # alignment can only be set after the field names are known. # since add_row sets the field names, it has to be set after x.add_row(cells) x.align = "l" print(x) return x # method is here for debug purposes # at no time in real life a complete table is needed def _create_heuristic_table(self, superbase, witness): self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) aligner = AstarEditGraphAligner(self.tokens_witness_a, self.tokens_witness_b, self.scorer) self.table = [[AstarEditGraphNode(aligner, y, x) for x in range(self.length_witness_a + 1)] for y in range(self.length_witness_b + 1)] self.heuristic_table = [[0 for x in range(self.length_witness_a + 1)] for y in range(self.length_witness_b + 1)] for y in range(self.length_witness_b + 1): for x in range(self.length_witness_a + 1): # TODO: I could create node dynamically # TODO: creating empty integer is also not really needed self.heuristic_table[y][x] = aligner.heuristic(self.table[y][x]) def _align_astar(self, superbase, witness, token_to_vertex, control_table=None): self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) self.control_table = control_table aligner = AstarEditGraphAligner(self.tokens_witness_a, self.tokens_witness_b, self.scorer) self.table = [[AstarEditGraphNode(aligner, y, x) for x in range(self.length_witness_a + 1)] for y in range(self.length_witness_b + 1)] aligner.table = self.table start = self.table[0][0] path = aligner.search(start, self.control_table) self._debug_path = path # transform path into an alignment alignment = {} # segment stuff # note we traverse from left to right! self.last_x = 0 self.last_y = 0 self.new_superbase = [] for element in path: # print(element.y, element.x) if element.match == True: # process segments self.newer_add_to_superbase(self.tokens_witness_a, self.tokens_witness_b, element.x, element.y) self.last_x = element.x self.last_y = element.y # process alignment token = self.tokens_witness_a[element.x - 1] token2 = self.tokens_witness_b[element.y - 1] vertex = token_to_vertex[token] alignment[token2] = vertex # add match to superbase self.new_superbase.append(token) # process additions/omissions in the begin of the superbase/witness self.newer_add_to_superbase(self.tokens_witness_a, self.tokens_witness_b, self.length_witness_a, self.length_witness_b) return alignment def newer_add_to_superbase(self, witness_a, witness_b, x, y): if x - self.last_x - 1 > 0 or y - self.last_y - 1 > 0: # create new segment omitted_base = witness_a[self.last_x:x - 1] added_witness = witness_b[self.last_y:y - 1] self.new_superbase += omitted_base self.new_superbase += added_witness
class EditGraphAligner(CollationAlgorithm): def __init__(self, collation, near_match=False, astar=False, debug_scores=False): self.collation = collation self.debug_scores = debug_scores self.scorer = Scorer(collation, near_match) if not astar: self.align_function = self._align_table else: print("INFO: Aligning using a* search algorithm. BETA quality.") self.align_function = self._align_astar def collate(self, graph, collation): ''' :type graph: VariantGraph :type collation: Collation ''' # Build the variant graph for the first witness # this is easy: generate a vertex for every token first_witness = collation.witnesses[0] tokens = first_witness.tokens() token_to_vertex = self.merge(graph, first_witness.sigil, tokens) # let the scorer prepare the first witness self.scorer.prepare_witness(first_witness) # construct superbase superbase = tokens # align witness 2 - n for x in range(1, len(collation.witnesses)): next_witness = collation.witnesses[x] # let the scorer prepare the next witness self.scorer.prepare_witness(next_witness) # # VOOR CONTROLE! # alignment = self._align_table(superbase, next_witness, token_to_vertex) # self.table2 = self.table # alignment = token -> vertex alignment = self.align_function(superbase, next_witness, token_to_vertex) # merge token_to_vertex.update( self.merge(graph, next_witness.sigil, next_witness.tokens(), alignment)) # print("actual") # self._debug_edit_graph_table(self.table) # print("expected") # self._debug_edit_graph_table(self.table2) # change superbase superbase = self.new_superbase if self.debug_scores: self._debug_edit_graph_table(self.table) def _align_astar(self, superbase, witness, token_to_vertex, control_table=None): self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) self.control_table = control_table aligner = AstarEditGraphAligner(self.tokens_witness_a, self.tokens_witness_b, self.scorer) self.table = [[ AstarEditGraphNode(aligner, y, x) for x in range(self.length_witness_a + 1) ] for y in range(self.length_witness_b + 1)] aligner.table = self.table start = self.table[0][0] path = aligner.search(start, self.control_table) # transform path into an alignment alignment = {} # segment stuff # note we traverse from left to right! self.last_x = 0 self.last_y = 0 self.new_superbase = [] for element in path: # print(element.y, element.x) if element.match == True: # process segments self.newer_add_to_superbase(self.tokens_witness_a, self.tokens_witness_b, element.x, element.y) self.last_x = element.x self.last_y = element.y # process alignment token = self.tokens_witness_a[element.x - 1] token2 = self.tokens_witness_b[element.y - 1] vertex = token_to_vertex[token] alignment[token2] = vertex # add match to superbase self.new_superbase.append(token) # process additions/omissions in the begin of the superbase/witness self.newer_add_to_superbase(self.tokens_witness_a, self.tokens_witness_b, self.length_witness_a, self.length_witness_b) return alignment def newer_add_to_superbase(self, witness_a, witness_b, x, y): if x - self.last_x - 1 > 0 or y - self.last_y - 1 > 0: # create new segment omitted_base = witness_a[self.last_x:x - 1] added_witness = witness_b[self.last_y:y - 1] self.new_superbase += omitted_base self.new_superbase += added_witness def _align_table(self, superbase, witness, token_to_vertex): self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) self.table = [[ EditGraphNode() for _ in range(self.length_witness_a + 1) ] for _ in range(self.length_witness_b + 1)] # per diagonal calculate the score (taking into account the three surrounding nodes) self.traverse_diagonally() alignment = {} # segment stuff # note we traverse from right to left! self.last_x = self.length_witness_a self.last_y = self.length_witness_b self.new_superbase = [] # start lower right cell x = self.length_witness_a y = self.length_witness_b # work our way to the upper left while x > 0 and y > 0: self._process_cell(token_to_vertex, self.tokens_witness_a, self.tokens_witness_b, alignment, x, y) # examine neighbor nodes nodes_to_examine = set() nodes_to_examine.add(self.table[y][x - 1]) nodes_to_examine.add(self.table[y - 1][x]) nodes_to_examine.add(self.table[y - 1][x - 1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) # move position if self.table[y - 1][x - 1] == parent_node: # another match or replacement y = y - 1 x = x - 1 else: if self.table[y - 1][x] == parent_node: #omission? y = y - 1 else: if self.table[y][x - 1] == parent_node: #addition? x = x - 1 # process additions/omissions in the begin of the superbase/witness self.add_to_superbase(self.tokens_witness_a, self.tokens_witness_b, 0, 0) return alignment def add_to_superbase(self, witness_a, witness_b, x, y): # print self.last_x - x - 1, self.last_y - y - 1 if self.last_x - x - 1 > 0 or self.last_y - y - 1 > 0: # print x, self.last_x, y, self.last_y # create new segment omitted_base = witness_a[x:self.last_x - 1] # print omitted_base added_witness = witness_b[y:self.last_y - 1] # print added_witness self.new_superbase = added_witness + self.new_superbase self.new_superbase = omitted_base + self.new_superbase def _process_cell(self, token_to_vertex, witness_a, witness_b, alignment, x, y): cell = self.table[y][x] # process segments if cell.match == True: self.add_to_superbase(witness_a, witness_b, x, y) self.last_x = x self.last_y = y # process alignment if cell.match == True: token = witness_a[x - 1] token2 = witness_b[y - 1] vertex = token_to_vertex[token] alignment[token2] = vertex # print("match") # print(token2) self.new_superbase.insert(0, token) return cell # This function traverses the table diagonally and scores each cell. # Original function from Mark Byers; translated from C into Python. def traverse_diagonally(self): m = self.length_witness_b + 1 n = self.length_witness_a + 1 for _slice in range(0, m + n - 1, 1): z1 = 0 if _slice < n else _slice - n + 1 z2 = 0 if _slice < m else _slice - m + 1 j = _slice - z2 while j >= z1: x = _slice - j y = j self.score_cell(y, x) j -= 1 def score_cell(self, y, x): # initialize root node score to zero (no edit operations have # been performed) if y == 0 and x == 0: self.table[y][x].g = 0 return # examine neighbor nodes nodes_to_examine = set() # fetch existing score from the left node if possible if x > 0: nodes_to_examine.add(self.table[y][x - 1]) if y > 0: nodes_to_examine.add(self.table[y - 1][x]) if x > 0 and y > 0: nodes_to_examine.add(self.table[y - 1][x - 1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) if parent_node == self.table[y - 1][x - 1]: edit_operation = 0 else: edit_operation = 1 token_a = self.tokens_witness_a[x - 1] token_b = self.tokens_witness_b[y - 1] self.scorer.score_cell(self.table[y][x], parent_node, token_a, token_b, y, x, edit_operation) def _debug_edit_graph_table(self, table): # print the table horizontal x = PrettyTable() x.header = False for y in range(0, len(table)): cells = table[y] x.add_row(cells) # alignment can only be set after the field names are known. # since add_row sets the field names, it has to be set after x.add_row(cells) x.align = "l" print(x) return x
class EditGraphAligner(CollationAlgorithm): def __init__(self, collation, near_match=False, astar=False, debug_scores=False): self.collation = collation self.debug_scores = debug_scores self.scorer = Scorer(collation, near_match) if not astar: self.align_function = self._align_table else: print("INFO: Aligning using a* search algorithm. BETA quality.") self.align_function = self._align_astar def collate(self, graph, collation): ''' :type graph: VariantGraph :type collation: Collation ''' # Build the variant graph for the first witness # this is easy: generate a vertex for every token first_witness = collation.witnesses[0] tokens = first_witness.tokens() token_to_vertex = self.merge(graph, first_witness.sigil, tokens) # let the scorer prepare the first witness self.scorer.prepare_witness(first_witness) # construct superbase superbase = tokens # align witness 2 - n for x in range(1, len(collation.witnesses)): next_witness = collation.witnesses[x] # let the scorer prepare the next witness self.scorer.prepare_witness(next_witness) # # VOOR CONTROLE! # alignment = self._align_table(superbase, next_witness, token_to_vertex) # self.table2 = self.table # alignment = token -> vertex alignment = self.align_function(superbase, next_witness, token_to_vertex) # merge token_to_vertex.update(self.merge(graph, next_witness.sigil, next_witness.tokens(), alignment)) # print("actual") # self._debug_edit_graph_table(self.table) # print("expected") # self._debug_edit_graph_table(self.table2) # change superbase superbase = self.new_superbase if self.debug_scores: self._debug_edit_graph_table(self.table) def _align_astar(self, superbase, witness, token_to_vertex, control_table=None): self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) self.control_table = control_table aligner = AstarEditGraphAligner(self.tokens_witness_a, self.tokens_witness_b, self.scorer) self.table = [[AstarEditGraphNode(aligner, y, x) for x in range(self.length_witness_a+1)] for y in range(self.length_witness_b+1)] aligner.table = self.table start = self.table[0][0] path = aligner.search(start, self.control_table) # transform path into an alignment alignment = {} # segment stuff # note we traverse from left to right! self.last_x = 0 self.last_y = 0 self.new_superbase=[] for element in path: # print(element.y, element.x) if element.match == True: # process segments self.newer_add_to_superbase(self.tokens_witness_a, self.tokens_witness_b, element.x, element.y) self.last_x = element.x self.last_y = element.y # process alignment token = self.tokens_witness_a[element.x-1] token2 = self.tokens_witness_b[element.y-1] vertex = token_to_vertex[token] alignment[token2] = vertex # add match to superbase self.new_superbase.append(token) # process additions/omissions in the begin of the superbase/witness self.newer_add_to_superbase(self.tokens_witness_a, self.tokens_witness_b, self.length_witness_a, self.length_witness_b) return alignment def newer_add_to_superbase(self, witness_a, witness_b, x, y): if x - self.last_x - 1 > 0 or y - self.last_y - 1 > 0: # create new segment omitted_base = witness_a[self.last_x:x - 1] added_witness = witness_b[self.last_y:y - 1] self.new_superbase += omitted_base self.new_superbase += added_witness def _align_table(self, superbase, witness, token_to_vertex): self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) self.table = [[EditGraphNode() for _ in range(self.length_witness_a+1)] for _ in range(self.length_witness_b+1)] # per diagonal calculate the score (taking into account the three surrounding nodes) self.traverse_diagonally() alignment = {} # segment stuff # note we traverse from right to left! self.last_x = self.length_witness_a self.last_y = self.length_witness_b self.new_superbase=[] # start lower right cell x = self.length_witness_a y = self.length_witness_b # work our way to the upper left while x > 0 and y > 0: self._process_cell(token_to_vertex, self.tokens_witness_a, self.tokens_witness_b, alignment, x, y) # examine neighbor nodes nodes_to_examine = set() nodes_to_examine.add(self.table[y][x-1]) nodes_to_examine.add(self.table[y-1][x]) nodes_to_examine.add(self.table[y-1][x-1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) # move position if self.table[y-1][x-1] == parent_node: # another match or replacement y = y -1 x = x -1 else: if self.table[y-1][x] == parent_node: #omission? y = y -1 else: if self.table[y][x-1] == parent_node: #addition? x = x -1 # process additions/omissions in the begin of the superbase/witness self.add_to_superbase(self.tokens_witness_a, self.tokens_witness_b, 0, 0) return alignment def add_to_superbase(self, witness_a, witness_b, x, y): # print self.last_x - x - 1, self.last_y - y - 1 if self.last_x - x - 1 > 0 or self.last_y - y - 1 > 0: # print x, self.last_x, y, self.last_y # create new segment omitted_base = witness_a[x:self.last_x - 1] # print omitted_base added_witness = witness_b[y:self.last_y - 1] # print added_witness self.new_superbase = added_witness + self.new_superbase self.new_superbase = omitted_base + self.new_superbase def _process_cell(self, token_to_vertex, witness_a, witness_b, alignment, x, y): cell = self.table[y][x] # process segments if cell.match == True: self.add_to_superbase(witness_a, witness_b, x, y) self.last_x = x self.last_y = y # process alignment if cell.match == True: token = witness_a[x-1] token2 = witness_b[y-1] vertex = token_to_vertex[token] alignment[token2] = vertex # print("match") # print(token2) self.new_superbase.insert(0, token) return cell # This function traverses the table diagonally and scores each cell. # Original function from Mark Byers; translated from C into Python. def traverse_diagonally(self): m = self.length_witness_b+1 n = self.length_witness_a+1 for _slice in range(0, m + n - 1, 1): z1 = 0 if _slice < n else _slice - n + 1; z2 = 0 if _slice < m else _slice - m + 1; j = _slice - z2 while j >= z1: x = _slice - j y = j self.score_cell(y, x) j -= 1 def score_cell(self, y, x): # initialize root node score to zero (no edit operations have # been performed) if y == 0 and x == 0: self.table[y][x].g = 0 return # examine neighbor nodes nodes_to_examine = set() # fetch existing score from the left node if possible if x > 0: nodes_to_examine.add(self.table[y][x-1]) if y > 0: nodes_to_examine.add(self.table[y-1][x]) if x > 0 and y > 0: nodes_to_examine.add(self.table[y-1][x-1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) if parent_node == self.table[y-1][x-1]: edit_operation = 0 else: edit_operation = 1 token_a = self.tokens_witness_a[x-1] token_b = self.tokens_witness_b[y-1] self.scorer.score_cell(self.table[y][x], parent_node, token_a, token_b, y, x, edit_operation) def _debug_edit_graph_table(self, table): # print the table horizontal x = PrettyTable() x.header=False for y in range(0, len(table)): cells = table[y] x.add_row(cells) # alignment can only be set after the field names are known. # since add_row sets the field names, it has to be set after x.add_row(cells) x.align="l" print(x) return x
class EditGraphAligner(CollationAlgorithm): def __init__(self, collation, near_match=False, debug_scores=False, detect_transpositions=False, properties_filter=None): self.collation = collation self.debug_scores = debug_scores self.detect_transpositions = detect_transpositions self.token_index = TokenIndex(collation.witnesses) self.scorer = Scorer(self.token_index, near_match=near_match, properties_filter=properties_filter) self.align_function = self._align_table self.added_witness = [] self.omitted_base = [] def collate(self, graph, collation): """ :type graph: VariantGraph :type collation: Collation """ # prepare the token index self.token_index.prepare() # Build the variant graph for the first witness # this is easy: generate a vertex for every token first_witness = collation.witnesses[0] tokens = first_witness.tokens() token_to_vertex = self.merge(graph, first_witness.sigil, tokens) # let the scorer prepare the first witness self.scorer.prepare_witness(first_witness) # construct superbase superbase = tokens # align witness 2 - n for x in range(1, len(collation.witnesses)): next_witness = collation.witnesses[x] # let the scorer prepare the next witness self.scorer.prepare_witness(next_witness) # # VOOR CONTROLE! # alignment = self._align_table(superbase, next_witness, token_to_vertex) # self.table2 = self.table # alignment = token -> vertex alignment = self.align_function(superbase, next_witness, token_to_vertex) # merge token_to_vertex.update(self.merge(graph, next_witness.sigil, next_witness.tokens(), alignment)) # print("actual") # self._debug_edit_graph_table(self.table) # print("expected") # self._debug_edit_graph_table(self.table2) # change superbase superbase = self.new_superbase if self.detect_transpositions: detector = TranspositionDetection(self) detector.detect() if self.debug_scores: self._debug_edit_graph_table(self.table) def _align_table(self, superbase, witness, token_to_vertex): if not superbase: raise Exception("Superbase is empty!") # print(""+str(superbase)+":"+str(witness.tokens())) self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) self.table = [[EditGraphNode() for _ in range(self.length_witness_a+1)] for _ in range(self.length_witness_b+1)] # per diagonal calculate the score (taking into account the three surrounding nodes) self.traverse_diagonally() alignment = {} self.additions = [] self.omissions = [] self.new_superbase=[] # start lower right cell x = self.length_witness_a y = self.length_witness_b # work our way to the upper left while x > 0 and y > 0: cell = self.table[y][x] self._process_cell(token_to_vertex, self.tokens_witness_a, self.tokens_witness_b, alignment, x, y) # examine neighbor nodes nodes_to_examine = set() nodes_to_examine.add(self.table[y][x-1]) nodes_to_examine.add(self.table[y-1][x]) nodes_to_examine.add(self.table[y-1][x-1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) # move position if self.table[y-1][x-1] == parent_node: # another match or replacement if not cell.match: self.omitted_base.insert(0, self.tokens_witness_a[x-1]) self.added_witness.insert(0, self.tokens_witness_b[y-1]) # print("replacement:"+str(self.tokens_witness_a[x-1])+":"+str(self.tokens_witness_b[y-1])) # else: # print("match:"+str(self.tokens_witness_a[x-1])) y -= 1 x -= 1 else: if self.table[y-1][x] == parent_node: # addition? self.added_witness.insert(0, self.tokens_witness_b[y - 1]) # print("added:" + str(self.tokens_witness_b[y - 1])) y -= 1 else: if self.table[y][x-1] == parent_node: # omission? self.omitted_base.insert(0, self.tokens_witness_a[x - 1]) # print("omitted:" + str(self.tokens_witness_a[x - 1])) x -= 1 # process additions/omissions in the begin of the superbase/witness if x > 0: self.omitted_base = self.tokens_witness_a[0:x] + self.omitted_base if y > 0: self.added_witness = self.tokens_witness_b[0:y] + self.added_witness self.add_to_superbase() return alignment def add_to_superbase(self): if self.omitted_base or self.added_witness: # print("update superbase:" + str(self.omitted_base) + ":" + str(self.added_witness)) # update superbase with additions, omissions self.new_superbase = self.added_witness + self.new_superbase self.new_superbase = self.omitted_base + self.new_superbase self.added_witness = [] self.omitted_base = [] def _process_cell(self, token_to_vertex, witness_a, witness_b, alignment, x, y): cell = self.table[y][x] if cell.match: # process segments self.add_to_superbase() # process alignment token = witness_a[x-1] token2 = witness_b[y-1] vertex = token_to_vertex[token] alignment[token2] = vertex # print("match") # print(token2) self.new_superbase.insert(0, token) return cell # This function traverses the table diagonally and scores each cell. # Original function from Mark Byers; translated from C into Python. def traverse_diagonally(self): m = self.length_witness_b+1 n = self.length_witness_a+1 for _slice in range(0, m + n - 1, 1): z1 = 0 if _slice < n else _slice - n + 1 z2 = 0 if _slice < m else _slice - m + 1 j = _slice - z2 while j >= z1: x = _slice - j y = j self.score_cell(y, x) j -= 1 def score_cell(self, y, x): # initialize root node score to zero (no edit operations have # been performed) if y == 0 and x == 0: self.table[y][x].g = 0 return # examine neighbor nodes nodes_to_examine = set() # fetch existing score from the left node if possible if x > 0: nodes_to_examine.add(self.table[y][x-1]) if y > 0: nodes_to_examine.add(self.table[y-1][x]) if x > 0 and y > 0: nodes_to_examine.add(self.table[y-1][x-1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) if parent_node == self.table[y-1][x-1]: edit_operation = 0 else: edit_operation = 1 token_a = self.tokens_witness_a[x-1] token_b = self.tokens_witness_b[y-1] self.scorer.score_cell(self.table[y][x], parent_node, token_a, token_b, y, x, edit_operation) def _debug_edit_graph_table(self, table): # print the table horizontal x = PrettyTable() x.header=False for y in range(0, len(table)): cells = table[y] x.add_row(cells) # alignment can only be set after the field names are known. # since add_row sets the field names, it has to be set after x.add_row(cells) x.align="l" print(x) return x
class EditGraphAligner(CollationAlgorithm): def __init__(self, collation, near_match=False, debug_scores=False, detect_transpositions=False, properties_filter=None): self.collation = collation self.debug_scores = debug_scores self.detect_transpositions = detect_transpositions self.token_index = TokenIndex(collation.witnesses) self.scorer = Scorer(self.token_index, near_match=near_match, properties_filter=properties_filter) self.align_function = self._align_table self.added_witness = [] self.omitted_base = [] def collate(self, graph, collation): """ :type graph: VariantGraph :type collation: Collation """ # prepare the token index self.token_index.prepare() # Build the variant graph for the first witness # this is easy: generate a vertex for every token first_witness = collation.witnesses[0] tokens = first_witness.tokens() token_to_vertex = self.merge(graph, first_witness.sigil, tokens) # let the scorer prepare the first witness self.scorer.prepare_witness(first_witness) # construct superbase superbase = tokens # align witness 2 - n for x in range(1, len(collation.witnesses)): next_witness = collation.witnesses[x] # let the scorer prepare the next witness self.scorer.prepare_witness(next_witness) # # VOOR CONTROLE! # alignment = self._align_table(superbase, next_witness, token_to_vertex) # self.table2 = self.table # alignment = token -> vertex alignment = self.align_function(superbase, next_witness, token_to_vertex) # merge token_to_vertex.update( self.merge(graph, next_witness.sigil, next_witness.tokens(), alignment)) # print("actual") # self._debug_edit_graph_table(self.table) # print("expected") # self._debug_edit_graph_table(self.table2) # change superbase superbase = self.new_superbase if self.detect_transpositions: detector = TranspositionDetection(self) detector.detect() if self.debug_scores: self._debug_edit_graph_table(self.table) def _align_table(self, superbase, witness, token_to_vertex): if not superbase: raise Exception("Superbase is empty!") # print(""+str(superbase)+":"+str(witness.tokens())) self.tokens_witness_a = superbase self.tokens_witness_b = witness.tokens() self.length_witness_a = len(self.tokens_witness_a) self.length_witness_b = len(self.tokens_witness_b) self.table = [[ EditGraphNode() for _ in range(self.length_witness_a + 1) ] for _ in range(self.length_witness_b + 1)] # per diagonal calculate the score (taking into account the three surrounding nodes) self.traverse_diagonally() alignment = {} self.additions = [] self.omissions = [] self.new_superbase = [] # start lower right cell x = self.length_witness_a y = self.length_witness_b # work our way to the upper left while x > 0 and y > 0: cell = self.table[y][x] self._process_cell(token_to_vertex, self.tokens_witness_a, self.tokens_witness_b, alignment, x, y) # examine neighbor nodes nodes_to_examine = set() nodes_to_examine.add(self.table[y][x - 1]) nodes_to_examine.add(self.table[y - 1][x]) nodes_to_examine.add(self.table[y - 1][x - 1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) # move position if self.table[y - 1][x - 1] == parent_node: # another match or replacement if not cell.match: self.omitted_base.insert(0, self.tokens_witness_a[x - 1]) self.added_witness.insert(0, self.tokens_witness_b[y - 1]) # print("replacement:"+str(self.tokens_witness_a[x-1])+":"+str(self.tokens_witness_b[y-1])) # else: # print("match:"+str(self.tokens_witness_a[x-1])) y -= 1 x -= 1 else: if self.table[y - 1][x] == parent_node: # addition? self.added_witness.insert(0, self.tokens_witness_b[y - 1]) # print("added:" + str(self.tokens_witness_b[y - 1])) y -= 1 else: if self.table[y][x - 1] == parent_node: # omission? self.omitted_base.insert(0, self.tokens_witness_a[x - 1]) # print("omitted:" + str(self.tokens_witness_a[x - 1])) x -= 1 # process additions/omissions in the begin of the superbase/witness if x > 0: self.omitted_base = self.tokens_witness_a[0:x] + self.omitted_base if y > 0: self.added_witness = self.tokens_witness_b[0:y] + self.added_witness self.add_to_superbase() return alignment def add_to_superbase(self): if self.omitted_base or self.added_witness: # print("update superbase:" + str(self.omitted_base) + ":" + str(self.added_witness)) # update superbase with additions, omissions self.new_superbase = self.added_witness + self.new_superbase self.new_superbase = self.omitted_base + self.new_superbase self.added_witness = [] self.omitted_base = [] def _process_cell(self, token_to_vertex, witness_a, witness_b, alignment, x, y): cell = self.table[y][x] if cell.match: # process segments self.add_to_superbase() # process alignment token = witness_a[x - 1] token2 = witness_b[y - 1] vertex = token_to_vertex[token] alignment[token2] = vertex # print("match") # print(token2) self.new_superbase.insert(0, token) return cell # This function traverses the table diagonally and scores each cell. # Original function from Mark Byers; translated from C into Python. def traverse_diagonally(self): m = self.length_witness_b + 1 n = self.length_witness_a + 1 for _slice in range(0, m + n - 1, 1): z1 = 0 if _slice < n else _slice - n + 1 z2 = 0 if _slice < m else _slice - m + 1 j = _slice - z2 while j >= z1: x = _slice - j y = j self.score_cell(y, x) j -= 1 def score_cell(self, y, x): # initialize root node score to zero (no edit operations have # been performed) if y == 0 and x == 0: self.table[y][x].g = 0 return # examine neighbor nodes nodes_to_examine = set() # fetch existing score from the left node if possible if x > 0: nodes_to_examine.add(self.table[y][x - 1]) if y > 0: nodes_to_examine.add(self.table[y - 1][x]) if x > 0 and y > 0: nodes_to_examine.add(self.table[y - 1][x - 1]) # calculate the maximum scoring parent node parent_node = max(nodes_to_examine, key=lambda x: x.g) if parent_node == self.table[y - 1][x - 1]: edit_operation = 0 else: edit_operation = 1 token_a = self.tokens_witness_a[x - 1] token_b = self.tokens_witness_b[y - 1] self.scorer.score_cell(self.table[y][x], parent_node, token_a, token_b, y, x, edit_operation) def _debug_edit_graph_table(self, table): # print the table horizontal x = PrettyTable() x.header = False for y in range(0, len(table)): cells = table[y] x.add_row(cells) # alignment can only be set after the field names are known. # since add_row sets the field names, it has to be set after x.add_row(cells) x.align = "l" print(x) return x