Example #1
0
 def test_witness_ranges_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     self.assertEqual(RangeSet("0-14"), token_index.get_range_for_witness("W1"))
     self.assertEqual(RangeSet("16-28"), token_index.get_range_for_witness("W2"))
Example #2
0
 def test_witness_ranges_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     self.assertEquals(RangeSet("0-14"),
                       token_index.get_range_for_witness("W1"))
     self.assertEquals(RangeSet("16-28"),
                       token_index.get_range_for_witness("W2"))
Example #3
0
class EditGraphAligner(CollationAlgorithm):
    def __init__(self,
                 collation,
                 near_match=False,
                 debug_scores=False,
                 detect_transpositions=False,
                 properties_filter=None):
        self.scorer = Scorer()
        self.collation = collation
        self.debug_scores = debug_scores
        self.detect_transpositions = detect_transpositions
        self.properties_filter = properties_filter
        self.token_index = TokenIndex(collation.witnesses)
        self.token_position_to_vertex = {}
        self.added_witness = []
        self.omitted_base = []
        self.vertex_array = []
        self.cells = [[]]

    def collate(self, graph):
        """
        :type graph: VariantGraph
        """
        # prepare the token index
        self.token_index.prepare()
        self.vertex_array = [None] * len(self.token_index.token_array)

        # Build the variant graph for the first witness
        # this is easy: generate a vertex for every token
        first_witness = self.collation.witnesses[0]
        tokens = first_witness.tokens()
        token_to_vertex = self.merge(graph, first_witness.sigil, tokens)
        # print("> token_to_vertex=", token_to_vertex)
        self.update_token_position_to_vertex(token_to_vertex)
        self.update_token_to_vertex_array(tokens, first_witness,
                                          self.token_position_to_vertex)

        # align witness 2 - n
        for x in range(1, len(self.collation.witnesses)):
            witness = self.collation.witnesses[x]
            tokens = witness.tokens()
            # print("\nwitness", witness.sigil)

            variant_graph_ranking = VariantGraphRanking.of(graph)
            # print("> x =", x, ", variant_graph_ranking =", variant_graph_ranking.byRank)
            variant_graph_ranks = list(
                set(
                    map(lambda v: variant_graph_ranking.byVertex.get(v),
                        graph.vertices())))
            # we leave in the rank of the start vertex, but remove the rank of the end vertex
            variant_graph_ranks.pop()

            # now the vertical stuff
            tokens_as_index_list = self.as_index_list(tokens)

            match_cube = MatchCube(self.token_index, witness,
                                   self.vertex_array, variant_graph_ranking,
                                   self.properties_filter)
            # print("> match_cube.matches=", match_cube.matches)
            self.fill_needleman_wunsch_table(variant_graph_ranks,
                                             tokens_as_index_list, match_cube)

            aligned = self.align_matching_tokens(match_cube)
            # print("> aligned=", aligned)
            # print("self.token_index.token_array=", self.token_index.token_array)
            # alignment = self.align_function(superbase, next_witness, token_to_vertex, match_cube)

            # merge
            witness_token_to_generated_vertex = self.merge(
                graph, witness.sigil, witness.tokens(), aligned)
            # print("> witness_token_to_generated_vertex =", witness_token_to_generated_vertex)
            token_to_vertex.update(witness_token_to_generated_vertex)
            # print("> token_to_vertex =", token_to_vertex)
            self.update_token_position_to_vertex(token_to_vertex, aligned)
            witness_token_position_to_vertex = {}
            for p in self.token_index.get_range_for_witness(witness.sigil):
                # print("> p= ", p)
                witness_token_position_to_vertex[
                    p] = self.token_position_to_vertex[p]
            self.update_token_to_vertex_array(
                tokens, witness, witness_token_position_to_vertex)
            # print("> vertex_array =", self.vertex_array)

            #             print("actual")
            #             self._debug_edit_graph_table(self.table)
            #             print("expected")
            #             self._debug_edit_graph_table(self.table2)

            # change superbase
            # superbase = self.new_superbase

            if self.detect_transpositions:
                detector = TranspositionDetection(self)
                detector.detect()

                # if self.debug_scores:
                #     self._debug_edit_graph_table(self.table)

    @staticmethod
    def as_index_list(tokens):
        tokens_as_index_list = [0]
        counter = 1
        for t in tokens:
            tokens_as_index_list.append(counter)
            counter += 1
        return tokens_as_index_list

    def fill_needleman_wunsch_table(self, variant_graph_ranks,
                                    tokens_as_index_list, match_cube):
        self.cells = [[None for row in range(0, len(variant_graph_ranks))]
                      for col in range(0, len(tokens_as_index_list))]
        scorer = Scorer(match_cube)

        # init 0,0
        self.cells[0][0] = Score(ScoreType.empty, 0, 0, None, 0)

        # fill the first row with gaps
        for x in range(1, len(variant_graph_ranks)):
            previous_x = x - 1
            self.cells[0][x] = scorer.gap(x, 0, self.cells[0][previous_x])

        # fill the first column with gaps
        for y in range(1, len(tokens_as_index_list)):
            # print("\nself.cells.len = ", len(self.cells), " x ", len(self.cells[0]))
            # print("y=", y)
            # print("self.cells[y][0]=", self.cells[y][0])
            previous_y = y - 1
            # print("previous_y=", previous_y)
            # print("self.cells[previous_y][0]=", self.cells[previous_y][0])
            self.cells[y][0] = scorer.gap(0, y, self.cells[previous_y][0])

        _debug_cells(self.cells)

        # fill the remaining cells
        # fill the rest of the cells in a y by x fashion
        for y in range(1, len(tokens_as_index_list)):
            for x in range(1, len(variant_graph_ranks)):
                previous_y = y - 1
                previous_x = x - 1
                from_upper_left = scorer.score(
                    x, y, self.cells[previous_y][previous_x])
                from_left = scorer.gap(x, y, self.cells[y][previous_x])
                from_upper = self.calculate_from_upper(scorer, y, x,
                                                       previous_y, match_cube)
                max_score = max(from_upper_left,
                                from_left,
                                from_upper,
                                key=lambda s: s.global_score)
                self.cells[y][x] = max_score

    def calculate_from_upper(self, scorer, y, x, previous_y, match_cube):
        upper_is_match = match_cube.has_match(previous_y - 1, x - 1)
        if upper_is_match:
            return scorer.score(x, y, self.cells[previous_y][x])
        else:
            return scorer.gap(x, y, self.cells[previous_y][x])

    def align_matching_tokens(self, cube):
        #  using the score iterator..
        #  find all the matches
        #  later for the transposition detection, we also want to keep track of all the additions,
        #  omissions, and replacements
        aligned = {}
        scores = ScoreIterator(self.cells)
        matched_vertices = []
        for score in scores:
            if score.type == ScoreType.match:
                rank = score.x - 1
                match = cube.get_match(score.y - 1, rank)
                if match.vertex not in matched_vertices:
                    aligned[match.token] = match.vertex
                    matched_vertices.append(match.vertex)
        return aligned

    def update_token_to_vertex_array(self, tokens, witness,
                                     witness_token_position_to_vertex):
        # we need to update the token -> vertex map
        # that information is stored in protected map
        # print("> witness_token_position_to_vertex =", witness_token_position_to_vertex)
        # t = list(witness_token_to_vertex)[0]
        # #print("> list(witness_token_to_vertex)[0] =", t)
        # #print("> t.token_string =", t.token_string)
        # #print("> t.token_data =", t.token_data)
        # print("> witness_token_position_to_vertex =", witness_token_position_to_vertex)
        for token_position in self.token_index.get_range_for_witness(
                witness.sigil):
            # print("> token_position =", token_position)
            vertex = witness_token_position_to_vertex[token_position]
            self.vertex_array[token_position] = vertex

    def update_token_position_to_vertex(self, token_to_vertex, aligned={}):
        for token in token_to_vertex:
            # print("> token =", token)
            position = token.token_data['_token_array_position']
            # print("> position =", position)
            self.token_position_to_vertex[position] = token_to_vertex[token]
        for token in aligned:
            # print("> token =", token)
            position = token.token_data['_token_array_position']
            # print("> position =", position)
            self.token_position_to_vertex[position] = aligned[token]