Beispiel #1
0
class DecisionTreeTest(TestCase):
    def setUp(self):
        # we need to create witnesses
        # 1: a, b, c, d, e
        # 2: a, e, c, d
        # 3: a, d, b

        a = Witness({'id': 'A', 'content': "a b c d e"})
        b = Witness({'id': 'B', 'content': "a e c d"})
        c = Witness({'id': 'C', 'content': "a d b"})

        self.witnesses = [a, b, c]
        self.tokenindex = TokenIndex(self.witnesses)
        self.tokenindex.prepare()

    def test_maximum_score(self):
        # from the token index we need to calculate the maximum amount of matches
        lcp_intervals = self.tokenindex.split_lcp_array_into_intervals()
        possible_matches = calculate_maximum(lcp_intervals)

        # print(possible_matches)
        self.assertEquals(12, possible_matches)

    def test_decision_tree(self):
        tree = DecisionTree(self.witnesses)
        root = tree.root
        self.assertEquals((0, 0, 0), root.coordinates)
        # we need three scores, (current score), (minimum global score, maximum global score)
        self.assertEquals(0, root.current_score)
        self.assertEquals(0, root.minimum_global_score)
        self.assertEquals(12, root.maximum_global_score)
Beispiel #2
0
 def __init__(self, witnesses):
     self.witnesses = witnesses
     self.tokenindex = TokenIndex(witnesses)
     self.tokenindex.prepare()
     self.lcp_intervals = self.tokenindex.split_lcp_array_into_intervals()
     global_maximum_score = calculate_maximum(self.lcp_intervals)
     self.root = DecisionNode(global_maximum_score)
class DecisionTreeTest(TestCase):

    def setUp(self):
        # we need to create witnesses
        # 1: a, b, c, d, e
        # 2: a, e, c, d
        # 3: a, d, b

        a = Witness({'id':'A', 'content':"a b c d e"})
        b = Witness({'id':'B', 'content':"a e c d"})
        c = Witness({'id':'C', 'content':"a d b"})

        self.witnesses = [a, b, c]
        self.tokenindex = TokenIndex(self.witnesses)
        self.tokenindex.prepare()

    def test_maximum_score(self):
        # from the token index we need to calculate the maximum amount of matches
        lcp_intervals = self.tokenindex.split_lcp_array_into_intervals()
        possible_matches = calculate_maximum(lcp_intervals)

        # print(possible_matches)
        self.assertEquals(12, possible_matches)

    def test_decision_tree(self):
        tree = DecisionTree(self.witnesses)
        root = tree.root
        self.assertEquals((0, 0, 0), root.coordinates)
        # we need three scores, (current score), (minimum global score, maximum global score)
        self.assertEquals(0, root.current_score)
        self.assertEquals(0, root.minimum_global_score)
        self.assertEquals(12, root.maximum_global_score)
 def testTokenArrayMarkersWithThreeWitnesses(self):
     collation = Collation()
     collation.add_plain_witness("W1", "interesting nice huh")
     collation.add_plain_witness("W2", "very nice right")
     collation.add_plain_witness("W3", "especially interesting")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     self.assertTokenArray("interesting nice huh $0 very nice right $1 especially interesting", token_index)
 def test_witness_ranges_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     self.assertEqual(RangeSet("0-14"), token_index.get_range_for_witness("W1"))
     self.assertEqual(RangeSet("16-28"), token_index.get_range_for_witness("W2"))
 def test_token_array_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     # $ is meant to separate witnesses here
     self.assertTokenArray("a b c d F g h i ! K ! q r s t $0 a b c d F g h i ! q r s t", token_index)
 def test_lcp_intervals_number_of_witnesses_Hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     intervals = token_index.split_lcp_array_into_intervals()
     potential_block = intervals[1] # ! q r s t
     self.assertEqual(3, potential_block.get_depth())
Beispiel #8
0
 def test_lcp_intervals_number_of_witnesses_Hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     intervals = token_index.split_lcp_array_into_intervals()
     potential_block = intervals[1]  # ! q r s t
     self.assertEqual(3, potential_block.number_of_witnesses)
Beispiel #9
0
 def test_token_array_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     # $ is meant to separate witnesses here
     self.assertTokenArray(
         "a b c d F g h i ! K ! q r s t $0 a b c d F g h i ! q r s t",
         token_index)
Beispiel #10
0
 def testTokenArrayMarkersWithThreeWitnesses(self):
     collation = Collation()
     collation.add_plain_witness("W1", "interesting nice huh")
     collation.add_plain_witness("W2", "very nice right")
     collation.add_plain_witness("W3", "especially interesting")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     self.assertTokenArray(
         "interesting nice huh $0 very nice right $1 especially interesting",
         token_index)
Beispiel #11
0
class DecisionTree(object):

    def __init__(self, witnesses):
        self.witnesses = witnesses
        self.tokenindex = TokenIndex(witnesses)
        self.tokenindex.prepare()
        self.lcp_intervals = self.tokenindex.split_lcp_array_into_intervals()
        global_maximum_score = calculate_maximum(self.lcp_intervals)
        self.root = DecisionNode(global_maximum_score)

    pass
 def testCaseDanielStoekl(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d e")
     collation.add_plain_witness("W2", "a e c d")
     collation.add_plain_witness("W3", "a d b")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     # Note: the suffix array can have multiple forms
     # outcome of sorting is not guaranteed
     # however the LCP array is fixed we can assert that
     self.assertEqual(array('i', [0, 0, 0, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 1]), token_index.get_lcp_array())
Beispiel #13
0
 def test_filter_potential_blocks(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a a")
     collation.add_plain_witness("w2", "a")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     intervals = token_index.split_lcp_array_into_intervals()
     # expectations
     # There is one interval with length 1, number of occurrences 3, number of witnesses: 2
     a_interval = intervals[0]  # a
     self.assertEqual(2, a_interval.number_of_witnesses)
     self.assertEqual(1, a_interval.length)
     self.assertEqual(3, a_interval.number_of_occurrences)
Beispiel #14
0
 def testCaseDanielStoekl(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d e")
     collation.add_plain_witness("W2", "a e c d")
     collation.add_plain_witness("W3", "a d b")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     # Note: the suffix array can have multiple forms
     # outcome of sorting is not guaranteed
     # however the LCP array is fixed we can assert that
     self.assertEquals(
         array('i', [0, 0, 0, 1, 1, 0, 1, 0, 2, 0, 1, 1, 0, 1]),
         token_index.get_lcp_array())
Beispiel #15
0
    def setUp(self):
        # we need to create witnesses
        # 1: a, b, c, d, e
        # 2: a, e, c, d
        # 3: a, d, b

        a = Witness({'id': 'A', 'content': "a b c d e"})
        b = Witness({'id': 'B', 'content': "a e c d"})
        c = Witness({'id': 'C', 'content': "a d b"})

        self.witnesses = [a, b, c]
        self.tokenindex = TokenIndex(self.witnesses)
        self.tokenindex.prepare()
 def test_filter_potential_blocks(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a a")
     collation.add_plain_witness("w2", "a")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     blocks = token_index.split_lcp_array_into_intervals()
     # expectations
     # There is one interval with length 1, number of occurrences 3, number of witnesses: 2
     a_block = blocks[0] # a
     self.assertEqual(2, a_block.get_depth())
     self.assertEqual(1, a_block.length)
     self.assertEqual(3, len(a_block.get_all_instances()))
 def __init__(self,
              collation,
              near_match=False,
              debug_scores=False,
              detect_transpositions=False,
              properties_filter=None):
     self.collation = collation
     self.debug_scores = debug_scores
     self.detect_transpositions = detect_transpositions
     self.token_index = TokenIndex(collation.witnesses)
     self.scorer = Scorer(self.token_index,
                          near_match=near_match,
                          properties_filter=properties_filter)
     self.align_function = self._align_table
Beispiel #18
0
 def testCaseDanielStoeklLCPIntervals(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d e")
     collation.add_plain_witness("W2", "a e c d")
     collation.add_plain_witness("W3", "a d b")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     blocks = token_index.split_lcp_array_into_intervals()
     self.assertLCP_Interval(2, 1, 3, 3, blocks[0])  # a
     self.assertLCP_Interval(5, 1, 2, 2, blocks[1])  # b
     self.assertLCP_Interval(7, 2, 2, 2, blocks[2])  # c d
     self.assertLCP_Interval(9, 1, 3, 3, blocks[3])  # d
     self.assertLCP_Interval(12, 1, 2, 2, blocks[4])  # e
     self.assertEquals(5, len(blocks))
 def testCaseDanielStoeklLCPIntervals(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d e")
     collation.add_plain_witness("W2", "a e c d")
     collation.add_plain_witness("W3", "a d b")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     blocks = token_index.split_lcp_array_into_intervals()
     self.assertLCP_Interval(2, 1, 3, 3, blocks[0])  # a
     self.assertLCP_Interval(5, 1, 2, 2, blocks[1])  # b
     self.assertLCP_Interval(7, 2, 2, 2, blocks[2])  # c d
     self.assertLCP_Interval(9, 1, 3, 3, blocks[3])  # d
     self.assertLCP_Interval(12, 1, 2, 2, blocks[4])  # e
     self.assertEqual(5, len(blocks))
 def __init__(self, collation, near_match=False, debug_scores=False, detect_transpositions=False, properties_filter=None):
     self.collation = collation
     self.debug_scores = debug_scores
     self.detect_transpositions = detect_transpositions
     self.token_index = TokenIndex(collation.witnesses)
     self.scorer = Scorer(self.token_index, near_match=near_match, properties_filter=properties_filter)
     self.align_function = self._align_table
 def test_non_overlapping_blocks_overlap_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "in the in the bleach")
     collation.add_plain_witness("W2", "in the in the bleach in the")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-4, 6-10")), blocks) # in the in the bleach
 def test_split_lcp_intervals_descending_LCP(self):
     lcp_array = array('i', [0, 20, 20, 20, 4])
     sa_array = array('i', [0, 1, 2, 3, 4]) # FAKED!
     token_index = TokenIndex.for_test(sa_array, lcp_array)
     split_intervals = token_index.split_lcp_array_into_intervals()
     self.assertIntervalIn(0, 20, 4, split_intervals)
     self.assertIntervalIn(0, 4, 5, split_intervals)
     self.assertEqual(2, len(split_intervals), "More items: "+str(split_intervals))
 def test_non_overlapping_blocks_overlap_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "in the in the bleach")
     collation.add_plain_witness("W2", "in the in the bleach in the")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-4, 6-10")),
                   blocks)  # in the in the bleach
Beispiel #24
0
 def __init__(self,
              collation,
              near_match=False,
              debug_scores=False,
              detect_transpositions=False,
              properties_filter=None):
     self.scorer = Scorer()
     self.collation = collation
     self.debug_scores = debug_scores
     self.detect_transpositions = detect_transpositions
     self.properties_filter = properties_filter
     self.token_index = TokenIndex(collation.witnesses)
     self.token_position_to_vertex = {}
     self.added_witness = []
     self.omitted_base = []
     self.vertex_array = []
     self.cells = [[]]
 def test_non_overlapping_blocks_Hermans(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-8, 16-24")), blocks) # a b c d F g h i !
     self.assertIn(Block(RangeSet("11-14, 25-28")), blocks) # q r s t
 def test_non_overlapping_blocks_Hermans(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-8, 16-24")),
                   blocks)  # a b c d F g h i !
     self.assertIn(Block(RangeSet("11-14, 25-28")), blocks)  # q r s t
Beispiel #27
0
 def test_split_lcp_intervals_descending_LCP(self):
     lcp_array = array('i', [0, 20, 20, 20, 4])
     sa_array = array('i', [0, 1, 2, 3, 4])  # FAKED!
     token_index = TokenIndex.for_test(sa_array, lcp_array)
     split_intervals = token_index.split_lcp_array_into_intervals()
     self.assertIntervalIn(0, 20, 4, split_intervals)
     self.assertIntervalIn(0, 4, 5, split_intervals)
     self.assertEqual(2, len(split_intervals),
                      "More items: " + str(split_intervals))
 def test_split_lcp_intervals_ascending_then_descending_LCP(self):
     lcp_array = array('i', [0, 10, 149, 93, 7, 1])
     sa_array = array('i', [0, 1, 2, 3, 4, 5]) # FAKED!
     token_index = TokenIndex.for_test(sa_array, lcp_array)
     split_intervals = token_index.split_lcp_array_into_intervals()
     self.assertIntervalIn(0, 10, 4, split_intervals)
     self.assertIntervalIn(1, 149, 2, split_intervals)
     self.assertIntervalIn(1, 93, 3, split_intervals)
     self.assertIntervalIn(0, 7, 5, split_intervals)
     self.assertIntervalIn(0, 1, 6, split_intervals)
     self.assertEqual(5, len(split_intervals), "More items: "+str(split_intervals))
 def test_blocks_Hermans_case_three_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks) # a b c d
     self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks) # g h i
     self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")), blocks) # ! q r s t
     self.assertIn(Block(RangeSet("4, 20")), blocks) # F
Beispiel #30
0
 def test_split_lcp_intervals_ascending_then_descending_LCP(self):
     lcp_array = array('i', [0, 10, 149, 93, 7, 1])
     sa_array = array('i', [0, 1, 2, 3, 4, 5])  # FAKED!
     token_index = TokenIndex.for_test(sa_array, lcp_array)
     split_intervals = token_index.split_lcp_array_into_intervals()
     self.assertIntervalIn(0, 10, 4, split_intervals)
     self.assertIntervalIn(1, 149, 2, split_intervals)
     self.assertIntervalIn(1, 93, 3, split_intervals)
     self.assertIntervalIn(0, 7, 5, split_intervals)
     self.assertIntervalIn(0, 1, 6, split_intervals)
     self.assertEqual(5, len(split_intervals),
                      "More items: " + str(split_intervals))
Beispiel #31
0
 def test_split_lcp_intervals_ascending_descending_ascending(self):
     lcp_array = array('i', [0, 4, 143, 87, 1, 1, 12, 93, 93, 37])
     sa_array = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])  # FAKED!
     token_index = TokenIndex.for_test(sa_array, lcp_array)
     split_intervals = token_index.split_lcp_array_into_intervals()
     self.assertIntervalIn(1, 143, 2, split_intervals)
     self.assertIntervalIn(1, 87, 3, split_intervals)
     self.assertIntervalIn(0, 4, 4, split_intervals)
     self.assertIntervalIn(6, 93, 3, split_intervals)
     self.assertIntervalIn(0, 1, 10, split_intervals)
     self.assertIntervalIn(5, 12, 5, split_intervals)
     self.assertIntervalIn(6, 37, 4, split_intervals)
 def test_blocks_Hermans_case_three_witnesses(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     collation.add_plain_witness("W3", "a b c d E g h i ! q r s t")
     algorithm = Scorer(TokenIndex.create_token_index(collation))
     blocks = algorithm._get_non_overlapping_repeating_blocks()
     self.assertIn(Block(RangeSet("0-3, 16-19, 30-33")), blocks)  # a b c d
     self.assertIn(Block(RangeSet("5-7, 21-23, 35-37")), blocks)  # g h i
     self.assertIn(Block(RangeSet("10-14, 24-28, 38-42")),
                   blocks)  # ! q r s t
     self.assertIn(Block(RangeSet("4, 20")), blocks)  # F
 def test_split_lcp_intervals_ascending_descending_ascending(self):
     lcp_array = array('i', [0, 4, 143, 87, 1, 1, 12, 93, 93, 37])
     sa_array = array('i', [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])  # FAKED!
     token_index = TokenIndex.for_test(sa_array, lcp_array)
     split_intervals = token_index.split_lcp_array_into_intervals()
     self.assertIntervalIn(1, 143, 2, split_intervals)
     self.assertIntervalIn(1, 87, 3, split_intervals)
     self.assertIntervalIn(0, 4, 4, split_intervals)
     self.assertIntervalIn(6, 93, 3, split_intervals)
     self.assertIntervalIn(0, 1, 10, split_intervals)
     self.assertIntervalIn(5, 12, 5, split_intervals)
     self.assertIntervalIn(6, 37, 4, split_intervals)
    def setUp(self):
        # we need to create witnesses
        # 1: a, b, c, d, e
        # 2: a, e, c, d
        # 3: a, d, b

        a = Witness({'id':'A', 'content':"a b c d e"})
        b = Witness({'id':'B', 'content':"a e c d"})
        c = Witness({'id':'C', 'content':"a d b"})

        self.witnesses = [a, b, c]
        self.tokenindex = TokenIndex(self.witnesses)
        self.tokenindex.prepare()
Beispiel #35
0
 def test_witness_ranges_hermans_case(self):
     collation = Collation()
     collation.add_plain_witness("W1", "a b c d F g h i ! K ! q r s t")
     collation.add_plain_witness("W2", "a b c d F g h i ! q r s t")
     token_index = TokenIndex(collation.witnesses)
     token_index.prepare()
     self.assertEquals(RangeSet("0-14"),
                       token_index.get_range_for_witness("W1"))
     self.assertEquals(RangeSet("16-28"),
                       token_index.get_range_for_witness("W2"))
Beispiel #36
0
class EditGraphAligner(CollationAlgorithm):
    def __init__(self,
                 collation,
                 near_match=False,
                 debug_scores=False,
                 detect_transpositions=False,
                 properties_filter=None):
        self.scorer = Scorer()
        self.collation = collation
        self.debug_scores = debug_scores
        self.detect_transpositions = detect_transpositions
        self.properties_filter = properties_filter
        self.token_index = TokenIndex(collation.witnesses)
        self.token_position_to_vertex = {}
        self.added_witness = []
        self.omitted_base = []
        self.vertex_array = []
        self.cells = [[]]

    def collate(self, graph):
        """
        :type graph: VariantGraph
        """
        # prepare the token index
        self.token_index.prepare()
        self.vertex_array = [None] * len(self.token_index.token_array)

        # Build the variant graph for the first witness
        # this is easy: generate a vertex for every token
        first_witness = self.collation.witnesses[0]
        tokens = first_witness.tokens()
        token_to_vertex = self.merge(graph, first_witness.sigil, tokens)
        # print("> token_to_vertex=", token_to_vertex)
        self.update_token_position_to_vertex(token_to_vertex)
        self.update_token_to_vertex_array(tokens, first_witness,
                                          self.token_position_to_vertex)

        # align witness 2 - n
        for x in range(1, len(self.collation.witnesses)):
            witness = self.collation.witnesses[x]
            tokens = witness.tokens()
            # print("\nwitness", witness.sigil)

            variant_graph_ranking = VariantGraphRanking.of(graph)
            # print("> x =", x, ", variant_graph_ranking =", variant_graph_ranking.byRank)
            variant_graph_ranks = list(
                set(
                    map(lambda v: variant_graph_ranking.byVertex.get(v),
                        graph.vertices())))
            # we leave in the rank of the start vertex, but remove the rank of the end vertex
            variant_graph_ranks.pop()

            # now the vertical stuff
            tokens_as_index_list = self.as_index_list(tokens)

            match_cube = MatchCube(self.token_index, witness,
                                   self.vertex_array, variant_graph_ranking,
                                   self.properties_filter)
            # print("> match_cube.matches=", match_cube.matches)
            self.fill_needleman_wunsch_table(variant_graph_ranks,
                                             tokens_as_index_list, match_cube)

            aligned = self.align_matching_tokens(match_cube)
            # print("> aligned=", aligned)
            # print("self.token_index.token_array=", self.token_index.token_array)
            # alignment = self.align_function(superbase, next_witness, token_to_vertex, match_cube)

            # merge
            witness_token_to_generated_vertex = self.merge(
                graph, witness.sigil, witness.tokens(), aligned)
            # print("> witness_token_to_generated_vertex =", witness_token_to_generated_vertex)
            token_to_vertex.update(witness_token_to_generated_vertex)
            # print("> token_to_vertex =", token_to_vertex)
            self.update_token_position_to_vertex(token_to_vertex, aligned)
            witness_token_position_to_vertex = {}
            for p in self.token_index.get_range_for_witness(witness.sigil):
                # print("> p= ", p)
                witness_token_position_to_vertex[
                    p] = self.token_position_to_vertex[p]
            self.update_token_to_vertex_array(
                tokens, witness, witness_token_position_to_vertex)
            # print("> vertex_array =", self.vertex_array)

            #             print("actual")
            #             self._debug_edit_graph_table(self.table)
            #             print("expected")
            #             self._debug_edit_graph_table(self.table2)

            # change superbase
            # superbase = self.new_superbase

            if self.detect_transpositions:
                detector = TranspositionDetection(self)
                detector.detect()

                # if self.debug_scores:
                #     self._debug_edit_graph_table(self.table)

    @staticmethod
    def as_index_list(tokens):
        tokens_as_index_list = [0]
        counter = 1
        for t in tokens:
            tokens_as_index_list.append(counter)
            counter += 1
        return tokens_as_index_list

    def fill_needleman_wunsch_table(self, variant_graph_ranks,
                                    tokens_as_index_list, match_cube):
        self.cells = [[None for row in range(0, len(variant_graph_ranks))]
                      for col in range(0, len(tokens_as_index_list))]
        scorer = Scorer(match_cube)

        # init 0,0
        self.cells[0][0] = Score(ScoreType.empty, 0, 0, None, 0)

        # fill the first row with gaps
        for x in range(1, len(variant_graph_ranks)):
            previous_x = x - 1
            self.cells[0][x] = scorer.gap(x, 0, self.cells[0][previous_x])

        # fill the first column with gaps
        for y in range(1, len(tokens_as_index_list)):
            # print("\nself.cells.len = ", len(self.cells), " x ", len(self.cells[0]))
            # print("y=", y)
            # print("self.cells[y][0]=", self.cells[y][0])
            previous_y = y - 1
            # print("previous_y=", previous_y)
            # print("self.cells[previous_y][0]=", self.cells[previous_y][0])
            self.cells[y][0] = scorer.gap(0, y, self.cells[previous_y][0])

        _debug_cells(self.cells)

        # fill the remaining cells
        # fill the rest of the cells in a y by x fashion
        for y in range(1, len(tokens_as_index_list)):
            for x in range(1, len(variant_graph_ranks)):
                previous_y = y - 1
                previous_x = x - 1
                from_upper_left = scorer.score(
                    x, y, self.cells[previous_y][previous_x])
                from_left = scorer.gap(x, y, self.cells[y][previous_x])
                from_upper = self.calculate_from_upper(scorer, y, x,
                                                       previous_y, match_cube)
                max_score = max(from_upper_left,
                                from_left,
                                from_upper,
                                key=lambda s: s.global_score)
                self.cells[y][x] = max_score

    def calculate_from_upper(self, scorer, y, x, previous_y, match_cube):
        upper_is_match = match_cube.has_match(previous_y - 1, x - 1)
        if upper_is_match:
            return scorer.score(x, y, self.cells[previous_y][x])
        else:
            return scorer.gap(x, y, self.cells[previous_y][x])

    def align_matching_tokens(self, cube):
        #  using the score iterator..
        #  find all the matches
        #  later for the transposition detection, we also want to keep track of all the additions,
        #  omissions, and replacements
        aligned = {}
        scores = ScoreIterator(self.cells)
        matched_vertices = []
        for score in scores:
            if score.type == ScoreType.match:
                rank = score.x - 1
                match = cube.get_match(score.y - 1, rank)
                if match.vertex not in matched_vertices:
                    aligned[match.token] = match.vertex
                    matched_vertices.append(match.vertex)
        return aligned

    def update_token_to_vertex_array(self, tokens, witness,
                                     witness_token_position_to_vertex):
        # we need to update the token -> vertex map
        # that information is stored in protected map
        # print("> witness_token_position_to_vertex =", witness_token_position_to_vertex)
        # t = list(witness_token_to_vertex)[0]
        # #print("> list(witness_token_to_vertex)[0] =", t)
        # #print("> t.token_string =", t.token_string)
        # #print("> t.token_data =", t.token_data)
        # print("> witness_token_position_to_vertex =", witness_token_position_to_vertex)
        for token_position in self.token_index.get_range_for_witness(
                witness.sigil):
            # print("> token_position =", token_position)
            vertex = witness_token_position_to_vertex[token_position]
            self.vertex_array[token_position] = vertex

    def update_token_position_to_vertex(self, token_to_vertex, aligned={}):
        for token in token_to_vertex:
            # print("> token =", token)
            position = token.token_data['_token_array_position']
            # print("> position =", position)
            self.token_position_to_vertex[position] = token_to_vertex[token]
        for token in aligned:
            # print("> token =", token)
            position = token.token_data['_token_array_position']
            # print("> position =", position)
            self.token_position_to_vertex[position] = aligned[token]
Beispiel #37
0
class EditGraphAligner(CollationAlgorithm):
    def __init__(self,
                 collation,
                 near_match=False,
                 debug_scores=False,
                 detect_transpositions=False,
                 properties_filter=None):
        self.collation = collation
        self.debug_scores = debug_scores
        self.detect_transpositions = detect_transpositions
        self.token_index = TokenIndex(collation.witnesses)
        self.scorer = Scorer(self.token_index,
                             near_match=near_match,
                             properties_filter=properties_filter)
        self.align_function = self._align_table
        self.added_witness = []
        self.omitted_base = []

    def collate(self, graph, collation):
        """
        :type graph: VariantGraph
        :type collation: Collation
        """
        # prepare the token index
        self.token_index.prepare()

        # Build the variant graph for the first witness
        # this is easy: generate a vertex for every token
        first_witness = collation.witnesses[0]
        tokens = first_witness.tokens()
        token_to_vertex = self.merge(graph, first_witness.sigil, tokens)

        # let the scorer prepare the first witness
        self.scorer.prepare_witness(first_witness)

        # construct superbase
        superbase = tokens

        # align witness 2 - n
        for x in range(1, len(collation.witnesses)):
            next_witness = collation.witnesses[x]

            # let the scorer prepare the next witness
            self.scorer.prepare_witness(next_witness)

            #             # VOOR CONTROLE!
            #             alignment = self._align_table(superbase, next_witness, token_to_vertex)
            #             self.table2 = self.table

            # alignment = token -> vertex
            alignment = self.align_function(superbase, next_witness,
                                            token_to_vertex)

            # merge
            token_to_vertex.update(
                self.merge(graph, next_witness.sigil, next_witness.tokens(),
                           alignment))

            #             print("actual")
            #             self._debug_edit_graph_table(self.table)
            #             print("expected")
            #             self._debug_edit_graph_table(self.table2)

            # change superbase
            superbase = self.new_superbase

            if self.detect_transpositions:
                detector = TranspositionDetection(self)
                detector.detect()

        if self.debug_scores:
            self._debug_edit_graph_table(self.table)

    def _align_table(self, superbase, witness, token_to_vertex):
        if not superbase:
            raise Exception("Superbase is empty!")

        # print(""+str(superbase)+":"+str(witness.tokens()))
        self.tokens_witness_a = superbase
        self.tokens_witness_b = witness.tokens()
        self.length_witness_a = len(self.tokens_witness_a)
        self.length_witness_b = len(self.tokens_witness_b)
        self.table = [[
            EditGraphNode() for _ in range(self.length_witness_a + 1)
        ] for _ in range(self.length_witness_b + 1)]

        # per diagonal calculate the score (taking into account the three surrounding nodes)
        self.traverse_diagonally()

        alignment = {}
        self.additions = []
        self.omissions = []
        self.new_superbase = []

        # start lower right cell
        x = self.length_witness_a
        y = self.length_witness_b
        # work our way to the upper left
        while x > 0 and y > 0:
            cell = self.table[y][x]
            self._process_cell(token_to_vertex, self.tokens_witness_a,
                               self.tokens_witness_b, alignment, x, y)
            # examine neighbor nodes
            nodes_to_examine = set()
            nodes_to_examine.add(self.table[y][x - 1])
            nodes_to_examine.add(self.table[y - 1][x])
            nodes_to_examine.add(self.table[y - 1][x - 1])
            # calculate the maximum scoring parent node
            parent_node = max(nodes_to_examine, key=lambda x: x.g)
            # move position
            if self.table[y - 1][x - 1] == parent_node:
                # another match or replacement
                if not cell.match:
                    self.omitted_base.insert(0, self.tokens_witness_a[x - 1])
                    self.added_witness.insert(0, self.tokens_witness_b[y - 1])
                    # print("replacement:"+str(self.tokens_witness_a[x-1])+":"+str(self.tokens_witness_b[y-1]))
                # else:
                # print("match:"+str(self.tokens_witness_a[x-1]))
                y -= 1
                x -= 1
            else:
                if self.table[y - 1][x] == parent_node:
                    # addition?
                    self.added_witness.insert(0, self.tokens_witness_b[y - 1])
                    # print("added:" + str(self.tokens_witness_b[y - 1]))
                    y -= 1
                else:
                    if self.table[y][x - 1] == parent_node:
                        # omission?
                        self.omitted_base.insert(0,
                                                 self.tokens_witness_a[x - 1])
                        # print("omitted:" + str(self.tokens_witness_a[x - 1]))
                        x -= 1

        # process additions/omissions in the begin of the superbase/witness
        if x > 0:
            self.omitted_base = self.tokens_witness_a[0:x] + self.omitted_base
        if y > 0:
            self.added_witness = self.tokens_witness_b[0:y] + self.added_witness
        self.add_to_superbase()
        return alignment

    def add_to_superbase(self):
        if self.omitted_base or self.added_witness:
            # print("update superbase:" + str(self.omitted_base) + ":" + str(self.added_witness))
            # update superbase with additions, omissions
            self.new_superbase = self.added_witness + self.new_superbase
            self.new_superbase = self.omitted_base + self.new_superbase
            self.added_witness = []
            self.omitted_base = []

    def _process_cell(self, token_to_vertex, witness_a, witness_b, alignment,
                      x, y):
        cell = self.table[y][x]
        if cell.match:
            # process segments
            self.add_to_superbase()
            # process alignment
            token = witness_a[x - 1]
            token2 = witness_b[y - 1]
            vertex = token_to_vertex[token]
            alignment[token2] = vertex
            #             print("match")
            #             print(token2)
            self.new_superbase.insert(0, token)
        return cell

    # This function traverses the table diagonally and scores each cell.
    # Original function from Mark Byers; translated from C into Python.
    def traverse_diagonally(self):
        m = self.length_witness_b + 1
        n = self.length_witness_a + 1
        for _slice in range(0, m + n - 1, 1):
            z1 = 0 if _slice < n else _slice - n + 1
            z2 = 0 if _slice < m else _slice - m + 1
            j = _slice - z2
            while j >= z1:
                x = _slice - j
                y = j
                self.score_cell(y, x)
                j -= 1

    def score_cell(self, y, x):
        # initialize root node score to zero (no edit operations have
        # been performed)
        if y == 0 and x == 0:
            self.table[y][x].g = 0
            return
        # examine neighbor nodes
        nodes_to_examine = set()
        # fetch existing score from the left node if possible
        if x > 0:
            nodes_to_examine.add(self.table[y][x - 1])
        if y > 0:
            nodes_to_examine.add(self.table[y - 1][x])
        if x > 0 and y > 0:
            nodes_to_examine.add(self.table[y - 1][x - 1])
        # calculate the maximum scoring parent node
        parent_node = max(nodes_to_examine, key=lambda x: x.g)
        if parent_node == self.table[y - 1][x - 1]:
            edit_operation = 0
        else:
            edit_operation = 1
        token_a = self.tokens_witness_a[x - 1]
        token_b = self.tokens_witness_b[y - 1]
        self.scorer.score_cell(self.table[y][x], parent_node, token_a, token_b,
                               y, x, edit_operation)

    def _debug_edit_graph_table(self, table):
        # print the table horizontal
        x = PrettyTable()
        x.header = False
        for y in range(0, len(table)):
            cells = table[y]
            x.add_row(cells)
        # alignment can only be set after the field names are known.
        # since add_row sets the field names, it has to be set after x.add_row(cells)
        x.align = "l"
        print(x)
        return x
Beispiel #38
0
class EditGraphAligner(CollationAlgorithm):
    def __init__(self, collation, near_match=False, debug_scores=False, detect_transpositions=False, properties_filter=None):
        self.collation = collation
        self.debug_scores = debug_scores
        self.detect_transpositions = detect_transpositions
        self.token_index = TokenIndex(collation.witnesses)
        self.scorer = Scorer(self.token_index, near_match=near_match, properties_filter=properties_filter)
        self.align_function = self._align_table
        self.added_witness = []
        self.omitted_base = []

    def collate(self, graph, collation):
        """
        :type graph: VariantGraph
        :type collation: Collation
        """
        # prepare the token index
        self.token_index.prepare()

        # Build the variant graph for the first witness
        # this is easy: generate a vertex for every token
        first_witness = collation.witnesses[0]
        tokens = first_witness.tokens()
        token_to_vertex = self.merge(graph, first_witness.sigil, tokens)

        # let the scorer prepare the first witness
        self.scorer.prepare_witness(first_witness)
        
        # construct superbase
        superbase = tokens
        
        # align witness 2 - n
        for x in range(1, len(collation.witnesses)):
            next_witness = collation.witnesses[x]
        
            # let the scorer prepare the next witness
            self.scorer.prepare_witness(next_witness)
            
#             # VOOR CONTROLE!
#             alignment = self._align_table(superbase, next_witness, token_to_vertex)
#             self.table2 = self.table
            
            # alignment = token -> vertex
            alignment = self.align_function(superbase, next_witness, token_to_vertex)
        
            # merge
            token_to_vertex.update(self.merge(graph, next_witness.sigil, next_witness.tokens(), alignment))

#             print("actual")
#             self._debug_edit_graph_table(self.table)
#             print("expected")
#             self._debug_edit_graph_table(self.table2)
            
            # change superbase
            superbase = self.new_superbase

            if self.detect_transpositions:
                detector = TranspositionDetection(self)
                detector.detect()

        if self.debug_scores:
            self._debug_edit_graph_table(self.table)
        

    def _align_table(self, superbase, witness, token_to_vertex):
        if not superbase:
            raise Exception("Superbase is empty!")

        # print(""+str(superbase)+":"+str(witness.tokens()))
        self.tokens_witness_a = superbase
        self.tokens_witness_b = witness.tokens()
        self.length_witness_a = len(self.tokens_witness_a)
        self.length_witness_b = len(self.tokens_witness_b)
        self.table = [[EditGraphNode() for _ in range(self.length_witness_a+1)] for _ in range(self.length_witness_b+1)]

        # per diagonal calculate the score (taking into account the three surrounding nodes)
        self.traverse_diagonally()

        alignment = {}
        self.additions = []
        self.omissions = []
        self.new_superbase=[]
        
        # start lower right cell
        x = self.length_witness_a
        y = self.length_witness_b
        # work our way to the upper left
        while x > 0 and y > 0:
            cell = self.table[y][x]
            self._process_cell(token_to_vertex, self.tokens_witness_a, self.tokens_witness_b, alignment, x, y)
            # examine neighbor nodes
            nodes_to_examine = set()
            nodes_to_examine.add(self.table[y][x-1])
            nodes_to_examine.add(self.table[y-1][x])
            nodes_to_examine.add(self.table[y-1][x-1])
            # calculate the maximum scoring parent node
            parent_node = max(nodes_to_examine, key=lambda x: x.g)
            # move position
            if self.table[y-1][x-1] == parent_node:
                # another match or replacement
                if not cell.match:
                    self.omitted_base.insert(0, self.tokens_witness_a[x-1])
                    self.added_witness.insert(0, self.tokens_witness_b[y-1])
                    # print("replacement:"+str(self.tokens_witness_a[x-1])+":"+str(self.tokens_witness_b[y-1]))
                # else:
                    # print("match:"+str(self.tokens_witness_a[x-1]))
                y -= 1
                x -= 1
            else:
                if self.table[y-1][x] == parent_node:
                    # addition?
                    self.added_witness.insert(0, self.tokens_witness_b[y - 1])
                    # print("added:" + str(self.tokens_witness_b[y - 1]))
                    y -= 1
                else:
                    if self.table[y][x-1] == parent_node:
                        # omission?
                        self.omitted_base.insert(0, self.tokens_witness_a[x - 1])
                        # print("omitted:" + str(self.tokens_witness_a[x - 1]))
                        x -= 1

        # process additions/omissions in the begin of the superbase/witness
        if x > 0:
            self.omitted_base = self.tokens_witness_a[0:x] + self.omitted_base
        if y > 0:
            self.added_witness = self.tokens_witness_b[0:y] + self.added_witness
        self.add_to_superbase()
        return alignment
        
    def add_to_superbase(self):
        if self.omitted_base or self.added_witness:
            # print("update superbase:" + str(self.omitted_base) + ":" + str(self.added_witness))
            # update superbase with additions, omissions
            self.new_superbase = self.added_witness + self.new_superbase
            self.new_superbase = self.omitted_base + self.new_superbase
            self.added_witness = []
            self.omitted_base = []

    def _process_cell(self, token_to_vertex, witness_a, witness_b, alignment, x, y):
        cell = self.table[y][x]
        if cell.match:
            # process segments
            self.add_to_superbase()
            # process alignment
            token = witness_a[x-1]
            token2 = witness_b[y-1]
            vertex = token_to_vertex[token]
            alignment[token2] = vertex
#             print("match")
#             print(token2)
            self.new_superbase.insert(0, token)
        return cell

    # This function traverses the table diagonally and scores each cell.
    # Original function from Mark Byers; translated from C into Python.
    def traverse_diagonally(self):
        m = self.length_witness_b+1
        n = self.length_witness_a+1
        for _slice in range(0, m + n - 1, 1):
            z1 = 0 if _slice < n else _slice - n + 1
            z2 = 0 if _slice < m else _slice - m + 1
            j = _slice - z2
            while j >= z1:
                x = _slice - j
                y = j
                self.score_cell(y, x)
                j -= 1

    def score_cell(self, y, x):
        # initialize root node score to zero (no edit operations have
        # been performed)
        if y == 0 and x == 0:
            self.table[y][x].g = 0
            return 
        # examine neighbor nodes
        nodes_to_examine = set()
        # fetch existing score from the left node if possible
        if x > 0:
            nodes_to_examine.add(self.table[y][x-1])
        if y > 0:
            nodes_to_examine.add(self.table[y-1][x])
        if x > 0 and y > 0:
            nodes_to_examine.add(self.table[y-1][x-1])
        # calculate the maximum scoring parent node
        parent_node = max(nodes_to_examine, key=lambda x: x.g)
        if parent_node == self.table[y-1][x-1]:
            edit_operation = 0
        else:
            edit_operation = 1
        token_a = self.tokens_witness_a[x-1]
        token_b = self.tokens_witness_b[y-1]
        self.scorer.score_cell(self.table[y][x], parent_node, token_a, token_b, y, x, edit_operation)

    def _debug_edit_graph_table(self, table):
        # print the table horizontal
        x = PrettyTable()
        x.header=False
        for y in range(0, len(table)):
            cells = table[y]
            x.add_row(cells)
        # alignment can only be set after the field names are known.
        # since add_row sets the field names, it has to be set after x.add_row(cells)
        x.align="l"
        print(x)
        return x