def createExampleToReport(): nodes = [] rootNode = Node(aId="root") inner_2 = Node(aId="inner2") inner_8 = Node(aId="inner8") rootNode.add_child(inner_2) nodes.append(Node(aId=3)) inner_2.add_child(nodes[0]) inner_4 = Node(aId="inner4") inner_2.add_child(inner_4) nodes.append(Node(aId=5)) nodes.append(Node(aId=6)) nodes.append(Node(aId=7)) inner_4.add_child(nodes[1]) inner_4.add_child(nodes[2]) inner_4.add_child(nodes[3]) rootNode.add_child(inner_8) nodes.append(Node(aId=9)) nodes.append(Node(aId=10)) inner_8.add_child(nodes[4]) inner_8.add_child(nodes[5]) lca_al = lca.LCA() lca_al.preprocess(rootNode) print(rootNode.fancyprintLCA())
def test_tree_one(): tree, nodes = createTreeOne() lca_al = lca.LCA() lca_al.preprocess(tree) assert lca_al.query(nodes[0], nodes[1]).id == "inner1" assert lca_al.query(nodes[2], nodes[3]).id == "inner2" assert lca_al.query(nodes[0], nodes[3]).id == "root" assert lca_al.query(nodes[3], nodes[0]).id == "root" assert lca_al.query(nodes[4], nodes[5]).id == "root"
def test_tree_two(): tree, nodes = createTreeTwo() lca_al = lca.LCA() lca_al.preprocess(tree) assert lca_al.query(nodes[7], nodes[8]).PREORDER == 5 assert lca_al.query(nodes[0], nodes[1]).PREORDER == 2 assert lca_al.query(nodes[1], nodes[3]).PREORDER == 1 assert lca_al.query(nodes[1], nodes[4]).PREORDER == 1 assert lca_al.query(nodes[1], nodes[5]).PREORDER == 1 assert lca_al.query(nodes[0], nodes[5]).PREORDER == 1 assert lca_al.query(nodes[4], nodes[5]).PREORDER == 8
def check_correctness(string): string = farach.str2int(string) constructed_tree = farach.construct_suffix_tree(string) id2node = [] constructed_tree.traverse(lambda n: id2node.append((n.id, n)) if 'inner' not in str(n.id) else 'do nothing') id2node = dict(id2node) constructed_tree.update_leaf_list leaflist = constructed_tree.leaflist lca_al = lca.LCA() lca_al.preprocess(constructed_tree) for i in leaflist: for j in leaflist: assert farach.naive_lca(i, j, constructed_tree, id2node) == lca_al.query(i, j)
def test_tree_four(): string = 'mississippi' string = farach.str2int(string) constructed_tree = farach.construct_suffix_tree(string) constructed_tree.update_leaf_list leaflist = constructed_tree.leaflist lca_al = lca.LCA() lca_al.preprocess(constructed_tree) assert str(lca_al.query(leaflist[1], leaflist[2]).leaflist) == "[node2, node5]" assert str( lca_al.query(leaflist[0], leaflist[2]).leaflist ) == "[node1, node2, node5, node8, node11, node4, node7, node3, node6, node10, node9, node12]" assert str(lca_al.query( leaflist[6], leaflist[8]).leaflist) == "[node4, node7, node3, node6]" assert str(lca_al.query(leaflist[9], leaflist[10]).leaflist) == "[node10, node9]"
def test_tree_three(): string = '12121' string = farach.str2int(string) constructed_tree = farach.construct_suffix_tree(string) constructed_tree.update_leaf_list leaflist = constructed_tree.leaflist lca_al = lca.LCA() lca_al.preprocess(constructed_tree) assert str(lca_al.query(leaflist[0], leaflist[1]).leaflist) == "[node1, node3]" assert str(lca_al.query( leaflist[2], leaflist[3]).leaflist) == "[node1, node3, node5, node2, node4, node6]" assert str(lca_al.query(leaflist[0], leaflist[2]).leaflist) == "[node1, node3, node5]" assert str(lca_al.query(leaflist[3], leaflist[4]).leaflist) == "[node2, node4]" assert str(lca_al.query( leaflist[0], leaflist[5]).leaflist) == "[node1, node3, node5, node2, node4, node6]"
def compute_lcp_tree(t_overmerged): global _lcp_depth ''' Augments every, to the algorithm relevant, node in t_overmerged with an attribute, node.suffix_link, pointing to the node representing the string of the current node minus first character Running time: O(n) ''' lca_nodepairs = [] def helper(node): nonlocal lca_nodepairs if hasattr(node, 'lca_even'): lca_nodepairs.append((node.lca_even, node.lca_odd)) t_overmerged.traverse(helper) id2node = [] t_overmerged.traverse(lambda n: id2node.append((n.id, n)) if 'inner' not in str(n.id) else 'do nothing') id2node = dict(id2node) # --------------------------------------- # CREATE LCP TREE # --------------------------------------- lca_f = fast_lca.LCA() lca_f.preprocess(t_overmerged) for node1, node2 in lca_nodepairs: # TODO: using naive_lca to find lca to create suffix link, this # must instead be the constant time lookup as described in # the article [Ht84], otherwise we do not achieve O(n) running # time for the algorithm lca = lca_f.query(id2node[node1.id], id2node[node2.id]) # lca_naive = naive_lca(node1, node2, t_overmerged, id2node) # assert lca == lca_naive if (lca.id == 'root' or node1.id + 1 not in id2node or node2.id + 1 not in id2node): # we cannot create a suffix link from root as it is undefined continue node1_next = id2node[node1.id + 1] node2_next = id2node[node2.id + 1] lca_parent = lca_f.query(node1_next, node2_next) # lca_parent_naive = naive_lca(node1_next, node2_next, t_overmerged, id2node) # assert(lca_parent == lca_parent_naive) lca.suffix_link = lca_parent # --------------------------------------- # ADD LCP DEPTH TO ALL NODES USING A SINGLE DFS # --------------------------------------- def lcp_depth(node): if hasattr(node, 'lcp_depth'): # we already computed this node as a result of computing an # earlier node with a suffix link to this node, no need to # repeat the computation return node.lcp_depth if hasattr(node, 'suffix_link'): if not hasattr(node.suffix_link, 'lcp_depth'): # our suffix link is to a node for which we have not yet # computed the lcp depth; do so, return it to here and # continue the bfs. This is still within O(n) as we simply # skip the node when we encounter it the second time in # the initial bfs node.lcp_depth = lcp_depth(node.suffix_link) + 1 node.lcp_depth = node.suffix_link.lcp_depth + 1 return node.lcp_depth t_overmerged.lcp_depth = 0 t_overmerged.bfs(lcp_depth) del lca_nodepairs, id2node, lca_f,
def T_even(t_odd, inputstr): global _even_calls S = inputstr n = len(S) # (i) # find the lexicographical ordering of the even suffixes leaflist = [] def get_leafs(node): nonlocal leaflist if node.is_leaf(): leaflist.append(node) t_odd.dfs(get_leafs) odd_suffix_ordering = [node.id for node in leaflist] # t_odd.leaflist] # even_suffixes is a list of tuples (x[2i], suffix[2i + 1]) to radix sort even_suffixes = [(int(S[node - 2]), node) for node in odd_suffix_ordering if node != 1] radixsort.sort(even_suffixes, 0) even_suffixes = [tup[1] - 1 for tup in even_suffixes] # in case S is of even length, n % 2 == 0, the even suffix at pos n # is the last one in the sorted list, as it starts with character '$' # which, by definition, is ranked as |alphabet| + 1, i.e. last character # We need to add this one specifically, as it is not found by counting # all odd suffixes down by one # e.g.: if the inputstr is of length 4, then odd suffixes are 1 and 3 # if we only count even suffixes as odd suffixes prefixed with # a character, we will never capture 4, as 5 is not an odd suffix # hence why we need to manually add it as the last one as it is '$' if n % 2 == 0: even_suffixes.append(n) # (ii) # compute lcp for adjacent even suffixes lca_f = fast_lca.LCA() lca_f.preprocess(t_odd) id2node = [] t_odd.traverse(lambda n: id2node.append((n.id, n)) if 'inner' not in str(n.id) else 'do nothing') id2node = dict(id2node) lcp = {} for idx in range(0, len(even_suffixes) - 1): i = even_suffixes[idx] j = even_suffixes[idx + 1] curr_lcp = 0 if (S[i - 1] == S[j - 1] and i < n and j < n): if j + 1 in id2node and i + 1 in id2node: lca_parent = lca_f.query(id2node[i + 1], id2node[j + 1]) curr_lcp = lca_parent.str_length + 1 else: curr_lcp = 1 lcp[(even_suffixes[idx], even_suffixes[idx + 1])] = curr_lcp # (iii) # construct T_even using information from (i) and (ii) root = Node(aId='root') fst_suf = even_suffixes[0] fst_suf_len = n - fst_suf + 1 # S[fst_suf - 1:] node_fst_suf = Node(fst_suf_len, fst_suf) root.add_child(node_fst_suf) id2node = {fst_suf: node_fst_suf} currLoopTime = 0 updatingLeafList = 0 for i in range(1, len(even_suffixes)): prev_suf = even_suffixes[i - 1] curr_suf = even_suffixes[i] curr_lcp = lcp[(prev_suf, curr_suf)] prev_lcp = None if i > 1: prevprev_suf = even_suffixes[i - 2] prev_lcp = lcp[(prevprev_suf, prev_suf)] if curr_lcp == 0: curr_suf_len = n - curr_suf + 1 new_node = Node(curr_suf_len, curr_suf) root.add_child(new_node) id2node[curr_suf] = new_node else: if prev_lcp: prev_node = id2node[prev_suf] # we need to append the new node to somewhere on the # path from root to the parent of the prev_node. # This might involve following a lot of nodes' # parentEdges to find the spot # TODO: is it O(n)??? remaining_until_insertion = prev_lcp - curr_lcp possible_insertion_node = prev_node.parent while remaining_until_insertion > 0: # run up through parentEdges until # remaining_until_insertion is 0 len_of_edge = possible_insertion_node.str_length - possible_insertion_node.parent.str_length remaining_until_insertion -= len_of_edge possible_insertion_node = possible_insertion_node.parent # possible_insertion_node is now the spot at which we # should place curr_suf # we need to pop the rightmost child of the # possible_insertion_node as we need to insert an inner # node with this child and our new_node as children in # place of this rightmost child, if # remaining_until_insertion is negative and not exactly # 0, in which case we can just add_child(new_node) if remaining_until_insertion == 0: len_newnode = n - curr_suf + 1 new_node = Node(len_newnode, curr_suf) id2node[curr_suf] = new_node possible_insertion_node.add_child(new_node) else: child_of_insertion_node = possible_insertion_node.children.pop( ) split_idx = abs(remaining_until_insertion) inner_parentEdge_len = child_of_insertion_node.parent.str_length + split_idx innernode = Node(inner_parentEdge_len, 'inner') len_newnode = n - curr_suf + 1 new_node = Node(len_newnode, curr_suf) possible_insertion_node.add_child(innernode) innernode.add_child(child_of_insertion_node) innernode.add_child(new_node) id2node[curr_suf] = new_node else: innernode_len = curr_lcp innernode = Node(innernode_len, 'inner') new_node_len = n - curr_suf + 1 new_node = Node(new_node_len, curr_suf) id2node[curr_suf] = new_node prev_node = id2node[prev_suf] # update prev_node by removing lcp from its parentEdge # as it has been assigned a new parent who's parentEdge # is exactly lcp # prev_node.parentEdge = prev_node.parentEdge[len(str_curr_lcp):] prev_node.parent.children[-1] = innernode innernode.parent = prev_node.parent # important! prev_node must be added before new_node to # keep lexicographic ordering of children innernode.add_child(prev_node) innernode.add_child(new_node) t_even = root t_even.update_leaf_list() del S, n, leaflist, odd_suffix_ordering, even_suffixes, lca_f, id2node return t_even