def ukkonen(text, n): text = text + '$' root, leaf = trie.TrieNode(''), trie.TrieNode(text[1:]) root.add_child(leaf) S, head, shift = {root: root}, root, 0 for i in range(2, n + 2): # niezmiennik: S[v] jest zdefiniowane dla wszystkich v != head(i - 1) child = head.children.get(text[i - shift]) if (child is None or shift >= len(child.label) or text[i] != child.label[shift]): previous_head = None while shift > 0 or text[i] not in head.children: v = (break_node(head, head.children[text[i - shift]], shift) if shift > 0 else head) v.add_child(trie.TrieNode(text[i:])) if head == root: shift -= 1 if previous_head is not None: S[previous_head] = v previous_head, head = v, S[head] if shift > 0: head, shift = fast_find(head, text[i - shift:i], split=False) if previous_head is not None: S[previous_head] = head child = head.children.get(text[i - shift]) if shift >= 0 else None if child is not None and len(child.label) == shift + 1: head, shift = head.children[text[i - shift]], 0 else: shift += 1 return root, S
def _get_suffix_tree(t): if len(t) == 1: return [1], [0] root, leaf = trie.TrieNode(''), trie.TrieNode(t[0:]) root.node_type, leaf.node_type = NODETYPE.NONE, NODETYPE.NONE root.add_child(leaf) To, A_To, LCP_To = _get_odd_suffix_tree(t) Te, A_Te, LCP_Te = _get_even_suffix_tree(t, To, A_To) Tovermerged = _get_faulty_merge(To, Te, len(t)) Tovermerged.odd_even_pairs = {} _initialise_odd_even_pairs_recurse(Tovermerged, len(t), Tovermerged.odd_even_pairs) LCA = lca.LCA(Tovermerged) Tovermerged.d_links = { k: LCA.query(o + 1, e + 1) for k, (o, e) in Tovermerged.odd_even_pairs.items() if o and e and k != len(t) + 1 and o < len(t) and e < len(t) } Tovermerged.depth_in_d_tree = {root: 0} for pair in Tovermerged.odd_even_pairs: _compute_depth_in_d_tree(pair, Tovermerged.depth_in_d_tree, Tovermerged.d_links) return _merge_suffix_arrays(A_To, A_Te, LCP_To, LCP_Te, LCA, t, Tovermerged.depth_in_d_tree)
def from_suffix_array_and_lcp(SA, LCP, text, n): root, leaf = trie.TrieNode(''), trie.TrieNode(text[SA[0] - 1:]) root.set_depth(0) root.add_child(leaf) leaf.set_depth(len(leaf.label)) root.index, leaf.index = n + 2, n + 2 - leaf.depth next_index, last_node = root.index + 1, leaf for a, lcp in zip(SA[1:], LCP): suffix = text[a - 1:] current_node = last_node while current_node.depth > lcp: current_node = current_node.parent if current_node.depth == lcp: if current_node.depth == len(suffix): new_node = current_node else: new_node = trie.TrieNode(suffix[current_node.depth:]) current_node.add_child(new_node) else: rightmost_child = max(current_node.children.items())[1] split_node = break_node( current_node, rightmost_child, lcp - current_node.depth) split_node.set_depth(lcp) if suffix[lcp:]: new_node = trie.TrieNode(suffix[lcp:]) split_node.add_child(new_node) split_node.index = next_index next_index += 1 else: new_node = split_node new_node.index = n + 2 - len(suffix) new_node.set_depth(len(suffix)) last_node = new_node return root
def naive(text, n): text = text + '$' root, leaf = trie.TrieNode(''), trie.TrieNode(text[1:]) root.add_child(leaf) for i in range(2, n + 2): head, remaining = slow_find(root, text[i:]) leaf = trie.TrieNode(text[-remaining:]) head.add_child(leaf) return root
def get_suffix_tree(text): root, leaf = trie.TrieNode(''), trie.TrieNode(text[0:]) root.add_child(leaf) if len(text) == 1: return root, [1], [0] (To, A_To, LCP_To) = get_odd_suffix_tree(text) (Te, A_Te, LCP_Te) = get_even_suffix_tree(text, To, A_To) Tovermerged = get_faulty_merge(To, Te, text) initialise_odd_even_pairs(Tovermerged, len(text)) lca = LCA() lca.preprocess(Tovermerged) compute_d_links(lca, len(text), Tovermerged) compute_depths_in_d_tree(Tovermerged) A_T, LCP_T = merge_suffix_arrays(A_To, A_Te, LCP_To, LCP_Te, lca, text, getattr(Tovermerged, "depth_in_d_tree")) T = suffix_and_lcp_array_to_tree(A_T, LCP_T, text) return T, A_T, LCP_T
def suffix_and_lcp_array_to_tree(A, LCP, text): next_index = len(text) + 1 n = len(A) root = trie.TrieNode('') leaf = trie.TrieNode(text[A[0] - 1:]) root.set_depth(0) root.add_child(leaf) leaf.set_depth(len(leaf.label)) leaf.index = len(text) + 1 - leaf.depth root.index = next_index next_index += 1 lastNode = leaf for i in range(1, n): suffix = text[A[i] - 1:] currentNode = lastNode while currentNode.depth > LCP[i - 1]: currentNode = currentNode.parent if currentNode.depth == LCP[i - 1]: if currentNode.depth == len(suffix): newNode = currentNode else: newNode = trie.TrieNode(suffix[currentNode.depth:]) currentNode.add_child(newNode) else: rightmostChild = \ sorted(currentNode.children.items())[len(currentNode.children) - 1][1] splitNode = suffix_tree.break_node(currentNode, rightmostChild, LCP[i - 1] - currentNode.depth) splitNode.set_depth(LCP[i - 1]) if len(suffix[LCP[i - 1]:]) >= 1: newNode = trie.TrieNode(suffix[LCP[i - 1]:]) splitNode.add_child(newNode) splitNode.index = next_index next_index += 1 else: newNode = splitNode newNode.index = len(text) - len(suffix) + 1 newNode.set_depth(len(suffix)) lastNode = newNode return root
def mccreight(text, n): text = text + '$' root, leaf = trie.TrieNode(''), trie.TrieNode(text[1:]) root.add_child(leaf) S, head = {}, root for _ in range(2, n + 2): # niezmiennik: S[v] jest zdefiniowane dla wszystkich v != head(i - 1) if head == root: # wyjatek 1: drzewo z jednym lisciem beta, gamma, v = '', head.children[leaf.label[0]].label[1:], root else: if head.parent == root: # wyjatek 2: head.parent jest rootem beta = head.parent.children[head.label[0]].label[1:] else: beta = head.parent.children[head.label[0]].label gamma = head.children[leaf.label[0]].label v, _ = fast_find(S[head.parent], beta, split=True) S[head] = v head, remaining = slow_find(v, gamma) leaf = trie.TrieNode(text[-remaining:]) head.add_child(leaf) return root, S
def weiner(text, n): text = text + '$' root = trie.TrieNode('') link, head = {(root, ''): root}, root for i in range(n + 1, 0, -1): # niezmiennik: link[v][c] = u dla u i v takich, ze word(u) = c word(v) v, depth = head, n + 2 while v != root and link.get((v, text[i])) is None: v, depth = v.parent, depth - len(v.label) u = link.get((v, text[i])) if u is None or text[depth] in u.children: if u is None: u, remaining = slow_find(root, text[depth - 1:]) else: u, remaining = slow_find(u, text[depth:]) v, _ = fast_find(v, text[depth:-remaining], False) depth = len(text) - remaining if u != root: link[(v, text[i])] = u leaf = trie.TrieNode(text[depth:]) u.add_child(leaf) head = leaf return root, link
def _get_faulty_merge_recurse(node, odd, even, n): if node.parent is None: node.index = n + 1 # Listy posortowane w kolejnosci: odd, even, wyjscie o_children = sorted(odd.children.items()) e_children = sorted(even.children.items()) i, j = 0, 0 while i < len(o_children) or j < len(e_children): o_char, o_child = o_children[i] if i < len(o_children) else (None, None) e_char, e_child = e_children[j] if j < len(e_children) else (None, None) empty_node = trie.TrieNode('') empty_node.node_type = NODETYPE.NONE if i == len(o_children): new_node = _create_node(node, e_child, NODETYPE.EVEN, n) _get_faulty_merge_recurse(new_node, empty_node, e_child, n) j += 1 elif j == len(e_children) or o_char < e_char: new_node = _create_node(node, o_child, NODETYPE.ODD, n) _get_faulty_merge_recurse(new_node, o_child, empty_node, n) i += 1 elif o_char == e_char: if len(e_child.label) > len(o_child.label): new_node = _create_node(node, o_child, NODETYPE.ODD, n) split_node = sufftree.break_node(even, e_child, len(o_child.label)) split_node.node_type = NODETYPE.NONE _get_faulty_merge_recurse(new_node, o_child, split_node, n) elif len(e_child.label) < len(o_child.label): new_node = _create_node(node, e_child, NODETYPE.EVEN, n) split_node = sufftree.break_node(odd, o_child, len(e_child.label)) split_node.node_type = NODETYPE.NONE _get_faulty_merge_recurse(new_node, split_node, e_child, n) else: new_node = _create_node( node, o_child if e_child.index > n else e_child, NODETYPE.BOTH, n) _get_faulty_merge_recurse(new_node, o_child, e_child, n) i, j = i + 1, j + 1 else: new_node = _create_node(node, e_child, NODETYPE.EVEN, n) _get_faulty_merge_recurse(new_node, empty_node, e_child, n) j += 1
def get_faulty_merge(To, Te, text): root = trie.TrieNode('') get_faulty_merge_recurse.next_index = len(text) + 2 get_faulty_merge_recurse(root, To, Te, text) return root
def get_faulty_merge_recurse(node, odd, even, text): def set_index(new_nd, old_nd): n = len(text) if old_nd.index <= n: new_nd.index = old_nd.index else: new_nd.index = get_faulty_merge_recurse.next_index get_faulty_merge_recurse.next_index += 1 if node.parent is None: node.index = len(text) + 1 i = 0 j = 0 # two following lines won't work linear, but for clarity, they are written # this way # Farach's algorithm works on lists which are already sorted in odd, # even and resulting tree, but current API uses # dictionary to represent children set and hence the call to 'sorted' # function; if we used list instead of dict, # we would simply omit them o_children = sorted(odd.children.items()) e_children = sorted(even.children.items()) while i < len(o_children) or j < len(e_children): o_child = e_child = o_char = e_char = None if i < len(o_children): o_child = o_children[i][1] o_char = o_children[i][0] if j < len(e_children): e_child = e_children[j][1] e_char = e_children[j][0] if i == len(o_children): new_node = trie.TrieNode(e_child.label) set_index(new_node, e_child) node.add_child(new_node) setattr(new_node, "even", True) get_faulty_merge_recurse(new_node, trie.TrieNode(""), e_child, text) j += 1 continue if j == len(e_children): new_node = trie.TrieNode(o_child.label) set_index(new_node, o_child) node.add_child(new_node) setattr(new_node, "odd", True) get_faulty_merge_recurse(new_node, o_child, trie.TrieNode(""), text) i += 1 continue if o_char == e_char: o_len = len(o_child.label) e_len = len(e_child.label) if o_len != e_len: odd_shorter, short_label = (True, o_child.label) if e_len > o_len else ( False, e_child.label) new_node = trie.TrieNode(short_label) node.add_child(new_node) if odd_shorter: setattr(new_node, "odd", True) set_index(new_node, o_child) else: setattr(new_node, "even", True) set_index(new_node, e_child) # add new artificial node in even or odd tree to recurse properly if odd_shorter: split_node = suffix_tree.break_node(even, e_child, o_len) get_faulty_merge_recurse(new_node, o_child, split_node, text) else: split_node = suffix_tree.break_node(odd, o_child, e_len) get_faulty_merge_recurse(new_node, split_node, e_child, text) else: # o_len == e_len and o_char == e_char new_node = trie.TrieNode(o_child.label) set_index(new_node, e_child if e_child.index <= len(text) else o_child) node.add_child(new_node) setattr(new_node, "odd", True) setattr(new_node, "even", True) get_faulty_merge_recurse(new_node, o_child, e_child, text) i += 1 j += 1 else: if o_char < e_char: new_node = trie.TrieNode(o_child.label) set_index(new_node, o_child) node.add_child(new_node) get_faulty_merge_recurse(new_node, o_child, trie.TrieNode(""), text) i += 1 setattr(new_node, "odd", True) else: new_node = trie.TrieNode(e_child.label) set_index(new_node, e_child) node.add_child(new_node) get_faulty_merge_recurse(new_node, trie.TrieNode(""), e_child, text) j += 1 setattr(new_node, "even", True)
def break_node(parent, child, index): u = trie.TrieNode(child.label[:index]) child.label = child.label[index:] u.add_child(child) parent.add_child(u) return u
def _get_faulty_merge(To, Te, n): root = trie.TrieNode('') root.node_type = NODETYPE.NONE _get_faulty_merge_recurse.next_index = n + 2 _get_faulty_merge_recurse(root, To, Te, n) return root
def _create_node(parent, other, node_type, n): new_node = trie.TrieNode(other.label) new_node.node_type = node_type _set_index(new_node, other, n) parent.add_child(new_node) return new_node