def create_trie( post_list ): # post_list is a map whose keys are document IDs and values are the words in that document all_words = set() for words in post_list.values(): for word in words: all_words.add(word) root = Trie.Node('', [], []) for word in all_words: current = root for i in range(len(word)): char = word[i] index = Trie.find( current.children_letters, char ) # check if the children of our current node contains this char if index == -1: # this sequence has not been added so we need to add this char into a child node pos = bisect.bisect_left(current.children_letters, char) # find the position to insert bisect.insort(current.children_letters, char) node = Trie.Node(char, [], []) current.children_nodes.insert( pos, node) # insert into the same position current = node else: # this sequence is already added so move to that node current = current.children_nodes[index] if i == len(word) - 1: current.is_terminal = True return root
def test_is_on_edge(self): # \ # 0001 - 1001 # \ # - 0011 root = Trie.Node(word.epsilon, None) branch = Trie.Node(word(0b0001, 4), root) root.set_child(branch) child1 = Trie.Node(word(0b1001, 4), branch) branch.set_child(child1) child2 = Trie.Node(word(0b0011, 4), branch) branch.set_child(child2) # edge cases q = word(0b1, 1) res = root.is_on_edge(q) self.assertFalse(res) q = word.epsilon res = root.is_on_edge(q) self.assertTrue(res) # on edge q = word(0b0, 1) res = branch.is_on_edge(q) self.assertTrue(res) q = word(0b0001, 4) res = branch.is_on_edge(q) self.assertTrue(res) q = word(0b0000, 4) res = branch.is_on_edge(q) self.assertFalse(res) q = word(0b000110, 6) res = child1.is_on_edge(q) self.assertTrue(res) q = word(0b00011001, 8) res = child1.is_on_edge(q) self.assertTrue(res) q = word(0b000100, 6) res = child1.is_on_edge(q) self.assertFalse(res) q = word(0b00010000, 8) res = child1.is_on_edge(q) self.assertFalse(res) # to long q = word(0b00011, 5) res = branch.is_on_edge(q) self.assertFalse(res) q = word(0b000010, 6) res = branch.is_on_edge(q) self.assertFalse(res) # to short q = word.epsilon res = branch.is_on_edge(q) self.assertFalse(res) q = word(0b000, 3) res = child1.is_on_edge(q) self.assertFalse(res) # no matching prefix q = word(0b01001, 5) res = child1.is_on_edge(q) self.assertFalse(res)