예제 #1
0
def create_trie(
    post_list
):  # post_list is a map whose keys are document IDs and values are the words in that document
    all_words = set()
    for words in post_list.values():
        for word in words:
            all_words.add(word)

    root = Trie.Node('', [], [])
    for word in all_words:
        current = root
        for i in range(len(word)):
            char = word[i]
            index = Trie.find(
                current.children_letters, char
            )  # check if the children of our current node contains this char
            if index == -1:  # this sequence has not been added so we need to add this char into a child node
                pos = bisect.bisect_left(current.children_letters,
                                         char)  # find the position to insert
                bisect.insort(current.children_letters, char)
                node = Trie.Node(char, [], [])
                current.children_nodes.insert(
                    pos, node)  # insert into the same position
                current = node
            else:  # this sequence is already added so move to that node
                current = current.children_nodes[index]
            if i == len(word) - 1: current.is_terminal = True

    return root
예제 #2
0
    def test_is_on_edge(self):
        #  \
        #   0001 - 1001
        #       \
        #        - 0011
        root = Trie.Node(word.epsilon, None)

        branch = Trie.Node(word(0b0001, 4), root)
        root.set_child(branch)

        child1 = Trie.Node(word(0b1001, 4), branch)
        branch.set_child(child1)

        child2 = Trie.Node(word(0b0011, 4), branch)
        branch.set_child(child2)

        # edge cases
        q = word(0b1, 1)
        res = root.is_on_edge(q)
        self.assertFalse(res)

        q = word.epsilon
        res = root.is_on_edge(q)
        self.assertTrue(res)

        # on edge
        q = word(0b0, 1)
        res = branch.is_on_edge(q)
        self.assertTrue(res)

        q = word(0b0001, 4)
        res = branch.is_on_edge(q)
        self.assertTrue(res)

        q = word(0b0000, 4)
        res = branch.is_on_edge(q)
        self.assertFalse(res)

        q = word(0b000110, 6)
        res = child1.is_on_edge(q)
        self.assertTrue(res)

        q = word(0b00011001, 8)
        res = child1.is_on_edge(q)
        self.assertTrue(res)

        q = word(0b000100, 6)
        res = child1.is_on_edge(q)
        self.assertFalse(res)

        q = word(0b00010000, 8)
        res = child1.is_on_edge(q)
        self.assertFalse(res)

        # to long
        q = word(0b00011, 5)
        res = branch.is_on_edge(q)
        self.assertFalse(res)

        q = word(0b000010, 6)
        res = branch.is_on_edge(q)
        self.assertFalse(res)

        # to short
        q = word.epsilon
        res = branch.is_on_edge(q)
        self.assertFalse(res)

        q = word(0b000, 3)
        res = child1.is_on_edge(q)
        self.assertFalse(res)

        # no matching prefix
        q = word(0b01001, 5)
        res = child1.is_on_edge(q)
        self.assertFalse(res)