Beispiel #1
0
 def __init__(self, html):
     '''html is input html doc'''
     self.html = html
     self.root = TreeNode.Node()
     self.curr = self.root
     self.tagPattern = re.compile(r'(?!<!)</?[^>]+>')
     self.attrPattern = re.compile(r'\w+=\".*\"')
     self.attrListPattern = re.compile(r'(?<=\")\s')
     self.attrNamePattern = re.compile(r'\w+(?=\=\")')
     self.attrContentPattern = re.compile(r'(?<=\=)\".*\"')
     self.contentPattern = re.compile(r'(?<=>).*(?=</)')
     self.startPattern = re.compile(r'<\w+')
     self.closePattern = re.compile(r'</\w+')
     self.namePattern = re.compile(r'(?<=<)\w+')
     self.closeNamePattern = re.compile(r'(?<=</)\w+')
def build(pre, inord, in_l, in_r, pre_l, pre_r):
    if pre_l > pre_r or in_l > in_r:
        return None

    root = TreeNode.Node(pre[pre_l])

    #NOTE: huge optimization possible here if you use a hash map of
    # values to array indices in pre
    root_idx = inord.index(pre[pre_l])
    left_size = root_idx - in_l

    root.left = build(pre, inord, in_l, root_idx - 1, \
      pre_l + 1, pre_l + left_size)

    root.right = build(pre, inord, root_idx + 1, in_r, \
      pre_l + left_size + 1, pre_r)

    return root
Beispiel #3
0
    def buildTree(self):
        '''Build Dom Tree'''
        for eachLine in self.html:
            tags = re.findall(self.tagPattern, eachLine)
            if tags:
                for tag in tags:
                    # if tag:
                    #     print(tag)
                    start = re.match(self.startPattern, tag)
                    close = re.match(self.closePattern, tag)
                    if start:
                        # fill the content of the node
                        names = re.findall(self.namePattern, eachLine)
                        for name in names:
                            node = TreeNode.Node()
                            attr = self.getAttr(tag)
                            node.appendAttrList(attr)
                            node.name = name
                            node.appendAttrList(attr)
                            # judge if the tag is a special node, if the answer is yes
                            # ignore content part and the tag must be a child node
                            # if the tag is not a special node, change the node to curr node
                            if self.isSpecialNames(name):
                                node.parent = self.curr
                                self.curr.appendChild(node)
                                self.addContentToAllParent(eachLine, node)

                            else:
                                content = re.findall(self.contentPattern,
                                                     eachLine)
                                if content:
                                    node.appendContent(content[0])
                                node.parent = self.curr
                                self.curr.appendChild(node)
                                self.curr = node

                    #   if the node is a close node, search all the way to top
                    # to find its start tag and close it all
                    if close:

                        names = re.findall(self.closeNamePattern, eachLine)
                        for name in names:
                            self.curr = self.findStartTag(name).parent
                            self.curr.appendContent(eachLine)
def buildTree(rel, att, od):
    tuple_list = tn.getAttList(rel, att)
    # create root node
    root = tn.Node(tn.getPage())
    root = tn.insert(root, tuple_list[0][0], tuple_list[0][1], od, root)
    for i in range(1, len(tuple_list)):
        key = tuple_list[i][0]
        pointer = tuple_list[i][1]
        root = tn.insert(tn.search(root, key)[0], key, pointer, od, root)

    root.__write__()

    with open(INDEX_PATH + 'directory.txt') as f:
        directory = json.loads(f.read())

    # update directory
    with open(INDEX_PATH + 'directory.txt', 'w') as f:
        tree = []
        tree.append(rel)
        tree.append(att)
        tree.append(root.node_page)
        directory.append(tree)
        f.write(json.dumps(directory))
    print(root.__print__())