def parse_node(elements, start_i, end_i): """ start_i end_i <tag>................</tag> """ nodes = [] i = start_i while i <= end_i: if isinstance(elements[i], TextNode): nodes.append(elements[i]) i += 1 elif isinstance(elements[i], StartTag): end_j = findEnd(elements, i) n = Node() n.tagname = elements[i].tagname n.attributes = elements[i].attributes children = parse_node(elements, i+1, end_j-1) n.childNodes = children nodes.append(n) i = end_j+1 elif isinstance(elements[i], SelfClosingTag): n = Node(True) n.tagname = elements[i].tagname n.attributes = elements[i].attributes nodes.append(n) i += 1 elif isinstance(elements[i], EndTag): i += 1 else: raise ValueError("unknown element", elements[i]) return nodes
def parse(xml): tokens = tokenize(xml) #skip anything before root node start_i = 0 while start_i < len(tokens) and not isinstance(tokens[start_i], StartTag): start_i += 1 end_i = findEnd(tokens, start_i) root = Node() root.tagname = tokens[start_i].tagname root.attributes = tokens[start_i].attributes children = parse_node(tokens, start_i+1, end_i-1) root.childNodes = children doc = XMLDocument(root) if isinstance(tokens[0], Prolog): doc.prolog = tokens[0] return doc