def __invoke_callback(self, page_generator, element, callback): try: callback(page_generator, element) except ReplaceWithNothing as exc: # Remove the element element.parentNode.removeChild(element) except ReplaceWithText as exc: # Replace the element with the given text text_node = element.ownerDocument.createTextNode(exc.text) element.parentNode.replaceChild(text_node, element) except ReplaceWithNode as exc: # Replace the element with the given node new_node = exc.node if new_node.ownerDocument is not element.ownerDocument: new_node = element.ownerDocument.importNode(new_node, True) element.parentNode.replaceChild(new_node, element) if exc.fix_namespaces: # page_generator.content uses HTML without specifying a namespace substitute_namespaces(new_node, {XHTML_NAMESPACE: EMPTY_NAMESPACE}) normalize_namespaces(new_node, strip_dups=True) except ReplaceWithHTML as exc: # # Replace the element with the given HTML code # # Parse with TagSoupToXml p = TagSoupToXml(omit_comments=exc.omit_comments) p.feed(exc.html) p.close() # Get a DOM document doc = p.todocument() # Find the <body> element for node in doc.documentElement.childNodes: if node.nodeType != node.ELEMENT_NODE: continue if node.localName == 'body': bodyElement = node break else: raise AssertionError("<body> element not found") # At this stage, HTML code doesn't have a namespace assigned yet. assert bodyElement.namespaceURI == EMPTY_NAMESPACE # Replace the placeholder element with the children of the <body> node. for node in bodyElement.childNodes: new_node = element.ownerDocument.importNode(node, True) element.parentNode.insertBefore(new_node, element) element.parentNode.removeChild(element)
def load_content(self): self.invoke_filters('load_content:before') # Convert HTML tag soup to XML p = TagSoupToXml(omit_comments=True) # Omit commented-out parts in the final output p.feed(open(self.path_info.source_filename, "rt", encoding="UTF-8").read()) # TODO: support other encodings? (is that safe?) p.close() # Return a DOM URL self.content = p.todocument() # Drop any "http://www.w3.org/1999/xhtml" namespace declarations substitute_namespaces(self.content.documentElement, {XHTML_NAMESPACE: EMPTY_NAMESPACE}) normalize_namespaces(self.content.documentElement, strip_dups=True) self.invoke_filters('load_content:after')