def __invoke_callback(self, page_generator, element, callback): try: callback(page_generator, element) except ReplaceWithNothing as exc: # Remove the element element.parentNode.removeChild(element) except ReplaceWithText as exc: # Replace the element with the given text text_node = element.ownerDocument.createTextNode(exc.text) element.parentNode.replaceChild(text_node, element) except ReplaceWithNode as exc: # Replace the element with the given node new_node = exc.node if new_node.ownerDocument is not element.ownerDocument: new_node = element.ownerDocument.importNode(new_node, True) element.parentNode.replaceChild(new_node, element) if exc.fix_namespaces: # page_generator.content uses HTML without specifying a namespace substitute_namespaces(new_node, {XHTML_NAMESPACE: EMPTY_NAMESPACE}) normalize_namespaces(new_node, strip_dups=True) except ReplaceWithHTML as exc: # # Replace the element with the given HTML code # # Parse with TagSoupToXml p = TagSoupToXml(omit_comments=exc.omit_comments) p.feed(exc.html) p.close() # Get a DOM document doc = p.todocument() # Find the <body> element for node in doc.documentElement.childNodes: if node.nodeType != node.ELEMENT_NODE: continue if node.localName == 'body': bodyElement = node break else: raise AssertionError("<body> element not found") # At this stage, HTML code doesn't have a namespace assigned yet. assert bodyElement.namespaceURI == EMPTY_NAMESPACE # Replace the placeholder element with the children of the <body> node. for node in bodyElement.childNodes: new_node = element.ownerDocument.importNode(node, True) element.parentNode.insertBefore(new_node, element) element.parentNode.removeChild(element)
def load_content(self): self.invoke_filters('load_content:before') # Convert HTML tag soup to XML p = TagSoupToXml(omit_comments=True) # Omit commented-out parts in the final output p.feed(open(self.path_info.source_filename, "rt", encoding="UTF-8").read()) # TODO: support other encodings? (is that safe?) p.close() # Return a DOM URL self.content = p.todocument() # Drop any "http://www.w3.org/1999/xhtml" namespace declarations substitute_namespaces(self.content.documentElement, {XHTML_NAMESPACE: EMPTY_NAMESPACE}) normalize_namespaces(self.content.documentElement, strip_dups=True) self.invoke_filters('load_content:after')
def _early_process_entry(self, page_generator, entry): """Perform early in-place processing of an entry.""" entryDocument = minidom.parseString(entry['atom:entry']) entryElement = entryDocument.documentElement page_content_type = self._framework.plugins['vars'].vars['page_content_type'] # Extract the 'id' of the entry (idElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'id') entry['id'] = getChildText(idElement).strip() # Extract and normalize the 'published' date of the entry (publishedElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'published') entry['published'] = atom_datetime_to_utc(getChildText(publishedElement).strip()) # Extract and normalize the 'updated' date of the entry; Create it if it doesn't exist. ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'updated')) if ee: (updatedElement,) = ee # there should be only one else: # Create an <updated> element using the 'published' date updatedElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'updated') replaceChildText(updatedElement, entry['published']) entryElement.appendChild(updatedElement) entry['updated'] = atom_datetime_to_utc(getChildText(updatedElement).strip()) # Create a <title> element if one does not already exist. ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'title')) if not ee: titleElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'title') titleElement.setAttribute('type', 'text') titleElement.appendChild(entryDocument.createTextNode(entry['title'])) entryElement.appendChild(titleElement) # Create a <link rel="alternate"> element if one does not already exist. ee = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'link') linkElement = None for e in ee: rel = e.getAttribute('rel') type = e.getAttribute('type') hreflang = e.getAttribute('hreflang') if rel == "alternate" and type == page_content_type and not hreflang: if linkElement is not None: raise FGValueError('Conflicting <link rel="alternate" type=%r hreflang=%r> entries in %s' % ( page_content_type, hreflang, page_generator.path_info.source_filename,)) linkElement = e if not linkElement: linkElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'link') linkElement.setAttribute('rel', 'alternate') linkElement.setAttribute('href', page_generator.path_info.target_url) linkElement.setAttribute('type', page_content_type) entryElement.appendChild(linkElement) # Rewrite URLs in the atom:entry element rewrite_links(entryElement, ATOM_CRITERIA, page_generator.path_info.target_url, page_generator.path_info.base_url, always_absolute=True) # Add a <summary> element, if applicable if entry['summary']: summaryDocument = minidom.parseString(entry['summary']) # Rewrite URLs in the summary rewrite_links(summaryDocument.documentElement, HTML_CRITERIA, entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True) # Create Atom <summary> element summaryElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'summary') summaryElement.setAttribute('type', 'xhtml') entryElement.appendChild(summaryElement) # Create XHTML <div> element divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div') divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE) summaryElement.appendChild(divElement) # Add data for n in summaryDocument.documentElement.childNodes: divElement.appendChild(divElement.ownerDocument.importNode(n, True)) # Elements with no namespace become XHTML elements substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE}) # Clean up data = None summaryDocument.unlink() summaryDocument = None del entry['summary'] # Add a <content> element if True: bodyDocument = minidom.parseString(entry['body']) # Rewrite URLs in the body rewrite_links(bodyDocument.documentElement, HTML_CRITERIA, entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True) # Create Atom <content> element contentElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'content') contentElement.setAttribute('type', 'xhtml') entryElement.appendChild(contentElement) # Create XHTML <div> element divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div') divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE) contentElement.appendChild(divElement) # Add data for n in bodyDocument.documentElement.childNodes: divElement.appendChild(divElement.ownerDocument.importNode(n, True)) # Elements with no namespace become XHTML elements substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE}) # Clean up data = None bodyDocument.unlink() bodyDocument = None del entry['body'] # Perform xmlns normalization normalize_namespaces(entryDocument.documentElement, strip_dups=True) # Update the new atom:entry document entry['atom:entry'] = entryDocument.toxml()