Example #1
0
    def __invoke_callback(self, page_generator, element, callback):
        try:
            callback(page_generator, element)
        except ReplaceWithNothing as exc:
            # Remove the element
            element.parentNode.removeChild(element)
        except ReplaceWithText as exc:
            # Replace the element with the given text
            text_node = element.ownerDocument.createTextNode(exc.text)
            element.parentNode.replaceChild(text_node, element)
        except ReplaceWithNode as exc:
            # Replace the element with the given node
            new_node = exc.node
            if new_node.ownerDocument is not element.ownerDocument:
                new_node = element.ownerDocument.importNode(new_node, True)
            element.parentNode.replaceChild(new_node, element)
            if exc.fix_namespaces:
                # page_generator.content uses HTML without specifying a namespace
                substitute_namespaces(new_node, {XHTML_NAMESPACE: EMPTY_NAMESPACE})
                normalize_namespaces(new_node, strip_dups=True)
        except ReplaceWithHTML as exc:
            #
            # Replace the element with the given HTML code
            #

            # Parse with TagSoupToXml
            p = TagSoupToXml(omit_comments=exc.omit_comments)
            p.feed(exc.html)
            p.close()

            # Get a DOM document
            doc = p.todocument()

            # Find the <body> element
            for node in doc.documentElement.childNodes:
                if node.nodeType != node.ELEMENT_NODE:
                    continue
                if node.localName == 'body':
                    bodyElement = node
                    break
            else:
                raise AssertionError("<body> element not found")

            # At this stage, HTML code doesn't have a namespace assigned yet.
            assert bodyElement.namespaceURI == EMPTY_NAMESPACE

            # Replace the placeholder element with the children of the <body> node.
            for node in bodyElement.childNodes:
                new_node = element.ownerDocument.importNode(node, True)
                element.parentNode.insertBefore(new_node, element)
            element.parentNode.removeChild(element)
Example #2
0
    def load_content(self):
        self.invoke_filters('load_content:before')

        # Convert HTML tag soup to XML
        p = TagSoupToXml(omit_comments=True)  # Omit commented-out parts in the final output
        p.feed(open(self.path_info.source_filename, "rt", encoding="UTF-8").read())   # TODO: support other encodings? (is that safe?)
        p.close()

        # Return a DOM URL
        self.content = p.todocument()

        # Drop any "http://www.w3.org/1999/xhtml" namespace declarations
        substitute_namespaces(self.content.documentElement, {XHTML_NAMESPACE: EMPTY_NAMESPACE})
        normalize_namespaces(self.content.documentElement, strip_dups=True)

        self.invoke_filters('load_content:after')
Example #3
0
    def _early_process_entry(self, page_generator, entry):
        """Perform early in-place processing of an entry."""

        entryDocument = minidom.parseString(entry['atom:entry'])
        entryElement = entryDocument.documentElement
        page_content_type = self._framework.plugins['vars'].vars['page_content_type']

        # Extract the 'id' of the entry
        (idElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'id')
        entry['id'] = getChildText(idElement).strip()

        # Extract and normalize the 'published' date of the entry
        (publishedElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'published')
        entry['published'] = atom_datetime_to_utc(getChildText(publishedElement).strip())

        # Extract and normalize the 'updated' date of the entry; Create it if it doesn't exist.
        ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'updated'))
        if ee:
            (updatedElement,) = ee  # there should be only one
        else:
            # Create an <updated> element using the 'published' date
            updatedElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'updated')
            replaceChildText(updatedElement, entry['published'])
            entryElement.appendChild(updatedElement)
        entry['updated'] = atom_datetime_to_utc(getChildText(updatedElement).strip())

        # Create a <title> element if one does not already exist.
        ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'title'))
        if not ee:
            titleElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'title')
            titleElement.setAttribute('type', 'text')
            titleElement.appendChild(entryDocument.createTextNode(entry['title']))
            entryElement.appendChild(titleElement)

        # Create a <link rel="alternate"> element if one does not already exist.
        ee = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'link')
        linkElement = None
        for e in ee:
            rel = e.getAttribute('rel')
            type = e.getAttribute('type')
            hreflang = e.getAttribute('hreflang')
            if rel == "alternate" and type == page_content_type and not hreflang:
                if linkElement is not None:
                    raise FGValueError('Conflicting <link rel="alternate" type=%r hreflang=%r> entries in %s' % (
                        page_content_type, hreflang, page_generator.path_info.source_filename,))
                linkElement = e
        if not linkElement:
            linkElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'link')
            linkElement.setAttribute('rel', 'alternate')
            linkElement.setAttribute('href', page_generator.path_info.target_url)
            linkElement.setAttribute('type', page_content_type)
            entryElement.appendChild(linkElement)

        # Rewrite URLs in the atom:entry element
        rewrite_links(entryElement, ATOM_CRITERIA, page_generator.path_info.target_url, page_generator.path_info.base_url, always_absolute=True)

        # Add a <summary> element, if applicable
        if entry['summary']:
            summaryDocument = minidom.parseString(entry['summary'])

            # Rewrite URLs in the summary
            rewrite_links(summaryDocument.documentElement, HTML_CRITERIA,
                entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True)

            # Create Atom <summary> element
            summaryElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'summary')
            summaryElement.setAttribute('type', 'xhtml')
            entryElement.appendChild(summaryElement)

            # Create XHTML <div> element
            divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div')
            divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE)
            summaryElement.appendChild(divElement)

            # Add data
            for n in summaryDocument.documentElement.childNodes:
                divElement.appendChild(divElement.ownerDocument.importNode(n, True))

            # Elements with no namespace become XHTML elements
            substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE})

            # Clean up
            data = None
            summaryDocument.unlink()
            summaryDocument = None
            del entry['summary']

        # Add a <content> element
        if True:
            bodyDocument = minidom.parseString(entry['body'])

            # Rewrite URLs in the body
            rewrite_links(bodyDocument.documentElement, HTML_CRITERIA,
                entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True)

            # Create Atom <content> element
            contentElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'content')
            contentElement.setAttribute('type', 'xhtml')
            entryElement.appendChild(contentElement)

            # Create XHTML <div> element
            divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div')
            divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE)
            contentElement.appendChild(divElement)

            # Add data
            for n in bodyDocument.documentElement.childNodes:
                divElement.appendChild(divElement.ownerDocument.importNode(n, True))

            # Elements with no namespace become XHTML elements
            substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE})

            # Clean up
            data = None
            bodyDocument.unlink()
            bodyDocument = None
            del entry['body']

        # Perform xmlns normalization
        normalize_namespaces(entryDocument.documentElement, strip_dups=True)

        # Update the new atom:entry document
        entry['atom:entry'] = entryDocument.toxml()