Exemple #1
0
    def rewrite_links(self, node, callback_func):
        """Rewrite links inside a document.

        For each link found, call `callback_func`, passing it the URL amd the
        matched criterion.

        The callback function should return the replacement URL.
        """
        assert node.nodeType in (node.DOCUMENT_NODE, node.ELEMENT_NODE)
        if node.nodeType == node.DOCUMENT_NODE:
            return self.rewrite_links(node.documentElement, callback_func)

        # Find any matching elements or attributes, and rewrite their URLs.
        for (m_type, m_match, m_criterion) in self.match_element(node):
            if m_type == self.MATCHED_ELEMENT:
                url = getChildText(node).strip()
                url = callback_func(url, m_criterion)
                replaceChildText(node, url)
            elif m_type == self.MATCHED_ATTRIBUTE:
                (namespaceURI, localName) = m_match
                url = node.getAttributeNS(namespaceURI, localName)
                url = callback_func(url, m_criterion)
                node.setAttributeNS(namespaceURI, localName, url)
            else:
                raise AssertionError("Unrecognized m_type")

        # Walk through the child nodes.
        for n in node.childNodes:
            if n.nodeType == node.ELEMENT_NODE:
               self.rewrite_links(n, callback_func)
Exemple #2
0
    def _load_content(self, page_generator):
        content = page_generator.content

        # Initialize the current entry
        current_entry = {}

        # Find the <head> and <body> elements.
        (headElement,) = page_generator.content.getElementsByTagName('head')
        (bodyElement,) = page_generator.content.getElementsByTagName('body')

        # Find the <atom:entry> element inside the <head> element.
        entries = headElement.getElementsByTagNameNS(ATOM_NAMESPACE, 'entry')
        if not entries:
            # No Atom feed entry.  Do nothing.
            self._clear_entry_data(page_generator)
            return
        elif len(entries) > 1:
            # There should only be one atom:entry
            raise FGValueError("Too many Atom entries in %s" % (page_generator.path_info.source_filename,))
        (entryElement,) = entries

        # Store the <atom:entry> element (with all namespace information included)
        dummyDocument = minidom.parseString('<dummy/>')
        new_entryElement = dummyDocument.importNode(entryElement, True)
        dummyDocument.documentElement.appendChild(new_entryElement)
        normalize_namespaces(new_entryElement)
        current_entry['atom:entry'] = new_entryElement.toxml()

        # Find and store the page summary (if any) in the <body> element, and un-set class="feed-summary".
        summaryElements = list(find_elements_with_class(content, "feed-summary", remove=True))
        if len(summaryElements) > 1:
            # There should only be one element with class="feed-summary"
            raise FGValueError('Too many elements have class="feed-summary" in %s' % (page_generator.path_info.source_filename,))
        elif summaryElements:
            # Save the summary
            current_entry['summary'] = summaryElements[0].toxml()
        else:
            # Save an empty summary
            current_entry['summary'] = None

        # Save the page body
        current_entry['body'] = bodyElement.toxml()

        # Save the page title
        (titleElement,) = getChildElementsNS(headElement, EMPTY_NAMESPACE, 'title')
        current_entry['title'] = getChildText(titleElement)

        # Save the path_info
        current_entry['path_info'] = page_generator.path_info

        # Perform some early processing
        self._early_process_entry(page_generator, current_entry)

        # Write the entry to disk
        self._write_entry_data(page_generator, current_entry)
Exemple #3
0
def _filter_set_title(pg):
    # Extract title text from pg.content
    (head_element,) = getChildElementsNS(pg.content.documentElement, EMPTY_NAMESPACE, 'head')
    (title_element,) = getChildElementsNS(head_element, EMPTY_NAMESPACE, 'title')
    title_text = getChildText(title_element)

    # Replace children of <title> element in pg.title
    (title_element,) = pg.page.getElementsByTagName('title')
    for n in title_element.childNodes:
        title_element.removeChild(n)
    title_element.appendChild(pg.page.createTextNode(title_text))
Exemple #4
0
 def _handle_maxima_element(self, page_generator, element):
     return self.maxima_expression_placeholder(getChildText(element), force_img=(element.getAttribute('force') == 'img'))
Exemple #5
0
 def _handle_math_element(self, page_generator, element):
     self.math_placeholder(getChildText(element), force_img=(element.getAttribute('force') == 'img'))
Exemple #6
0
    def _handle_news_element(self, page_generator, c_newsElement):
        if self._feed_url is None:
            raise ValueError("news-here element found before set_news_feed called")

        # Find the template element
        (c_templateElement,) = getChildElementsNS(c_newsElement, NEWS_NAMESPACE, 'template')

        # Create the result document
        result_doc = minidom.parseString("<div/>")

        # Load the Atom feed
        feed = minidom.parseString(open(self._feed_path_info.output_filename, "rb").read())

        # Get the content-type of page links
        page_content_type = self._framework.plugins['vars'].vars['page_content_type']

        # Get the maximum number of entries (if any)
        limit = c_newsElement.getAttribute('limit')
        if not limit:
            limit = None
        else:
            limit = int(limit)

        for i, f_entryElement in enumerate(getChildElementsNS(feed.documentElement, ATOM_NAMESPACE, 'entry')):
            # Don't output more than the specified number of articles
            if limit is not None and i >= limit:
                break

            params = {}

            # Get entry title
            (f_titleElement,) = getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'title')
            assert f_titleElement.getAttribute('type') == 'text'
            params['title'] = getChildText(f_titleElement)

            # Get entry publication/update dates
            params['published'] = getChildText(tuple(getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'published'))[0])
            params['updated'] = getChildText(tuple(getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'updated'))[0])

            # Get entry <link rel="alternate" type="text/html">
            ee = getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'link')
            for e in ee:
                rel = e.getAttribute('rel')
                type = e.getAttribute('type')
                hreflang = e.getAttribute('hreflang')
                if rel == "alternate" and type == page_content_type and not hreflang:
                    params['href'] = e.getAttribute('href')
                    break
            else:
                raise RuntimeError("link not found")

            # Get entry summary
            (f_summaryElement,) = getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'summary')
            assert f_summaryElement.getAttribute('type') == 'xhtml'
            (f_summaryDiv,) = (n for n in f_summaryElement.childNodes if n.nodeType == n.ELEMENT_NODE)
            params['summaryDiv'] = f_summaryDiv

            # Create per-entry <div> element
            r_divElement = result_doc.createElement('div')
            result_doc.documentElement.appendChild(r_divElement)

            # Copy the template to the result
            self.__copy_template_to_result(c_templateElement, r_divElement, params)

        # Namespace normalization
        normalize_namespaces(result_doc.documentElement)

        # Replace the placeholder
        raise ReplaceWithNode(result_doc.documentElement)
Exemple #7
0
    def _early_process_entry(self, page_generator, entry):
        """Perform early in-place processing of an entry."""

        entryDocument = minidom.parseString(entry['atom:entry'])
        entryElement = entryDocument.documentElement
        page_content_type = self._framework.plugins['vars'].vars['page_content_type']

        # Extract the 'id' of the entry
        (idElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'id')
        entry['id'] = getChildText(idElement).strip()

        # Extract and normalize the 'published' date of the entry
        (publishedElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'published')
        entry['published'] = atom_datetime_to_utc(getChildText(publishedElement).strip())

        # Extract and normalize the 'updated' date of the entry; Create it if it doesn't exist.
        ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'updated'))
        if ee:
            (updatedElement,) = ee  # there should be only one
        else:
            # Create an <updated> element using the 'published' date
            updatedElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'updated')
            replaceChildText(updatedElement, entry['published'])
            entryElement.appendChild(updatedElement)
        entry['updated'] = atom_datetime_to_utc(getChildText(updatedElement).strip())

        # Create a <title> element if one does not already exist.
        ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'title'))
        if not ee:
            titleElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'title')
            titleElement.setAttribute('type', 'text')
            titleElement.appendChild(entryDocument.createTextNode(entry['title']))
            entryElement.appendChild(titleElement)

        # Create a <link rel="alternate"> element if one does not already exist.
        ee = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'link')
        linkElement = None
        for e in ee:
            rel = e.getAttribute('rel')
            type = e.getAttribute('type')
            hreflang = e.getAttribute('hreflang')
            if rel == "alternate" and type == page_content_type and not hreflang:
                if linkElement is not None:
                    raise FGValueError('Conflicting <link rel="alternate" type=%r hreflang=%r> entries in %s' % (
                        page_content_type, hreflang, page_generator.path_info.source_filename,))
                linkElement = e
        if not linkElement:
            linkElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'link')
            linkElement.setAttribute('rel', 'alternate')
            linkElement.setAttribute('href', page_generator.path_info.target_url)
            linkElement.setAttribute('type', page_content_type)
            entryElement.appendChild(linkElement)

        # Rewrite URLs in the atom:entry element
        rewrite_links(entryElement, ATOM_CRITERIA, page_generator.path_info.target_url, page_generator.path_info.base_url, always_absolute=True)

        # Add a <summary> element, if applicable
        if entry['summary']:
            summaryDocument = minidom.parseString(entry['summary'])

            # Rewrite URLs in the summary
            rewrite_links(summaryDocument.documentElement, HTML_CRITERIA,
                entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True)

            # Create Atom <summary> element
            summaryElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'summary')
            summaryElement.setAttribute('type', 'xhtml')
            entryElement.appendChild(summaryElement)

            # Create XHTML <div> element
            divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div')
            divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE)
            summaryElement.appendChild(divElement)

            # Add data
            for n in summaryDocument.documentElement.childNodes:
                divElement.appendChild(divElement.ownerDocument.importNode(n, True))

            # Elements with no namespace become XHTML elements
            substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE})

            # Clean up
            data = None
            summaryDocument.unlink()
            summaryDocument = None
            del entry['summary']

        # Add a <content> element
        if True:
            bodyDocument = minidom.parseString(entry['body'])

            # Rewrite URLs in the body
            rewrite_links(bodyDocument.documentElement, HTML_CRITERIA,
                entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True)

            # Create Atom <content> element
            contentElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'content')
            contentElement.setAttribute('type', 'xhtml')
            entryElement.appendChild(contentElement)

            # Create XHTML <div> element
            divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div')
            divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE)
            contentElement.appendChild(divElement)

            # Add data
            for n in bodyDocument.documentElement.childNodes:
                divElement.appendChild(divElement.ownerDocument.importNode(n, True))

            # Elements with no namespace become XHTML elements
            substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE})

            # Clean up
            data = None
            bodyDocument.unlink()
            bodyDocument = None
            del entry['body']

        # Perform xmlns normalization
        normalize_namespaces(entryDocument.documentElement, strip_dups=True)

        # Update the new atom:entry document
        entry['atom:entry'] = entryDocument.toxml()