Ejemplo n.º 1
0
def _filter_set_title(pg):
    # Extract title text from pg.content
    (head_element,) = getChildElementsNS(pg.content.documentElement, EMPTY_NAMESPACE, 'head')
    (title_element,) = getChildElementsNS(head_element, EMPTY_NAMESPACE, 'title')
    title_text = getChildText(title_element)

    # Replace children of <title> element in pg.title
    (title_element,) = pg.page.getElementsByTagName('title')
    for n in title_element.childNodes:
        title_element.removeChild(n)
    title_element.appendChild(pg.page.createTextNode(title_text))
Ejemplo n.º 2
0
    def _load_content(self, page_generator):
        content = page_generator.content

        # Initialize the current entry
        current_entry = {}

        # Find the <head> and <body> elements.
        (headElement,) = page_generator.content.getElementsByTagName('head')
        (bodyElement,) = page_generator.content.getElementsByTagName('body')

        # Find the <atom:entry> element inside the <head> element.
        entries = headElement.getElementsByTagNameNS(ATOM_NAMESPACE, 'entry')
        if not entries:
            # No Atom feed entry.  Do nothing.
            self._clear_entry_data(page_generator)
            return
        elif len(entries) > 1:
            # There should only be one atom:entry
            raise FGValueError("Too many Atom entries in %s" % (page_generator.path_info.source_filename,))
        (entryElement,) = entries

        # Store the <atom:entry> element (with all namespace information included)
        dummyDocument = minidom.parseString('<dummy/>')
        new_entryElement = dummyDocument.importNode(entryElement, True)
        dummyDocument.documentElement.appendChild(new_entryElement)
        normalize_namespaces(new_entryElement)
        current_entry['atom:entry'] = new_entryElement.toxml()

        # Find and store the page summary (if any) in the <body> element, and un-set class="feed-summary".
        summaryElements = list(find_elements_with_class(content, "feed-summary", remove=True))
        if len(summaryElements) > 1:
            # There should only be one element with class="feed-summary"
            raise FGValueError('Too many elements have class="feed-summary" in %s' % (page_generator.path_info.source_filename,))
        elif summaryElements:
            # Save the summary
            current_entry['summary'] = summaryElements[0].toxml()
        else:
            # Save an empty summary
            current_entry['summary'] = None

        # Save the page body
        current_entry['body'] = bodyElement.toxml()

        # Save the page title
        (titleElement,) = getChildElementsNS(headElement, EMPTY_NAMESPACE, 'title')
        current_entry['title'] = getChildText(titleElement)

        # Save the path_info
        current_entry['path_info'] = page_generator.path_info

        # Perform some early processing
        self._early_process_entry(page_generator, current_entry)

        # Write the entry to disk
        self._write_entry_data(page_generator, current_entry)
Ejemplo n.º 3
0
def _filter_copy_head(pg):
    # Find <head> elements of both pages
    (srcHeadElement,) = pg.content.getElementsByTagName('head')
    (destHeadElement,) = pg.page.getElementsByTagName('head')

    # Import <head> element from pg.content into pg.page's DOM (but don't add
    # it to the document tree just yet).
    tmpHeadElement = pg.page.importNode(srcHeadElement, True)

    # Remove <title>, since it's handled elsewhere.
    (n,) = getChildElementsNS(tmpHeadElement, EMPTY_NAMESPACE, 'title')
    if n:
        n.parentNode.removeChild(n)

    # Remove other items from <head>
    pg.invoke_filters('generate_page:filter_head', tmpHeadElement)

    # Copy the remaining nodes
    for n in tmpHeadElement.childNodes:
        destHeadElement.appendChild(n)
Ejemplo n.º 4
0
    def _handle_news_element(self, page_generator, c_newsElement):
        if self._feed_url is None:
            raise ValueError("news-here element found before set_news_feed called")

        # Find the template element
        (c_templateElement,) = getChildElementsNS(c_newsElement, NEWS_NAMESPACE, 'template')

        # Create the result document
        result_doc = minidom.parseString("<div/>")

        # Load the Atom feed
        feed = minidom.parseString(open(self._feed_path_info.output_filename, "rb").read())

        # Get the content-type of page links
        page_content_type = self._framework.plugins['vars'].vars['page_content_type']

        # Get the maximum number of entries (if any)
        limit = c_newsElement.getAttribute('limit')
        if not limit:
            limit = None
        else:
            limit = int(limit)

        for i, f_entryElement in enumerate(getChildElementsNS(feed.documentElement, ATOM_NAMESPACE, 'entry')):
            # Don't output more than the specified number of articles
            if limit is not None and i >= limit:
                break

            params = {}

            # Get entry title
            (f_titleElement,) = getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'title')
            assert f_titleElement.getAttribute('type') == 'text'
            params['title'] = getChildText(f_titleElement)

            # Get entry publication/update dates
            params['published'] = getChildText(tuple(getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'published'))[0])
            params['updated'] = getChildText(tuple(getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'updated'))[0])

            # Get entry <link rel="alternate" type="text/html">
            ee = getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'link')
            for e in ee:
                rel = e.getAttribute('rel')
                type = e.getAttribute('type')
                hreflang = e.getAttribute('hreflang')
                if rel == "alternate" and type == page_content_type and not hreflang:
                    params['href'] = e.getAttribute('href')
                    break
            else:
                raise RuntimeError("link not found")

            # Get entry summary
            (f_summaryElement,) = getChildElementsNS(f_entryElement, ATOM_NAMESPACE, 'summary')
            assert f_summaryElement.getAttribute('type') == 'xhtml'
            (f_summaryDiv,) = (n for n in f_summaryElement.childNodes if n.nodeType == n.ELEMENT_NODE)
            params['summaryDiv'] = f_summaryDiv

            # Create per-entry <div> element
            r_divElement = result_doc.createElement('div')
            result_doc.documentElement.appendChild(r_divElement)

            # Copy the template to the result
            self.__copy_template_to_result(c_templateElement, r_divElement, params)

        # Namespace normalization
        normalize_namespaces(result_doc.documentElement)

        # Replace the placeholder
        raise ReplaceWithNode(result_doc.documentElement)
Ejemplo n.º 5
0
    def _early_process_entry(self, page_generator, entry):
        """Perform early in-place processing of an entry."""

        entryDocument = minidom.parseString(entry['atom:entry'])
        entryElement = entryDocument.documentElement
        page_content_type = self._framework.plugins['vars'].vars['page_content_type']

        # Extract the 'id' of the entry
        (idElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'id')
        entry['id'] = getChildText(idElement).strip()

        # Extract and normalize the 'published' date of the entry
        (publishedElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'published')
        entry['published'] = atom_datetime_to_utc(getChildText(publishedElement).strip())

        # Extract and normalize the 'updated' date of the entry; Create it if it doesn't exist.
        ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'updated'))
        if ee:
            (updatedElement,) = ee  # there should be only one
        else:
            # Create an <updated> element using the 'published' date
            updatedElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'updated')
            replaceChildText(updatedElement, entry['published'])
            entryElement.appendChild(updatedElement)
        entry['updated'] = atom_datetime_to_utc(getChildText(updatedElement).strip())

        # Create a <title> element if one does not already exist.
        ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'title'))
        if not ee:
            titleElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'title')
            titleElement.setAttribute('type', 'text')
            titleElement.appendChild(entryDocument.createTextNode(entry['title']))
            entryElement.appendChild(titleElement)

        # Create a <link rel="alternate"> element if one does not already exist.
        ee = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'link')
        linkElement = None
        for e in ee:
            rel = e.getAttribute('rel')
            type = e.getAttribute('type')
            hreflang = e.getAttribute('hreflang')
            if rel == "alternate" and type == page_content_type and not hreflang:
                if linkElement is not None:
                    raise FGValueError('Conflicting <link rel="alternate" type=%r hreflang=%r> entries in %s' % (
                        page_content_type, hreflang, page_generator.path_info.source_filename,))
                linkElement = e
        if not linkElement:
            linkElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'link')
            linkElement.setAttribute('rel', 'alternate')
            linkElement.setAttribute('href', page_generator.path_info.target_url)
            linkElement.setAttribute('type', page_content_type)
            entryElement.appendChild(linkElement)

        # Rewrite URLs in the atom:entry element
        rewrite_links(entryElement, ATOM_CRITERIA, page_generator.path_info.target_url, page_generator.path_info.base_url, always_absolute=True)

        # Add a <summary> element, if applicable
        if entry['summary']:
            summaryDocument = minidom.parseString(entry['summary'])

            # Rewrite URLs in the summary
            rewrite_links(summaryDocument.documentElement, HTML_CRITERIA,
                entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True)

            # Create Atom <summary> element
            summaryElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'summary')
            summaryElement.setAttribute('type', 'xhtml')
            entryElement.appendChild(summaryElement)

            # Create XHTML <div> element
            divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div')
            divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE)
            summaryElement.appendChild(divElement)

            # Add data
            for n in summaryDocument.documentElement.childNodes:
                divElement.appendChild(divElement.ownerDocument.importNode(n, True))

            # Elements with no namespace become XHTML elements
            substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE})

            # Clean up
            data = None
            summaryDocument.unlink()
            summaryDocument = None
            del entry['summary']

        # Add a <content> element
        if True:
            bodyDocument = minidom.parseString(entry['body'])

            # Rewrite URLs in the body
            rewrite_links(bodyDocument.documentElement, HTML_CRITERIA,
                entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True)

            # Create Atom <content> element
            contentElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'content')
            contentElement.setAttribute('type', 'xhtml')
            entryElement.appendChild(contentElement)

            # Create XHTML <div> element
            divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div')
            divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE)
            contentElement.appendChild(divElement)

            # Add data
            for n in bodyDocument.documentElement.childNodes:
                divElement.appendChild(divElement.ownerDocument.importNode(n, True))

            # Elements with no namespace become XHTML elements
            substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE})

            # Clean up
            data = None
            bodyDocument.unlink()
            bodyDocument = None
            del entry['body']

        # Perform xmlns normalization
        normalize_namespaces(entryDocument.documentElement, strip_dups=True)

        # Update the new atom:entry document
        entry['atom:entry'] = entryDocument.toxml()
Ejemplo n.º 6
0
    def handle_make_atom_feed(self, target_url):
        """Write the data we've collected so far as an Atom feed

        Usage: make-atom-feed TARGET_RELATIVE_URL
        """

        tp = TypicalPaths(self._framework, target_url)
        data_dir = self._get_feed_data_dir()

        def is_update_needed():
            # Check if the feed needs to be updated
            try:
                output_mtime = os.lstat(tp.output_filename).st_mtime
            except EnvironmentError as exc:
                if exc.errno != errno.ENOENT:
                    raise
                # The output file doesn't exist, so an update is needed
                return True

            # Output file exists.  Check timestamps.
            source_mtime = os.lstat(tp.source_filename).st_mtime
            if output_mtime < source_mtime:
                # The source file was modified, so an update is needed.
                return True

            for basename in fnmatch.filter(os.listdir(data_dir), "entry-*-stamp"):
                entry_mtime = os.lstat(os.path.join(data_dir, basename)).st_mtime
                if output_mtime < entry_mtime:
                    # one of the entries is newer than the output file, so an update is needed
                    return True

            return False

        if not is_update_needed():
            # No update needed
            print("skipping %s" % (tp.output_filename,))
            return

        # Make sure the output directory exists
        self._framework.plugins['StillWeb.BasicCommands'].ensure_path(tp.output_dir, tp.pathtuple[:-1])

        print("making %s (using %s)" % (tp.output_filename, tp.source_filename))

        # Load the entries
        entries = []
        regex = re.compile(r"^entry-([^-.]*)-stamp$")
        for basename in os.listdir(data_dir):
            m = regex.search(basename)
            if not m:
                continue
            rootword = m.group(1)
            filename = os.path.join(data_dir, "entry-%s-data" % (rootword,))
            try:
                f = open(filename, "rb")
                entry = pickle.load(f)
                f.close()
            except EnvironmentError as exc:
                if exc.errno == errno.ENOENT:
                    continue
                else:
                    raise
            entries.append(entry)

        # Check for duplicate ids
        ids = {}
        for entry in entries:
            if entry['id'] in ids:
                raise FGValueError("Duplicate id %r in %s (already defined in %s)" % (entry['id'], entry['path_info'].source_filename, ids[entry['id']]))
            ids[entry['id']] = entry['path_info'].source_filename

        # Skip entries whose publication dates are in the future
        unpublished_entries = []
        now = atom_datetime_to_utc(datetime.datetime.utcnow().isoformat() + "Z")
        for i, entry in enumerate(entries):
            if entry['published'] > now:
                unpublished_entries.append(i)
        for i in reversed(unpublished_entries):
            print("%s: skipping %s ('published' in the future)" % (tp.output_filename, entry['path_info'].output_filename,))
            del entries[i]

        # Sort the entries by their publication date, newest first.
        entries.sort(key=lambda entry: atom_datetime_to_sort_key(entry['published']), reverse=True)

        # Find the most recent update
        if not entries:
            raise FGValueError("Refusing to make empty feed")
        most_recent_update = entries[0]['updated']

        # Load and parse the template file
        feedDocument = minidom.parseString(open(tp.source_filename, "rb").read())
        feedElement = feedDocument.documentElement
        assert (feedElement.namespaceURI, feedElement.localName) == (ATOM_NAMESPACE, "feed")

        # Set <updated> to the newest entry's <updated> (or <published>) field
        if tuple(getChildElementsNS(feedDocument, ATOM_NAMESPACE, "updated")):
            raise FGValueError("Template contains auto-generated <updated> field")
        updatedElement = feedDocument.createElementNS(ATOM_NAMESPACE, 'updated')
        feedElement.appendChild(updatedElement)
        replaceChildText(updatedElement, most_recent_update)

        # Create a <link rel="self"> element if one does not already exist.
        for linkElement in getChildElementsNS(feedElement, ATOM_NAMESPACE, "link"):
            if linkElement.getAttribute('rel') == 'self':
                break
        else:
            linkElement = feedDocument.createElementNS(ATOM_NAMESPACE, 'link')
            linkElement.setAttribute('rel', 'self')
            linkElement.setAttribute('type', ATOM_CONTENT_TYPE)
            linkElement.setAttribute('href', tp.target_url)
            feedElement.appendChild(linkElement)

        # Do URL path substitution
        rewrite_links(feedElement, ATOM_CRITERIA, tp.target_url, tp.base_url, always_absolute=True)

        # Add the entries
        for entry in entries:
            # Create an <entry> element
            entryElement = feedDocument.importNode(minidom.parseString(entry['atom:entry']).documentElement, True)
            assert (entryElement.namespaceURI, entryElement.localName) == (ATOM_NAMESPACE, 'entry')
            feedElement.appendChild(entryElement)


        # Write the feed to the output file
        if os.path.exists(tp.output_filename):
            os.unlink(tp.output_filename)
        output_file = open(tp.output_filename, "wb")
        try:
            output_file.write(feedDocument.toxml("UTF-8"))
        except:
            os.unlink(tp.output_filename)
            raise
        finally:
            output_file.close()