Esempio n. 1
0
    def _early_process_entry(self, page_generator, entry):
        """Perform early in-place processing of an entry."""

        entryDocument = minidom.parseString(entry['atom:entry'])
        entryElement = entryDocument.documentElement
        page_content_type = self._framework.plugins['vars'].vars['page_content_type']

        # Extract the 'id' of the entry
        (idElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'id')
        entry['id'] = getChildText(idElement).strip()

        # Extract and normalize the 'published' date of the entry
        (publishedElement,) = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'published')
        entry['published'] = atom_datetime_to_utc(getChildText(publishedElement).strip())

        # Extract and normalize the 'updated' date of the entry; Create it if it doesn't exist.
        ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'updated'))
        if ee:
            (updatedElement,) = ee  # there should be only one
        else:
            # Create an <updated> element using the 'published' date
            updatedElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'updated')
            replaceChildText(updatedElement, entry['published'])
            entryElement.appendChild(updatedElement)
        entry['updated'] = atom_datetime_to_utc(getChildText(updatedElement).strip())

        # Create a <title> element if one does not already exist.
        ee = tuple(getChildElementsNS(entryElement, ATOM_NAMESPACE, 'title'))
        if not ee:
            titleElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'title')
            titleElement.setAttribute('type', 'text')
            titleElement.appendChild(entryDocument.createTextNode(entry['title']))
            entryElement.appendChild(titleElement)

        # Create a <link rel="alternate"> element if one does not already exist.
        ee = getChildElementsNS(entryElement, ATOM_NAMESPACE, 'link')
        linkElement = None
        for e in ee:
            rel = e.getAttribute('rel')
            type = e.getAttribute('type')
            hreflang = e.getAttribute('hreflang')
            if rel == "alternate" and type == page_content_type and not hreflang:
                if linkElement is not None:
                    raise FGValueError('Conflicting <link rel="alternate" type=%r hreflang=%r> entries in %s' % (
                        page_content_type, hreflang, page_generator.path_info.source_filename,))
                linkElement = e
        if not linkElement:
            linkElement = entryDocument.createElementNS(ATOM_NAMESPACE, 'link')
            linkElement.setAttribute('rel', 'alternate')
            linkElement.setAttribute('href', page_generator.path_info.target_url)
            linkElement.setAttribute('type', page_content_type)
            entryElement.appendChild(linkElement)

        # Rewrite URLs in the atom:entry element
        rewrite_links(entryElement, ATOM_CRITERIA, page_generator.path_info.target_url, page_generator.path_info.base_url, always_absolute=True)

        # Add a <summary> element, if applicable
        if entry['summary']:
            summaryDocument = minidom.parseString(entry['summary'])

            # Rewrite URLs in the summary
            rewrite_links(summaryDocument.documentElement, HTML_CRITERIA,
                entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True)

            # Create Atom <summary> element
            summaryElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'summary')
            summaryElement.setAttribute('type', 'xhtml')
            entryElement.appendChild(summaryElement)

            # Create XHTML <div> element
            divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div')
            divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE)
            summaryElement.appendChild(divElement)

            # Add data
            for n in summaryDocument.documentElement.childNodes:
                divElement.appendChild(divElement.ownerDocument.importNode(n, True))

            # Elements with no namespace become XHTML elements
            substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE})

            # Clean up
            data = None
            summaryDocument.unlink()
            summaryDocument = None
            del entry['summary']

        # Add a <content> element
        if True:
            bodyDocument = minidom.parseString(entry['body'])

            # Rewrite URLs in the body
            rewrite_links(bodyDocument.documentElement, HTML_CRITERIA,
                entry['path_info'].target_url, entry['path_info'].base_url, always_absolute=True)

            # Create Atom <content> element
            contentElement = entryElement.ownerDocument.createElementNS(ATOM_NAMESPACE, 'content')
            contentElement.setAttribute('type', 'xhtml')
            entryElement.appendChild(contentElement)

            # Create XHTML <div> element
            divElement = entryElement.ownerDocument.createElementNS(XHTML_NAMESPACE, 'div')
            divElement.setAttributeNS(XMLNS_NAMESPACE, 'xmlns', XHTML_NAMESPACE)
            contentElement.appendChild(divElement)

            # Add data
            for n in bodyDocument.documentElement.childNodes:
                divElement.appendChild(divElement.ownerDocument.importNode(n, True))

            # Elements with no namespace become XHTML elements
            substitute_namespaces(divElement, {EMPTY_NAMESPACE: XHTML_NAMESPACE})

            # Clean up
            data = None
            bodyDocument.unlink()
            bodyDocument = None
            del entry['body']

        # Perform xmlns normalization
        normalize_namespaces(entryDocument.documentElement, strip_dups=True)

        # Update the new atom:entry document
        entry['atom:entry'] = entryDocument.toxml()
Esempio n. 2
0
def _filter_rewrite_links(pg):
    rewrite_links(pg.page, HTML_CRITERIA,
        target_url=pg.path_info.target_url,
        base_url=pg.path_info.base_url)
Esempio n. 3
0
    def handle_make_atom_feed(self, target_url):
        """Write the data we've collected so far as an Atom feed

        Usage: make-atom-feed TARGET_RELATIVE_URL
        """

        tp = TypicalPaths(self._framework, target_url)
        data_dir = self._get_feed_data_dir()

        def is_update_needed():
            # Check if the feed needs to be updated
            try:
                output_mtime = os.lstat(tp.output_filename).st_mtime
            except EnvironmentError as exc:
                if exc.errno != errno.ENOENT:
                    raise
                # The output file doesn't exist, so an update is needed
                return True

            # Output file exists.  Check timestamps.
            source_mtime = os.lstat(tp.source_filename).st_mtime
            if output_mtime < source_mtime:
                # The source file was modified, so an update is needed.
                return True

            for basename in fnmatch.filter(os.listdir(data_dir), "entry-*-stamp"):
                entry_mtime = os.lstat(os.path.join(data_dir, basename)).st_mtime
                if output_mtime < entry_mtime:
                    # one of the entries is newer than the output file, so an update is needed
                    return True

            return False

        if not is_update_needed():
            # No update needed
            print("skipping %s" % (tp.output_filename,))
            return

        # Make sure the output directory exists
        self._framework.plugins['StillWeb.BasicCommands'].ensure_path(tp.output_dir, tp.pathtuple[:-1])

        print("making %s (using %s)" % (tp.output_filename, tp.source_filename))

        # Load the entries
        entries = []
        regex = re.compile(r"^entry-([^-.]*)-stamp$")
        for basename in os.listdir(data_dir):
            m = regex.search(basename)
            if not m:
                continue
            rootword = m.group(1)
            filename = os.path.join(data_dir, "entry-%s-data" % (rootword,))
            try:
                f = open(filename, "rb")
                entry = pickle.load(f)
                f.close()
            except EnvironmentError as exc:
                if exc.errno == errno.ENOENT:
                    continue
                else:
                    raise
            entries.append(entry)

        # Check for duplicate ids
        ids = {}
        for entry in entries:
            if entry['id'] in ids:
                raise FGValueError("Duplicate id %r in %s (already defined in %s)" % (entry['id'], entry['path_info'].source_filename, ids[entry['id']]))
            ids[entry['id']] = entry['path_info'].source_filename

        # Skip entries whose publication dates are in the future
        unpublished_entries = []
        now = atom_datetime_to_utc(datetime.datetime.utcnow().isoformat() + "Z")
        for i, entry in enumerate(entries):
            if entry['published'] > now:
                unpublished_entries.append(i)
        for i in reversed(unpublished_entries):
            print("%s: skipping %s ('published' in the future)" % (tp.output_filename, entry['path_info'].output_filename,))
            del entries[i]

        # Sort the entries by their publication date, newest first.
        entries.sort(key=lambda entry: atom_datetime_to_sort_key(entry['published']), reverse=True)

        # Find the most recent update
        if not entries:
            raise FGValueError("Refusing to make empty feed")
        most_recent_update = entries[0]['updated']

        # Load and parse the template file
        feedDocument = minidom.parseString(open(tp.source_filename, "rb").read())
        feedElement = feedDocument.documentElement
        assert (feedElement.namespaceURI, feedElement.localName) == (ATOM_NAMESPACE, "feed")

        # Set <updated> to the newest entry's <updated> (or <published>) field
        if tuple(getChildElementsNS(feedDocument, ATOM_NAMESPACE, "updated")):
            raise FGValueError("Template contains auto-generated <updated> field")
        updatedElement = feedDocument.createElementNS(ATOM_NAMESPACE, 'updated')
        feedElement.appendChild(updatedElement)
        replaceChildText(updatedElement, most_recent_update)

        # Create a <link rel="self"> element if one does not already exist.
        for linkElement in getChildElementsNS(feedElement, ATOM_NAMESPACE, "link"):
            if linkElement.getAttribute('rel') == 'self':
                break
        else:
            linkElement = feedDocument.createElementNS(ATOM_NAMESPACE, 'link')
            linkElement.setAttribute('rel', 'self')
            linkElement.setAttribute('type', ATOM_CONTENT_TYPE)
            linkElement.setAttribute('href', tp.target_url)
            feedElement.appendChild(linkElement)

        # Do URL path substitution
        rewrite_links(feedElement, ATOM_CRITERIA, tp.target_url, tp.base_url, always_absolute=True)

        # Add the entries
        for entry in entries:
            # Create an <entry> element
            entryElement = feedDocument.importNode(minidom.parseString(entry['atom:entry']).documentElement, True)
            assert (entryElement.namespaceURI, entryElement.localName) == (ATOM_NAMESPACE, 'entry')
            feedElement.appendChild(entryElement)


        # Write the feed to the output file
        if os.path.exists(tp.output_filename):
            os.unlink(tp.output_filename)
        output_file = open(tp.output_filename, "wb")
        try:
            output_file.write(feedDocument.toxml("UTF-8"))
        except:
            os.unlink(tp.output_filename)
            raise
        finally:
            output_file.close()