Beispiel #1
0
def scrub(feed_uri, data):

    # some data is not trustworthy
    for tag in config.ignore_in_feed(feed_uri).split():
        if tag.find('lang') >= 0: tag = 'language'
        if data.feed.has_key(tag): del data.feed[tag]
        for entry in data.entries:
            if entry.has_key(tag): del entry[tag]
            if entry.has_key(tag + "_detail"): del entry[tag + "_detail"]
            if entry.has_key(tag + "_parsed"): del entry[tag + "_parsed"]
            for key in entry.keys():
                if not key.endswith('_detail'): continue
                for detail in entry[key].copy():
                    if detail == tag: del entry[key][detail]

    # adjust title types
    if config.title_type(feed_uri):
        title_type = config.title_type(feed_uri)
        title_type = type_map.get(title_type, title_type)
        for entry in data.entries:
            if entry.has_key('title_detail'):
                entry.title_detail['type'] = title_type

    # adjust summary types
    if config.summary_type(feed_uri):
        summary_type = config.summary_type(feed_uri)
        summary_type = type_map.get(summary_type, summary_type)
        for entry in data.entries:
            if entry.has_key('summary_detail'):
                entry.summary_detail['type'] = summary_type

    # adjust content types
    if config.content_type(feed_uri):
        content_type = config.content_type(feed_uri)
        content_type = type_map.get(content_type, content_type)
        for entry in data.entries:
            if entry.has_key('content'):
                entry.content[0]['type'] = content_type

    # some people put html in author names
    if config.name_type(feed_uri).find('html') >= 0:
        from shell.tmpl import stripHtml
        if data.feed.has_key('author_detail') and \
            data.feed.author_detail.has_key('name'):
            data.feed.author_detail['name'] = \
                str(stripHtml(data.feed.author_detail.name))
        for entry in data.entries:
            if entry.has_key('author_detail') and \
                entry.author_detail.has_key('name'):
                entry.author_detail['name'] = \
                    str(stripHtml(entry.author_detail.name))
            if entry.has_key('source'):
                source = entry.source
                if source.has_key('author_detail') and \
                    source.author_detail.has_key('name'):
                    source.author_detail['name'] = \
                        str(stripHtml(source.author_detail.name))

    # handle dates in the future
    future_dates = config.future_dates(feed_uri).lower()
    if future_dates == 'ignore_date':
        now = time.gmtime()
        if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
            if data.feed['updated_parsed'] > now:
                del data.feed['updated_parsed']
        for entry in data.entries:
            if entry.has_key('published_parsed') and entry['published_parsed']:
                if entry['published_parsed'] > now:
                    del entry['published_parsed']
                    del entry['published']
            if entry.has_key('updated_parsed') and entry['updated_parsed']:
                if entry['updated_parsed'] > now:
                    del entry['updated_parsed']
                    del entry['updated']
    elif future_dates == 'ignore_entry':
        now = time.gmtime()
        if data.feed.has_key('updated_parsed') and data.feed['updated_parsed']:
            if data.feed['updated_parsed'] > now:
                del data.feed['updated_parsed']
        data.entries = [
            entry for entry in data.entries
            if (not entry.has_key('published_parsed')
                or not entry['published_parsed']
                or entry['published_parsed'] <= now) and
            (not entry.has_key('updated_parsed') or not entry['updated_parsed']
             or entry['updated_parsed'] <= now)
        ]

    scrub_xmlbase = config.xml_base(feed_uri)

    # resolve relative URIs and sanitize
    for entry in data.entries + [data.feed]:
        for key in entry.keys():
            if key == 'content' and not entry.has_key('content_detail'):
                node = entry.content[0]
            elif key.endswith('_detail'):
                node = entry[key]
            else:
                continue

            if not node.has_key('type'): continue
            if not 'html' in node['type']: continue
            if not node.has_key('value'): continue

            if node.has_key('base'):
                if scrub_xmlbase:
                    if scrub_xmlbase == 'feed_alternate':
                        if entry.has_key('source') and \
                            entry.source.has_key('link'):
                            node['base'] = entry.source.link
                        elif data.feed.has_key('link'):
                            node['base'] = data.feed.link
                    elif scrub_xmlbase == 'entry_alternate':
                        if entry.has_key('link'):
                            node['base'] = entry.link
                    else:
                        node['base'] = feedparser._urljoin(
                            node['base'], scrub_xmlbase)

                node['value'] = feedparser._resolveRelativeURIs(
                    node.value, node.base, 'utf-8', node.type)

            # Run this through HTML5's sanitizer
            doc = None
            if 'xhtml' in node['type']:
                try:
                    from xml.dom import minidom
                    doc = minidom.parseString(node['value'])
                except:
                    node['type'] = 'text/html'

            if not doc:
                from html5lib import html5parser, treebuilders
                p = html5parser.HTMLParser(
                    tree=treebuilders.getTreeBuilder('dom'))
                doc = p.parseFragment(node['value'], encoding='utf-8')

            from html5lib import treewalkers, serializer
            from html5lib.filters import sanitizer
            walker = sanitizer.Filter(treewalkers.getTreeWalker('dom')(doc))
            xhtml = serializer.XHTMLSerializer(inject_meta_charset=False)
            tree = xhtml.serialize(walker, encoding='utf-8')

            node['value'] = ''.join([str(token) for token in tree])
Beispiel #2
0
def serialize_xhtml(input, options):
    options = dict([(str(k), v) for k, v in options.items()])
    return serializer.XHTMLSerializer(**options).render(
        JsonWalker(input), options.get("encoding", None))
 def serialize_xhtml(self, input, options):
     options = dict([(str(k), v) for k, v in options.iteritems()])
     return u''.join(
         serializer.XHTMLSerializer(**options).serialize(
             JsonWalker(input), options.get("encoding", None)))