Beispiel #1
0
def pre_process_story(entry, encoding):
    publish_date = entry.get('g_parsed') or entry.get('updated_parsed')
    if publish_date:
        publish_date = datetime.datetime(*publish_date[:6])
    if not publish_date and entry.get('published'):
        try:
            publish_date = dateutil.parser.parse(
                entry.get('published')).replace(tzinfo=None)
        except (ValueError, TypeError, OverflowError):
            pass

    if publish_date:
        entry['published'] = publish_date
    else:
        entry['published'] = datetime.datetime.utcnow() + datetime.timedelta(
            seconds=randint(0, 59))

    if entry['published'] < datetime.datetime(2000, 1, 1):
        entry['published'] = datetime.datetime.utcnow()

    # Future dated stories get forced to current date
    # if entry['published'] > datetime.datetime.now() + datetime.timedelta(days=1):
    if entry['published'] > datetime.datetime.now():
        entry['published'] = datetime.datetime.now() + datetime.timedelta(
            seconds=randint(0, 59))

    # entry_link = entry.get('link') or ''
    # protocol_index = entry_link.find("://")
    # if protocol_index != -1:
    #     entry['link'] = (entry_link[:protocol_index+3]
    #                     + urlquote(entry_link[protocol_index+3:]))
    # else:
    #     entry['link'] = urlquote(entry_link)
    if isinstance(entry.get('guid'), dict):
        entry['guid'] = unicode(entry['guid'])

    # Normalize story content/summary
    summary = entry.get('summary') or ""
    content = ""
    if not summary and 'summary_detail' in entry:
        summary = entry['summary_detail'].get('value', '')
    if entry.get('content'):
        content = entry['content'][0].get('value', '')
    if len(content) > len(summary):
        entry['story_content'] = content.strip()
    else:
        entry['story_content'] = summary.strip()
    if not entry['story_content'] and entry.get('subtitle'):
        entry['story_content'] = entry.get('subtitle')

    if 'summary_detail' in entry and entry['summary_detail'].get(
            'type', None) == 'text/plain':
        try:
            entry['story_content'] = feedparser._sanitizeHTML(
                entry['story_content'], encoding, 'text/plain')
            if encoding and not isinstance(entry['story_content'], unicode):
                entry['story_content'] = entry['story_content'].decode(
                    encoding, 'ignore')
        except UnicodeEncodeError:
            pass

    # Add each media enclosure as a Download link
    for media_content in chain(
            entry.get('media_content', [])[:15],
            entry.get('links', [])[:15]):
        media_url = media_content.get('url', '')
        media_type = media_content.get('type', media_content.get('medium', ''))
        if media_url and media_type and entry[
                'story_content'] and media_url not in entry['story_content']:
            media_type_name = media_type.split('/')[0]
            if 'audio' in media_type and media_url:
                entry['story_content'] += """<br><br>
                    <audio controls="controls" preload="none">
                        <source src="%(media_url)s" type="%(media_type)s" />
                    </audio>""" % {
                    'media_url': media_url,
                    'media_type': media_type
                }
            elif 'video' in media_type and media_url:
                entry['story_content'] += """<br><br>
                    <video controls="controls" preload="none">
                        <source src="%(media_url)s" type="%(media_type)s" />
                    </video>""" % {
                    'media_url': media_url,
                    'media_type': media_type
                }
            elif 'image' in media_type and media_url and media_url not in entry[
                    'story_content']:
                entry[
                    'story_content'] += """<br><br><img src="%s" />""" % media_url
                continue
            elif media_content.get(
                    'rel', '') == 'alternative' or 'text' in media_content.get(
                        'type', ''):
                continue
            elif media_type_name in ['application']:
                continue
            entry['story_content'] += """<br><br>
                Download %(media_type)s: <a href="%(media_url)s">%(media_url)s</a>""" % {
                'media_type': media_type_name,
                'media_url': media_url,
            }

    entry['guid'] = entry.get('guid') or entry.get('id') or entry.get(
        'link') or str(entry.get('published'))

    if not entry.get('title'):
        entry['title'] = ""

    entry['title'] = strip_tags(entry.get('title'))
    entry['author'] = strip_tags(entry.get('author'))

    entry['story_content'] = attach_media_scripts(entry['story_content'])

    return entry
Beispiel #2
0
def pre_process_story(entry, encoding):
    publish_date = entry.get('published_parsed') or entry.get('updated_parsed')
    if publish_date:
        publish_date = datetime.datetime(*publish_date[:6])
    if not publish_date and entry.get('published'):
        try:
            publish_date = dateutil.parser.parse(entry.get('published')).replace(tzinfo=None)
        except (ValueError, TypeError, OverflowError):
            pass
    
    if publish_date:
        entry['published'] = publish_date
    else:
        entry['published'] = datetime.datetime.utcnow()
    
    if entry['published'] < datetime.datetime(2000, 1, 1):
        entry['published'] = datetime.datetime.utcnow()
    
    if entry['published'] > datetime.datetime.now() + datetime.timedelta(days=1):
        entry['published'] = datetime.datetime.now()
    
    # entry_link = entry.get('link') or ''
    # protocol_index = entry_link.find("://")
    # if protocol_index != -1:
    #     entry['link'] = (entry_link[:protocol_index+3]
    #                     + urlquote(entry_link[protocol_index+3:]))
    # else:
    #     entry['link'] = urlquote(entry_link)
    if isinstance(entry.get('guid'), dict):
        entry['guid'] = unicode(entry['guid'])

    # Normalize story content/summary
    summary = entry.get('summary') or ""
    content = ""
    if not summary and 'summary_detail' in entry:
        summary = entry['summary_detail'].get('value', '')
    if entry.get('content'):
        content = entry['content'][0].get('value', '')
    if len(content) > len(summary):
        entry['story_content'] = content.strip()
    else:
        entry['story_content'] = summary.strip()
    
    if 'summary_detail' in entry and entry['summary_detail'].get('type', None) == 'text/plain':
        try:
            entry['story_content'] = feedparser._sanitizeHTML(entry['story_content'], encoding, 'text/plain')
            if encoding and not isinstance(entry['story_content'], unicode):
                entry['story_content'] = entry['story_content'].decode(encoding, 'ignore')
        except UnicodeEncodeError:
            pass
        
    # Add each media enclosure as a Download link
    for media_content in chain(entry.get('media_content', [])[:5], entry.get('links', [])[:5]):
        media_url = media_content.get('url', '')
        media_type = media_content.get('type', '')
        if media_url and media_type and entry['story_content'] and media_url not in entry['story_content']:
            media_type_name = media_type.split('/')[0]
            if 'audio' in media_type and media_url:
                entry['story_content'] += """<br><br>
                    <audio controls="controls" preload="none">
                        <source src="%(media_url)s" type="%(media_type)s" />
                    </audio>"""  % {
                        'media_url': media_url, 
                        'media_type': media_type
                    }
            elif 'image' in media_type and media_url:
                entry['story_content'] += """<br><br><img src="%s" />"""  % media_url
                continue
            elif media_content.get('rel') == 'alternative' or 'text' in media_content.get('type'):
                continue
            elif media_type_name in ['application']:
                continue
            entry['story_content'] += """<br><br>
                Download %(media_type)s: <a href="%(media_url)s">%(media_url)s</a>"""  % {
                'media_type': media_type_name,
                'media_url': media_url, 
            }
    
    entry['guid'] = entry.get('guid') or entry.get('id') or entry.get('link') or str(entry.get('published'))

    if not entry.get('title') and entry.get('story_content'):
        story_title = strip_tags(entry['story_content'])
        if len(story_title) > 80:
            story_title = story_title[:80] + '...'
        entry['title'] = story_title
    if not entry.get('title') and entry.get('link'):
        entry['title'] = entry['link']
        
    entry['title'] = strip_tags(entry.get('title'))
    entry['author'] = strip_tags(entry.get('author'))
    
    entry['story_content'] = attach_media_scripts(entry['story_content'])
    
    return entry