def pre_process_story(entry, encoding): publish_date = entry.get('g_parsed') or entry.get('updated_parsed') if publish_date: publish_date = datetime.datetime(*publish_date[:6]) if not publish_date and entry.get('published'): try: publish_date = dateutil.parser.parse( entry.get('published')).replace(tzinfo=None) except (ValueError, TypeError, OverflowError): pass if publish_date: entry['published'] = publish_date else: entry['published'] = datetime.datetime.utcnow() + datetime.timedelta( seconds=randint(0, 59)) if entry['published'] < datetime.datetime(2000, 1, 1): entry['published'] = datetime.datetime.utcnow() # Future dated stories get forced to current date # if entry['published'] > datetime.datetime.now() + datetime.timedelta(days=1): if entry['published'] > datetime.datetime.now(): entry['published'] = datetime.datetime.now() + datetime.timedelta( seconds=randint(0, 59)) # entry_link = entry.get('link') or '' # protocol_index = entry_link.find("://") # if protocol_index != -1: # entry['link'] = (entry_link[:protocol_index+3] # + urlquote(entry_link[protocol_index+3:])) # else: # entry['link'] = urlquote(entry_link) if isinstance(entry.get('guid'), dict): entry['guid'] = unicode(entry['guid']) # Normalize story content/summary summary = entry.get('summary') or "" content = "" if not summary and 'summary_detail' in entry: summary = entry['summary_detail'].get('value', '') if entry.get('content'): content = entry['content'][0].get('value', '') if len(content) > len(summary): entry['story_content'] = content.strip() else: entry['story_content'] = summary.strip() if not entry['story_content'] and entry.get('subtitle'): entry['story_content'] = entry.get('subtitle') if 'summary_detail' in entry and entry['summary_detail'].get( 'type', None) == 'text/plain': try: entry['story_content'] = feedparser._sanitizeHTML( entry['story_content'], encoding, 'text/plain') if encoding and not isinstance(entry['story_content'], unicode): entry['story_content'] = entry['story_content'].decode( encoding, 'ignore') except UnicodeEncodeError: pass # Add each media enclosure as a Download link for media_content in chain( entry.get('media_content', [])[:15], entry.get('links', [])[:15]): media_url = media_content.get('url', '') media_type = media_content.get('type', media_content.get('medium', '')) if media_url and media_type and entry[ 'story_content'] and media_url not in entry['story_content']: media_type_name = media_type.split('/')[0] if 'audio' in media_type and media_url: entry['story_content'] += """<br><br> <audio controls="controls" preload="none"> <source src="%(media_url)s" type="%(media_type)s" /> </audio>""" % { 'media_url': media_url, 'media_type': media_type } elif 'video' in media_type and media_url: entry['story_content'] += """<br><br> <video controls="controls" preload="none"> <source src="%(media_url)s" type="%(media_type)s" /> </video>""" % { 'media_url': media_url, 'media_type': media_type } elif 'image' in media_type and media_url and media_url not in entry[ 'story_content']: entry[ 'story_content'] += """<br><br><img src="%s" />""" % media_url continue elif media_content.get( 'rel', '') == 'alternative' or 'text' in media_content.get( 'type', ''): continue elif media_type_name in ['application']: continue entry['story_content'] += """<br><br> Download %(media_type)s: <a href="%(media_url)s">%(media_url)s</a>""" % { 'media_type': media_type_name, 'media_url': media_url, } entry['guid'] = entry.get('guid') or entry.get('id') or entry.get( 'link') or str(entry.get('published')) if not entry.get('title'): entry['title'] = "" entry['title'] = strip_tags(entry.get('title')) entry['author'] = strip_tags(entry.get('author')) entry['story_content'] = attach_media_scripts(entry['story_content']) return entry
def pre_process_story(entry, encoding): publish_date = entry.get('published_parsed') or entry.get('updated_parsed') if publish_date: publish_date = datetime.datetime(*publish_date[:6]) if not publish_date and entry.get('published'): try: publish_date = dateutil.parser.parse(entry.get('published')).replace(tzinfo=None) except (ValueError, TypeError, OverflowError): pass if publish_date: entry['published'] = publish_date else: entry['published'] = datetime.datetime.utcnow() if entry['published'] < datetime.datetime(2000, 1, 1): entry['published'] = datetime.datetime.utcnow() if entry['published'] > datetime.datetime.now() + datetime.timedelta(days=1): entry['published'] = datetime.datetime.now() # entry_link = entry.get('link') or '' # protocol_index = entry_link.find("://") # if protocol_index != -1: # entry['link'] = (entry_link[:protocol_index+3] # + urlquote(entry_link[protocol_index+3:])) # else: # entry['link'] = urlquote(entry_link) if isinstance(entry.get('guid'), dict): entry['guid'] = unicode(entry['guid']) # Normalize story content/summary summary = entry.get('summary') or "" content = "" if not summary and 'summary_detail' in entry: summary = entry['summary_detail'].get('value', '') if entry.get('content'): content = entry['content'][0].get('value', '') if len(content) > len(summary): entry['story_content'] = content.strip() else: entry['story_content'] = summary.strip() if 'summary_detail' in entry and entry['summary_detail'].get('type', None) == 'text/plain': try: entry['story_content'] = feedparser._sanitizeHTML(entry['story_content'], encoding, 'text/plain') if encoding and not isinstance(entry['story_content'], unicode): entry['story_content'] = entry['story_content'].decode(encoding, 'ignore') except UnicodeEncodeError: pass # Add each media enclosure as a Download link for media_content in chain(entry.get('media_content', [])[:5], entry.get('links', [])[:5]): media_url = media_content.get('url', '') media_type = media_content.get('type', '') if media_url and media_type and entry['story_content'] and media_url not in entry['story_content']: media_type_name = media_type.split('/')[0] if 'audio' in media_type and media_url: entry['story_content'] += """<br><br> <audio controls="controls" preload="none"> <source src="%(media_url)s" type="%(media_type)s" /> </audio>""" % { 'media_url': media_url, 'media_type': media_type } elif 'image' in media_type and media_url: entry['story_content'] += """<br><br><img src="%s" />""" % media_url continue elif media_content.get('rel') == 'alternative' or 'text' in media_content.get('type'): continue elif media_type_name in ['application']: continue entry['story_content'] += """<br><br> Download %(media_type)s: <a href="%(media_url)s">%(media_url)s</a>""" % { 'media_type': media_type_name, 'media_url': media_url, } entry['guid'] = entry.get('guid') or entry.get('id') or entry.get('link') or str(entry.get('published')) if not entry.get('title') and entry.get('story_content'): story_title = strip_tags(entry['story_content']) if len(story_title) > 80: story_title = story_title[:80] + '...' entry['title'] = story_title if not entry.get('title') and entry.get('link'): entry['title'] = entry['link'] entry['title'] = strip_tags(entry.get('title')) entry['author'] = strip_tags(entry.get('author')) entry['story_content'] = attach_media_scripts(entry['story_content']) return entry