def update_posts_for_feed_task(partner): """ Load and parse the RSS or ATOM feed associated with the given feed url, and for each entry, parse out the individual entries and save each one as a partner_feeds. """ from feedparser import parse from partner_feeds.models import Post import timelib, re, time feed = parse(partner.feed_url) for entry in feed.entries: p = Post() try: p.partner_id = partner.id p.title = entry.title p.subheader = entry.summary try: p.author = entry.author except AttributeError: pass try: p.guid = entry.id except AttributeError: p.guid = entry.link p.url = entry.link # try to get the date of the entry, otherwise, try the date of the feed try: entry_date = re.sub('\|', '', entry.date) entry_date = timelib.strtotime( entry_date) # convert to a timestamp entry_date = time.localtime( entry_date ) # converts to a time.struct_time (with regards to local timezone) entry_date = time.strftime( "%Y-%m-%d %H:%M:%S", entry_date) # converts to mysql date format p.date = entry_date except AttributeError: p.date = time.strftime("%Y-%m-%d %H:%M:%S", feed.date) p.save() except AttributeError: # needs logging pass
def update_posts_for_feed_task(partner): """ Load and parse the RSS or ATOM feed associated with the given feed url, and for each entry, parse out the individual entries and save each one as a partner_feeds. """ from feedparser import parse from partner_feeds.models import Post import timelib, re, time feed = parse(partner.feed_url) for entry in feed.entries: p = Post() try: p.partner_id = partner.id p.title = entry.title p.subheader = entry.summary try: p.author = entry.author except AttributeError: pass try: p.guid = entry.id except AttributeError: p.guid = entry.link p.url = entry.link # try to get the date of the entry, otherwise, try the date of the feed try: entry_date = re.sub('\|','', entry.date) entry_date = timelib.strtotime(entry_date) # convert to a timestamp entry_date = time.localtime(entry_date) # converts to a time.struct_time (with regards to local timezone) entry_date = time.strftime("%Y-%m-%d %H:%M:%S", entry_date) # converts to mysql date format p.date = entry_date except AttributeError: p.date = time.strftime("%Y-%m-%d %H:%M:%S", feed.date) p.save() except AttributeError: # needs logging pass
def update_posts_for_feed_task(partner): """ Load and parse the RSS or ATOM feed associated with the given feed url, and for each entry, parse out the individual entries and save each one as a partner_feeds. """ logger.debug("Updating posts for partner feed: {} - {}.".format(partner, partner.pk)) current_datetime = datetime.now() number_of_new_posts = 0 feed = parse(partner.feed_url) for entry in feed.entries: p = Post() exception_data = {'entry': entry} try: p.partner_id = partner.id p.title = entry.title if not p.title or len(p.title) == 0: continue if hasattr(entry, 'summary'): p.subheader = entry.summary else: p.subheader = '' try: p.author = entry.author except AttributeError: pass try: p.guid = entry.id except AttributeError: p.guid = entry.link # try and select feed post to see if entry exists first try: Post.objects.get(guid=p.guid, partner_id=partner.id) logger.debug("Prexisting partner_feed.Post with partner id: {}, guid: {}.".format(partner.id, p.guid)) # print p.guid # print partner.id # TODO check to see if the story has been updated except ObjectDoesNotExist: logger.debug("partner_feed.Post does not exist with partner id: {}, guid: {}".format(partner.id, p.guid)) # skip if URL is too long for database field max_length = 500 if len(entry.link) > max_length: logger.debug("Entry link is longer than {}. Skipping entry link {}.".format(max_length, entry.link)) continue p.url = entry.link # try to get the date of the entry, otherwise, use the current date if getattr(entry, 'published_parsed', None): p.date = strftime("%Y-%m-%d %H:%M:%S", utc_time_struct_to_local_time_struct(entry.published_parsed)) elif getattr(entry, 'updated_parsed', None): p.date = strftime("%Y-%m-%d %H:%M:%S", utc_time_struct_to_local_time_struct(entry.updated_parsed)) else: p.date = current_datetime logger.debug("Saving partner_feed.Post with partner id: {}, guid: {}".format(partner.id, p.guid)) p.save() logger.debug("Finished saving partner_feed.Post with partner id: {}, guid: {}".format(partner.id, p.guid)) number_of_new_posts = number_of_new_posts + 1 except Exception: client = Client(dsn=settings.RAVEN_CONFIG['dsn']) client.captureException(exc_info=sys.exc_info(), data=exception_data) # return number of added posts return number_of_new_posts
def update_posts_for_feed_task(partner): """ Load and parse the RSS or ATOM feed associated with the given feed url, and for each entry, parse out the individual entries and save each one as a partner_feeds. """ logger.debug(u"Updating posts for partner feed: {} - {}.".format( partner, partner.pk)) current_datetime = datetime.now() number_of_new_posts = 0 feed = parse(partner.feed_url) for entry in feed.entries: p = Post() exception_data = {'entry': entry} try: p.partner_id = partner.id p.title = entry.title if not p.title or len(p.title) == 0: continue if hasattr(entry, 'summary'): p.subheader = entry.summary else: p.subheader = '' try: p.author = entry.author except AttributeError: pass try: p.guid = entry.id except AttributeError: p.guid = entry.link # try and select feed post to see if entry exists first try: Post.objects.get(guid=p.guid, partner_id=partner.id) logger.debug( u"Prexisting partner_feed.Post with partner id: {}, guid: {}." .format(partner.id, p.guid)) # TODO check to see if the story has been updated except ObjectDoesNotExist: logger.debug( u"partner_feed.Post does not exist with partner id: {}, guid: {}" .format(partner.id, p.guid)) # skip if URL is too long for database field max_length = 500 if len(entry.link) > max_length: logger.debug( u"Entry link is longer than {}. Skipping entry link {}." .format(max_length, entry.link)) continue p.url = entry.link # try to get the date of the entry, otherwise, use the current date if getattr(entry, 'published_parsed', None): p.date = strftime( "%Y-%m-%d %H:%M:%S", utc_time_struct_to_local_time_struct( entry.published_parsed)) elif getattr(entry, 'updated_parsed', None): p.date = strftime( "%Y-%m-%d %H:%M:%S", utc_time_struct_to_local_time_struct( entry.updated_parsed)) else: p.date = current_datetime logger.debug( u"Saving partner_feed.Post with partner id: {}, guid: {}". format(partner.id, p.guid)) p.save() logger.debug( u"Finished saving partner_feed.Post with partner id: {}, guid: {}" .format(partner.id, p.guid)) number_of_new_posts = number_of_new_posts + 1 except Exception: raven_client.captureException(exc_info=sys.exc_info(), data=exception_data) # return number of added posts return number_of_new_posts
def update_posts_for_feed(partner): """ Load and parse the RSS or ATOM feed associated with the given feed url, and for each entry, parse out the individual entries and save each one as a partner_feeds. feedparser does a good job normalizing the data, but for a couple of fields we need to do a little more work """ from feedparser import parse from partner_feeds.models import Post, Partner import timelib import time from datetime import datetime from django.utils.text import get_text_list feed = parse(partner.feed_url) for entry in feed.entries: # Required: title, link, skip the entry if it doesn't have them if 'title' in entry or 'link' in entry: p = Post(partner_id=partner.id, title=entry.title) # Links and GUID if 'id' in entry: p.guid = entry.id else: p.guid = entry.link p.url = entry.link # Date if 'date' in entry: entry_date = entry.date elif 'published' in entry: entry_date = entry.published elif 'date' in feed: entry_date = feed.date else: entry_date = None # entry.date and entry.published appear to be strings while # feed.date is a time.struct_time for some reason if type(entry_date) is not time.struct_time: entry_date = timelib.strtotime(entry_date) # convert to a timestamp entry_date = time.localtime(entry_date) # converts to a time.struct_time (with regards to local timezone) if entry_date is not None: entry_date = time.strftime("%Y-%m-%d %H:%M:%S", entry_date) # converts to mysql date format else: entry_date = time.strftime("%Y-%m-%d %H:%M:%S") p.date = entry_date # feedparser doesn't seem to save the ATOM summary tag to # entry.description, but the summary is saved as one of the # rows in the entry.content list # # To find the summary, we loop through the list and # use the smallest field if 'content' in entry and len(entry.content) > 1: summary = entry.content.pop(0)['value'] for content in entry.content: if len(content['value']) < len(summary): summary = content['value'] p.description = summary elif 'description' in entry: p.description = entry.description if 'media_content' in entry and 'url' in entry.media_content[0]: p.image_url = entry.media_content[0]['url'] if 'authors' in entry and entry.authors[0]: authors = [a['name'] for a in entry.authors if 'name' in a] p.byline = get_text_list(authors, 'and') elif 'author' in entry: p.byline = entry.author p.save() # Set the current time as when the partner feed was last retrieved # Needs to be an UPDATE and not a SAVE or else we will get an infinite loop Partner.objects.filter( pk=partner.pk).update(date_feed_updated=datetime.now())