def update_site_feed(feed, site_id): '''This functions handles the feed update of site and is kind of recursive, since in the end it will call another apply_async onto himself''' from feeds.models import Post, Site from feeds.utils import get_sanitized_html # Avoids running two instances at the time # Update task_id for this site site = Site.objects.get(id=site_id) site.task_id = update_site_feed.request.id site.save() # Update this site info if 'feed' not in feed: logger.warn(u"Site {} feed did not returned feed information".format(site.id)) if 'feed_error' in feed: logger.error('Site {} is with its feed url broken'.format(site.id)) # TODO: Create a task to use site.url to discover its new feed location site.feed_errors += 1 site.save() else: info = feed['feed'] if 'title' in info: site.title = info['title'] # For some reason, some Google Alerts returns a not valid FQDN info after parsed # and then we must check if it starts with "http" if 'link' in info and info['link'].startswith('http'): site.url = info['link'] if site.feed_errors > 0: site.feed_errors = 0 site.save() # Create posts if 'entries' not in feed: logger.warn(u"Site {} feed did not returned any post".format(site.id)) else: new_posts_found = 0 for entry in feed['entries']: # Without link we can't save this post if 'link' not in entry: continue url = entry['link'] title = entry.get('title', '') # Try to get content if isinstance(entry.get('content'), list): try: for content in entry['content']: if content: break except IndexError: content = u'' else: content = entry.get('content') if not content and 'description' in entry: content = entry['description'] if isinstance(content, dict): content = content.get('value') # Still no content found, lets try using summary if not content and entry.get('summary'): content = entry['summary'] # Parses the content to avoid broken HTML and script tags content = get_sanitized_html(content) author = entry.get('author') if 'published_parsed' in entry and entry.get('published_parsed'): created_at = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed'])) try: created_at = make_aware(created_at, get_current_timezone()) except AmbiguousTimeError: logger.error('Failed when tring to make {} aware'.format(created_at)) created_at = timezone.now() else: created_at = timezone.now() try: post, created = site.posts.get_or_create( url_hash=Post.hashurl(url), defaults={ 'title': title, 'url': url, 'content': content, 'author': author, 'created_at': created_at } ) except IntegrityError: # Raised when two posts have the same URL logger.warn('Final URL {} is duplicated'.format(url)) pass else: if created: new_posts_found += 1 logger.info( 'Site {site_id} got {new} new posts from {total} in feed'.format( site_id=site.id, new=new_posts_found, total=len(feed['entries']) ) ) # Updates when is it to run again next_update = site.set_next_update(save=False) logger.info("Site's {} next update at {}".format(site.id, next_update)) site.last_update = timezone.now() site.save()
def update_site_feed(site): '''This functions handles the feed update of site and is kind of recursive, since in the end it will call another apply_async onto himself''' # Avoids running two instances at the time cachekey = SITE_WORKER_CACHE_KEY.format(id=site.id) if cache.get(cachekey): logger.warn('Worker for site {} still running'.format(site.id)) return cache.add(cachekey, '1', 60) # Will not run again in 60 seconds from feeds.models import Post # Update task_id for this site site.task_id = update_site_feed.request.id site.save() feed = site.getfeed() # Update this site info if not 'feed' in feed: logger.warn(u"Site {} feed did not returned feed information".format(site.id)) if 'feed_error' in feed: logger.error('Site {} is with its feed url broken'.format(site.id)) # TODO: Create a task to use site.url to discover its new feed location site.feed_errors += 1 site.save() else: info = feed['feed'] if 'title' in info: site.title = info['title'] if 'link' in info: site.url = info['link'] if site.feed_errors > 0: site.feed_errors = 0 site.save() # Create posts if not 'entries' in feed: logger.warn(u"Site {} feed did not returned any post".format(site.id)) else: new_posts_found = 0 for entry in feed['entries']: # Without link we can't save this post if not 'link' in entry: continue url = entry['link'] title = entry.get('title', '') # Try to get content if isinstance(entry.get('content'), list): try: for content in entry['content']: if content: break except IndexError: content = u'' else: content = entry.get('content') if not content and 'description' in entry: content = entry['description'] if isinstance(content, dict): content = content.get('value') author = entry.get('author') if 'published_parsed' in entry and entry.get('published_parsed'): created_at = datetime.datetime.fromtimestamp(time.mktime(entry['published_parsed'])) try: created_at = make_aware(created_at, get_current_timezone()) except AmbiguousTimeError,e: logger.error('Failed when tring to make {} aware'.format(created_at)) created_at = timezone.now() else: created_at = timezone.now() try: post, created = site.posts.get_or_create(url_hash=Post.hashurl(url), defaults={ 'title': title, 'url': url, 'content': content, 'author': author, } ) except IntegrityError: # Raised when two posts have the same URL pass else: if created: new_posts_found += 1 post.created_at = created_at post.save() logger.info('Site {site_id} got {new} new posts from {total} in feed'.format(site_id=site.id, new=new_posts_found, total=len(feed['entries'])))