def process(self): tsp = transaction.savepoint() try: ret_feed, ret_entries = self._process() if ret_feed != FEED_OK: raise FeedValidationError() except FeedValidationError: # no extra noise necessary transaction.savepoint_rollback(tsp) except: print_exc(self.feed.id) ret_feed, ret_entries = FEED_ERREXC, dict() transaction.savepoint_rollback(tsp) else: transaction.savepoint_commit(tsp) return ret_feed, ret_entries
def _process(self): "Downloads and parses a feed." ret_values = {ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0} report_errors = ( not self.options.report_after or not self.feed.last_checked or (self.feed.last_checked + self.options.report_after < timezone.now()) ) try: self.fpf = feedparser.parse( self.feed.feed_url, agent=USER_AGENT, etag=self.feed.etag if not self.options.force else "" ) except KeyboardInterrupt: raise except: if report_errors: log.error("Feed cannot be parsed: {0} (#{1})".format(self.feed.feed_url, self.feed.id)) return FEED_ERRPARSE, ret_values if hasattr(self.fpf, "status"): log.extra("[{0}] HTTP status {1}: {2}".format(self.feed.id, self.fpf.status, self.feed.feed_url)) if self.fpf.status == 304: log.extra( ("[{0}] Feed has not changed since " "last check: {1}").format(self.feed.id, self.feed.feed_url) ) # Fast-path: just update last_checked timestamp self.feed.last_checked = timezone.now() self.feed.save() return FEED_SAME, ret_values if self.fpf.status >= 400: if report_errors: log.warn("[{0}] HTTP error {1}: {2}".format(self.feed.id, self.fpf.status, self.feed.feed_url)) return FEED_ERRFETCH, ret_values if self.fpf.bozo: bozo = getattr(self.fpf, "bozo_exception", "unknown error") if not self.feed.skip_errors: if report_errors: log.warn("[{0}] Failed to fetch feed: {1} ({2})".format(self.feed.id, self.feed.feed_url, bozo)) return FEED_ERRFETCH, ret_values elif report_errors: log.info("[{0}] Skipped feed error: {1} ({2})".format(self.feed.id, self.feed.feed_url, bozo)) self.feed.title = self.fpf.feed.get("title", "")[:200] self.feed.tagline = self.fpf.feed.get("tagline", "") self.feed.link = self.fpf.feed.get("link", "") self.feed.last_checked = timezone.now() log.debug( "[{0}] Feed info for: {1}\n{2}".format( self.feed.id, self.feed.feed_url, "\n".join( " {0}: {1}".format(key, getattr(self.feed, key)) for key in ["title", "tagline", "link", "last_checked"] ), ) ) guids = filter(None, it.imap(self._get_guid, self.fpf.entries)) if guids: from feedjack.models import Post self.postdict = dict((post.guid, post) for post in Post.objects.filter(feed=self.feed.id, guid__in=guids)) if self.options.max_diff: # Do not calculate diff for empty (probably just-added) feeds if not self.postdict and Post.objects.filter(feed=self.feed.id).count() == 0: diff = 0 else: diff = op.truediv(len(guids) - len(self.postdict), len(guids)) * 100 if diff > self.options.max_diff: log.warn( "[{0}] Feed validation failed: {1} (diff: {2}% > {3}%)".format( self.feed.id, self.feed.feed_url, round(diff, 1), self.options.max_diff ) ) return FEED_INVALID, ret_values else: self.postdict = dict() self.feed.save() # etag/mtime aren't updated yet for entry in self.fpf.entries: tsp = transaction.savepoint() try: ret_entry = self.process_entry(entry) except: print_exc(self.feed.id) ret_entry = ENTRY_ERR transaction.savepoint_rollback(tsp) else: transaction.savepoint_commit(tsp) ret_values[ret_entry] += 1 if not ret_values[ENTRY_ERR]: # etag/mtime updated only if there's no errors self.feed.etag = self.fpf.get("etag") or "" try: self.feed.last_modified = feedparser_ts(self.fpf.modified_parsed) except AttributeError: pass self.feed.save() return FEED_OK if ret_values[ENTRY_NEW] or ret_values[ENTRY_UPDATED] else FEED_SAME, ret_values
def _process(self): 'Downloads and parses a feed.' ret_values = { ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0 } log.info('[{0}] Processing feed {1}'.format(self.feed.id, self.feed.feed_url)) try: self.fpf = feedparser.parse( self.feed.feed_url, agent=USER_AGENT, etag=self.feed.etag if not self.options.force else '' ) except KeyboardInterrupt: raise except: log.error( 'Feed cannot be parsed: {0} (#{1})'\ .format(self.feed.feed_url, self.feed.id) ) return FEED_ERRPARSE, ret_values if hasattr(self.fpf, 'status'): log.extra('[{0}] HTTP status {1}: {2}'.format( self.feed.id, self.fpf.status, self.feed.feed_url )) if self.fpf.status == 304: log.extra(( '[{0}] Feed has not changed since ' 'last check: {1}' ).format(self.feed.id, self.feed.feed_url)) return FEED_SAME, ret_values if self.fpf.status >= 400: log.warn('[{0}] HTTP error {1}: {2}'.format( self.feed.id, self.fpf.status, self.feed.feed_url )) return FEED_ERRFETCH, ret_values if self.fpf.bozo: bozo = getattr(self.fpf, 'bozo_exception', 'unknown error') if not self.feed.skip_errors: log.warn( '[{0}] Failed to fetch feed: {1} ({2})'\ .format(self.feed.id, self.feed.feed_url, bozo) ) return FEED_ERRFETCH, ret_values else: log.info( '[{0}] Skipped feed error: {1} ({2})'\ .format(self.feed.id, self.feed.feed_url, bozo) ) self.feed.title = self.fpf.feed.get('title', '')[0:254] self.feed.tagline = self.fpf.feed.get('tagline', '') self.feed.link = self.fpf.feed.get('link', '') self.feed.last_checked = datetime.now() log.debug('[{0}] Feed info for: {1}\n{2}'.format( self.feed.id, self.feed.feed_url, '\n'.join( ' {0}: {1}'.format(key, getattr(self.feed, key)) for key in ['title', 'tagline', 'link', 'last_checked'] ))) guids = filter(None, it.imap(self._get_guid, self.fpf.entries)) if guids: from feedjack.models import Post self.postdict = dict( (post.guid, post) for post in Post.objects.filter( feed=self.feed.id ).filter(guid__in=guids) ) if self.options.max_diff: diff = op.truediv(len(guids) - len(self.postdict), len(guids)) * 100 if diff > self.options.max_diff: log.warn( '[{0}] Feed validation failed: {1} (diff: {2}% > {3}%)'\ .format(self.feed.id, self.feed.feed_url, round(diff, 1), self.options.max_diff) ) return FEED_INVALID, ret_values else: self.postdict = dict() self.feed.save() # etag/mtime aren't updated yet for entry in self.fpf.entries: tsp = transaction.savepoint() try: ret_entry = self.process_entry(entry) except: print_exc(self.feed.id) ret_entry = ENTRY_ERR transaction.savepoint_rollback(tsp) else: transaction.savepoint_commit(tsp) ret_values[ret_entry] += 1 if not ret_values[ENTRY_ERR]: # etag/mtime updated only if there's no errors self.feed.etag = self.fpf.get('etag') or '' try: self.feed.last_modified = mtime(self.fpf.modified) except AttributeError: pass self.feed.save() return FEED_OK, ret_values