Beispiel #1
0
 def process(self):
     tsp = transaction.savepoint()
     try:
         ret_feed, ret_entries = self._process()
         if ret_feed != FEED_OK: raise FeedValidationError()
     except FeedValidationError: # no extra noise necessary
         transaction.savepoint_rollback(tsp)
     except:
         print_exc(self.feed.id)
         ret_feed, ret_entries = FEED_ERREXC, dict()
         transaction.savepoint_rollback(tsp)
     else:
         transaction.savepoint_commit(tsp)
     return ret_feed, ret_entries
Beispiel #2
0
    def _process(self):
        "Downloads and parses a feed."

        ret_values = {ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0}
        report_errors = (
            not self.options.report_after
            or not self.feed.last_checked
            or (self.feed.last_checked + self.options.report_after < timezone.now())
        )

        try:
            self.fpf = feedparser.parse(
                self.feed.feed_url, agent=USER_AGENT, etag=self.feed.etag if not self.options.force else ""
            )
        except KeyboardInterrupt:
            raise
        except:
            if report_errors:
                log.error("Feed cannot be parsed: {0} (#{1})".format(self.feed.feed_url, self.feed.id))
            return FEED_ERRPARSE, ret_values

        if hasattr(self.fpf, "status"):
            log.extra("[{0}] HTTP status {1}: {2}".format(self.feed.id, self.fpf.status, self.feed.feed_url))
            if self.fpf.status == 304:
                log.extra(
                    ("[{0}] Feed has not changed since " "last check: {1}").format(self.feed.id, self.feed.feed_url)
                )
                # Fast-path: just update last_checked timestamp
                self.feed.last_checked = timezone.now()
                self.feed.save()
                return FEED_SAME, ret_values

            if self.fpf.status >= 400:
                if report_errors:
                    log.warn("[{0}] HTTP error {1}: {2}".format(self.feed.id, self.fpf.status, self.feed.feed_url))
                return FEED_ERRFETCH, ret_values

        if self.fpf.bozo:
            bozo = getattr(self.fpf, "bozo_exception", "unknown error")
            if not self.feed.skip_errors:
                if report_errors:
                    log.warn("[{0}] Failed to fetch feed: {1} ({2})".format(self.feed.id, self.feed.feed_url, bozo))
                return FEED_ERRFETCH, ret_values
            elif report_errors:
                log.info("[{0}] Skipped feed error: {1} ({2})".format(self.feed.id, self.feed.feed_url, bozo))

        self.feed.title = self.fpf.feed.get("title", "")[:200]
        self.feed.tagline = self.fpf.feed.get("tagline", "")
        self.feed.link = self.fpf.feed.get("link", "")
        self.feed.last_checked = timezone.now()

        log.debug(
            "[{0}] Feed info for: {1}\n{2}".format(
                self.feed.id,
                self.feed.feed_url,
                "\n".join(
                    "  {0}: {1}".format(key, getattr(self.feed, key))
                    for key in ["title", "tagline", "link", "last_checked"]
                ),
            )
        )

        guids = filter(None, it.imap(self._get_guid, self.fpf.entries))
        if guids:
            from feedjack.models import Post

            self.postdict = dict((post.guid, post) for post in Post.objects.filter(feed=self.feed.id, guid__in=guids))
            if self.options.max_diff:
                # Do not calculate diff for empty (probably just-added) feeds
                if not self.postdict and Post.objects.filter(feed=self.feed.id).count() == 0:
                    diff = 0
                else:
                    diff = op.truediv(len(guids) - len(self.postdict), len(guids)) * 100
                if diff > self.options.max_diff:
                    log.warn(
                        "[{0}] Feed validation failed: {1} (diff: {2}% > {3}%)".format(
                            self.feed.id, self.feed.feed_url, round(diff, 1), self.options.max_diff
                        )
                    )
                    return FEED_INVALID, ret_values
        else:
            self.postdict = dict()

        self.feed.save()  # etag/mtime aren't updated yet

        for entry in self.fpf.entries:
            tsp = transaction.savepoint()
            try:
                ret_entry = self.process_entry(entry)
            except:
                print_exc(self.feed.id)
                ret_entry = ENTRY_ERR
                transaction.savepoint_rollback(tsp)
            else:
                transaction.savepoint_commit(tsp)
            ret_values[ret_entry] += 1

        if not ret_values[ENTRY_ERR]:  # etag/mtime updated only if there's no errors
            self.feed.etag = self.fpf.get("etag") or ""
            try:
                self.feed.last_modified = feedparser_ts(self.fpf.modified_parsed)
            except AttributeError:
                pass
            self.feed.save()

        return FEED_OK if ret_values[ENTRY_NEW] or ret_values[ENTRY_UPDATED] else FEED_SAME, ret_values
Beispiel #3
0
    def _process(self):
        'Downloads and parses a feed.'

        ret_values = {
            ENTRY_NEW: 0,
            ENTRY_UPDATED: 0,
            ENTRY_SAME: 0,
            ENTRY_ERR: 0 }

        log.info('[{0}] Processing feed {1}'.format(self.feed.id, self.feed.feed_url))

        try:
            self.fpf = feedparser.parse(
                self.feed.feed_url, agent=USER_AGENT,
                etag=self.feed.etag if not self.options.force else '' )
        except KeyboardInterrupt: raise
        except:
            log.error( 'Feed cannot be parsed: {0} (#{1})'\
                .format(self.feed.feed_url, self.feed.id) )
            return FEED_ERRPARSE, ret_values
        
        if hasattr(self.fpf, 'status'):
            log.extra('[{0}] HTTP status {1}: {2}'.format(
                self.feed.id, self.fpf.status, self.feed.feed_url ))
            if self.fpf.status == 304:
                log.extra(( '[{0}] Feed has not changed since '
                    'last check: {1}' ).format(self.feed.id, self.feed.feed_url))
                return FEED_SAME, ret_values

            if self.fpf.status >= 400:
                log.warn('[{0}] HTTP error {1}: {2}'.format(
                    self.feed.id, self.fpf.status, self.feed.feed_url ))
                return FEED_ERRFETCH, ret_values

        if self.fpf.bozo:
            bozo = getattr(self.fpf, 'bozo_exception', 'unknown error')
            if not self.feed.skip_errors:
                log.warn( '[{0}] Failed to fetch feed: {1} ({2})'\
                    .format(self.feed.id, self.feed.feed_url, bozo) )
                return FEED_ERRFETCH, ret_values
            else:
                log.info( '[{0}] Skipped feed error: {1} ({2})'\
                    .format(self.feed.id, self.feed.feed_url, bozo) )

        self.feed.title = self.fpf.feed.get('title', '')[0:254]
        self.feed.tagline = self.fpf.feed.get('tagline', '')
        self.feed.link = self.fpf.feed.get('link', '')
        self.feed.last_checked = datetime.now()

        log.debug('[{0}] Feed info for: {1}\n{2}'.format(
            self.feed.id, self.feed.feed_url, '\n'.join(
            '  {0}: {1}'.format(key, getattr(self.feed, key))
            for key in ['title', 'tagline', 'link', 'last_checked'] )))

        guids = filter(None, it.imap(self._get_guid, self.fpf.entries))
        if guids:
            from feedjack.models import Post
            self.postdict = dict( (post.guid, post)
                for post in Post.objects.filter(
                    feed=self.feed.id ).filter(guid__in=guids) )
            if self.options.max_diff:
                diff = op.truediv(len(guids) - len(self.postdict), len(guids)) * 100
                if diff > self.options.max_diff:
                    log.warn( '[{0}] Feed validation failed: {1} (diff: {2}% > {3}%)'\
                        .format(self.feed.id, self.feed.feed_url, round(diff, 1), self.options.max_diff) )
                    return FEED_INVALID, ret_values
        else: self.postdict = dict()

        self.feed.save() # etag/mtime aren't updated yet

        for entry in self.fpf.entries:
            tsp = transaction.savepoint()
            try: ret_entry = self.process_entry(entry)
            except:
                print_exc(self.feed.id)
                ret_entry = ENTRY_ERR
                transaction.savepoint_rollback(tsp)
            else:
                transaction.savepoint_commit(tsp)
            ret_values[ret_entry] += 1

        if not ret_values[ENTRY_ERR]: # etag/mtime updated only if there's no errors
            self.feed.etag = self.fpf.get('etag') or ''
            try: self.feed.last_modified = mtime(self.fpf.modified)
            except AttributeError: pass
            self.feed.save()

        return FEED_OK, ret_values