Example #1
0
    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()

        ret_values = {ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0}

        # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))

        if hasattr(self.fpf, "status"):
            if self.options["verbose"]:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(
                        u"   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)"
                        % (self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries))
                    )

            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values

            if self.fpf.status in (302, 301):
                if not self.fpf.href.endswith("feedburner.com/atom.xml"):
                    self.feed.feed_address = self.fpf.href
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug(
                        "   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..."
                        % (self.feed.title[:30], self.fpf.status)
                    )
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug(
                    "   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
                    % (self.feed.title[:30], self.fpf.status)
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values

        if not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries))
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(552, "Non-xml feed", self.fpf.bozo_exception)
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries))
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(553, "SAX Exception", self.fpf.bozo_exception)
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values

        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        self.feed.etag = self.fpf.get("etag")
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ""

        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            pass

        self.fpf.entries = self.fpf.entries[:50]

        if self.fpf.feed.get("title"):
            self.feed.feed_title = self.fpf.feed.get("title")
        tagline = self.fpf.feed.get("tagline", self.feed.data.feed_tagline)
        if tagline:
            self.feed.data.feed_tagline = utf8encode(tagline)
            self.feed.data.save()
        if not self.feed.feed_link_locked:
            self.feed.feed_link = self.fpf.feed.get("link") or self.fpf.feed.get("id") or self.feed.feed_link

        guids = []
        for entry in self.fpf.entries:
            if entry.get("id", ""):
                guids.append(entry.get("id", ""))
            elif entry.get("link"):
                guids.append(entry.link)
            elif entry.get("title"):
                guids.append(entry.title)
        self.feed = self.feed.save()

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        # end_date = datetime.datetime.utcnow()
        story_guids = []
        stories = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get("published") < start_date:
                start_date = story.get("published")
            # if story.get('published') > end_date:
            #     end_date = story.get('published')
            stories.append(story)
            story_guids.append(story.get("guid") or story.get("link"))

        existing_stories = list(
            MStory.objects(
                # story_guid__in=story_guids,
                story_date__gte=start_date,
                story_feed_id=self.feed_id,
            ).limit(len(story_guids))
        )

        # MStory.objects(
        #     (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
        #     | (Q(story_guid__in=story_guids)),
        #     story_feed=self.feed
        # ).order_by('-story_date')
        ret_values = self.feed.add_update_stories(stories, existing_stories, verbose=self.options["verbose"])

        if (
            (not self.feed.is_push or self.options.get("force"))
            and hasattr(self.fpf, "feed")
            and hasattr(self.fpf.feed, "links")
            and self.fpf.feed.links
        ):
            hub_url = None
            self_url = self.feed.feed_address
            for link in self.fpf.feed.links:
                if link["rel"] == "hub":
                    hub_url = link["href"]
                elif link["rel"] == "self":
                    self_url = link["href"]
            if hub_url and self_url and not settings.DEBUG:
                logging.debug(u"   ---> [%-30s] ~BB~FWSubscribing to PuSH hub: %s" % (self.feed.title[:30], hub_url))
                PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)

        logging.debug(
            u"   ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s"
            % (
                self.feed.title[:30],
                "~FG~SB" if ret_values[ENTRY_NEW] else "",
                ret_values[ENTRY_NEW],
                "~FY~SB" if ret_values[ENTRY_UPDATED] else "",
                ret_values[ENTRY_UPDATED],
                "~SB" if ret_values[ENTRY_SAME] else "",
                ret_values[ENTRY_SAME],
                "~FR~SB" if ret_values[ENTRY_ERR] else "",
                ret_values[ENTRY_ERR],
                len(self.fpf.entries),
            )
        )
        self.feed.update_all_statistics(full=bool(ret_values[ENTRY_NEW]), force=self.options["force"])
        self.feed.trim_feed()
        self.feed.save_feed_history(200, "OK")

        if self.options["verbose"]:
            logging.debug(
                u"   ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss" % (self.feed.title[:30], time.time() - start)
            )

        return FEED_OK, ret_values
Example #2
0
    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()
        
        ret_values = dict(new=0, updated=0, same=0, error=0)

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(u'   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
                                  self.feed.title[:30],
                                  self.fpf.bozo_exception,
                                  len(self.fpf.entries)))
                    
            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values
            
            # 302: Temporary redirect: ignore
            # 301: Permanent redirect: save it
            if self.fpf.status == 301:
                if not self.fpf.href.endswith('feedburner.com/atom.xml'):
                    self.feed.feed_address = self.fpf.href
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug("   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status))
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug("   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values

        if not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
                logging.debug("   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug("   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
                
        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''

        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            self.feed.last_modified = None
            pass
        
        self.fpf.entries = self.fpf.entries[:100]
        
        if self.fpf.feed.get('title'):
            self.feed.feed_title = strip_tags(self.fpf.feed.get('title'))
        tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
        if tagline:
            self.feed.data.feed_tagline = utf8encode(tagline)
            self.feed.data.save()
        if not self.feed.feed_link_locked:
            self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
        
        self.feed = self.feed.save()

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        story_hashes = []
        stories = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get('published') < start_date:
                start_date = story.get('published')
            story['story_hash'] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get('guid'))
            stories.append(story)
            story_hashes.append(story.get('story_hash'))

        existing_stories = dict((s.story_hash, s) for s in MStory.objects(
            story_hash__in=story_hashes,
            # story_date__gte=start_date,
            # story_feed_id=self.feed.pk
        ))
        
        ret_values = self.feed.add_update_stories(stories, existing_stories,
                                                  verbose=self.options['verbose'],
                                                  updates_off=self.options['updates_off'])

        if (hasattr(self.fpf, 'feed') and 
            hasattr(self.fpf.feed, 'links') and self.fpf.feed.links):
            hub_url = None
            self_url = self.feed.feed_address
            for link in self.fpf.feed.links:
                if link['rel'] == 'hub' and not hub_url:
                    hub_url = link['href']
                elif link['rel'] == 'self':
                    self_url = link['href']
            push_expired = False
            if self.feed.is_push:
                try:
                    push_expired = self.feed.push.lease_expires < datetime.datetime.now()
                except PushSubscription.DoesNotExist:
                    self.feed.is_push = False
            if (hub_url and self_url and not settings.DEBUG and
                self.feed.active_subscribers > 0 and
                (push_expired or not self.feed.is_push or self.options.get('force'))):
                logging.debug(u'   ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (
                              self.feed.title[:30],
                              "~SKRe-~SN" if push_expired else "", hub_url))
                try:
                    PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
                except TimeoutError:
                    logging.debug(u'   ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % (
                                  self.feed.title[:30], hub_url))                    
            elif (self.feed.is_push and 
                  (self.feed.active_subscribers <= 0 or not hub_url)):
                logging.debug(u'   ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (
                              self.feed.title[:30]))
                self.feed.is_push = False
                self.feed = self.feed.save()
        
        logging.debug(u'   ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (
                      self.feed.title[:30], 
                      '~FG~SB' if ret_values['new'] else '', ret_values['new'],
                      '~FY~SB' if ret_values['updated'] else '', ret_values['updated'],
                      '~SB' if ret_values['same'] else '', ret_values['same'],
                      '~FR~SB' if ret_values['error'] else '', ret_values['error'],
                      len(self.fpf.entries)))
        self.feed.update_all_statistics(full=bool(ret_values['new']), force=self.options['force'])
        if ret_values['new']:
            self.feed.trim_feed()
            self.feed.expire_redis()
        self.feed.save_feed_history(200, "OK")
        
        if self.options['verbose']:
            logging.debug(u'   ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (
                          self.feed.title[:30], time.time() - start))
        
        return FEED_OK, ret_values
Example #3
0
    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()
        
        ret_values = dict(new=0, updated=0, same=0, error=0)

        # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(u'   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
                                  self.feed.title[:30],
                                  self.fpf.bozo_exception,
                                  len(self.fpf.entries)))
                    
            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values
            
            if self.fpf.status in (302, 301):
                if not self.fpf.href.endswith('feedburner.com/atom.xml'):
                    self.feed.feed_address = self.fpf.href
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug("   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status))
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug("   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values

        if not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
                logging.debug("   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception)
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug("   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception)
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
                
        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''

        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            pass
        
        self.fpf.entries = self.fpf.entries[:50]
        
        if self.fpf.feed.get('title'):
            self.feed.feed_title = self.fpf.feed.get('title')
        tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
        if tagline:
            self.feed.data.feed_tagline = utf8encode(tagline)
            self.feed.data.save()
        if not self.feed.feed_link_locked:
            self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
        
        self.feed = self.feed.save()

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        story_guids = []
        stories = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get('published') < start_date:
                start_date = story.get('published')
            stories.append(story)
            story_guids.append(story.get('guid'))

        existing_stories = list(MStory.objects(
            # story_guid__in=story_guids,
            story_date__gte=start_date,
            story_feed_id=self.feed.pk
        ).limit(max(int(len(story_guids)*1.5), 10)))
        
        ret_values = self.feed.add_update_stories(stories, existing_stories,
                                                  verbose=self.options['verbose'])

        if ((not self.feed.is_push or self.options.get('force'))
            and hasattr(self.fpf, 'feed') and 
            hasattr(self.fpf.feed, 'links') and self.fpf.feed.links):
            hub_url = None
            self_url = self.feed.feed_address
            for link in self.fpf.feed.links:
                if link['rel'] == 'hub':
                    hub_url = link['href']
                elif link['rel'] == 'self':
                    self_url = link['href']
            if hub_url and self_url and not settings.DEBUG:
                logging.debug(u'   ---> [%-30s] ~BB~FWSubscribing to PuSH hub: %s' % (
                              self.feed.title[:30], hub_url))
                PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
        
        logging.debug(u'   ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (
                      self.feed.title[:30], 
                      '~FG~SB' if ret_values['new'] else '', ret_values['new'],
                      '~FY~SB' if ret_values['updated'] else '', ret_values['updated'],
                      '~SB' if ret_values['same'] else '', ret_values['same'],
                      '~FR~SB' if ret_values['error'] else '', ret_values['error'],
                      len(self.fpf.entries)))
        self.feed.update_all_statistics(full=bool(ret_values['new']), force=self.options['force'])
        if ret_values['new']:
            self.feed.trim_feed()
        self.feed.save_feed_history(200, "OK")
        
        if self.options['verbose']:
            logging.debug(u'   ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (
                          self.feed.title[:30], time.time() - start))
        
        return FEED_OK, ret_values
Example #4
0
    def process(self, first_run=True):
        """ Downloads and parses a feed.
        """
        self.refresh_feed()
        
        ret_values = {
            ENTRY_NEW:0,
            ENTRY_UPDATED:0,
            ENTRY_SAME:0,
            ENTRY_ERR:0}

        # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))
        
        self.feed.fetched_once = True
        self.feed.last_update = datetime.datetime.utcnow()

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                logging.debug(u'   ---> [%-30s] Fetched feed, HTTP status %d: %s (bozo: %s)' % (unicode(self.feed)[:30],
                                                     self.fpf.status,
                                                     self.feed.feed_address,
                                                     self.fpf.bozo))
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(u'   ---> [%-30s] BOZO exception: %s (%s entries)' % (
                                  unicode(self.feed)[:30],
                                  self.fpf.bozo_exception,
                                  len(self.fpf.entries)))
            if self.fpf.status == 304:
                self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values
            
            if self.fpf.status in (302, 301):
                if not self.fpf.href.endswith('feedburner.com/atom.xml'):
                    self.feed.feed_address = self.fpf.href
                if first_run:
                    self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed.save()
                    self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
                
            if self.fpf.status >= 400:
                logging.debug("   ---> [%-30s] HTTP Status code: %s. Checking address..." % (unicode(self.feed)[:30], self.fpf.status))
                fixed_feed = self.feed.check_feed_address_for_feed_link()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed.schedule_feed_fetch_immediately()
                self.feed.save()
                return FEED_ERRHTTP, ret_values
                                    
        if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
            if not self.fpf.entries:
                logging.debug("   ---> [%-30s] Feed is Non-XML. %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries)))
                fixed_feed = self.feed.check_feed_address_for_feed_link()
                if not fixed_feed:
                    self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception)
                else:
                    self.feed.schedule_feed_fetch_immediately()
                self.feed.save()
                return FEED_ERRPARSE, ret_values
        elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
            logging.debug("   ---> [%-30s] Feed is Bad XML (SAX). %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries)))
            if not self.fpf.entries:
                fixed_feed = self.feed.check_feed_address_for_feed_link()
                if not fixed_feed:
                    self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception)
                else:
                    self.feed.schedule_feed_fetch_immediately()
                self.feed.save()
                return FEED_ERRPARSE, ret_values
                
        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''

        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            pass
        
        self.fpf.entries = self.fpf.entries[:50]
        
        self.feed.feed_title = self.fpf.feed.get('title', self.feed.feed_title)
        tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
        if tagline:
            self.feed.data.feed_tagline = utf8encode(tagline)
            self.feed.data.save()
        self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
        
        self.feed.last_update = datetime.datetime.utcnow()
        
        guids = []
        for entry in self.fpf.entries:
            if entry.get('id', ''):
                guids.append(entry.get('id', ''))
            elif entry.get('link'):
                guids.append(entry.link)
            elif entry.get('title'):
                guids.append(entry.title)
        self.feed.save()

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        # end_date = datetime.datetime.utcnow()
        story_guids = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get('published') < start_date:
                start_date = story.get('published')
            # if story.get('published') > end_date:
            #     end_date = story.get('published')
            story_guids.append(story.get('guid') or story.get('link'))
        existing_stories = MStory.objects(
            # story_guid__in=story_guids,
            story_date__gte=start_date,
            story_feed_id=self.feed.pk
        ).limit(len(story_guids))
        
        # MStory.objects(
        #     (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
        #     | (Q(story_guid__in=story_guids)),
        #     story_feed=self.feed
        # ).order_by('-story_date')
        ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories)
        
        logging.debug(u'   ---> [%-30s] Parsed Feed: %s' % (
                      unicode(self.feed)[:30], 
                      u' '.join(u'%s=%d' % (self.entry_trans[key],
                              ret_values[key]) for key in self.entry_keys),))
        self.feed.update_all_statistics()
        self.feed.trim_feed()
        self.feed.save_feed_history(200, "OK")
        
        return FEED_OK, ret_values
Example #5
0
    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()

        ret_values = dict(new=0, updated=0, same=0, error=0)

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(
                        u'   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)'
                        % (self.feed.title[:30], self.fpf.bozo_exception,
                           len(self.fpf.entries)))

            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values

            # 302: Temporary redirect: ignore
            # 301: Permanent redirect: save it
            if self.fpf.status == 301:
                if not self.fpf.href.endswith('feedburner.com/atom.xml'):
                    self.feed.feed_address = self.fpf.href
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug(
                        "   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..."
                        % (self.feed.title[:30], self.fpf.status))
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status,
                                                "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug(
                    "   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
                    % (self.feed.title[:30], self.fpf.status))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address(
                    )
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values

        if not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception,
                                            feedparser.NonXMLContentType):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address(
                    )
                if not fixed_feed:
                    self.feed.save_feed_history(552, 'Non-xml feed',
                                                self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(
                    self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address(
                    )
                if not fixed_feed:
                    self.feed.save_feed_history(553, 'SAX Exception',
                                                self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values

        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''

        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            self.feed.last_modified = None
            pass

        self.fpf.entries = self.fpf.entries[:100]

        if self.fpf.feed.get('title'):
            self.feed.feed_title = strip_tags(self.fpf.feed.get('title'))
        tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
        if tagline:
            self.feed.data.feed_tagline = utf8encode(tagline)
            self.feed.data.save()
        if not self.feed.feed_link_locked:
            self.feed.feed_link = self.fpf.feed.get(
                'link') or self.fpf.feed.get('id') or self.feed.feed_link

        self.feed = self.feed.save()

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        story_hashes = []
        stories = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get('published') < start_date:
                start_date = story.get('published')
            story['story_hash'] = MStory.feed_guid_hash_unsaved(
                self.feed.pk, story.get('guid'))
            stories.append(story)
            story_hashes.append(story.get('story_hash'))

        existing_stories = dict((s.story_hash, s) for s in MStory.objects(
            story_hash__in=story_hashes,
            # story_date__gte=start_date,
            # story_feed_id=self.feed.pk
        ))

        ret_values = self.feed.add_update_stories(
            stories,
            existing_stories,
            verbose=self.options['verbose'],
            updates_off=self.options['updates_off'])

        if (hasattr(self.fpf, 'feed') and hasattr(self.fpf.feed, 'links')
                and self.fpf.feed.links):
            hub_url = None
            self_url = self.feed.feed_address
            for link in self.fpf.feed.links:
                if link['rel'] == 'hub' and not hub_url:
                    hub_url = link['href']
                elif link['rel'] == 'self':
                    self_url = link['href']
            push_expired = False
            if self.feed.is_push:
                try:
                    push_expired = self.feed.push.lease_expires < datetime.datetime.now(
                    )
                except PushSubscription.DoesNotExist:
                    self.feed.is_push = False
            if (hub_url and self_url and not settings.DEBUG
                    and self.feed.active_subscribers > 0
                    and (push_expired or not self.feed.is_push
                         or self.options.get('force'))):
                logging.debug(
                    u'   ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' %
                    (self.feed.title[:30], "~SKRe-~SN" if push_expired else "",
                     hub_url))
                try:
                    PushSubscription.objects.subscribe(self_url,
                                                       feed=self.feed,
                                                       hub=hub_url)
                except TimeoutError:
                    logging.debug(
                        u'   ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s'
                        % (self.feed.title[:30], hub_url))
            elif (self.feed.is_push
                  and (self.feed.active_subscribers <= 0 or not hub_url)):
                logging.debug(
                    u'   ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' %
                    (self.feed.title[:30]))
                self.feed.is_push = False
                self.feed = self.feed.save()

        logging.debug(
            u'   ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s'
            % (self.feed.title[:30], '~FG~SB' if ret_values['new'] else '',
               ret_values['new'], '~FY~SB' if ret_values['updated'] else '',
               ret_values['updated'], '~SB' if ret_values['same'] else '',
               ret_values['same'], '~FR~SB' if ret_values['error'] else '',
               ret_values['error'], len(self.fpf.entries)))
        self.feed.update_all_statistics(full=bool(ret_values['new']),
                                        force=self.options['force'])
        if ret_values['new']:
            self.feed.trim_feed()
            self.feed.expire_redis()
        self.feed.save_feed_history(200, "OK")

        if self.options['verbose']:
            logging.debug(u'   ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' %
                          (self.feed.title[:30], time.time() - start))

        return FEED_OK, ret_values
Example #6
0
    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()

        ret_values = {
            ENTRY_NEW: 0,
            ENTRY_UPDATED: 0,
            ENTRY_SAME: 0,
            ENTRY_ERR: 0
        }

        # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(
                        u'   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)'
                        % (self.feed.title[:30], self.fpf.bozo_exception,
                           len(self.fpf.entries)))

            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values

            if self.fpf.status in (302, 301):
                if not self.fpf.href.endswith('feedburner.com/atom.xml'):
                    self.feed.feed_address = self.fpf.href
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug(
                        "   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..."
                        % (self.feed.title[:30], self.fpf.status))
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status,
                                                "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug(
                    "   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
                    % (self.feed.title[:30], self.fpf.status))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values

        if not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception,
                                            feedparser.NonXMLContentType):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(552, 'Non-xml feed',
                                                self.fpf.bozo_exception)
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(
                    self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(553, 'SAX Exception',
                                                self.fpf.bozo_exception)
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values

        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''

        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            pass

        self.fpf.entries = self.fpf.entries[:50]

        if self.fpf.feed.get('title'):
            self.feed.feed_title = self.fpf.feed.get('title')
        tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
        if tagline:
            self.feed.data.feed_tagline = utf8encode(tagline)
            self.feed.data.save()
        if not self.feed.feed_link_locked:
            self.feed.feed_link = self.fpf.feed.get(
                'link') or self.fpf.feed.get('id') or self.feed.feed_link

        guids = []
        for entry in self.fpf.entries:
            if entry.get('id', ''):
                guids.append(entry.get('id', ''))
            elif entry.get('link'):
                guids.append(entry.link)
            elif entry.get('title'):
                guids.append(entry.title)
        self.feed = self.feed.save()

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        # end_date = datetime.datetime.utcnow()
        story_guids = []
        stories = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get('published') < start_date:
                start_date = story.get('published')
            # if story.get('published') > end_date:
            #     end_date = story.get('published')
            stories.append(story)
            story_guids.append(story.get('guid') or story.get('link'))

        existing_stories = list(
            MStory.objects(
                # story_guid__in=story_guids,
                story_date__gte=start_date,
                story_feed_id=self.feed_id).limit(len(story_guids)))

        # MStory.objects(
        #     (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
        #     | (Q(story_guid__in=story_guids)),
        #     story_feed=self.feed
        # ).order_by('-story_date')
        ret_values = self.feed.add_update_stories(
            stories, existing_stories, verbose=self.options['verbose'])

        if ((not self.feed.is_push or self.options.get('force'))
                and hasattr(self.fpf, 'feed')
                and hasattr(self.fpf.feed, 'links') and self.fpf.feed.links):
            hub_url = None
            self_url = self.feed.feed_address
            for link in self.fpf.feed.links:
                if link['rel'] == 'hub':
                    hub_url = link['href']
                elif link['rel'] == 'self':
                    self_url = link['href']
            if hub_url and self_url and not settings.DEBUG:
                logging.debug(
                    u'   ---> [%-30s] ~BB~FWSubscribing to PuSH hub: %s' %
                    (self.feed.title[:30], hub_url))
                PushSubscription.objects.subscribe(self_url,
                                                   feed=self.feed,
                                                   hub=hub_url)

        logging.debug(
            u'   ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s'
            % (self.feed.title[:30], '~FG~SB' if ret_values[ENTRY_NEW] else '',
               ret_values[ENTRY_NEW], '~FY~SB'
               if ret_values[ENTRY_UPDATED] else '', ret_values[ENTRY_UPDATED],
               '~SB' if ret_values[ENTRY_SAME] else '', ret_values[ENTRY_SAME],
               '~FR~SB' if ret_values[ENTRY_ERR] else '',
               ret_values[ENTRY_ERR], len(self.fpf.entries)))
        self.feed.update_all_statistics(full=bool(ret_values[ENTRY_NEW]),
                                        force=self.options['force'])
        self.feed.trim_feed()
        self.feed.save_feed_history(200, "OK")

        if self.options['verbose']:
            logging.debug(u'   ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' %
                          (self.feed.title[:30], time.time() - start))

        return FEED_OK, ret_values
Example #7
0
    def process(self, first_run=True):
        """ Downloads and parses a feed.
        """
        self.refresh_feed()

        ret_values = {
            ENTRY_NEW: 0,
            ENTRY_UPDATED: 0,
            ENTRY_SAME: 0,
            ENTRY_ERR: 0
        }

        # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))

        self.feed.fetched_once = True
        self.feed.last_update = datetime.datetime.utcnow()

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                logging.debug(
                    u'   ---> [%-30s] Fetched feed, HTTP status %d: %s (bozo: %s)'
                    % (unicode(self.feed)[:30], self.fpf.status,
                       self.feed.feed_address, self.fpf.bozo))
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(
                        u'   ---> [%-30s] BOZO exception: %s (%s entries)' %
                        (unicode(self.feed)[:30], self.fpf.bozo_exception,
                         len(self.fpf.entries)))
            if self.fpf.status == 304:
                self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values

            if self.fpf.status in (302, 301):
                if not self.fpf.href.endswith('feedburner.com/atom.xml'):
                    self.feed.feed_address = self.fpf.href
                if first_run:
                    self.feed.has_feed_exception = True
                    self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed.save()
                    self.feed.save_feed_history(self.fpf.status,
                                                "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values

            if self.fpf.status >= 400:
                logging.debug(
                    "   ---> [%-30s] HTTP Status code: %s. Checking address..."
                    % (unicode(self.feed)[:30], self.fpf.status))
                fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed.has_feed_exception = True
                    self.feed.schedule_feed_fetch_immediately()
                self.feed.save()
                return FEED_ERRHTTP, ret_values

        if self.fpf.bozo and isinstance(self.fpf.bozo_exception,
                                        feedparser.NonXMLContentType):
            logging.debug(
                "   ---> [%-30s] Feed is Non-XML. %s entries.%s Checking address..."
                % (unicode(self.feed)[:30], len(
                    self.fpf.entries), ' Not' if self.fpf.entries else ''))
            if not self.fpf.entries:
                fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(502, 'Non-xml feed',
                                                self.fpf.bozo_exception)
                else:
                    self.feed.has_feed_exception = True
                    self.feed.schedule_feed_fetch_immediately()
                self.feed.save()
                return FEED_ERRPARSE, ret_values
        elif self.fpf.bozo and isinstance(self.fpf.bozo_exception,
                                          xml.sax._exceptions.SAXException):
            logging.debug(
                "   ---> [%-30s] Feed has SAX/XML parsing issues. %s entries.%s Checking address..."
                % (unicode(self.feed)[:30], len(
                    self.fpf.entries), ' Not' if self.fpf.entries else ''))
            if not self.fpf.entries:
                fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(503, 'SAX Exception',
                                                self.fpf.bozo_exception)
                else:
                    self.feed.has_feed_exception = True
                    self.feed.schedule_feed_fetch_immediately()
                self.feed.save()
                return FEED_ERRPARSE, ret_values

        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''

        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            pass

        self.fpf.entries = self.fpf.entries[:50]

        self.feed.feed_title = self.fpf.feed.get('title', self.feed.feed_title)
        tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
        if tagline:
            self.feed.data.feed_tagline = utf8encode(tagline)
            self.feed.data.save()
        if not self.feed.feed_link_locked:
            self.feed.feed_link = self.fpf.feed.get(
                'link') or self.fpf.feed.get('id') or self.feed.feed_link

        self.feed.last_update = datetime.datetime.utcnow()

        guids = []
        for entry in self.fpf.entries:
            if entry.get('id', ''):
                guids.append(entry.get('id', ''))
            elif entry.get('link'):
                guids.append(entry.link)
            elif entry.get('title'):
                guids.append(entry.title)
        self.feed.save()

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        # end_date = datetime.datetime.utcnow()
        story_guids = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get('published') < start_date:
                start_date = story.get('published')
            # if story.get('published') > end_date:
            #     end_date = story.get('published')
            story_guids.append(story.get('guid') or story.get('link'))

        existing_stories = list(
            MStory.objects(
                # story_guid__in=story_guids,
                story_date__gte=start_date,
                story_feed_id=self.feed.pk).limit(len(story_guids)))

        # MStory.objects(
        #     (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
        #     | (Q(story_guid__in=story_guids)),
        #     story_feed=self.feed
        # ).order_by('-story_date')
        ret_values = self.feed.add_update_stories(
            self.fpf.entries,
            existing_stories,
            verbose=self.options['verbose'])

        logging.debug(
            u'   ---> [%-30s] ~FYParsed Feed: new=~FG~SB%s~SN~FY up=~FY~SB%s~SN same=~FY%s err=~FR~SB%s'
            % (unicode(self.feed)[:30], ret_values[ENTRY_NEW],
               ret_values[ENTRY_UPDATED], ret_values[ENTRY_SAME],
               ret_values[ENTRY_ERR]))
        self.feed.update_all_statistics(full=bool(ret_values[ENTRY_NEW]))
        self.feed.trim_feed()
        self.feed.save_feed_history(200, "OK")

        return FEED_OK, ret_values
Example #8
0
    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()

        ret_values = dict(new=0, updated=0, same=0, error=0)

        # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))

        if hasattr(self.fpf, "status"):
            if self.options["verbose"]:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(
                        u"   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)"
                        % (self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries))
                    )

            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values

            # 302: Temporary redirect: ignore
            # 301: Permanent redirect: save it
            if self.fpf.status == 301:
                if not self.fpf.href.endswith("feedburner.com/atom.xml"):
                    self.feed.feed_address = self.fpf.href
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug(
                        "   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..."
                        % (self.feed.title[:30], self.fpf.status)
                    )
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug(
                    "   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
                    % (self.feed.title[:30], self.fpf.status)
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values

        if not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries))
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(552, "Non-xml feed", self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries))
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(553, "SAX Exception", self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values

        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        self.feed.etag = self.fpf.get("etag")
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ""

        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            self.feed.last_modified = None
            pass

        self.fpf.entries = self.fpf.entries[:50]

        if self.fpf.feed.get("title"):
            self.feed.feed_title = self.fpf.feed.get("title")
        tagline = self.fpf.feed.get("tagline", self.feed.data.feed_tagline)
        if tagline:
            self.feed.data.feed_tagline = utf8encode(tagline)
            self.feed.data.save()
        if not self.feed.feed_link_locked:
            self.feed.feed_link = self.fpf.feed.get("link") or self.fpf.feed.get("id") or self.feed.feed_link

        self.feed = self.feed.save()

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        story_guids = []
        stories = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get("published") < start_date:
                start_date = story.get("published")
            stories.append(story)
            story_guids.append(story.get("guid"))

        existing_stories = dict(
            (s.story_guid, s)
            for s in MStory.objects(
                # story_guid__in=story_guids,
                story_date__gte=start_date,
                story_feed_id=self.feed.pk,
            ).limit(max(int(len(story_guids) * 1.5), 10))
        )

        ret_values = self.feed.add_update_stories(stories, existing_stories, verbose=self.options["verbose"])

        if hasattr(self.fpf, "feed") and hasattr(self.fpf.feed, "links") and self.fpf.feed.links:
            hub_url = None
            self_url = self.feed.feed_address
            for link in self.fpf.feed.links:
                if link["rel"] == "hub" and not hub_url:
                    hub_url = link["href"]
                elif link["rel"] == "self":
                    self_url = link["href"]
            push_expired = self.feed.is_push and self.feed.push.lease_expires < datetime.datetime.now()
            if (
                hub_url
                and self_url
                and not settings.DEBUG
                and self.feed.active_subscribers > 0
                and (push_expired or not self.feed.is_push or self.options.get("force"))
            ):
                logging.debug(
                    u"   ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s"
                    % (self.feed.title[:30], "~SKRe-~SN" if push_expired else "", hub_url)
                )
                PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
            elif self.feed.is_push and (self.feed.active_subscribers <= 0 or not hub_url):
                logging.debug(u"   ---> [%-30s] ~BB~FWTurning off PuSH, no hub found" % (self.feed.title[:30]))
                self.feed.is_push = False
                self.feed = self.feed.save()

        logging.debug(
            u"   ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s"
            % (
                self.feed.title[:30],
                "~FG~SB" if ret_values["new"] else "",
                ret_values["new"],
                "~FY~SB" if ret_values["updated"] else "",
                ret_values["updated"],
                "~SB" if ret_values["same"] else "",
                ret_values["same"],
                "~FR~SB" if ret_values["error"] else "",
                ret_values["error"],
                len(self.fpf.entries),
            )
        )
        self.feed.update_all_statistics(full=bool(ret_values["new"]), force=self.options["force"])
        if ret_values["new"]:
            self.feed.trim_feed()
            self.feed.expire_redis()
        self.feed.save_feed_history(200, "OK")

        if self.options["verbose"]:
            logging.debug(
                u"   ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss" % (self.feed.title[:30], time.time() - start)
            )

        return FEED_OK, ret_values
Example #9
0
    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()

        ret_values = dict(new=0, updated=0, same=0, error=0)

        if hasattr(self.fpf, "status"):
            if self.options["verbose"]:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(
                        u"   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)"
                        % (self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries))
                    )

            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values

            # 302: Temporary redirect: ignore
            # 301: Permanent redirect: save it (after 20 tries)
            if self.fpf.status == 301:
                if self.fpf.href.endswith("feedburner.com/atom.xml"):
                    return FEED_ERRHTTP, ret_values
                redirects, non_redirects = self.feed.count_redirects_in_history("feed")
                self.feed.save_feed_history(self.fpf.status, "HTTP Redirect (%d to go)" % (20 - len(redirects)))
                if len(redirects) >= 20 or len(non_redirects) == 0:
                    self.feed.feed_address = self.fpf.href
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug(
                        "   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..."
                        % (self.feed.title[:30], self.fpf.status)
                    )
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug(
                    "   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..."
                    % (self.feed.title[:30], self.fpf.status)
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values

        if not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries))
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(552, "Non-xml feed", self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..."
                    % (self.feed.title[:30], len(self.fpf.entries))
                )
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed, feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(553, "SAX Exception", self.fpf.bozo_exception)
                else:
                    self.feed = feed
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values

        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        original_etag = self.feed.etag
        self.feed.etag = self.fpf.get("etag")
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ""
        if self.feed.etag != original_etag:
            self.feed.save(update_fields=["etag"])

        original_last_modified = self.feed.last_modified
        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            self.feed.last_modified = None
            pass
        if self.feed.last_modified != original_last_modified:
            self.feed.save(update_fields=["last_modified"])

        self.fpf.entries = self.fpf.entries[:100]

        original_title = self.feed.feed_title
        if self.fpf.feed.get("title"):
            self.feed.feed_title = strip_tags(self.fpf.feed.get("title"))
        if self.feed.feed_title != original_title:
            self.feed.save(update_fields=["feed_title"])

        tagline = self.fpf.feed.get("tagline", self.feed.data.feed_tagline)
        if tagline:
            original_tagline = self.feed.data.feed_tagline
            self.feed.data.feed_tagline = utf8encode(tagline)
            if self.feed.data.feed_tagline != original_tagline:
                self.feed.data.save(update_fields=["feed_tagline"])

        if not self.feed.feed_link_locked:
            new_feed_link = self.fpf.feed.get("link") or self.fpf.feed.get("id") or self.feed.feed_link
            if new_feed_link != self.feed.feed_link:
                logging.debug(
                    "   ---> [%-30s] ~SB~FRFeed's page is different: %s to %s"
                    % (self.feed.title[:30], self.feed.feed_link, new_feed_link)
                )
                redirects, non_redirects = self.feed.count_redirects_in_history("page")
                self.feed.save_page_history(301, "HTTP Redirect (%s to go)" % (20 - len(redirects)))
                if len(redirects) >= 20 or len(non_redirects) == 0:
                    self.feed.feed_link = new_feed_link
                    self.feed.save(update_fields=["feed_link"])

        # Determine if stories aren't valid and replace broken guids
        guids_seen = set()
        permalinks_seen = set()
        for entry in self.fpf.entries:
            guids_seen.add(entry.get("guid"))
            permalinks_seen.add(Feed.get_permalink(entry))
        guid_difference = len(guids_seen) != len(self.fpf.entries)
        single_guid = len(guids_seen) == 1
        replace_guids = single_guid and guid_difference
        permalink_difference = len(permalinks_seen) != len(self.fpf.entries)
        single_permalink = len(permalinks_seen) == 1
        replace_permalinks = single_permalink and permalink_difference

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        story_hashes = []
        stories = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get("published") < start_date:
                start_date = story.get("published")
            if replace_guids:
                if replace_permalinks:
                    new_story_guid = unicode(story.get("published"))
                    if self.options["verbose"]:
                        logging.debug(
                            u"   ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s"
                            % (self.feed.title[:30], story.get("guid"), new_story_guid)
                        )
                    story["guid"] = new_story_guid
                else:
                    new_story_guid = Feed.get_permalink(story)
                    if self.options["verbose"]:
                        logging.debug(
                            u"   ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s"
                            % (self.feed.title[:30], story.get("guid"), new_story_guid)
                        )
                    story["guid"] = new_story_guid
            story["story_hash"] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get("guid"))
            stories.append(story)
            story_hashes.append(story.get("story_hash"))

        existing_stories = dict(
            (s.story_hash, s)
            for s in MStory.objects(
                story_hash__in=story_hashes,
                # story_date__gte=start_date,
                # story_feed_id=self.feed.pk
            )
        )

        ret_values = self.feed.add_update_stories(
            stories, existing_stories, verbose=self.options["verbose"], updates_off=self.options["updates_off"]
        )

        if hasattr(self.fpf, "feed") and hasattr(self.fpf.feed, "links") and self.fpf.feed.links:
            hub_url = None
            self_url = self.feed.feed_address
            for link in self.fpf.feed.links:
                if link["rel"] == "hub" and not hub_url:
                    hub_url = link["href"]
                elif link["rel"] == "self":
                    self_url = link["href"]
            push_expired = False
            if self.feed.is_push:
                try:
                    push_expired = self.feed.push.lease_expires < datetime.datetime.now()
                except PushSubscription.DoesNotExist:
                    self.feed.is_push = False
            if (
                hub_url
                and self_url
                and not settings.DEBUG
                and self.feed.active_subscribers > 0
                and (push_expired or not self.feed.is_push or self.options.get("force"))
            ):
                logging.debug(
                    u"   ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s"
                    % (self.feed.title[:30], "~SKRe-~SN" if push_expired else "", hub_url)
                )
                try:
                    PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url)
                except TimeoutError:
                    logging.debug(
                        u"   ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s"
                        % (self.feed.title[:30], hub_url)
                    )
            elif self.feed.is_push and (self.feed.active_subscribers <= 0 or not hub_url):
                logging.debug(u"   ---> [%-30s] ~BB~FWTurning off PuSH, no hub found" % (self.feed.title[:30]))
                self.feed.is_push = False
                self.feed = self.feed.save()

        logging.debug(
            u"   ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s"
            % (
                self.feed.title[:30],
                "~FG~SB" if ret_values["new"] else "",
                ret_values["new"],
                "~FY~SB" if ret_values["updated"] else "",
                ret_values["updated"],
                "~SB" if ret_values["same"] else "",
                ret_values["same"],
                "~FR~SB" if ret_values["error"] else "",
                ret_values["error"],
                len(self.fpf.entries),
            )
        )
        self.feed.update_all_statistics(full=bool(ret_values["new"]), force=self.options["force"])
        if ret_values["new"]:
            self.feed.trim_feed()
            self.feed.expire_redis()
        self.feed.save_feed_history(200, "OK")

        if self.options["verbose"]:
            logging.debug(
                u"   ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss" % (self.feed.title[:30], time.time() - start)
            )

        return FEED_OK, ret_values
Example #10
0
    def process(self):
        """ Downloads and parses a feed.
        """
        start = time.time()
        self.refresh_feed()
        
        ret_values = dict(new=0, updated=0, same=0, error=0)

        # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(u'   ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (
                                  self.feed.title[:30],
                                  self.fpf.bozo_exception,
                                  len(self.fpf.entries)))
                    
            if self.fpf.status == 304:
                self.feed = self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values
            
            if self.fpf.status in (302, 301):
                if not self.fpf.href.endswith('feedburner.com/atom.xml'):
                    self.feed.feed_address = self.fpf.href
                if not self.feed.known_good:
                    self.feed.fetched_once = True
                    logging.debug("   ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status))
                    self.feed = self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed = self.feed.save()
                    self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
            if self.fpf.status >= 400:
                logging.debug("   ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                self.feed = self.feed.save()
                return FEED_ERRHTTP, ret_values

        if not self.fpf.entries:
            if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
                logging.debug("   ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception)
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
            elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
                logging.debug("   ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)))
                fixed_feed = None
                if not self.feed.known_good:
                    fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception)
                self.feed = self.feed.save()
                return FEED_ERRPARSE, ret_values
                
        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''

        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            pass
        
        self.fpf.entries = self.fpf.entries[:50]
        
        if self.fpf.feed.get('title'):
            self.feed.feed_title = self.fpf.feed.get('title')
        tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
        if tagline:
            self.feed.data.feed_tagline = utf8encode(tagline)
            self.feed.data.save()
        if not self.feed.feed_link_locked:
            self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link