def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = {ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0} # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) if hasattr(self.fpf, "status"): if self.options["verbose"]: if self.fpf.bozo and self.fpf.status != 304: logging.debug( u" ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)" % (self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries)) ) if self.fpf.status == 304: self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values if self.fpf.status in (302, 301): if not self.fpf.href.endswith("feedburner.com/atom.xml"): self.feed.feed_address = self.fpf.href if not self.feed.known_good: self.feed.fetched_once = True logging.debug( " ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status) ) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug( " ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status) ) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)) ) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(552, "Non-xml feed", self.fpf.bozo_exception) self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)) ) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(553, "SAX Exception", self.fpf.bozo_exception) self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get("etag") if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = "" try: self.feed.last_modified = mtime(self.fpf.modified) except: pass self.fpf.entries = self.fpf.entries[:50] if self.fpf.feed.get("title"): self.feed.feed_title = self.fpf.feed.get("title") tagline = self.fpf.feed.get("tagline", self.feed.data.feed_tagline) if tagline: self.feed.data.feed_tagline = utf8encode(tagline) self.feed.data.save() if not self.feed.feed_link_locked: self.feed.feed_link = self.fpf.feed.get("link") or self.fpf.feed.get("id") or self.feed.feed_link guids = [] for entry in self.fpf.entries: if entry.get("id", ""): guids.append(entry.get("id", "")) elif entry.get("link"): guids.append(entry.link) elif entry.get("title"): guids.append(entry.title) self.feed = self.feed.save() # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() # end_date = datetime.datetime.utcnow() story_guids = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get("published") < start_date: start_date = story.get("published") # if story.get('published') > end_date: # end_date = story.get('published') stories.append(story) story_guids.append(story.get("guid") or story.get("link")) existing_stories = list( MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed_id, ).limit(len(story_guids)) ) # MStory.objects( # (Q(story_date__gte=start_date) & Q(story_date__lte=end_date)) # | (Q(story_guid__in=story_guids)), # story_feed=self.feed # ).order_by('-story_date') ret_values = self.feed.add_update_stories(stories, existing_stories, verbose=self.options["verbose"]) if ( (not self.feed.is_push or self.options.get("force")) and hasattr(self.fpf, "feed") and hasattr(self.fpf.feed, "links") and self.fpf.feed.links ): hub_url = None self_url = self.feed.feed_address for link in self.fpf.feed.links: if link["rel"] == "hub": hub_url = link["href"] elif link["rel"] == "self": self_url = link["href"] if hub_url and self_url and not settings.DEBUG: logging.debug(u" ---> [%-30s] ~BB~FWSubscribing to PuSH hub: %s" % (self.feed.title[:30], hub_url)) PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url) logging.debug( u" ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s" % ( self.feed.title[:30], "~FG~SB" if ret_values[ENTRY_NEW] else "", ret_values[ENTRY_NEW], "~FY~SB" if ret_values[ENTRY_UPDATED] else "", ret_values[ENTRY_UPDATED], "~SB" if ret_values[ENTRY_SAME] else "", ret_values[ENTRY_SAME], "~FR~SB" if ret_values[ENTRY_ERR] else "", ret_values[ENTRY_ERR], len(self.fpf.entries), ) ) self.feed.update_all_statistics(full=bool(ret_values[ENTRY_NEW]), force=self.options["force"]) self.feed.trim_feed() self.feed.save_feed_history(200, "OK") if self.options["verbose"]: logging.debug( u" ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss" % (self.feed.title[:30], time.time() - start) ) return FEED_OK, ret_values
def add_update_stories(self, stories, existing_stories): ret_values = {ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0} for story in stories: story = pre_process_story(story) if story.get("title"): story_contents = story.get("content") story_tags = self.get_tags(story) if story_contents is not None: story_content = story_contents[0]["value"] else: story_content = story.get("summary") existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories) if existing_story is None: s = MStory( story_feed_id=self.pk, story_date=story.get("published"), story_title=story.get("title"), story_content=story_content, story_author_name=story.get("author"), story_permalink=story.get("link"), story_guid=story.get("guid") or story.get("id") or story.get("link"), story_tags=story_tags, ) try: s.save() ret_values[ENTRY_NEW] += 1 cache.set("updated_feed:%s" % self.id, 1) except (IntegrityError, OperationError): ret_values[ENTRY_ERR] += 1 # logging.info('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e)) elif existing_story and story_has_changed: # update story # logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content))) original_content = None if existing_story.story_original_content_z: original_content = zlib.decompress(existing_story.story_original_content_z) elif existing_story.story_content_z: original_content = zlib.decompress(existing_story.story_content_z) # print 'Type: %s %s' % (type(original_content), type(story_content)) if story_content and len(story_content) > 10: diff = HTMLDiff(unicode(original_content), story_content) story_content_diff = diff.getDiff() else: story_content_diff = original_content # logging.debug("\t\tDiff: %s %s %s" % diff.getStats()) # logging.debug("\t\tDiff content: %s" % diff.getDiff()) if existing_story.story_title != story.get("title"): # logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title'))) pass existing_story.story_feed = self.pk existing_story.story_date = story.get("published") existing_story.story_title = story.get("title") existing_story.story_content = story_content_diff existing_story.story_original_content = original_content existing_story.story_author_name = story.get("author") existing_story.story_permalink = story.get("link") existing_story.story_guid = story.get("guid") or story.get("id") or story.get("link") existing_story.story_tags = story_tags try: existing_story.save() ret_values[ENTRY_UPDATED] += 1 cache.set("updated_feed:%s" % self.id, 1) except (IntegrityError, OperationError): ret_values[ENTRY_ERR] += 1 logging.info( "Saving updated story, IntegrityError: %s - %s" % (self.feed_title, story.get("title")) ) else: ret_values[ENTRY_SAME] += 1 # logging.debug("Unchanged story: %s " % story.get('title')) return ret_values
def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = dict(new=0, updated=0, same=0, error=0) # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) if hasattr(self.fpf, 'status'): if self.options['verbose']: if self.fpf.bozo and self.fpf.status != 304: logging.debug(u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % ( self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values if self.fpf.status in (302, 301): if not self.fpf.href.endswith('feedburner.com/atom.xml'): self.feed.feed_address = self.fpf.href if not self.feed.known_good: self.feed.fetched_once = True logging.debug(" ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status)) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug(" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status)) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception) self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug(" ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception) self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' try: self.feed.last_modified = mtime(self.fpf.modified) except: pass self.fpf.entries = self.fpf.entries[:50] if self.fpf.feed.get('title'): self.feed.feed_title = self.fpf.feed.get('title') tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) if tagline: self.feed.data.feed_tagline = utf8encode(tagline) self.feed.data.save() if not self.feed.feed_link_locked: self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link self.feed = self.feed.save() # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() story_guids = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') stories.append(story) story_guids.append(story.get('guid')) existing_stories = list(MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed.pk ).limit(max(int(len(story_guids)*1.5), 10))) ret_values = self.feed.add_update_stories(stories, existing_stories, verbose=self.options['verbose']) if ((not self.feed.is_push or self.options.get('force')) and hasattr(self.fpf, 'feed') and hasattr(self.fpf.feed, 'links') and self.fpf.feed.links): hub_url = None self_url = self.feed.feed_address for link in self.fpf.feed.links: if link['rel'] == 'hub': hub_url = link['href'] elif link['rel'] == 'self': self_url = link['href'] if hub_url and self_url and not settings.DEBUG: logging.debug(u' ---> [%-30s] ~BB~FWSubscribing to PuSH hub: %s' % ( self.feed.title[:30], hub_url)) PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url) logging.debug(u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % ( self.feed.title[:30], '~FG~SB' if ret_values['new'] else '', ret_values['new'], '~FY~SB' if ret_values['updated'] else '', ret_values['updated'], '~SB' if ret_values['same'] else '', ret_values['same'], '~FR~SB' if ret_values['error'] else '', ret_values['error'], len(self.fpf.entries))) self.feed.update_all_statistics(full=bool(ret_values['new']), force=self.options['force']) if ret_values['new']: self.feed.trim_feed() self.feed.save_feed_history(200, "OK") if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % ( self.feed.title[:30], time.time() - start)) return FEED_OK, ret_values
def add_update_stories(self, stories, existing_stories, db): ret_values = { ENTRY_NEW:0, ENTRY_UPDATED:0, ENTRY_SAME:0, ENTRY_ERR:0 } for story in stories: story = pre_process_story(story) if story.get('title'): story_contents = story.get('content') story_tags = self.get_tags(story) if story_contents is not None: story_content = story_contents[0]['value'] else: story_content = story.get('summary') existing_story, story_has_changed = self._exists_story(story, story_content, existing_stories) if existing_story is None: # pub_date = datetime.datetime.timetuple(story.get('published')) # logging.debug('- New story: %s %s' % (pub_date, story.get('title'))) s = MStory(story_feed_id = self.pk, story_date = story.get('published'), story_title = story.get('title'), story_content = story_content, story_author_name = story.get('author'), story_permalink = story.get('link'), story_guid = story.get('guid') or story.get('id') or story.get('link'), story_tags = story_tags ) try: s.save() ret_values[ENTRY_NEW] += 1 cache.set('updated_feed:%s' % self.id, 1) except (IntegrityError, OperationError): ret_values[ENTRY_ERR] += 1 # print('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e)) elif existing_story and story_has_changed: # update story # logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content))) original_content = None if existing_story.get('story_original_content_z'): original_content = zlib.decompress(existing_story.get('story_original_content_z')) elif existing_story.get('story_content_z'): original_content = zlib.decompress(existing_story.get('story_content_z')) # print 'Type: %s %s' % (type(original_content), type(story_content)) if len(story_content) > 10: diff = HTMLDiff(unicode(original_content), story_content) story_content_diff = diff.getDiff() else: story_content_diff = original_content # logging.debug("\t\tDiff: %s %s %s" % diff.getStats()) # logging.debug("\t\tDiff content: %s" % diff.getDiff()) if existing_story.get('story_title') != story.get('title'): # logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title'))) pass existing_story['story_feed'] = self.pk existing_story['story_date'] = story.get('published') existing_story['story_title'] = story.get('title') existing_story['story_content'] = story_content_diff existing_story['story_original_content'] = original_content existing_story['story_author_name'] = story.get('author') existing_story['story_permalink'] = story.get('link') existing_story['story_guid'] = story.get('guid') or story.get('id') or story.get('link') existing_story['story_tags'] = story_tags try: db.stories.update({'_id': existing_story['_id']}, existing_story) ret_values[ENTRY_UPDATED] += 1 cache.set('updated_feed:%s' % self.id, 1) except (IntegrityError, OperationError): ret_values[ENTRY_ERR] += 1 # print('Saving updated story, IntegrityError: %s - %s' % (self.feed_title, story.get('title'))) else: ret_values[ENTRY_SAME] += 1 # logging.debug("Unchanged story: %s " % story.get('title')) return ret_values
def process(self, first_run=True): """ Downloads and parses a feed. """ self.refresh_feed() ret_values = { ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0 } # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) self.feed.fetched_once = True self.feed.last_update = datetime.datetime.utcnow() if hasattr(self.fpf, 'status'): if self.options['verbose']: logging.debug( u' ---> [%-30s] Fetched feed, HTTP status %d: %s (bozo: %s)' % (unicode(self.feed)[:30], self.fpf.status, self.feed.feed_address, self.fpf.bozo)) if self.fpf.bozo and self.fpf.status != 304: logging.debug( u' ---> [%-30s] BOZO exception: %s (%s entries)' % (unicode(self.feed)[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values if self.fpf.status in (302, 301): self.feed.feed_address = self.fpf.href if first_run: self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug( " ---> [%-30s] HTTP Status code: %s. Checking address..." % (unicode(self.feed)[:30], self.fpf.status)) fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRHTTP, ret_values if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): if not self.fpf.entries: logging.debug( " ---> [%-30s] Feed is Non-XML. %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries))) fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception) else: self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] Feed is Bad XML (SAX). %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries))) if not self.fpf.entries: fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception) else: self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' try: self.feed.last_modified = mtime(self.fpf.modified) except: pass self.feed.feed_title = self.fpf.feed.get('title', self.feed.feed_title) self.feed.data.feed_tagline = self.fpf.feed.get( 'tagline', self.feed.data.feed_tagline) self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get( 'id') or self.feed.feed_link self.feed.last_update = datetime.datetime.utcnow() guids = [] for entry in self.fpf.entries: if entry.get('id', ''): guids.append(entry.get('id', '')) elif entry.title: guids.append(entry.title) elif entry.link: guids.append(entry.link) self.feed.save() # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() # end_date = datetime.datetime.utcnow() story_guids = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') # if story.get('published') > end_date: # end_date = story.get('published') story_guids.append(story.get('guid') or story.get('link')) existing_stories = MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed.pk).limit(len(story_guids)) # MStory.objects( # (Q(story_date__gte=start_date) & Q(story_date__lte=end_date)) # | (Q(story_guid__in=story_guids)), # story_feed=self.feed # ).order_by('-story_date') ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories) logging.debug(u' ---> [%-30s] Parsed Feed: %s' % ( unicode(self.feed)[:30], u' '.join(u'%s=%d' % (self.entry_trans[key], ret_values[key]) for key in self.entry_keys), )) self.feed.update_all_statistics() self.feed.trim_feed() self.feed.save_feed_history(200, "OK") return FEED_OK, ret_values
def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = dict(new=0, updated=0, same=0, error=0) # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) if hasattr(self.fpf, 'status'): if self.options['verbose']: if self.fpf.bozo and self.fpf.status != 304: logging.debug( u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % (self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values if self.fpf.status in (302, 301): if not self.fpf.href.endswith('feedburner.com/atom.xml'): self.feed.feed_address = self.fpf.href if not self.feed.known_good: self.feed.fetched_once = True logging.debug( " ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status)) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug( " ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status)) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception) self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance( self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception) self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' try: self.feed.last_modified = mtime(self.fpf.modified) except: self.feed.last_modified = None pass self.fpf.entries = self.fpf.entries[:50] if self.fpf.feed.get('title'): self.feed.feed_title = self.fpf.feed.get('title') tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) if tagline: self.feed.data.feed_tagline = utf8encode(tagline) self.feed.data.save() if not self.feed.feed_link_locked: self.feed.feed_link = self.fpf.feed.get( 'link') or self.fpf.feed.get('id') or self.feed.feed_link self.feed = self.feed.save() # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() story_guids = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') stories.append(story) story_guids.append(story.get('guid')) existing_stories = dict((s.story_guid, s) for s in MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed.pk).limit( max(int(len(story_guids) * 1.5), 10))) ret_values = self.feed.add_update_stories( stories, existing_stories, verbose=self.options['verbose']) if (hasattr(self.fpf, 'feed') and hasattr(self.fpf.feed, 'links') and self.fpf.feed.links): hub_url = None self_url = self.feed.feed_address for link in self.fpf.feed.links: if link['rel'] == 'hub' and not hub_url: hub_url = link['href'] elif link['rel'] == 'self': self_url = link['href'] push_expired = self.feed.is_push and self.feed.push.lease_expires < datetime.datetime.now( ) if (hub_url and self_url and not settings.DEBUG and (push_expired or not self.feed.is_push or self.options.get('force'))): logging.debug( u' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % (self.feed.title[:30], "~SKRe-~SN" if push_expired else "", hub_url)) PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url) elif self.feed.is_push and not hub_url: logging.debug( u' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % (self.feed.title[:30])) self.feed.is_push = False self.feed = self.feed.save() logging.debug( u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % (self.feed.title[:30], '~FG~SB' if ret_values['new'] else '', ret_values['new'], '~FY~SB' if ret_values['updated'] else '', ret_values['updated'], '~SB' if ret_values['same'] else '', ret_values['same'], '~FR~SB' if ret_values['error'] else '', ret_values['error'], len(self.fpf.entries))) self.feed.update_all_statistics(full=bool(ret_values['new']), force=self.options['force']) if ret_values['new']: self.feed.trim_feed() self.feed.save_feed_history(200, "OK") if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % (self.feed.title[:30], time.time() - start)) return FEED_OK, ret_values
def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = dict(new=0, updated=0, same=0, error=0) if hasattr(self.fpf, 'status'): if self.options['verbose']: if self.fpf.bozo and self.fpf.status != 304: logging.debug(u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % ( self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values # 302: Temporary redirect: ignore # 301: Permanent redirect: save it (after 20 tries) if self.fpf.status == 301: if self.fpf.href.endswith('feedburner.com/atom.xml'): return FEED_ERRHTTP, ret_values redirects, non_redirects = self.feed.count_redirects_in_history('feed') self.feed.save_feed_history(self.fpf.status, "HTTP Redirect (%d to go)" % (20-len(redirects))) if len(redirects) >= 20 or len(non_redirects) == 0: self.feed.feed_address = self.fpf.href if not self.feed.known_good: self.feed.fetched_once = True logging.debug(" ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status)) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug(" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status)) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed = feed self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug(" ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields original_etag = self.feed.etag self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' if self.feed.etag != original_etag: self.feed.save(update_fields=['etag']) original_last_modified = self.feed.last_modified try: self.feed.last_modified = mtime(self.fpf.modified) except: self.feed.last_modified = None pass if self.feed.last_modified != original_last_modified: self.feed.save(update_fields=['last_modified']) self.fpf.entries = self.fpf.entries[:100] original_title = self.feed.feed_title if self.fpf.feed.get('title'): self.feed.feed_title = strip_tags(self.fpf.feed.get('title')) if self.feed.feed_title != original_title: self.feed.save(update_fields=['feed_title']) tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) if tagline: original_tagline = self.feed.data.feed_tagline self.feed.data.feed_tagline = smart_unicode(tagline) if self.feed.data.feed_tagline != original_tagline: self.feed.data.save(update_fields=['feed_tagline']) if not self.feed.feed_link_locked: new_feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link if new_feed_link != self.feed.feed_link: logging.debug(" ---> [%-30s] ~SB~FRFeed's page is different: %s to %s" % (self.feed.title[:30], self.feed.feed_link, new_feed_link)) redirects, non_redirects = self.feed.count_redirects_in_history('page') self.feed.save_page_history(301, "HTTP Redirect (%s to go)" % (20-len(redirects))) if len(redirects) >= 20 or len(non_redirects) == 0: self.feed.feed_link = new_feed_link self.feed.save(update_fields=['feed_link']) # Determine if stories aren't valid and replace broken guids guids_seen = set() permalinks_seen = set() for entry in self.fpf.entries: guids_seen.add(entry.get('guid')) permalinks_seen.add(Feed.get_permalink(entry)) guid_difference = len(guids_seen) != len(self.fpf.entries) single_guid = len(guids_seen) == 1 replace_guids = single_guid and guid_difference permalink_difference = len(permalinks_seen) != len(self.fpf.entries) single_permalink = len(permalinks_seen) == 1 replace_permalinks = single_permalink and permalink_difference # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() story_hashes = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') if replace_guids: if replace_permalinks: new_story_guid = unicode(story.get('published')) if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % ( self.feed.title[:30], story.get('guid'), new_story_guid)) story['guid'] = new_story_guid else: new_story_guid = Feed.get_permalink(story) if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % ( self.feed.title[:30], story.get('guid'), new_story_guid)) story['guid'] = new_story_guid story['story_hash'] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get('guid')) stories.append(story) story_hashes.append(story.get('story_hash')) existing_stories = dict((s.story_hash, s) for s in MStory.objects( story_hash__in=story_hashes, # story_date__gte=start_date, # story_feed_id=self.feed.pk )) ret_values = self.feed.add_update_stories(stories, existing_stories, verbose=self.options['verbose'], updates_off=self.options['updates_off']) if (hasattr(self.fpf, 'feed') and hasattr(self.fpf.feed, 'links') and self.fpf.feed.links): hub_url = None self_url = self.feed.feed_address for link in self.fpf.feed.links: if link['rel'] == 'hub' and not hub_url: hub_url = link['href'] elif link['rel'] == 'self': self_url = link['href'] push_expired = False if self.feed.is_push: try: push_expired = self.feed.push.lease_expires < datetime.datetime.now() except PushSubscription.DoesNotExist: self.feed.is_push = False if (hub_url and self_url and not settings.DEBUG and self.feed.active_subscribers > 0 and (push_expired or not self.feed.is_push or self.options.get('force'))): logging.debug(u' ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s' % ( self.feed.title[:30], "~SKRe-~SN" if push_expired else "", hub_url)) try: PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url) except TimeoutError: logging.debug(u' ---> [%-30s] ~BB~FW~FRTimed out~FW subscribing to PuSH hub: %s' % ( self.feed.title[:30], hub_url)) elif (self.feed.is_push and (self.feed.active_subscribers <= 0 or not hub_url)): logging.debug(u' ---> [%-30s] ~BB~FWTurning off PuSH, no hub found' % ( self.feed.title[:30])) self.feed.is_push = False self.feed = self.feed.save() logging.debug(u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % ( self.feed.title[:30], '~FG~SB' if ret_values['new'] else '', ret_values['new'], '~FY~SB' if ret_values['updated'] else '', ret_values['updated'], '~SB' if ret_values['same'] else '', ret_values['same'], '~FR~SB' if ret_values['error'] else '', ret_values['error'], len(self.fpf.entries))) self.feed.update_all_statistics(has_new_stories=bool(ret_values['new']), force=self.options['force']) if ret_values['new']: self.feed.trim_feed() self.feed.expire_redis() self.feed.save_feed_history(200, "OK") if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % ( self.feed.title[:30], time.time() - start)) return FEED_OK, ret_values
def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = dict(new=0, updated=0, same=0, error=0) if hasattr(self.fpf, 'status'): if self.options['verbose']: if self.fpf.bozo and self.fpf.status != 304: logging.debug(u' ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)' % ( self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: # 304 stands for resource not modified self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values # 302: Temporary redirect: ignore # 301: Permanent redirect: save it if self.fpf.status == 301: if not self.fpf.href.endswith('feedburner.com/atom.xml'): self.feed.feed_address = self.fpf.href if not self.feed.known_good: self.feed.fetched_once = True logging.debug(" ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status)) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug(" ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status)) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed = feed self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug(" ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(552, 'Non-xml feed', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug(" ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries))) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(553, 'SAX Exception', self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' try: self.feed.last_modified = mtime(self.fpf.modified) except: self.feed.last_modified = None pass self.fpf.entries = self.fpf.entries[:100] if self.fpf.feed.get('title'): self.feed.feed_title = strip_tags(self.fpf.feed.get('title')) self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link self.feed = self.feed.save() # Determine if stories aren't valid and replace broken guids # if guid is single among many entries: # if permalink also is single among many entries: # replace the guid with published # else if permalink is not: # replace the guid with permalink guids_seen = set() permalinks_seen = set() for entry in self.fpf.entries: guids_seen.add(entry.get('guid')) permalinks_seen.add(Feed.get_permalink(entry)) guid_difference = len(guids_seen) != len(self.fpf.entries) # means guid is duplicated. single_guid = len(guids_seen) == 1 replace_guids = single_guid and guid_difference # means guid is single but entries not. permalink_difference = len(permalinks_seen) != len(self.fpf.entries) single_permalink = len(permalinks_seen) == 1 replace_permalinks = single_permalink and permalink_difference # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() story_hashes = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') if replace_guids: if replace_permalinks: new_story_guid = unicode(story.get('published')) if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with timestamp: %s' % ( self.feed.title[:30], story.get('guid'), new_story_guid)) story['guid'] = new_story_guid else: new_story_guid = Feed.get_permalink(story) if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBReplacing guid (%s) with permalink: %s' % ( self.feed.title[:30], story.get('guid'), new_story_guid)) story['guid'] = new_story_guid story['story_hash'] = MStory.feed_guid_hash_unsaved(self.feed.pk, story.get('guid')) stories.append(story) story_hashes.append(story.get('story_hash')) # find the existing_stories with story_hash in story_hashes. existing_stories = dict((s.story_hash, s) for s in MStory.objects( story_hash__in=story_hashes, # story_date__gte=start_date, # story_feed_id=self.feed.pk )) ret_values = self.feed.add_update_stories(stories, existing_stories, verbose=self.options['verbose'],) logging.debug(u' ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s' % ( self.feed.title[:30], '~FG~SB' if ret_values['new'] else '', ret_values['new'], '~FY~SB' if ret_values['updated'] else '', ret_values['updated'], '~SB' if ret_values['same'] else '', ret_values['same'], '~FR~SB' if ret_values['error'] else '', ret_values['error'], len(self.fpf.entries))) # If there is new story, update all statistics self.feed.update_all_statistics(full=bool(ret_values['new'])) self.feed.save_feed_history(200, "OK") if self.options['verbose']: logging.debug(u' ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss' % ( self.feed.title[:30], time.time() - start)) return FEED_OK, ret_values
def process(self, first_run=True): """ Downloads and parses a feed. """ self.refresh_feed() ret_values = { ENTRY_NEW:0, ENTRY_UPDATED:0, ENTRY_SAME:0, ENTRY_ERR:0} # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) self.feed.fetched_once = True self.feed.last_update = datetime.datetime.utcnow() if hasattr(self.fpf, 'status'): if self.options['verbose']: logging.debug(u' ---> [%-30s] Fetched feed, HTTP status %d: %s (bozo: %s)' % (unicode(self.feed)[:30], self.fpf.status, self.feed.feed_address, self.fpf.bozo)) if self.fpf.bozo and self.fpf.status != 304: logging.debug(u' ---> [%-30s] BOZO exception: %s (%s entries)' % ( unicode(self.feed)[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values if self.fpf.status in (302, 301): if not self.fpf.href.endswith('feedburner.com/atom.xml'): self.feed.feed_address = self.fpf.href if first_run: self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug(" ---> [%-30s] HTTP Status code: %s. Checking address..." % (unicode(self.feed)[:30], self.fpf.status)) fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRHTTP, ret_values if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): if not self.fpf.entries: logging.debug(" ---> [%-30s] Feed is Non-XML. %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries))) fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception) else: self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug(" ---> [%-30s] Feed is Bad XML (SAX). %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries))) if not self.fpf.entries: fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception) else: self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' try: self.feed.last_modified = mtime(self.fpf.modified) except: pass self.fpf.entries = self.fpf.entries[:50] self.feed.feed_title = self.fpf.feed.get('title', self.feed.feed_title) tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) if tagline: self.feed.data.feed_tagline = utf8encode(tagline) self.feed.data.save() self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link self.feed.last_update = datetime.datetime.utcnow() guids = [] for entry in self.fpf.entries: if entry.get('id', ''): guids.append(entry.get('id', '')) elif entry.get('link'): guids.append(entry.link) elif entry.get('title'): guids.append(entry.title) self.feed.save() # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() # end_date = datetime.datetime.utcnow() story_guids = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') # if story.get('published') > end_date: # end_date = story.get('published') story_guids.append(story.get('guid') or story.get('link')) existing_stories = MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed.pk ).limit(len(story_guids)) # MStory.objects( # (Q(story_date__gte=start_date) & Q(story_date__lte=end_date)) # | (Q(story_guid__in=story_guids)), # story_feed=self.feed # ).order_by('-story_date') ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories) logging.debug(u' ---> [%-30s] Parsed Feed: %s' % ( unicode(self.feed)[:30], u' '.join(u'%s=%d' % (self.entry_trans[key], ret_values[key]) for key in self.entry_keys),)) self.feed.update_all_statistics() self.feed.trim_feed() self.feed.save_feed_history(200, "OK") return FEED_OK, ret_values
def process(self): """ Downloads and parses a feed. """ start = time.time() self.refresh_feed() ret_values = dict(new=0, updated=0, same=0, error=0) # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) if hasattr(self.fpf, "status"): if self.options["verbose"]: if self.fpf.bozo and self.fpf.status != 304: logging.debug( u" ---> [%-30s] ~FRBOZO exception: %s ~SB(%s entries)" % (self.feed.title[:30], self.fpf.bozo_exception, len(self.fpf.entries)) ) if self.fpf.status == 304: self.feed = self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values # 302: Temporary redirect: ignore # 301: Permanent redirect: save it if self.fpf.status == 301: if not self.fpf.href.endswith("feedburner.com/atom.xml"): self.feed.feed_address = self.fpf.href if not self.feed.known_good: self.feed.fetched_once = True logging.debug( " ---> [%-30s] ~SB~SK~FRFeed is %s'ing. Refetching..." % (self.feed.title[:30], self.fpf.status) ) self.feed = self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed = self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug( " ---> [%-30s] ~SB~FRHTTP Status code: %s. Checking address..." % (self.feed.title[:30], self.fpf.status) ) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed = feed self.feed = self.feed.save() return FEED_ERRHTTP, ret_values if not self.fpf.entries: if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug( " ---> [%-30s] ~SB~FRFeed is Non-XML. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)) ) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(552, "Non-xml feed", self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] ~SB~FRFeed has SAX/XML parsing issues. %s entries. Checking address..." % (self.feed.title[:30], len(self.fpf.entries)) ) fixed_feed = None if not self.feed.known_good: fixed_feed, feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(553, "SAX Exception", self.fpf.bozo_exception) else: self.feed = feed self.feed = self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get("etag") if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = "" try: self.feed.last_modified = mtime(self.fpf.modified) except: self.feed.last_modified = None pass self.fpf.entries = self.fpf.entries[:50] if self.fpf.feed.get("title"): self.feed.feed_title = self.fpf.feed.get("title") tagline = self.fpf.feed.get("tagline", self.feed.data.feed_tagline) if tagline: self.feed.data.feed_tagline = utf8encode(tagline) self.feed.data.save() if not self.feed.feed_link_locked: self.feed.feed_link = self.fpf.feed.get("link") or self.fpf.feed.get("id") or self.feed.feed_link self.feed = self.feed.save() # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() story_guids = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get("published") < start_date: start_date = story.get("published") stories.append(story) story_guids.append(story.get("guid")) existing_stories = dict( (s.story_guid, s) for s in MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed.pk, ).limit(max(int(len(story_guids) * 1.5), 10)) ) ret_values = self.feed.add_update_stories(stories, existing_stories, verbose=self.options["verbose"]) if hasattr(self.fpf, "feed") and hasattr(self.fpf.feed, "links") and self.fpf.feed.links: hub_url = None self_url = self.feed.feed_address for link in self.fpf.feed.links: if link["rel"] == "hub" and not hub_url: hub_url = link["href"] elif link["rel"] == "self": self_url = link["href"] push_expired = self.feed.is_push and self.feed.push.lease_expires < datetime.datetime.now() if ( hub_url and self_url and not settings.DEBUG and self.feed.active_subscribers > 0 and (push_expired or not self.feed.is_push or self.options.get("force")) ): logging.debug( u" ---> [%-30s] ~BB~FW%sSubscribing to PuSH hub: %s" % (self.feed.title[:30], "~SKRe-~SN" if push_expired else "", hub_url) ) PushSubscription.objects.subscribe(self_url, feed=self.feed, hub=hub_url) elif self.feed.is_push and (self.feed.active_subscribers <= 0 or not hub_url): logging.debug(u" ---> [%-30s] ~BB~FWTurning off PuSH, no hub found" % (self.feed.title[:30])) self.feed.is_push = False self.feed = self.feed.save() logging.debug( u" ---> [%-30s] ~FYParsed Feed: %snew=%s~SN~FY %sup=%s~SN same=%s%s~SN %serr=%s~SN~FY total=~SB%s" % ( self.feed.title[:30], "~FG~SB" if ret_values["new"] else "", ret_values["new"], "~FY~SB" if ret_values["updated"] else "", ret_values["updated"], "~SB" if ret_values["same"] else "", ret_values["same"], "~FR~SB" if ret_values["error"] else "", ret_values["error"], len(self.fpf.entries), ) ) self.feed.update_all_statistics(full=bool(ret_values["new"]), force=self.options["force"]) if ret_values["new"]: self.feed.trim_feed() self.feed.expire_redis() self.feed.save_feed_history(200, "OK") if self.options["verbose"]: logging.debug( u" ---> [%-30s] ~FBTIME: feed parse in ~FM%.4ss" % (self.feed.title[:30], time.time() - start) ) return FEED_OK, ret_values
def process(self): """ Downloads and parses a feed. """ ret_values = { ENTRY_NEW:0, ENTRY_UPDATED:0, ENTRY_SAME:0, ENTRY_ERR:0} # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) if hasattr(self.fpf, 'status'): if self.options['verbose']: logging.debug(u' ---> [%-30s] Fetched feed, HTTP status %d: %s (bozo: %s)' % (unicode(self.feed)[:30], self.fpf.status, self.feed.feed_address, self.fpf.bozo)) if self.fpf.bozo and self.fpf.status != 304: logging.debug(u' ---> [%-30s] BOZO exception: %s' % ( unicode(self.feed)[:30], self.fpf.bozo_exception,)) if self.fpf.status == 304: self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values if self.fpf.status >= 400: self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Error") return FEED_ERRHTTP, ret_values if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): if not self.fpf.entries: logging.debug(" ---> [%-30s] Feed is Non-XML. Checking address..." % unicode(self.feed)[:30]) fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception) return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug(" ---> [%-30s] Feed is Bad XML (SAX). Checking address..." % unicode(self.feed)[:30]) if not self.fpf.entries: fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception) return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' try: self.feed.last_modified = mtime(self.fpf.modified) except: pass self.feed.feed_title = self.fpf.feed.get('title', self.feed.feed_title) self.feed.feed_tagline = self.fpf.feed.get('tagline', self.feed.feed_tagline) self.feed.feed_link = self.fpf.feed.get('link', self.feed.feed_link) self.feed.last_update = datetime.datetime.now() guids = [] for entry in self.fpf.entries: if entry.get('id', ''): guids.append(entry.get('id', '')) elif entry.title: guids.append(entry.title) elif entry.link: guids.append(entry.link) self.lock.acquire() try: self.feed.save() finally: self.lock.release() # Compare new stories to existing stories, adding and updating # start_date = datetime.datetime.now() # end_date = datetime.datetime.now() story_guids = [] for entry in self.fpf.entries: story = pre_process_story(entry) # if story.get('published') < start_date: # start_date = story.get('published') # if story.get('published') > end_date: # end_date = story.get('published') story_guids.append(story.get('guid') or story.get('link')) existing_stories = self.db.stories.find({ 'story_feed_id': self.feed.pk, # 'story_date': {'$gte': start_date}, 'story_guid': {'$in': story_guids} }).limit(len(story_guids)) # MStory.objects( # (Q(story_date__gte=start_date) & Q(story_date__lte=end_date)) # | (Q(story_guid__in=story_guids)), # story_feed=self.feed # ).order_by('-story_date') ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories, self.db) logging.debug(u' ---> [%-30s] Parsed Feed: %s' % ( unicode(self.feed)[:30], u' '.join(u'%s=%d' % (self.entry_trans[key], ret_values[key]) for key in self.entry_keys),)) self.feed.update_all_statistics(lock=self.lock) self.feed.trim_feed() self.feed.save_feed_history(200, "OK") return FEED_OK, ret_values
def add_update_stories(self, stories, existing_stories): ret_values = { ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0 } for story in stories: story = pre_process_story(story) if story.get('title'): story_contents = story.get('content') story_tags = self.get_tags(story) if story_contents is not None: story_content = story_contents[0]['value'] else: story_content = story.get('summary') existing_story, story_has_changed = self._exists_story( story, story_content, existing_stories) if existing_story is None: s = MStory(story_feed_id=self.pk, story_date=story.get('published'), story_title=story.get('title'), story_content=story_content, story_author_name=story.get('author'), story_permalink=story.get('link'), story_guid=story.get('guid') or story.get('id') or story.get('link'), story_tags=story_tags) try: s.save() ret_values[ENTRY_NEW] += 1 cache.set('updated_feed:%s' % self.id, 1) except (IntegrityError, OperationError): ret_values[ENTRY_ERR] += 1 # logging.info('Saving new story, IntegrityError: %s - %s: %s' % (self.feed_title, story.get('title'), e)) elif existing_story and story_has_changed: # update story # logging.debug('- Updated story in feed (%s - %s): %s / %s' % (self.feed_title, story.get('title'), len(existing_story.story_content), len(story_content))) original_content = None if existing_story.story_original_content_z: original_content = zlib.decompress( existing_story.story_original_content_z) elif existing_story.story_content_z: original_content = zlib.decompress( existing_story.story_content_z) # print 'Type: %s %s' % (type(original_content), type(story_content)) if story_content and len(story_content) > 10: diff = HTMLDiff(unicode(original_content), story_content) story_content_diff = diff.getDiff() else: story_content_diff = original_content # logging.debug("\t\tDiff: %s %s %s" % diff.getStats()) # logging.debug("\t\tDiff content: %s" % diff.getDiff()) if existing_story.story_title != story.get('title'): # logging.debug('\tExisting title / New: : \n\t\t- %s\n\t\t- %s' % (existing_story.story_title, story.get('title'))) pass existing_story.story_feed = self.pk existing_story.story_date = story.get('published') existing_story.story_title = story.get('title') existing_story.story_content = story_content_diff existing_story.story_original_content = original_content existing_story.story_author_name = story.get('author') existing_story.story_permalink = story.get('link') existing_story.story_guid = story.get('guid') or story.get( 'id') or story.get('link') existing_story.story_tags = story_tags try: existing_story.save() ret_values[ENTRY_UPDATED] += 1 cache.set('updated_feed:%s' % self.id, 1) except (IntegrityError, OperationError): ret_values[ENTRY_ERR] += 1 logging.info( 'Saving updated story, IntegrityError: %s - %s' % (self.feed_title, story.get('title'))) else: ret_values[ENTRY_SAME] += 1 # logging.debug("Unchanged story: %s " % story.get('title')) return ret_values
guids.append(entry.title) <<<<<<< HEAD self.feed.save() self.refresh_feed() >>>>>>> Refreshing feed on fetch. ======= self.feed = self.feed.save() >>>>>>> Fixing errors in timeouts to show the correct error. Also fixing microformats parsing issue and allow IPv6 URLs in enclosures to be ignored, fixing a bunch of feeds. # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() story_guids = [] stories = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') stories.append(story) story_guids.append(story.get('guid')) existing_stories = list(MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed.pk ).limit(max(int(len(story_guids)*1.5), 10))) ret_values = self.feed.add_update_stories(stories, existing_stories, verbose=self.options['verbose']) if ((not self.feed.is_push or self.options.get('force'))
def process(self, first_run=True): """ Downloads and parses a feed. """ self.refresh_feed() ret_values = {ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0} # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) self.feed.fetched_once = True self.feed.last_update = datetime.datetime.utcnow() if hasattr(self.fpf, "status"): if self.options["verbose"]: logging.debug( u" ---> [%-30s] Fetched feed, HTTP status %d: %s (bozo: %s)" % (unicode(self.feed)[:30], self.fpf.status, self.feed.feed_address, self.fpf.bozo) ) if self.fpf.bozo and self.fpf.status != 304: logging.debug( u" ---> [%-30s] BOZO exception: %s (%s entries)" % (unicode(self.feed)[:30], self.fpf.bozo_exception, len(self.fpf.entries)) ) if self.fpf.status == 304: self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values if self.fpf.status in (302, 301): self.feed.feed_address = self.fpf.href if first_run: self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Error") return FEED_ERRHTTP, ret_values if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): if not self.fpf.entries: logging.debug( " ---> [%-30s] Feed is Non-XML. %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries)) ) fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(502, "Non-xml feed", self.fpf.bozo_exception) else: self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug( " ---> [%-30s] Feed is Bad XML (SAX). %s entries. Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries)) ) if not self.fpf.entries: fixed_feed = self.feed.check_feed_address_for_feed_link() if not fixed_feed: self.feed.save_feed_history(503, "SAX Exception", self.fpf.bozo_exception) else: self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get("etag") if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = "" try: self.feed.last_modified = mtime(self.fpf.modified) except: pass self.feed.feed_title = self.fpf.feed.get("title", self.feed.feed_title) self.feed.feed_tagline = self.fpf.feed.get("tagline", self.feed.feed_tagline) self.feed.feed_link = self.fpf.feed.get("link", self.feed.feed_link) self.feed.last_update = datetime.datetime.utcnow() guids = [] for entry in self.fpf.entries: if entry.get("id", ""): guids.append(entry.get("id", "")) elif entry.title: guids.append(entry.title) elif entry.link: guids.append(entry.link) self.feed.save() # Compare new stories to existing stories, adding and updating # start_date = datetime.datetime.utcnow() # end_date = datetime.datetime.utcnow() story_guids = [] for entry in self.fpf.entries: story = pre_process_story(entry) # if story.get('published') < start_date: # start_date = story.get('published') # if story.get('published') > end_date: # end_date = story.get('published') story_guids.append(story.get("guid") or story.get("link")) existing_stories = settings.MONGODB.stories.find( { "story_feed_id": self.feed.pk, # 'story_date': {'$gte': start_date}, "story_guid": {"$in": story_guids}, } ).limit(len(story_guids)) # MStory.objects( # (Q(story_date__gte=start_date) & Q(story_date__lte=end_date)) # | (Q(story_guid__in=story_guids)), # story_feed=self.feed # ).order_by('-story_date') ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories) logging.debug( u" ---> [%-30s] Parsed Feed: %s" % ( unicode(self.feed)[:30], u" ".join(u"%s=%d" % (self.entry_trans[key], ret_values[key]) for key in self.entry_keys), ) ) self.feed.update_all_statistics(lock=self.lock) self.feed.trim_feed() self.feed.save_feed_history(200, "OK") return FEED_OK, ret_values