def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter(feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF)\ .order_by('-last_read_date') logging.debug(u' ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers' % ( unicode(feed)[:30], user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers)) if self.options['slave_db']: slave_db = self.options['slave_db'] stories_db_orig = slave_db.stories.find({ "story_feed_id": feed.pk, "story_date": { "$gte": UNREAD_CUTOFF, }, }) stories_db = [] for story in stories_db_orig: stories_db.append(bunch(story)) else: stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) for sub in user_subs: cache.delete('usersub:%s' % sub.user_id) sub.needs_unread_recalc = True sub.save() if self.options['compute_scores']: for sub in user_subs: silent = False if self.options['verbose'] >= 2 else True sub.calculate_feed_scores(silent=silent, stories_db=stories_db)
def load_river_stories(request): limit = 18 offset = 0 start = datetime.datetime.utcnow() user = get_user(request) feed_ids = [int(feed_id) for feed_id in request.REQUEST.getlist('feeds') if feed_id] original_feed_ids = list(feed_ids) page = int(request.REQUEST.get('page', 1)) read_stories_count = int(request.REQUEST.get('read_stories_count', 0)) new_flag = request.REQUEST.get('new_flag', False) bottom_delta = datetime.timedelta(days=settings.DAYS_OF_UNREAD) if not feed_ids: logging.user(request, "~FCLoading empty river stories: page %s" % (page)) return dict(stories=[]) # Fetch all stories at and before the page number. # Not a single page, because reading stories can move them up in the unread order. # `read_stories_count` is an optimization, works best when all 25 stories before have been read. limit = limit * page - read_stories_count # Read stories to exclude read_stories = MUserStory.objects(user_id=user.pk, feed_id__in=feed_ids).only('story_id') read_stories = [rs.story_id for rs in read_stories] # Determine mark_as_read dates for all feeds to ignore all stories before this date. # max_feed_count = 0 feed_counts = {} feed_last_reads = {} for feed_id in feed_ids: try: usersub = UserSubscription.objects.get(feed__pk=feed_id, user=user) except UserSubscription.DoesNotExist: continue if not usersub: continue feed_counts[feed_id] = (usersub.unread_count_negative * 1 + usersub.unread_count_neutral * 10 + usersub.unread_count_positive * 20) # if feed_counts[feed_id] > max_feed_count: # max_feed_count = feed_counts[feed_id] feed_last_reads[feed_id] = int(time.mktime(usersub.mark_read_date.timetuple())) feed_counts = sorted(feed_counts.items(), key=itemgetter(1))[:50] feed_ids = [f[0] for f in feed_counts] feed_last_reads = dict([(str(feed_id), feed_last_reads[feed_id]) for feed_id in feed_ids if feed_id in feed_last_reads]) feed_counts = dict(feed_counts) # After excluding read stories, all that's left are stories # past the mark_read_date. Everything returned is guaranteed to be unread. mstories = MStory.objects( story_guid__nin=read_stories, story_feed_id__in=feed_ids, # story_date__gte=start - bottom_delta ).map_reduce("""function() { var d = feed_last_reads[this[~story_feed_id]]; if (this[~story_date].getTime()/1000 > d) { emit(this[~id], this); } }""", """function(key, values) { return values[0]; }""", output='inline', scope={ 'feed_last_reads': feed_last_reads } ) mstories = [story.value for story in mstories if story and story.value] mstories = sorted(mstories, cmp=lambda x, y: cmp(story_score(y, bottom_delta), story_score(x, bottom_delta))) # story_feed_counts = defaultdict(int) # mstories_pruned = [] # for story in mstories: # print story['story_title'], story_feed_counts[story['story_feed_id']] # if story_feed_counts[story['story_feed_id']] >= 3: continue # mstories_pruned.append(story) # story_feed_counts[story['story_feed_id']] += 1 stories = [] for i, story in enumerate(mstories): if i < offset: continue if i >= offset + limit: break stories.append(bunch(story)) stories = Feed.format_stories(stories) found_feed_ids = list(set([story['story_feed_id'] for story in stories])) # Find starred stories starred_stories = MStarredStory.objects( user_id=user.pk, story_feed_id__in=found_feed_ids ).only('story_guid', 'starred_date') starred_stories = dict([(story.story_guid, story.starred_date) for story in starred_stories]) # Intelligence classifiers for all feeds involved def sort_by_feed(classifiers): feed_classifiers = defaultdict(list) for classifier in classifiers: feed_classifiers[classifier.feed_id].append(classifier) return feed_classifiers classifier_feeds = sort_by_feed(MClassifierFeed.objects(user_id=user.pk, feed_id__in=found_feed_ids)) classifier_authors = sort_by_feed(MClassifierAuthor.objects(user_id=user.pk, feed_id__in=found_feed_ids)) classifier_titles = sort_by_feed(MClassifierTitle.objects(user_id=user.pk, feed_id__in=found_feed_ids)) classifier_tags = sort_by_feed(MClassifierTag.objects(user_id=user.pk, feed_id__in=found_feed_ids)) classifiers = {} for feed_id in found_feed_ids: classifiers[feed_id] = get_classifiers_for_user(user, feed_id, classifier_feeds[feed_id], classifier_authors[feed_id], classifier_titles[feed_id], classifier_tags[feed_id]) # Just need to format stories for story in stories: story_date = localtime_for_timezone(story['story_date'], user.profile.timezone) now = localtime_for_timezone(datetime.datetime.now(), user.profile.timezone) story['short_parsed_date'] = format_story_link_date__short(story_date, now) story['long_parsed_date'] = format_story_link_date__long(story_date, now) story['read_status'] = 0 if story['id'] in starred_stories: story['starred'] = True starred_date = localtime_for_timezone(starred_stories[story['id']], user.profile.timezone) story['starred_date'] = format_story_link_date__long(starred_date, now) story['intelligence'] = { 'feed': apply_classifier_feeds(classifier_feeds[story['story_feed_id']], story['story_feed_id']), 'author': apply_classifier_authors(classifier_authors[story['story_feed_id']], story), 'tags': apply_classifier_tags(classifier_tags[story['story_feed_id']], story), 'title': apply_classifier_titles(classifier_titles[story['story_feed_id']], story), } diff = datetime.datetime.utcnow() - start timediff = float("%s.%.2s" % (diff.seconds, (diff.microseconds / 1000))) logging.user(request, "~FCLoading river stories: page %s - ~SB%s/%s " "stories ~SN(%s/%s/%s feeds) ~FB(%s seconds)" % (page, len(stories), len(mstories), len(found_feed_ids), len(feed_ids), len(original_feed_ids), timediff)) if new_flag: return dict(stories=stories, classifiers=classifiers) else: logging.user(request, "~BR~FCNo new flag on river") return dict(stories=stories)
raise e mstories = sorted(mstories, cmp=lambda x, y: cmp(story_score(y, bottom_delta), story_score(x, bottom_delta))) # story_feed_counts = defaultdict(int) # mstories_pruned = [] # for story in mstories: # print story['story_title'], story_feed_counts[story['story_feed_id']] # if story_feed_counts[story['story_feed_id']] >= 3: continue # mstories_pruned.append(story) # story_feed_counts[story['story_feed_id']] += 1 stories = [] for i, story in enumerate(mstories): if i < offset: continue if i >= offset + limit: break stories.append(bunch(story)) stories = Feed.format_stories(stories) found_feed_ids = list(set([story['story_feed_id'] for story in stories])) # Find starred stories starred_stories = MStarredStory.objects( user_id=user.pk, story_feed_id__in=found_feed_ids ).only('story_guid', 'starred_date') starred_stories = dict([(story.story_guid, story.starred_date) for story in starred_stories]) # Intelligence classifiers for all feeds involved def sort_by_feed(classifiers): feed_classifiers = defaultdict(list) for classifier in classifiers:
def load_river_stories(request): limit = 18 offset = 0 start = datetime.datetime.utcnow() user = get_user(request) feed_ids = [int(feed_id) for feed_id in request.REQUEST.getlist('feeds') if feed_id] original_feed_ids = list(feed_ids) page = int(request.REQUEST.get('page', 0))+1 read_stories_count = int(request.REQUEST.get('read_stories_count', 0)) bottom_delta = datetime.timedelta(days=settings.DAYS_OF_UNREAD) if not feed_ids: logging.user(request.user, "~FCLoading empty river stories: page %s" % (page)) return dict(stories=[]) # Fetch all stories at and before the page number. # Not a single page, because reading stories can move them up in the unread order. # `read_stories_count` is an optimization, works best when all 25 stories before have been read. limit = limit * page - read_stories_count # Read stories to exclude read_stories = MUserStory.objects(user_id=user.pk, feed_id__in=feed_ids).only('story') read_stories = [rs.story.id for rs in read_stories] # Determine mark_as_read dates for all feeds to ignore all stories before this date. # max_feed_count = 0 feed_counts = {} feed_last_reads = {} for feed_id in feed_ids: try: usersub = UserSubscription.objects.get(feed__pk=feed_id, user=user) except UserSubscription.DoesNotExist: continue if not usersub: continue feed_counts[feed_id] = (usersub.unread_count_negative * 1 + usersub.unread_count_neutral * 10 + usersub.unread_count_positive * 20) # if feed_counts[feed_id] > max_feed_count: # max_feed_count = feed_counts[feed_id] feed_last_reads[feed_id] = int(time.mktime(usersub.mark_read_date.timetuple())) feed_counts = sorted(feed_counts.items(), key=itemgetter(1))[:50] feed_ids = [f[0] for f in feed_counts] feed_last_reads = dict([(str(feed_id), feed_last_reads[feed_id]) for feed_id in feed_ids]) feed_counts = dict(feed_counts) # After excluding read stories, all that's left are stories # past the mark_read_date. Everything returned is guaranteed to be unread. mstories = MStory.objects( id__nin=read_stories, story_feed_id__in=feed_ids, story_date__gte=start - bottom_delta ).map_reduce("""function() { var d = feed_last_reads[this[~story_feed_id]]; if (this[~story_date].getTime()/1000 > d) { emit(this[~id], this); } }""", """function(key, values) { return values[0]; }""", output='inline', scope={ 'feed_last_reads': feed_last_reads } ) mstories = [story.value for story in mstories] mstories = sorted(mstories, cmp=lambda x, y: cmp(story_score(y, bottom_delta), story_score(x, bottom_delta))) # story_feed_counts = defaultdict(int) # mstories_pruned = [] # for story in mstories: # print story['story_title'], story_feed_counts[story['story_feed_id']] # if story_feed_counts[story['story_feed_id']] >= 3: continue # mstories_pruned.append(story) # story_feed_counts[story['story_feed_id']] += 1 stories = [] for i, story in enumerate(mstories): if i < offset: continue if i >= offset + limit: break stories.append(bunch(story)) stories = Feed.format_stories(stories) found_feed_ids = list(set([story['story_feed_id'] for story in stories])) # Find starred stories starred_stories = MStarredStory.objects( user_id=user.pk, story_feed_id__in=found_feed_ids ).only('story_guid', 'starred_date') starred_stories = dict([(story.story_guid, story.starred_date) for story in starred_stories]) # Intelligence classifiers for all feeds involved def sort_by_feed(classifiers): feed_classifiers = defaultdict(list) for classifier in classifiers: feed_classifiers[classifier.feed_id].append(classifier) return feed_classifiers classifier_feeds = sort_by_feed(MClassifierFeed.objects(user_id=user.pk, feed_id__in=found_feed_ids)) classifier_authors = sort_by_feed(MClassifierAuthor.objects(user_id=user.pk, feed_id__in=found_feed_ids)) classifier_titles = sort_by_feed(MClassifierTitle.objects(user_id=user.pk, feed_id__in=found_feed_ids)) classifier_tags = sort_by_feed(MClassifierTag.objects(user_id=user.pk, feed_id__in=found_feed_ids)) # Just need to format stories for story in stories: story_date = localtime_for_timezone(story['story_date'], user.profile.timezone) now = localtime_for_timezone(datetime.datetime.now(), user.profile.timezone) story['short_parsed_date'] = format_story_link_date__short(story_date, now) story['long_parsed_date'] = format_story_link_date__long(story_date, now) story['read_status'] = 0 if story['id'] in starred_stories: story['starred'] = True starred_date = localtime_for_timezone(starred_stories[story['id']], user.profile.timezone) story['starred_date'] = format_story_link_date__long(starred_date, now) story['intelligence'] = { 'feed': apply_classifier_feeds(classifier_feeds[story['story_feed_id']], story['story_feed_id']), 'author': apply_classifier_authors(classifier_authors[story['story_feed_id']], story), 'tags': apply_classifier_tags(classifier_tags[story['story_feed_id']], story), 'title': apply_classifier_titles(classifier_titles[story['story_feed_id']], story), } diff = datetime.datetime.utcnow() - start timediff = float("%s.%.2s" % (diff.seconds, (diff.microseconds / 1000))) logging.user(request.user, "~FCLoading river stories: page %s - ~SB%s/%s " "stories ~SN(%s/%s/%s feeds) ~FB(%s seconds)" % (page, len(stories), len(mstories), len(found_feed_ids), len(feed_ids), len(original_feed_ids), timediff)) return dict(stories=stories)
def process(self, first_run=True): """ Downloads and parses a feed. """ self.refresh_feed() ret_values = { ENTRY_NEW:0, ENTRY_UPDATED:0, ENTRY_SAME:0, ENTRY_ERR:0} # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title)) self.feed.fetched_once = True self.feed.last_update = datetime.datetime.utcnow() if hasattr(self.fpf, 'status'): if self.options['verbose']: logging.debug(u' ---> [%-30s] Fetched feed, HTTP status %d: %s (bozo: %s)' % (unicode(self.feed)[:30], self.fpf.status, self.feed.feed_address, self.fpf.bozo)) if self.fpf.bozo and self.fpf.status != 304: logging.debug(u' ---> [%-30s] BOZO exception: %s (%s entries)' % ( unicode(self.feed)[:30], self.fpf.bozo_exception, len(self.fpf.entries))) if self.fpf.status == 304: self.feed.save() self.feed.save_feed_history(304, "Not modified") return FEED_SAME, ret_values if self.fpf.status in (302, 301): if not self.fpf.href.endswith('feedburner.com/atom.xml'): self.feed.feed_address = self.fpf.href if first_run: self.feed.has_feed_exception = True self.feed.schedule_feed_fetch_immediately() if not self.fpf.entries: self.feed.save() self.feed.save_feed_history(self.fpf.status, "HTTP Redirect") return FEED_ERRHTTP, ret_values if self.fpf.status >= 400: logging.debug(" ---> [%-30s] HTTP Status code: %s. Checking address..." % (unicode(self.feed)[:30], self.fpf.status)) fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(self.fpf.status, "HTTP Error") else: self.feed.has_feed_exception = True self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRHTTP, ret_values if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType): logging.debug(" ---> [%-30s] Feed is Non-XML. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.fpf.entries else '')) if not self.fpf.entries: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception) else: self.feed.has_feed_exception = True self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRPARSE, ret_values elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException): logging.debug(" ---> [%-30s] Feed has SAX/XML parsing issues. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.fpf.entries else '')) if not self.fpf.entries: fixed_feed = self.feed.check_feed_link_for_feed_address() if not fixed_feed: self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception) else: self.feed.has_feed_exception = True self.feed.schedule_feed_fetch_immediately() self.feed.save() return FEED_ERRPARSE, ret_values # the feed has changed (or it is the first time we parse it) # saving the etag and last_modified fields self.feed.etag = self.fpf.get('etag') if self.feed.etag: self.feed.etag = self.feed.etag[:255] # some times this is None (it never should) *sigh* if self.feed.etag is None: self.feed.etag = '' try: self.feed.last_modified = mtime(self.fpf.modified) except: pass self.fpf.entries = self.fpf.entries[:50] self.feed.feed_title = self.fpf.feed.get('title', self.feed.feed_title) tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline) if tagline: self.feed.data.feed_tagline = utf8encode(tagline) self.feed.data.save() if not self.feed.feed_link_locked: self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link self.feed.last_update = datetime.datetime.utcnow() guids = [] for entry in self.fpf.entries: if entry.get('id', ''): guids.append(entry.get('id', '')) elif entry.get('link'): guids.append(entry.link) elif entry.get('title'): guids.append(entry.title) self.feed.save() # Compare new stories to existing stories, adding and updating start_date = datetime.datetime.utcnow() # end_date = datetime.datetime.utcnow() story_guids = [] for entry in self.fpf.entries: story = pre_process_story(entry) if story.get('published') < start_date: start_date = story.get('published') # if story.get('published') > end_date: # end_date = story.get('published') story_guids.append(story.get('guid') or story.get('link')) if self.options['slave_db']: slave_db = self.options['slave_db'] stories_db_orig = slave_db.stories.find({ "story_feed_id": self.feed.pk, "story_date": { "$gte": start_date, }, }).limit(len(story_guids)) existing_stories = [] for story in stories_db_orig: existing_stories.append(bunch(story)) else: existing_stories = list(MStory.objects( # story_guid__in=story_guids, story_date__gte=start_date, story_feed_id=self.feed.pk ).limit(len(story_guids))) # MStory.objects( # (Q(story_date__gte=start_date) & Q(story_date__lte=end_date)) # | (Q(story_guid__in=story_guids)), # story_feed=self.feed # ).order_by('-story_date') ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories, verbose=self.options['verbose']) logging.debug(u' ---> [%-30s] ~FYParsed Feed: new=~FG~SB%s~SN~FY up=~FY~SB%s~SN same=~FY%s err=~FR~SB%s' % ( unicode(self.feed)[:30], ret_values[ENTRY_NEW], ret_values[ENTRY_UPDATED], ret_values[ENTRY_SAME], ret_values[ENTRY_ERR])) self.feed.update_all_statistics() self.feed.trim_feed() self.feed.save_feed_history(200, "OK") return FEED_OK, ret_values
story_score(x, days_to_keep_unreads))) # Prune the river to only include a set number of stories per feed # story_feed_counts = defaultdict(int) # mstories_pruned = [] # for story in mstories: # print story['story_title'], story_feed_counts[story['story_feed_id']] # if story_feed_counts[story['story_feed_id']] >= 3: continue # mstories_pruned.append(story) # story_feed_counts[story['story_feed_id']] += 1 stories = [] for i, story in enumerate(mstories): if i < offset: continue if i >= limit: break stories.append(bunch(story)) stories = Feed.format_stories(stories) found_feed_ids = list(set([story['story_feed_id'] for story in stories])) # Find starred stories try: starred_stories = MStarredStory.objects( user_id=user.pk, story_feed_id__in=found_feed_ids ).only('story_guid', 'starred_date') starred_stories = dict([(story.story_guid, story.starred_date) for story in starred_stories]) except OperationFailure: logging.info(" ***> Starred stories failure") starred_stories = {}