Example #1
0
    def count_unreads_for_subscribers(self, feed):
        UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD)
        user_subs = UserSubscription.objects.filter(feed=feed, 
                                                    active=True,
                                                    user__profile__last_seen_on__gte=UNREAD_CUTOFF)\
                                            .order_by('-last_read_date')
        logging.debug(u'   ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers' % (
                      unicode(feed)[:30], user_subs.count(),
                      feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers))
        
        if self.options['slave_db']:
            slave_db = self.options['slave_db']

            stories_db_orig = slave_db.stories.find({
                "story_feed_id": feed.pk,
                "story_date": {
                    "$gte": UNREAD_CUTOFF,
                },
            })
            stories_db = []
            for story in stories_db_orig:
                stories_db.append(bunch(story))
        else:
            stories_db = MStory.objects(story_feed_id=feed.pk,
                                        story_date__gte=UNREAD_CUTOFF)
        for sub in user_subs:
            cache.delete('usersub:%s' % sub.user_id)
            sub.needs_unread_recalc = True
            sub.save()
            
        if self.options['compute_scores']:
            for sub in user_subs:
                silent = False if self.options['verbose'] >= 2 else True
                sub.calculate_feed_scores(silent=silent, stories_db=stories_db)
Example #2
0
def load_river_stories(request):
    limit              = 18
    offset             = 0
    start              = datetime.datetime.utcnow()
    user               = get_user(request)
    feed_ids           = [int(feed_id) for feed_id in request.REQUEST.getlist('feeds') if feed_id]
    original_feed_ids  = list(feed_ids)
    page               = int(request.REQUEST.get('page', 1))
    read_stories_count = int(request.REQUEST.get('read_stories_count', 0))
    new_flag           = request.REQUEST.get('new_flag', False)
    bottom_delta       = datetime.timedelta(days=settings.DAYS_OF_UNREAD)
    
    if not feed_ids: 
        logging.user(request, "~FCLoading empty river stories: page %s" % (page))
        return dict(stories=[])
    
    # Fetch all stories at and before the page number.
    # Not a single page, because reading stories can move them up in the unread order.
    # `read_stories_count` is an optimization, works best when all 25 stories before have been read.
    limit = limit * page - read_stories_count
    
    # Read stories to exclude
    read_stories = MUserStory.objects(user_id=user.pk, feed_id__in=feed_ids).only('story_id')
    read_stories = [rs.story_id for rs in read_stories]
    
    # Determine mark_as_read dates for all feeds to ignore all stories before this date.
    # max_feed_count     = 0
    feed_counts     = {}
    feed_last_reads = {}
    for feed_id in feed_ids:
        try:
            usersub = UserSubscription.objects.get(feed__pk=feed_id, user=user)
        except UserSubscription.DoesNotExist:
            continue
        if not usersub: continue
        feed_counts[feed_id] = (usersub.unread_count_negative * 1 + 
                                usersub.unread_count_neutral * 10 +
                                usersub.unread_count_positive * 20)
        # if feed_counts[feed_id] > max_feed_count:
        #     max_feed_count = feed_counts[feed_id]
        feed_last_reads[feed_id] = int(time.mktime(usersub.mark_read_date.timetuple()))
    feed_counts = sorted(feed_counts.items(), key=itemgetter(1))[:50]
    feed_ids = [f[0] for f in feed_counts]
    feed_last_reads = dict([(str(feed_id), feed_last_reads[feed_id]) for feed_id in feed_ids
                            if feed_id in feed_last_reads])
    feed_counts = dict(feed_counts)

    # After excluding read stories, all that's left are stories 
    # past the mark_read_date. Everything returned is guaranteed to be unread.
    mstories = MStory.objects(
        story_guid__nin=read_stories,
        story_feed_id__in=feed_ids,
        # story_date__gte=start - bottom_delta
    ).map_reduce("""function() {
            var d = feed_last_reads[this[~story_feed_id]];
            if (this[~story_date].getTime()/1000 > d) {
                emit(this[~id], this);
            }
        }""",
        """function(key, values) {
            return values[0];
        }""",
        output='inline',
        scope={
            'feed_last_reads': feed_last_reads
        }
    )
    mstories = [story.value for story in mstories if story and story.value]

    mstories = sorted(mstories, cmp=lambda x, y: cmp(story_score(y, bottom_delta), story_score(x, bottom_delta)))

    # story_feed_counts = defaultdict(int)
    # mstories_pruned = []
    # for story in mstories:
    #     print story['story_title'], story_feed_counts[story['story_feed_id']]
    #     if story_feed_counts[story['story_feed_id']] >= 3: continue
    #     mstories_pruned.append(story)
    #     story_feed_counts[story['story_feed_id']] += 1
    stories = []
    for i, story in enumerate(mstories):
        if i < offset: continue
        if i >= offset + limit: break
        stories.append(bunch(story))
    stories = Feed.format_stories(stories)
    found_feed_ids = list(set([story['story_feed_id'] for story in stories]))
    
    # Find starred stories
    starred_stories = MStarredStory.objects(
        user_id=user.pk,
        story_feed_id__in=found_feed_ids
    ).only('story_guid', 'starred_date')
    starred_stories = dict([(story.story_guid, story.starred_date) 
                            for story in starred_stories])
    
    # Intelligence classifiers for all feeds involved
    def sort_by_feed(classifiers):
        feed_classifiers = defaultdict(list)
        for classifier in classifiers:
            feed_classifiers[classifier.feed_id].append(classifier)
        return feed_classifiers
    classifier_feeds   = sort_by_feed(MClassifierFeed.objects(user_id=user.pk, feed_id__in=found_feed_ids))
    classifier_authors = sort_by_feed(MClassifierAuthor.objects(user_id=user.pk, feed_id__in=found_feed_ids))
    classifier_titles  = sort_by_feed(MClassifierTitle.objects(user_id=user.pk, feed_id__in=found_feed_ids))
    classifier_tags    = sort_by_feed(MClassifierTag.objects(user_id=user.pk, feed_id__in=found_feed_ids))
    
    classifiers = {}
    for feed_id in found_feed_ids:
        classifiers[feed_id] = get_classifiers_for_user(user, feed_id, classifier_feeds[feed_id], 
                                                        classifier_authors[feed_id],
                                                        classifier_titles[feed_id],
                                                        classifier_tags[feed_id])
    
    # Just need to format stories
    for story in stories:
        story_date = localtime_for_timezone(story['story_date'], user.profile.timezone)
        now = localtime_for_timezone(datetime.datetime.now(), user.profile.timezone)
        story['short_parsed_date'] = format_story_link_date__short(story_date, now)
        story['long_parsed_date']  = format_story_link_date__long(story_date, now)
        story['read_status'] = 0
        if story['id'] in starred_stories:
            story['starred'] = True
            starred_date = localtime_for_timezone(starred_stories[story['id']], user.profile.timezone)
            story['starred_date'] = format_story_link_date__long(starred_date, now)
        story['intelligence'] = {
            'feed':   apply_classifier_feeds(classifier_feeds[story['story_feed_id']], story['story_feed_id']),
            'author': apply_classifier_authors(classifier_authors[story['story_feed_id']], story),
            'tags':   apply_classifier_tags(classifier_tags[story['story_feed_id']], story),
            'title':  apply_classifier_titles(classifier_titles[story['story_feed_id']], story),
        }
    
    diff = datetime.datetime.utcnow() - start
    timediff = float("%s.%.2s" % (diff.seconds, (diff.microseconds / 1000)))
    logging.user(request, "~FCLoading river stories: page %s - ~SB%s/%s "
                               "stories ~SN(%s/%s/%s feeds) ~FB(%s seconds)" % 
                               (page, len(stories), len(mstories), len(found_feed_ids), 
                               len(feed_ids), len(original_feed_ids), timediff))
    
    if new_flag:
        return dict(stories=stories, classifiers=classifiers)
    else:
        logging.user(request, "~BR~FCNo new flag on river")
        return dict(stories=stories)
Example #3
0
        raise e

    mstories = sorted(mstories, cmp=lambda x, y: cmp(story_score(y, bottom_delta), story_score(x, bottom_delta)))

    # story_feed_counts = defaultdict(int)
    # mstories_pruned = []
    # for story in mstories:
    #     print story['story_title'], story_feed_counts[story['story_feed_id']]
    #     if story_feed_counts[story['story_feed_id']] >= 3: continue
    #     mstories_pruned.append(story)
    #     story_feed_counts[story['story_feed_id']] += 1
    stories = []
    for i, story in enumerate(mstories):
        if i < offset: continue
        if i >= offset + limit: break
        stories.append(bunch(story))
    stories = Feed.format_stories(stories)
    found_feed_ids = list(set([story['story_feed_id'] for story in stories]))
    
    # Find starred stories
    starred_stories = MStarredStory.objects(
        user_id=user.pk,
        story_feed_id__in=found_feed_ids
    ).only('story_guid', 'starred_date')
    starred_stories = dict([(story.story_guid, story.starred_date) 
                            for story in starred_stories])
    
    # Intelligence classifiers for all feeds involved
    def sort_by_feed(classifiers):
        feed_classifiers = defaultdict(list)
        for classifier in classifiers:
Example #4
0
def load_river_stories(request):
    limit              = 18
    offset             = 0
    start              = datetime.datetime.utcnow()
    user               = get_user(request)
    feed_ids           = [int(feed_id) for feed_id in request.REQUEST.getlist('feeds') if feed_id]
    original_feed_ids  = list(feed_ids)
    page               = int(request.REQUEST.get('page', 0))+1
    read_stories_count = int(request.REQUEST.get('read_stories_count', 0))
    bottom_delta       = datetime.timedelta(days=settings.DAYS_OF_UNREAD)
    
    if not feed_ids: 
        logging.user(request.user, "~FCLoading empty river stories: page %s" % (page))
        return dict(stories=[])
    
    # Fetch all stories at and before the page number.
    # Not a single page, because reading stories can move them up in the unread order.
    # `read_stories_count` is an optimization, works best when all 25 stories before have been read.
    limit = limit * page - read_stories_count
    
    # Read stories to exclude
    read_stories = MUserStory.objects(user_id=user.pk, feed_id__in=feed_ids).only('story')
    read_stories = [rs.story.id for rs in read_stories]
    
    # Determine mark_as_read dates for all feeds to ignore all stories before this date.
    # max_feed_count     = 0
    feed_counts     = {}
    feed_last_reads = {}
    for feed_id in feed_ids:
        try:
            usersub = UserSubscription.objects.get(feed__pk=feed_id, user=user)
        except UserSubscription.DoesNotExist:
            continue
        if not usersub: continue
        feed_counts[feed_id] = (usersub.unread_count_negative * 1 + 
                                usersub.unread_count_neutral * 10 +
                                usersub.unread_count_positive * 20)
        # if feed_counts[feed_id] > max_feed_count:
        #     max_feed_count = feed_counts[feed_id]
        feed_last_reads[feed_id] = int(time.mktime(usersub.mark_read_date.timetuple()))
    feed_counts = sorted(feed_counts.items(), key=itemgetter(1))[:50]
    feed_ids = [f[0] for f in feed_counts]
    feed_last_reads = dict([(str(feed_id), feed_last_reads[feed_id]) for feed_id in feed_ids])
    feed_counts = dict(feed_counts)
    
    # After excluding read stories, all that's left are stories 
    # past the mark_read_date. Everything returned is guaranteed to be unread.
    mstories = MStory.objects(
        id__nin=read_stories,
        story_feed_id__in=feed_ids,
        story_date__gte=start - bottom_delta
    ).map_reduce("""function() {
            var d = feed_last_reads[this[~story_feed_id]];
            if (this[~story_date].getTime()/1000 > d) {
                emit(this[~id], this);
            }
        }""",
        """function(key, values) {
            return values[0];
        }""",
        output='inline',
        scope={
            'feed_last_reads': feed_last_reads
        }
    )
    mstories = [story.value for story in mstories]

    mstories = sorted(mstories, cmp=lambda x, y: cmp(story_score(y, bottom_delta), story_score(x, bottom_delta)))

    # story_feed_counts = defaultdict(int)
    # mstories_pruned = []
    # for story in mstories:
    #     print story['story_title'], story_feed_counts[story['story_feed_id']]
    #     if story_feed_counts[story['story_feed_id']] >= 3: continue
    #     mstories_pruned.append(story)
    #     story_feed_counts[story['story_feed_id']] += 1
    stories = []
    for i, story in enumerate(mstories):
        if i < offset: continue
        if i >= offset + limit: break
        stories.append(bunch(story))
    stories = Feed.format_stories(stories)
    found_feed_ids = list(set([story['story_feed_id'] for story in stories]))
    
    # Find starred stories
    starred_stories = MStarredStory.objects(
        user_id=user.pk,
        story_feed_id__in=found_feed_ids
    ).only('story_guid', 'starred_date')
    starred_stories = dict([(story.story_guid, story.starred_date) 
                            for story in starred_stories])
    
    # Intelligence classifiers for all feeds involved
    def sort_by_feed(classifiers):
        feed_classifiers = defaultdict(list)
        for classifier in classifiers:
            feed_classifiers[classifier.feed_id].append(classifier)
        return feed_classifiers
    classifier_feeds   = sort_by_feed(MClassifierFeed.objects(user_id=user.pk, feed_id__in=found_feed_ids))
    classifier_authors = sort_by_feed(MClassifierAuthor.objects(user_id=user.pk, feed_id__in=found_feed_ids))
    classifier_titles  = sort_by_feed(MClassifierTitle.objects(user_id=user.pk, feed_id__in=found_feed_ids))
    classifier_tags    = sort_by_feed(MClassifierTag.objects(user_id=user.pk, feed_id__in=found_feed_ids))
    
    # Just need to format stories
    for story in stories:
        story_date = localtime_for_timezone(story['story_date'], user.profile.timezone)
        now = localtime_for_timezone(datetime.datetime.now(), user.profile.timezone)
        story['short_parsed_date'] = format_story_link_date__short(story_date, now)
        story['long_parsed_date']  = format_story_link_date__long(story_date, now)
        story['read_status'] = 0
        if story['id'] in starred_stories:
            story['starred'] = True
            starred_date = localtime_for_timezone(starred_stories[story['id']], user.profile.timezone)
            story['starred_date'] = format_story_link_date__long(starred_date, now)
        story['intelligence'] = {
            'feed':   apply_classifier_feeds(classifier_feeds[story['story_feed_id']], story['story_feed_id']),
            'author': apply_classifier_authors(classifier_authors[story['story_feed_id']], story),
            'tags':   apply_classifier_tags(classifier_tags[story['story_feed_id']], story),
            'title':  apply_classifier_titles(classifier_titles[story['story_feed_id']], story),
        }
    
    diff = datetime.datetime.utcnow() - start
    timediff = float("%s.%.2s" % (diff.seconds, (diff.microseconds / 1000)))
    logging.user(request.user, "~FCLoading river stories: page %s - ~SB%s/%s "
                               "stories ~SN(%s/%s/%s feeds) ~FB(%s seconds)" % 
                               (page, len(stories), len(mstories), len(found_feed_ids), 
                               len(feed_ids), len(original_feed_ids), timediff))
    
    return dict(stories=stories)
Example #5
0
    def process(self, first_run=True):
        """ Downloads and parses a feed.
        """
        self.refresh_feed()
        
        ret_values = {
            ENTRY_NEW:0,
            ENTRY_UPDATED:0,
            ENTRY_SAME:0,
            ENTRY_ERR:0}

        # logging.debug(u' ---> [%d] Processing %s' % (self.feed.id, self.feed.feed_title))
        
        self.feed.fetched_once = True
        self.feed.last_update = datetime.datetime.utcnow()

        if hasattr(self.fpf, 'status'):
            if self.options['verbose']:
                logging.debug(u'   ---> [%-30s] Fetched feed, HTTP status %d: %s (bozo: %s)' % (unicode(self.feed)[:30],
                                                     self.fpf.status,
                                                     self.feed.feed_address,
                                                     self.fpf.bozo))
                if self.fpf.bozo and self.fpf.status != 304:
                    logging.debug(u'   ---> [%-30s] BOZO exception: %s (%s entries)' % (
                                  unicode(self.feed)[:30],
                                  self.fpf.bozo_exception,
                                  len(self.fpf.entries)))
            if self.fpf.status == 304:
                self.feed.save()
                self.feed.save_feed_history(304, "Not modified")
                return FEED_SAME, ret_values
            
            if self.fpf.status in (302, 301):
                if not self.fpf.href.endswith('feedburner.com/atom.xml'):
                    self.feed.feed_address = self.fpf.href
                if first_run:
                    self.feed.has_feed_exception = True
                    self.feed.schedule_feed_fetch_immediately()
                if not self.fpf.entries:
                    self.feed.save()
                    self.feed.save_feed_history(self.fpf.status, "HTTP Redirect")
                    return FEED_ERRHTTP, ret_values
                
            if self.fpf.status >= 400:
                logging.debug("   ---> [%-30s] HTTP Status code: %s. Checking address..." % (unicode(self.feed)[:30], self.fpf.status))
                fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(self.fpf.status, "HTTP Error")
                else:
                    self.feed.has_feed_exception = True
                    self.feed.schedule_feed_fetch_immediately()
                self.feed.save()
                return FEED_ERRHTTP, ret_values
                                    
        if self.fpf.bozo and isinstance(self.fpf.bozo_exception, feedparser.NonXMLContentType):
            logging.debug("   ---> [%-30s] Feed is Non-XML. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.fpf.entries else ''))
            if not self.fpf.entries:
                fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(502, 'Non-xml feed', self.fpf.bozo_exception)
                else:
                    self.feed.has_feed_exception = True
                    self.feed.schedule_feed_fetch_immediately()
                self.feed.save()
                return FEED_ERRPARSE, ret_values
        elif self.fpf.bozo and isinstance(self.fpf.bozo_exception, xml.sax._exceptions.SAXException):
            logging.debug("   ---> [%-30s] Feed has SAX/XML parsing issues. %s entries.%s Checking address..." % (unicode(self.feed)[:30], len(self.fpf.entries), ' Not' if self.fpf.entries else ''))
            if not self.fpf.entries:
                fixed_feed = self.feed.check_feed_link_for_feed_address()
                if not fixed_feed:
                    self.feed.save_feed_history(503, 'SAX Exception', self.fpf.bozo_exception)
                else:
                    self.feed.has_feed_exception = True
                    self.feed.schedule_feed_fetch_immediately()
                self.feed.save()
                return FEED_ERRPARSE, ret_values
                
        # the feed has changed (or it is the first time we parse it)
        # saving the etag and last_modified fields
        self.feed.etag = self.fpf.get('etag')
        if self.feed.etag:
            self.feed.etag = self.feed.etag[:255]
        # some times this is None (it never should) *sigh*
        if self.feed.etag is None:
            self.feed.etag = ''

        try:
            self.feed.last_modified = mtime(self.fpf.modified)
        except:
            pass
        
        self.fpf.entries = self.fpf.entries[:50]
        
        self.feed.feed_title = self.fpf.feed.get('title', self.feed.feed_title)
        tagline = self.fpf.feed.get('tagline', self.feed.data.feed_tagline)
        if tagline:
            self.feed.data.feed_tagline = utf8encode(tagline)
            self.feed.data.save()
        if not self.feed.feed_link_locked:
            self.feed.feed_link = self.fpf.feed.get('link') or self.fpf.feed.get('id') or self.feed.feed_link
        
        self.feed.last_update = datetime.datetime.utcnow()
        
        guids = []
        for entry in self.fpf.entries:
            if entry.get('id', ''):
                guids.append(entry.get('id', ''))
            elif entry.get('link'):
                guids.append(entry.link)
            elif entry.get('title'):
                guids.append(entry.title)
        self.feed.save()

        # Compare new stories to existing stories, adding and updating
        start_date = datetime.datetime.utcnow()
        # end_date = datetime.datetime.utcnow()
        story_guids = []
        for entry in self.fpf.entries:
            story = pre_process_story(entry)
            if story.get('published') < start_date:
                start_date = story.get('published')
            # if story.get('published') > end_date:
            #     end_date = story.get('published')
            story_guids.append(story.get('guid') or story.get('link'))
        
        if self.options['slave_db']:
            slave_db = self.options['slave_db']
            stories_db_orig = slave_db.stories.find({
                "story_feed_id": self.feed.pk,
                "story_date": {
                    "$gte": start_date,
                },
            }).limit(len(story_guids))
            existing_stories = []
            for story in stories_db_orig:
                existing_stories.append(bunch(story))
        else:
            existing_stories = list(MStory.objects(
                # story_guid__in=story_guids,
                story_date__gte=start_date,
                story_feed_id=self.feed.pk
            ).limit(len(story_guids)))
        
        # MStory.objects(
        #     (Q(story_date__gte=start_date) & Q(story_date__lte=end_date))
        #     | (Q(story_guid__in=story_guids)),
        #     story_feed=self.feed
        # ).order_by('-story_date')
        ret_values = self.feed.add_update_stories(self.fpf.entries, existing_stories, verbose=self.options['verbose'])
        
        logging.debug(u'   ---> [%-30s] ~FYParsed Feed: new=~FG~SB%s~SN~FY up=~FY~SB%s~SN same=~FY%s err=~FR~SB%s' % (
                      unicode(self.feed)[:30], 
                      ret_values[ENTRY_NEW], ret_values[ENTRY_UPDATED], ret_values[ENTRY_SAME], ret_values[ENTRY_ERR]))
        self.feed.update_all_statistics()
        self.feed.trim_feed()
        self.feed.save_feed_history(200, "OK")
        
        return FEED_OK, ret_values
Example #6
0
                                                     story_score(x, days_to_keep_unreads)))

    # Prune the river to only include a set number of stories per feed
    # story_feed_counts = defaultdict(int)
    # mstories_pruned = []
    # for story in mstories:
    #     print story['story_title'], story_feed_counts[story['story_feed_id']]
    #     if story_feed_counts[story['story_feed_id']] >= 3: continue
    #     mstories_pruned.append(story)
    #     story_feed_counts[story['story_feed_id']] += 1
    
    stories = []
    for i, story in enumerate(mstories):
        if i < offset: continue
        if i >= limit: break
        stories.append(bunch(story))
    stories = Feed.format_stories(stories)
    found_feed_ids = list(set([story['story_feed_id'] for story in stories]))
    
    # Find starred stories
    try:
        starred_stories = MStarredStory.objects(
            user_id=user.pk,
            story_feed_id__in=found_feed_ids
        ).only('story_guid', 'starred_date')
        starred_stories = dict([(story.story_guid, story.starred_date) 
                                for story in starred_stories])
    except OperationFailure:
        logging.info(" ***> Starred stories failure")
        starred_stories = {}