def reindex_stories(): db = pymongo.Connection().newsblur count = MStory.objects().count() print "Mongo DB stories: %s" % count p = 0.0 i = 0 feeds = Feed.objects.all().order_by('-average_stories_per_month') feed_count = feeds.count() f = 0 for feed in feeds: f += 1 print "%s/%s: %s" % ( f, feed_count, feed, ) sys.stdout.flush() for story in MStory.objects(story_feed_id=feed.pk): i += 1.0 if round(i / count * 100) != p: p = round(i / count * 100) print '%s%%' % p if isinstance(story.id, unicode): story.story_guid = story.id story.id = pymongo.objectid.ObjectId() try: story.save() except OperationError, e: print " ***> OperationError: %s" % e except e: print ' ***> Unknown Error: %s' % e db.stories.remove({"_id": story.story_guid})
def test_load_feeds__gawker(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'gawker1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='gawker') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) feed.update(force=True) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) management.call_command('loaddata', 'gawker2.json', verbosity=0) feed.update(force=True) # Test: 1 changed char in content stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) url = reverse('load-single-feed', kwargs=dict(feed_id=1)) response = self.client.get(url) feed = json.decode(response.content) self.assertEquals(len(feed['stories']), 6)
def test_load_feeds__gawker(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'gawker1.json', verbosity=0, skip_checks=False) feed = Feed.objects.get(pk=10) stories = MStory.objects(story_feed_id=feed.pk) self.assertEqual(stories.count(), 0) feed.update(force=True) stories = MStory.objects(story_feed_id=feed.pk) self.assertEqual(stories.count(), 38) management.call_command('loaddata', 'gawker2.json', verbosity=0, skip_checks=False) feed.update(force=True) # Test: 1 changed char in content stories = MStory.objects(story_feed_id=feed.pk) self.assertEqual(stories.count(), 38) url = reverse('load-single-feed', kwargs=dict(feed_id=10)) response = self.client.get(url) feed = json.decode(response.content) self.assertEqual(len(feed['stories']), 6)
def test_load_feeds__gothamist(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'gothamist_aug_2009_1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='gothamist') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=4, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 42) url = reverse('load-single-feed', kwargs=dict(feed_id=4)) response = self.client.get(url) content = json.decode(response.content) self.assertEquals(len(content['stories']), 6) management.call_command('loaddata', 'gothamist_aug_2009_2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=4, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 42) url = reverse('load-single-feed', kwargs=dict(feed_id=4)) response = self.client.get(url) # print [c['story_title'] for c in json.decode(response.content)] content = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(content['stories']), 6)
def mark_story_as_unread(request): story_id = request.POST['story_id'] feed_id = int(request.POST['feed_id']) usersub = UserSubscription.objects.select_related('feed').get(user=request.user, feed=feed_id) if not usersub.needs_unread_recalc: usersub.needs_unread_recalc = True usersub.save() data = dict(code=0, payload=dict(story_id=story_id)) logging.user(request, "~FY~SBUnread~SN story in feed: %s" % (usersub.feed)) story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] if story.story_date < usersub.mark_read_date: # Story is outside the mark as read range, so invert all stories before. newer_stories = MStory.objects(story_feed_id=story.story_feed_id, story_date__gte=story.story_date, story_date__lte=usersub.mark_read_date ).only('story_guid') newer_stories = [s.story_guid for s in newer_stories] usersub.mark_read_date = story.story_date - datetime.timedelta(minutes=1) usersub.needs_unread_recalc = True usersub.save() # Mark stories as read only after the mark_read_date has been moved, otherwise # these would be ignored. data = usersub.mark_story_ids_as_read(newer_stories, request=request) m = MUserStory.objects(story_id=story_id, user_id=request.user.pk, feed_id=feed_id) m.delete() return data
def test_load_feeds__gawker(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'gawker1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='gawker') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=1, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) management.call_command('loaddata', 'gawker2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=1, single_threaded=True, daemonize=False) # Test: 1 changed char in content stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) response = self.client.post('/reader/load_single_feed', {"feed_id": 1}) feed = json.decode(response.content) self.assertEquals(len(feed['stories']), 30)
def reindex_stories(): count = MStory.objects().count() print "Mongo DB stories: %s" % count p = 0.0 i = 0 feeds = Feed.objects.all().order_by('-average_stories_per_month') feed_count = feeds.count() f = 0 for feed in feeds: f += 1 print "%s/%s: %s" % (f, feed_count, feed,) sys.stdout.flush() for story in MStory.objects(story_feed_id=feed.pk): i += 1.0 if round(i / count * 100) != p: p = round(i / count * 100) print '%s%%' % p if isinstance(story.id, unicode) and story.id: story.story_guid = story.id story.id = pymongo.objectid.ObjectId() try: story.save() except mongoengine.queryset.OperationError: print 'Dupe!' continue
def test_load_feeds__slashdot(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'slashdot1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='slashdot') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) management.call_command('loaddata', 'slashdot2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) response = self.client.post('/reader/feed', { "feed_id": 5 }) # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 30)
def test_load_feeds__gothamist(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'gothamist_aug_2009_1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='gothamist') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) feed.update(force=True) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 42) url = reverse('load-single-feed', kwargs=dict(feed_id=4)) response = self.client.get(url) content = json.decode(response.content) self.assertEquals(len(content['stories']), 6) management.call_command('loaddata', 'gothamist_aug_2009_2.json', verbosity=0) feed.update(force=True) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 42) url = reverse('load-single-feed', kwargs=dict(feed_id=4)) response = self.client.get(url) # print [c['story_title'] for c in json.decode(response.content)] content = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(content['stories']), 6)
def bootstrap_stories(): print "Mongo DB stories: %s" % MStory.objects().count() # db.stories.drop() print "Dropped! Mongo DB stories: %s" % MStory.objects().count() print "Stories: %s" % Story.objects.all().count() pprint(db.stories.index_information()) feeds = Feed.objects.all().order_by('-average_stories_per_month') feed_count = feeds.count() i = 0 for feed in feeds: i += 1 print "%s/%s: %s (%s stories)" % (i, feed_count, feed, Story.objects.filter(story_feed=feed).count()) sys.stdout.flush() stories = Story.objects.filter(story_feed=feed).values() for story in stories: # story['story_tags'] = [tag.name for tag in Tag.objects.filter(story=story['id'])] try: story['story_tags'] = json.decode(story['story_tags']) except: continue del story['id'] del story['story_author_id'] try: MStory(**story).save() except: continue print "\nMongo DB stories: %s" % MStory.objects().count()
def reindex_stories(): db = pymongo.Connection().newsblur count = MStory.objects().count() print "Mongo DB stories: %s" % count p = 0.0 i = 0 feeds = Feed.objects.all().order_by('-average_stories_per_month') feed_count = feeds.count() f = 0 for feed in feeds: f += 1 print "%s/%s: %s" % (f, feed_count, feed,) sys.stdout.flush() for story in MStory.objects(story_feed_id=feed.pk): i += 1.0 if round(i / count * 100) != p: p = round(i / count * 100) print '%s%%' % p if isinstance(story.id, unicode): story.story_guid = story.id story.id = pymongo.objectid.ObjectId() try: story.save() except OperationError, e: print " ***> OperationError: %s" % e except e: print ' ***> Unknown Error: %s' % e db.stories.remove({"_id": story.story_guid})
def bootstrap_stories(): print "Mongo DB stories: %s" % MStory.objects().count() # db.stories.drop() print "Dropped! Mongo DB stories: %s" % MStory.objects().count() print "Stories: %s" % Story.objects.all().count() pprint(db.stories.index_information()) feeds = Feed.objects.all().order_by('-average_stories_per_month') feed_count = feeds.count() i = 0 for feed in feeds: i += 1 print "%s/%s: %s (%s stories)" % ( i, feed_count, feed, Story.objects.filter(story_feed=feed).count()) sys.stdout.flush() stories = Story.objects.filter(story_feed=feed).values() for story in stories: # story['story_tags'] = [tag.name for tag in Tag.objects.filter(story=story['id'])] try: story['story_tags'] = json.decode(story['story_tags']) except: continue del story['id'] del story['story_author_id'] try: MStory(**story).save() except: continue print "\nMongo DB stories: %s" % MStory.objects().count()
def test_load_feeds__slashdot(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'slashdot1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='slashdot') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) management.call_command('loaddata', 'slashdot2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) response = self.client.post('/reader/feed', {"feed_id": 5}) # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 30)
def test_load_feeds__motherjones(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'motherjones1.json', verbosity=0, skip_checks=False) feed = Feed.objects.get(feed_link__contains='motherjones') stories = MStory.objects(story_feed_id=feed.pk) self.assertEqual(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=feed.pk, daemonize=False, skip_checks=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEqual(stories.count(), 10) response = self.client.get(reverse('load-feeds')) content = json.decode(response.content) self.assertEqual(content['feeds'][str(feed.pk)]['nt'], 10) self.client.post(reverse('mark-story-as-read'), { 'story_id': stories[0].story_guid, 'feed_id': feed.pk }) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEqual(content['feeds'][str(feed.pk)]['nt'], 9) management.call_command('loaddata', 'motherjones2.json', verbosity=0, skip_checks=False) management.call_command('refresh_feed', force=1, feed=feed.pk, daemonize=False, skip_checks=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEqual(stories.count(), 10) url = reverse('load-single-feed', kwargs=dict(feed_id=feed.pk)) response = self.client.get(url) # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) # Test: 1 changed char in title self.assertEqual(len(feed['stories']), 6) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEqual(content['feeds'][str(feed['feed_id'])]['nt'], 9)
def test_load_feeds__google(self): # Freezegun the date to 2017-04-30 self.client.login(username='******', password='******') old_story_guid = "blog.google:443/topics/inside-google/google-earths-incredible-3d-imagery-explained/" management.call_command('loaddata', 'google1.json', verbosity=1) print Feed.objects.all() feed = Feed.objects.get(pk=766) print " Testing test_load_feeds__google: %s" % feed stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=False, feed=766, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 20) response = self.client.get( reverse('load-feeds') + "?update_counts=true") content = json.decode(response.content) self.assertEquals(content['feeds']['766']['nt'], 20) old_story = MStory.objects.get(story_feed_id=feed.pk, story_guid__contains=old_story_guid) self.client.post(reverse('mark-story-hashes-as-read'), {'story_hash': old_story.story_hash}) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['766']['nt'], 19) management.call_command('loaddata', 'google2.json', verbosity=1) management.call_command('refresh_feed', force=False, feed=766, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 20) url = reverse('load-single-feed', kwargs=dict(feed_id=766)) response = self.client.get(url) # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 6) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['766']['nt'], 19)
def test_load_feeds__slashdot(self): self.client.login(username='******', password='******') old_story_guid = "tag:google.com,2005:reader/item/4528442633bc7b2b" management.call_command('loaddata', 'slashdot1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='slashdot') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) response = self.client.get(reverse('load-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 38) self.client.post(reverse('mark-story-as-read'), { 'story_id': old_story_guid, 'feed_id': 5 }) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 37) management.call_command('loaddata', 'slashdot2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) url = reverse('load-single-feed', kwargs=dict(feed_id=5)) response = self.client.get(url) # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 6) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 37)
def count_unreads_for_subscribers(self, feed): user_subs = UserSubscription.objects.filter( feed=feed, active=True, user__profile__last_seen_on__gte=feed.unread_cutoff ).order_by("-last_read_date") if not user_subs.count(): return for sub in user_subs: if not sub.needs_unread_recalc: sub.needs_unread_recalc = True sub.save() if self.options["compute_scores"]: r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL) stories = MStory.objects(story_feed_id=feed.pk, story_date__gte=feed.unread_cutoff) stories = Feed.format_stories(stories, feed.pk) story_hashes = r.zrangebyscore( "zF:%s" % feed.pk, int(feed.unread_cutoff.strftime("%s")), int(time.time() + 60 * 60 * 24) ) missing_story_hashes = set(story_hashes) - set([s["story_hash"] for s in stories]) if missing_story_hashes: missing_stories = MStory.objects( story_feed_id=feed.pk, story_hash__in=missing_story_hashes ).read_preference(pymongo.ReadPreference.PRIMARY) missing_stories = Feed.format_stories(missing_stories, feed.pk) stories = missing_stories + stories logging.debug( u" ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores" % (feed.title[:30], len(missing_stories), len(missing_story_hashes), len(stories)) ) cache.set("S:%s" % feed.pk, stories, 60) logging.debug( u" ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)" % ( feed.title[:30], len(stories), user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers, ) ) self.calculate_feed_scores_with_stories(user_subs, stories) elif self.options.get("mongodb_replication_lag"): logging.debug( u" ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag" % (feed.title[:30], self.options.get("mongodb_replication_lag")) )
def mark_feed_read(self): now = datetime.datetime.now() if MStory.objects(story_feed_id=self.feed.pk).first(): latest_story_date = MStory.objects(story_feed_id=self.feed.pk).order_by('-story_date')[0].story_date\ + datetime.timedelta(minutes=1) else: latest_story_date = now self.last_read_date = max(now, latest_story_date) self.mark_read_date = max(now, latest_story_date) self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = max(now, latest_story_date) self.needs_unread_relcalc = False self.save()
def imagesearch(request, page_num): if page_num == '': page_num = '1' page_num = int(page_num) num_per_page = 15 q = request.GET.get('q', None) if q: image_server = settings.FDFS_HTTP_SERVER index_stories = SearchStory.query(q)[:500] response_images = [] for index_story in index_stories: story = MStory.objects(id=index_story['db_id']).first() if story and story.image_ids: for image_id in story.image_ids: if len(image_id) > 20: # print image_id image = MImage.objects(id=image_id).first() imagedict = dict( image_url=image_server + image.image_remote_id, story_url=story.story_guid, story_title=story.story_title, ) response_images.append(imagedict) if len(response_images) >= 50: return render(request, 'imagesearch.html', locals()) return render(request, 'imagesearch.html', locals())
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter(feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF)\ .order_by('-last_read_date') logging.debug(u' ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers' % ( unicode(feed)[:30], user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers)) if self.options['slave_db']: slave_db = self.options['slave_db'] stories_db_orig = slave_db.stories.find({ "story_feed_id": feed.pk, "story_date": { "$gte": UNREAD_CUTOFF, }, }) stories_db = [] for story in stories_db_orig: stories_db.append(bunch(story)) else: stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) for sub in user_subs: cache.delete('usersub:%s' % sub.user_id) sub.needs_unread_recalc = True sub.save() if self.options['compute_scores']: for sub in user_subs: silent = False if self.options['verbose'] >= 2 else True sub.calculate_feed_scores(silent=silent, stories_db=stories_db)
def more_like_this(request): user = get_user(request) get_post = getattr(request, request.method) order = get_post.get('order', 'newest') page = int(get_post.get('page', 1)) limit = int(get_post.get('limit', 10)) offset = limit * (page - 1) story_hash = get_post.get('story_hash') feed_ids = [ us.feed_id for us in UserSubscription.objects.filter(user=user) ] feed_ids, _ = MStory.split_story_hash(story_hash) story_ids = SearchStory.more_like_this([feed_ids], story_hash, order, offset=offset, limit=limit) stories_db = MStory.objects(story_hash__in=story_ids).order_by( '-story_date' if order == "newest" else 'story_date') stories = Feed.format_stories(stories_db) return { "stories": stories, }
def mark_feed_read(self): if (self.unread_count_negative == 0 and self.unread_count_neutral == 0 and self.unread_count_positive == 0 and not self.needs_unread_recalc): return now = datetime.datetime.utcnow() # Use the latest story to get last read time. latest_story = MStory.objects(story_feed_id=self.feed.pk).order_by( '-story_date').only('story_date').limit(1) if latest_story and len(latest_story) >= 1: latest_story_date = latest_story[0]['story_date']\ + datetime.timedelta(seconds=1) else: latest_story_date = now self.last_read_date = latest_story_date self.mark_read_date = latest_story_date self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = now self.oldest_unread_story_date = now self.needs_unread_recalc = False # No longer removing old user read stories, since they're needed for social, # and they get cleaned up automatically when new stories come in. # MUserStory.delete_old_stories(self.user_id, self.feed_id) self.save()
def mark_story_as_read(request): story_ids = request.REQUEST.getlist('story_id') feed_id = int(request.REQUEST['feed_id']) usersub = UserSubscription.objects.select_related('feed').get(user=request.user, feed=feed_id) if not usersub.needs_unread_recalc: usersub.needs_unread_recalc = True usersub.save() data = dict(code=0, payload=story_ids) if len(story_ids) > 1: logging.debug(" ---> [%s] Read %s stories in feed: %s" % (request.user, len(story_ids), usersub.feed)) else: logging.debug(" ---> [%s] Read story in feed: %s" % (request.user, usersub.feed)) for story_id in story_ids: story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] now = datetime.datetime.utcnow() m = MUserStory(story=story, user_id=request.user.pk, feed_id=feed_id, read_date=now) try: m.save() except OperationError: logging.info(' ---> [%s] *** Marked story as read: Duplicate Story -> %s' % (request.user, story_id)) return data
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter( feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF ).order_by("-last_read_date") logging.debug( u" ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers" % ( unicode(feed)[:30], user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers, ) ) stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) for sub in user_subs: cache.delete("usersub:%s" % sub.user_id) sub.needs_unread_recalc = True sub.save() if self.options["compute_scores"]: for sub in user_subs: silent = False if self.options["verbose"] >= 2 else True sub.calculate_feed_scores(silent=silent, stories_db=stories_db)
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter( feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF ).order_by("-last_read_date") for sub in user_subs: if not sub.needs_unread_recalc: sub.needs_unread_recalc = True sub.save() if self.options["compute_scores"]: stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) logging.debug( u" ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)" % ( feed.title[:30], stories_db.count(), user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers, ) ) self.calculate_feed_scores_with_stories(user_subs, stories_db) elif self.options.get("mongodb_replication_lag"): logging.debug( u" ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag" % (feed.title[:30], self.options.get("mongodb_replication_lag")) )
def mark_story_as_unread(request): story_id = request.POST['story_id'] feed_id = int(request.POST['feed_id']) try: usersub = UserSubscription.objects.select_related('feed').get(user=request.user, feed=feed_id) except Feed.DoesNotExist: duplicate_feed = DuplicateFeed.objects.filter(duplicate_feed_id=feed_id) if duplicate_feed: try: usersub = UserSubscription.objects.get(user=request.user, feed=duplicate_feed[0].feed) except Feed.DoesNotExist: return dict(code=-1) if not usersub.needs_unread_recalc: usersub.needs_unread_recalc = True usersub.save() data = dict(code=0, payload=dict(story_id=story_id)) logging.user(request.user, "~FY~SBUnread~SN story in feed: %s" % (usersub.feed)) story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] m = MUserStory.objects(story=story, user_id=request.user.pk, feed_id=feed_id) m.delete() return data
def imagesearch(request,page_num): if page_num == '': page_num = '1' page_num = int(page_num) num_per_page = 15 q = request.GET.get('q',None) if q: image_server = settings.FDFS_HTTP_SERVER index_stories = SearchStory.query(q)[:500] response_images = [] for index_story in index_stories: story = MStory.objects(id=index_story['db_id']).first() if story and story.image_ids: for image_id in story.image_ids: if len(image_id) > 20: # print image_id image = MImage.objects(id=image_id).first() imagedict = dict( image_url=image_server+image.image_remote_id, story_url=story.story_guid, story_title = story.story_title, ) response_images.append(imagedict) if len(response_images)>=50: return render(request,'imagesearch.html',locals()) return render(request,'imagesearch.html',locals())
def mark_feed_read(self): if (self.unread_count_negative == 0 and self.unread_count_neutral == 0 and self.unread_count_positive == 0 and not self.needs_unread_recalc): return now = datetime.datetime.utcnow() # Use the latest story to get last read time. latest_story = MStory.objects(story_feed_id=self.feed.pk).order_by( '-story_date').only('story_date').limit(1) if latest_story and len(latest_story) >= 1: latest_story_date = latest_story[0]['story_date']\ + datetime.timedelta(seconds=1) else: latest_story_date = now self.last_read_date = latest_story_date self.mark_read_date = latest_story_date self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = now self.oldest_unread_story_date = now self.needs_unread_recalc = False self.save() return True
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter(feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF)\ .order_by('-last_read_date') if not user_subs.count(): return for sub in user_subs: if not sub.needs_unread_recalc: sub.needs_unread_recalc = True sub.save() if self.options['compute_scores']: stories = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF)\ .read_preference(pymongo.ReadPreference.PRIMARY) stories = Feed.format_stories(stories, feed.pk) logging.debug(u' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % ( feed.title[:30], len(stories), user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers)) self.calculate_feed_scores_with_stories(user_subs, stories) elif self.options.get('mongodb_replication_lag'): logging.debug(u' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % ( feed.title[:30], self.options.get('mongodb_replication_lag')))
def bootstrap_userstories(): print "Mongo DB userstories: %s" % MUserStory.objects().count() # db.userstories.drop() print "Dropped! Mongo DB userstories: %s" % MUserStory.objects().count() print "UserStories: %s" % UserStory.objects.all().count() pprint(db.userstories.index_information()) userstories = UserStory.objects.all().values() for userstory in userstories: try: story = Story.objects.get(pk=userstory['story_id']) except Story.DoesNotExist: continue try: userstory['story'] = MStory.objects( story_feed_id=story.story_feed.pk, story_guid=story.story_guid)[0] except: print '!', continue print '.', del userstory['id'] del userstory['opinion'] del userstory['story_id'] try: MUserStory(**userstory).save() except: print '\n\n!\n\n' continue print "\nMongo DB userstories: %s" % MUserStory.objects().count()
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta( days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter(feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF)\ .order_by('-last_read_date') if not user_subs.count(): return for sub in user_subs: if not sub.needs_unread_recalc: sub.needs_unread_recalc = True sub.save() if self.options['compute_scores']: stories = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF)\ .read_preference(pymongo.ReadPreference.PRIMARY) stories = Feed.format_stories(stories, feed.pk) logging.debug( u' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % (feed.title[:30], len(stories), user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers)) self.calculate_feed_scores_with_stories(user_subs, stories) elif self.options.get('mongodb_replication_lag'): logging.debug( u' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % (feed.title[:30], self.options.get('mongodb_replication_lag')))
def mark_feed_read(self): now = datetime.datetime.utcnow() # Use the latest story to get last read time. latest_story = MStory.objects(story_feed_id=self.feed.pk).order_by('-story_date').only('story_date').limit(1) if latest_story and len(latest_story) >= 1: latest_story_date = latest_story[0]['story_date']\ + datetime.timedelta(seconds=1) else: latest_story_date = now self.last_read_date = latest_story_date self.mark_read_date = latest_story_date self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = now self.oldest_unread_story_date = now self.needs_unread_recalc = False # No longer removing old user read stories, since they're needed for social, # and they get cleaned up automatically when new stories come in. # MUserStory.delete_old_stories(self.user_id, self.feed_id) self.save()
def mark_story_as_read(request): story_ids = request.REQUEST.getlist("story_id") feed_id = int(request.REQUEST["feed_id"]) try: usersub = UserSubscription.objects.select_related("feed").get(user=request.user, feed=feed_id) except Feed.DoesNotExist: duplicate_feed = DuplicateFeed.objects.filter(duplicate_feed_id=feed_id) if duplicate_feed: try: usersub = UserSubscription.objects.get(user=request.user, feed=duplicate_feed[0].feed) except Feed.DoesNotExist: return dict(code=-1) if not usersub.needs_unread_recalc: usersub.needs_unread_recalc = True usersub.save() data = dict(code=0, payload=story_ids) if len(story_ids) > 1: logging.info(" ---> [%s] ~FYRead %s stories in feed: %s" % (request.user, len(story_ids), usersub.feed)) else: logging.info(" ---> [%s] ~FYRead story in feed: %s" % (request.user, usersub.feed)) for story_id in story_ids: story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] now = datetime.datetime.utcnow() m = MUserStory(story=story, user_id=request.user.pk, feed_id=feed_id, read_date=now) try: m.save() except OperationError: logging.info(" ---> [%s] ~BRMarked story as read: Duplicate Story -> %s" % (request.user, story_id)) return data
def bootstrap_userstories(): print "Mongo DB userstories: %s" % MUserStory.objects().count() # db.userstories.drop() print "Dropped! Mongo DB userstories: %s" % MUserStory.objects().count() print "UserStories: %s" % UserStory.objects.all().count() pprint(db.userstories.index_information()) userstories = UserStory.objects.all().values() for userstory in userstories: try: story = Story.objects.get(pk=userstory['story_id']) except Story.DoesNotExist: continue try: userstory['story'] = MStory.objects(story_feed_id=story.story_feed.pk, story_guid=story.story_guid)[0] except: print '!', continue print '.', del userstory['id'] del userstory['opinion'] del userstory['story_id'] try: MUserStory(**userstory).save() except: print '\n\n!\n\n' continue print "\nMongo DB userstories: %s" % MUserStory.objects().count()
def mark_story_as_unread(request): story_id = request.POST['story_id'] feed_id = int(request.POST['feed_id']) try: usersub = UserSubscription.objects.select_related('feed').get(user=request.user, feed=feed_id) except Feed.DoesNotExist: duplicate_feed = DuplicateFeed.objects.filter(duplicate_feed_id=feed_id) if duplicate_feed: try: usersub = UserSubscription.objects.get(user=request.user, feed=duplicate_feed[0].feed) except Feed.DoesNotExist: return dict(code=-1) if not usersub.needs_unread_recalc: usersub.needs_unread_recalc = True usersub.save() data = dict(code=0, payload=dict(story_id=story_id)) logging.user(request, "~FY~SBUnread~SN story in feed: %s" % (usersub.feed)) story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] m = MUserStory.objects(story=story, user_id=request.user.pk, feed_id=feed_id) m.delete() return data
def mark_feed_read(self): if (self.unread_count_negative == 0 and self.unread_count_neutral == 0 and self.unread_count_positive == 0 and not self.needs_unread_recalc): return now = datetime.datetime.utcnow() # Use the latest story to get last read time. latest_story = MStory.objects(story_feed_id=self.feed.pk).order_by('-story_date').only('story_date').limit(1) if latest_story and len(latest_story) >= 1: latest_story_date = latest_story[0]['story_date']\ + datetime.timedelta(seconds=1) else: latest_story_date = now self.last_read_date = latest_story_date self.mark_read_date = latest_story_date self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = now self.oldest_unread_story_date = now self.needs_unread_recalc = False self.save() return True
def mark_feed_read(self, cutoff_date=None): if (self.unread_count_negative == 0 and self.unread_count_neutral == 0 and self.unread_count_positive == 0 and not self.needs_unread_recalc): return recount = True # Use the latest story to get last read time. if cutoff_date: cutoff_date = cutoff_date + datetime.timedelta(seconds=1) else: latest_story = MStory.objects(story_feed_id=self.feed.pk).order_by('-story_date').only('story_date').limit(1) if latest_story and len(latest_story) >= 1: cutoff_date = (latest_story[0]['story_date'] + datetime.timedelta(seconds=1)) else: cutoff_date = datetime.datetime.utcnow() recount = False self.last_read_date = cutoff_date self.mark_read_date = cutoff_date self.oldest_unread_story_date = cutoff_date if not recount: self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = datetime.datetime.utcnow() self.needs_unread_recalc = False else: self.needs_unread_recalc = True self.save() return True
def test_load_feeds__google(self): # Freezegun the date to 2017-04-30 self.client.login(username='******', password='******') old_story_guid = "blog.google:443/topics/inside-google/google-earths-incredible-3d-imagery-explained/" management.call_command('loaddata', 'google1.json', verbosity=1) print Feed.objects.all() feed = Feed.objects.get(pk=766) print " Testing test_load_feeds__google: %s" % feed stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=False, feed=766, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 20) response = self.client.get(reverse('load-feeds')+"?update_counts=true") content = json.decode(response.content) self.assertEquals(content['feeds']['766']['nt'], 20) old_story = MStory.objects.get(story_feed_id=feed.pk, story_guid__contains=old_story_guid) self.client.post(reverse('mark-story-hashes-as-read'), {'story_hash': old_story.story_hash}) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['766']['nt'], 19) management.call_command('loaddata', 'google2.json', verbosity=1) management.call_command('refresh_feed', force=False, feed=766, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 20) url = reverse('load-single-feed', kwargs=dict(feed_id=766)) response = self.client.get(url) # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 6) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['766']['nt'], 19)
def mark_story_as_read(request): story_ids = request.REQUEST.getlist('story_id') feed_id = int(request.REQUEST['feed_id']) try: usersub = UserSubscription.objects.select_related('feed').get( user=request.user, feed=feed_id) except (UserSubscription.DoesNotExist, Feed.DoesNotExist): duplicate_feed = DuplicateFeed.objects.filter( duplicate_feed_id=feed_id) if duplicate_feed: try: usersub = UserSubscription.objects.get( user=request.user, feed=duplicate_feed[0].feed) except (UserSubscription.DoesNotExist, Feed.DoesNotExist): return dict(code=-1) else: return dict(code=-1) if not usersub.needs_unread_recalc: usersub.needs_unread_recalc = True usersub.save() data = dict(code=0, payload=story_ids) if len(story_ids) > 1: logging.user( request.user, "~FYRead %s stories in feed: %s" % (len(story_ids), usersub.feed)) else: logging.user(request.user, "~FYRead story in feed: %s" % (usersub.feed)) for story_id in story_ids: try: story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] except IndexError: # Story has been deleted, probably by feed_fetcher. continue now = datetime.datetime.utcnow() date = now if now > story.story_date else story.story_date # For handling future stories m = MUserStory(story=story, user_id=request.user.pk, feed_id=feed_id, read_date=date) try: m.save() except OperationError: logging.user( request.user, "~BRMarked story as read: Duplicate Story -> %s" % (story_id)) m = MUserStory.objects.get(story=story, user_id=request.user.pk, feed_id=feed_id) m.read_date = date m.save() return data
def count_unreads_for_subscribers(self, feed): user_subs = UserSubscription.objects.filter(feed=feed, active=True, user__profile__last_seen_on__gte=feed.unread_cutoff)\ .order_by('-last_read_date') if not user_subs.count(): return for sub in user_subs: if not sub.needs_unread_recalc: sub.needs_unread_recalc = True sub.save() if self.options['compute_scores']: r = redis.Redis(connection_pool=settings.REDIS_STORY_HASH_POOL) stories = MStory.objects(story_feed_id=feed.pk, story_date__gte=feed.unread_cutoff) stories = Feed.format_stories(stories, feed.pk) story_hashes = r.zrangebyscore( 'zF:%s' % feed.pk, int(feed.unread_cutoff.strftime('%s')), int(time.time() + 60 * 60 * 24)) missing_story_hashes = set(story_hashes) - set( [s['story_hash'] for s in stories]) if missing_story_hashes: missing_stories = MStory.objects(story_feed_id=feed.pk, story_hash__in=missing_story_hashes)\ .read_preference(pymongo.ReadPreference.PRIMARY) missing_stories = Feed.format_stories(missing_stories, feed.pk) stories = missing_stories + stories logging.debug( u' ---> [%-30s] ~FYFound ~SB~FC%s(of %s)/%s~FY~SN un-secondaried stories while computing scores' % (feed.title[:30], len(missing_stories), len(missing_story_hashes), len(stories))) cache.set("S:%s" % feed.pk, stories, 60) logging.debug( u' ---> [%-30s] ~FYComputing scores: ~SB%s stories~SN with ~SB%s subscribers ~SN(%s/%s/%s)' % (feed.title[:30], len(stories), user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers)) self.calculate_feed_scores_with_stories(user_subs, stories) elif self.options.get('mongodb_replication_lag'): logging.debug( u' ---> [%-30s] ~BR~FYSkipping computing scores: ~SB%s seconds~SN of mongodb lag' % (feed.title[:30], self.options.get('mongodb_replication_lag')))
def handle(self, *args, **options): cursor = connection.cursor() cursor.execute( """SELECT DISTINCT f.id AS original_id, f2.id AS duplicate_id, f.feed_address AS original_feed_address, f2.feed_address AS duplicate_feed_address, f.feed_title AS original_feed_title, f2.feed_title AS duplicate_feed_title, f.feed_link AS original_feed_link, f2.feed_link AS duplicate_feed_link, fd2.feed_tagline AS original_feed_tagline, fd.feed_tagline AS duplicate_feed_tagline FROM feeds f, feeds f2 INNER JOIN rss_feeds_feeddata fd ON fd.feed_id = f.feed_id INNER JOIN rss_feeds_feeddata fd2 ON fd2.feed_id = f2.feed_id WHERE f2.id > f.id AND fd.feed_tagline = fd2.feed_tagline AND f.feed_link = f2.feed_link AND f.feed_title = f2.feed_title ORDER BY original_id ASC;""") feed_fields = ('original_id', 'duplicate_id', 'original_feed_address', 'duplicate_feed_address') skips = 0 merges = 0 for feeds_values in cursor.fetchall(): feeds = dict(zip(feed_fields, feeds_values)) duplicate_stories = MStory.objects( story_feed_id=feeds['duplicate_id']).only('story_guid')[5:8] duplicate_story_ids = [ story.story_guid for story in duplicate_stories ] original_stories = MStory.objects( story_feed_id=feeds['original_id'], story_guid__in=duplicate_story_ids) if duplicate_stories.count() == original_stories.count(): merges += 1 merge_feeds(feeds['original_id'], feeds['duplicate_id']) else: # print duplicate_stories # print duplicate_story_ids # print original_stories # print "Skipping: %s" % feeds skips += 1 print "Skips: %s, Merges: %s" % (skips, merges)
def calculate_metrics(): from apps.rss_feeds.models import MStory from apps.reader.models import MUserStory return { 'stories': MStory.objects().count(), 'read_stories': MUserStory.objects().count(), }
def calculate_metrics(self): from apps.rss_feeds.models import MStory from apps.reader.models import MUserStory return { 'stories': MStory.objects().count(), 'read_stories': MUserStory.objects().count(), }
def test_load_feeds__slashdot(self): self.client.login(username='******', password='******') old_story_guid = "{'original-id': u'http://yro.slashdot.org/story/09/09/05/0112254/Court-Allows-Microsoft-To-Sell-Word-During-Appeal?from=rss', 'gr:original-id': u'http://yro.slashdot.org/story/09/09/05/0112254/Court-Allows-Microsoft-To-Sell-Word-During-Appeal?from=rss'}" new_story_guid = "{'original-id': u'http://yro.slashdot.org/story/09/09/05/0112254/Court-Allows-Microsoft-To-Sell-Word-During-Appeal?from=rss!!', 'gr:original-id': u'http://yro.slashdot.org/story/09/09/05/0112254/Court-Allows-Microsoft-To-Sell-Word-During-Appeal?from=rss!!'}" management.call_command('loaddata', 'slashdot1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='slashdot') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) response = self.client.get(reverse('load-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 38) self.client.post(reverse('mark-story-as-read'), {'story_id': old_story_guid, 'feed_id': 5}) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 37) management.call_command('loaddata', 'slashdot2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) url = reverse('load-single-feed', kwargs=dict(feed_id=5)) response = self.client.get(url) # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 12) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 37)
def test_load_feeds__slashdot(self): self.client.login(username='******', password='******') old_story_guid = "tag:google.com,2005:reader/item/4528442633bc7b2b" management.call_command('loaddata', 'slashdot1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='slashdot') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) response = self.client.get(reverse('load-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 38) self.client.post(reverse('mark-story-as-read'), {'story_id': old_story_guid, 'feed_id': 5}) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 37) management.call_command('loaddata', 'slashdot2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=5, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 38) url = reverse('load-single-feed', kwargs=dict(feed_id=5)) response = self.client.get(url) # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 6) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds']['5']['nt'], 37)
def test_load_feeds__motherjones(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'motherjones1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='motherjones') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=feed.pk, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 10) response = self.client.get(reverse('load-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds'][str(feed.pk)]['nt'], 10) self.client.post(reverse('mark-story-as-read'), {'story_id': stories[0].story_guid, 'feed_id': feed.pk}) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds'][str(feed.pk)]['nt'], 9) management.call_command('loaddata', 'motherjones2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=feed.pk, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 10) url = reverse('load-single-feed', kwargs=dict(feed_id=feed.pk)) response = self.client.get(url) # pprint([c['story_title'] for c in json.decode(response.content)]) feed = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(feed['stories']), 6) response = self.client.get(reverse('refresh-feeds')) content = json.decode(response.content) self.assertEquals(content['feeds'][str(feed['feed_id'])]['nt'], 9)
def mark_feed_read(self): now = datetime.datetime.utcnow() # Use the latest story to get last read time. if MStory.objects(story_feed_id=self.feed.pk).first(): latest_story_date = MStory.objects(story_feed_id=self.feed.pk).order_by('-story_date').only('story_date')[0]['story_date']\ + datetime.timedelta(seconds=1) else: latest_story_date = now self.last_read_date = latest_story_date self.mark_read_date = latest_story_date self.unread_count_negative = 0 self.unread_count_positive = 0 self.unread_count_neutral = 0 self.unread_count_updated = latest_story_date self.needs_unread_recalc = False MUserStory.delete_marked_as_read_stories(self.user.pk, self.feed.pk) self.save()
def compress_stories(): count = MStory.objects().count() print "Mongo DB stories: %s" % count p = 0.0 i = 0 feeds = Feed.objects.all().order_by('-average_stories_per_month') feed_count = feeds.count() f = 0 for feed in feeds: f += 1 print "%s/%s: %s" % (f, feed_count, feed,) sys.stdout.flush() for story in MStory.objects(story_feed_id=feed.pk): i += 1.0 if round(i / count * 100) != p: p = round(i / count * 100) print '%s%%' % p story.save()
def test_load_feeds__gothamist(self): self.client.login(username='******', password='******') management.call_command('loaddata', 'gothamist_aug_2009_1.json', verbosity=0) feed = Feed.objects.get(feed_link__contains='gothamist') stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 0) management.call_command('refresh_feed', force=1, feed=4, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 42) response = self.client.post('/reader/load_single_feed', {"feed_id": 4}) content = json.decode(response.content) self.assertEquals(len(content['stories']), 30) management.call_command('loaddata', 'gothamist_aug_2009_2.json', verbosity=0) management.call_command('refresh_feed', force=1, feed=4, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=feed.pk) self.assertEquals(stories.count(), 42) response = self.client.get('/reader/load_single_feed', {"feed_id": 4}) # print [c['story_title'] for c in json.decode(response.content)] content = json.decode(response.content) # Test: 1 changed char in title self.assertEquals(len(content['stories']), 30)
def process_feed_wrapper(self, feed_queue): """ wrapper for ProcessFeed """ UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) delta = None current_process = multiprocessing.current_process() identity = "X" if current_process._identity: identity = current_process._identity[0] for feed_id in feed_queue: ret_entries = {ENTRY_NEW: 0, ENTRY_UPDATED: 0, ENTRY_SAME: 0, ENTRY_ERR: 0} start_time = datetime.datetime.utcnow() feed = self.refresh_feed(feed_id) try: ffeed = FetchFeed(feed_id, self.options) ret_feed, fetched_feed = ffeed.fetch() if (fetched_feed and ret_feed == FEED_OK) or self.options["force"]: pfeed = ProcessFeed(feed_id, fetched_feed, self.options) ret_feed, ret_entries = pfeed.process() feed = self.refresh_feed(feed_id) if ret_entries.get(ENTRY_NEW) or self.options["force"] or not feed.fetched_once: if not feed.fetched_once: feed.fetched_once = True feed.save() MUserStory.delete_old_stories(feed_id=feed.pk) user_subs = UserSubscription.objects.filter(feed=feed) logging.debug( u" ---> [%-30s] Computing scores for all feed subscribers: %s subscribers" % (unicode(feed)[:30], user_subs.count()) ) stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) for sub in user_subs: cache.delete("usersub:%s" % sub.user_id) silent = False if self.options["verbose"] >= 2 else True sub.calculate_feed_scores(silent=silent, stories_db=stories_db) cache.delete("feed_stories:%s-%s-%s" % (feed.id, 0, 25)) # if ret_entries.get(ENTRY_NEW) or ret_entries.get(ENTRY_UPDATED) or self.options['force']: # feed.get_stories(force=True) except KeyboardInterrupt: break except urllib2.HTTPError, e: feed.save_feed_history(e.code, e.msg, e.fp.read()) fetched_feed = None except Feed.DoesNotExist, e: logging.debug(" ---> [%-30s] Feed is now gone..." % (unicode(feed)[:30])) return
def handle(self, *args, **options): cursor = connection.cursor() cursor.execute("""SELECT DISTINCT f.id AS original_id, f2.id AS duplicate_id, f.feed_address AS original_feed_address, f2.feed_address AS duplicate_feed_address, f.feed_title AS original_feed_title, f2.feed_title AS duplicate_feed_title, f.feed_link AS original_feed_link, f2.feed_link AS duplicate_feed_link, fd2.feed_tagline AS original_feed_tagline, fd.feed_tagline AS duplicate_feed_tagline FROM feeds f, feeds f2 INNER JOIN rss_feeds_feeddata fd ON fd.feed_id = f.feed_id INNER JOIN rss_feeds_feeddata fd2 ON fd2.feed_id = f2.feed_id WHERE f2.id > f.id AND fd.feed_tagline = fd2.feed_tagline AND f.feed_link = f2.feed_link AND f.feed_title = f2.feed_title ORDER BY original_id ASC;""") feed_fields = ('original_id', 'duplicate_id', 'original_feed_address', 'duplicate_feed_address') skips = 0 merges = 0 for feeds_values in cursor.fetchall(): feeds = dict(zip(feed_fields, feeds_values)) duplicate_stories = MStory.objects(story_feed_id=feeds['duplicate_id']).only('story_guid')[5:8] duplicate_story_ids = [story.story_guid for story in duplicate_stories] original_stories = MStory.objects(story_feed_id=feeds['original_id'], story_guid__in=duplicate_story_ids) if duplicate_stories.count() == original_stories.count(): merges += 1 merge_feeds(feeds['original_id'], feeds['duplicate_id']) else: # print duplicate_stories # print duplicate_story_ids # print original_stories # print "Skipping: %s" % feeds skips += 1 print "Skips: %s, Merges: %s" % (skips, merges)
def mark_story_as_starred(request): code = 1 feed_id = int(request.POST['feed_id']) story_id = request.POST['story_id'] story = MStory.objects(story_feed_id=feed_id, story_guid=story_id).limit(1) if story: story_db = dict([(k, v) for k, v in story[0]._data.items() if k is not None and v is not None]) now = datetime.datetime.now() story_values = dict(user_id=request.user.pk, starred_date=now, **story_db) MStarredStory.objects.create(**story_values) logging.user(request.user, "~FCStarring: ~SB%s" % (story[0].story_title[:50])) else: code = -1 return {'code': code}
def run(self,feed_pk,**kwargs): from apps.rss_feeds.models import MStory stories = MStory.objects(story_feed_id=feed_pk) if len(stories): #is sort by story-data for story in stories: # start = time.time() story.fetch_reference_images() # num_valid_urls = 0 # for image_id in story.image_ids: # if len(image_id) > 20: # num_valid_urls +=1 # delta = time.time() - start # logging.info('Process ~FY%d~FW[~FB%d~FW] urls in ~FG%.4s~FW seconds.' % ( # num_valid_urls,len(story.image_urls),delta)) logging.info('---> ~FYProcess feed %d done!~FW'%feed_pk)
def send_story_email(request): code = 1 message = 'OK' story_id = request.POST['story_id'] feed_id = request.POST['feed_id'] to_address = request.POST['to'] from_name = request.POST['from_name'] from_email = request.POST['from_email'] comments = request.POST['comments'] comments = comments[:2048] # Separated due to PyLint from_address = '*****@*****.**' if not email_re.match(to_address): code = -1 message = 'You need to send the email to a valid email address.' elif not email_re.match(from_email): code = -1 message = 'You need to provide your email address.' elif not from_name: code = -1 message = 'You need to provide your name.' else: story = MStory.objects(story_feed_id=feed_id, story_guid=story_id)[0] story = Feed.format_story(story, feed_id, text=True) feed = Feed.objects.get(pk=story['story_feed_id']) text = render_to_string('mail/email_story_text.xhtml', locals()) html = render_to_string('mail/email_story_html.xhtml', locals()) subject = "%s is sharing a story with you: \"%s\"" % ( from_name, story['story_title']) subject = subject.replace('\n', ' ') msg = EmailMultiAlternatives( subject, text, from_email='NewsBlur <%s>' % from_address, to=[to_address], cc=['%s <%s>' % (from_name, from_email)], headers={'Reply-To': '%s <%s>' % (from_name, from_email)}) msg.attach_alternative(html, "text/html") msg.send() logging.user( request, '~BMSharing story by email: ~FY~SB%s~SN~BM~FY/~SB%s' % (story['story_title'][:50], feed.feed_title[:50])) return {'code': code, 'message': message}
def count_unreads_for_subscribers(self, feed): UNREAD_CUTOFF = datetime.datetime.utcnow() - datetime.timedelta(days=settings.DAYS_OF_UNREAD) user_subs = UserSubscription.objects.filter(feed=feed, active=True, user__profile__last_seen_on__gte=UNREAD_CUTOFF)\ .order_by('-last_read_date') logging.debug(u' ---> [%-30s] Computing scores: %s (%s/%s/%s) subscribers' % ( unicode(feed)[:30], user_subs.count(), feed.num_subscribers, feed.active_subscribers, feed.premium_subscribers)) stories_db = MStory.objects(story_feed_id=feed.pk, story_date__gte=UNREAD_CUTOFF) for sub in user_subs: cache.delete('usersub:%s' % sub.user_id) sub.needs_unread_recalc = True sub.save() if self.options['compute_scores']: for sub in user_subs: silent = False if self.options['verbose'] >= 2 else True sub.calculate_feed_scores(silent=silent, stories_db=stories_db)
def test_train(self): # user = User.objects.all() # feed = Feed.objects.all() management.call_command('loaddata', 'brownstoner.json', verbosity=0, commit=False) management.call_command('refresh_feed', force=1, feed=1, single_threaded=True, daemonize=False) management.call_command('loaddata', 'brownstoner2.json', verbosity=0, commit=False) management.call_command('refresh_feed', force=1, feed=1, single_threaded=True, daemonize=False) stories = MStory.objects(story_feed_id=1)[:53] phrasefilter = PhraseFilter() for story in stories: # print story.story_title, story.id phrasefilter.run(story.story_title, story.id) phrasefilter.pare_phrases() phrases = phrasefilter.get_phrases() print phrases tokenizer = Tokenizer(phrases) classifier = Bayes( tokenizer) # FisherClassifier(user[0], feed[0], phrases) classifier.train('good', 'House of the Day: 393 Pacific St.') classifier.train('good', 'House of the Day: 393 Pacific St.') classifier.train('good', 'Condo of the Day: 393 Pacific St.') classifier.train('good', 'Co-op of the Day: 393 Pacific St. #3') classifier.train('good', 'Co-op of the Day: 393 Pacific St. #3') classifier.train('good', 'Development Watch: 393 Pacific St. #3') classifier.train('bad', 'Development Watch: 393 Pacific St. #3') classifier.train('bad', 'Development Watch: 393 Pacific St. #3') classifier.train('bad', 'Development Watch: 393 Pacific St. #3') classifier.train('bad', 'Streetlevel: 393 Pacific St. #3') guess = dict(classifier.guess('Co-op of the Day: 413 Atlantic')) self.assertTrue(guess['good'] > .99) self.assertTrue('bad' not in guess) guess = dict(classifier.guess('House of the Day: 413 Atlantic')) self.assertTrue(guess['good'] > .99) self.assertTrue('bad' not in guess) guess = dict(classifier.guess('Development Watch: Yatta')) self.assertTrue(guess['bad'] > .7) self.assertTrue(guess['good'] < .3) guess = dict(classifier.guess('Development Watch: 393 Pacific St.')) self.assertTrue(guess['bad'] > .7) self.assertTrue(guess['good'] < .3) guess = dict(classifier.guess('Streetlevel: 123 Carlton St.')) self.assertTrue(guess['bad'] > .99) self.assertTrue('good' not in guess) guess = classifier.guess('Extra, Extra') self.assertTrue('bad' not in guess) self.assertTrue('good' not in guess) guess = classifier.guess('Nothing doing: 393 Pacific St.') self.assertTrue('bad' not in guess) self.assertTrue('good' not in guess)