def run(): with transaction.atomic(): feed_ids = [feed.id for feed in Feed.objects.only('id').all()] LOG.info('total %s feeds', len(feed_ids)) for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True): storys = query_old_storys_by_feed(feed_id) Story.bulk_save_by_feed(feed_id, storys)
def test_delete_by_retention(self): storys_0_30 = self.storys[:30] modified = Story.bulk_save_by_feed(self.feed_id, storys_0_30, batch_size=10) self.assertEqual(len(modified), 30) self.assert_feed_total_storys(30) self.assert_total_story_infos(0) storys_20_50 = self.storys[20:50] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_20_50, batch_size=10) self.assertEqual(len(modified), 20) self.assert_feed_total_storys(50) self.assert_total_story_infos(20) n = STORY_SERVICE.delete_by_retention(self.feed_id, retention=10, limit=10) self.assertEqual(n, 10) self.assert_feed_total_storys(50) self.assert_total_story_infos(20) n = STORY_SERVICE.delete_by_retention(self.feed_id, retention=10, limit=50) self.assertEqual(n, 30) self.assert_feed_total_storys(50) self.assert_total_story_infos(10)
def test_story_dt_and_content_length(self): dt = timezone.datetime(2019, 6, 1, 12, 12, 12, tzinfo=timezone.utc) story = { 'unique_id': f'blog.example.com/1', 'title': f'test story 1', 'dt_published': dt, 'dt_updated': dt, } modified = Story.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(0) dt_created = modified[0].dt_created dt_published = modified[0].dt_published assert modified[0].dt_updated == dt dt = dt + timezone.timedelta(days=1) updated_content = 'updated_content 1' story.update( content=updated_content, content_hash_base64=compute_hash_base64(updated_content), dt_published=dt, dt_updated=dt, ) modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(1) assert modified[0].dt_created == dt_created assert modified[0].dt_published == dt_published assert modified[0].dt_updated == dt assert modified[0].content_length == len(updated_content) dt = dt + timezone.timedelta(days=2) updated_content = 'updated_content 22' story.update( content=updated_content, content_hash_base64=compute_hash_base64(updated_content), dt_published=dt, dt_updated=dt, ) modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(1) assert modified[0].dt_created == dt_created assert modified[0].dt_published == dt_published assert modified[0].dt_updated == dt assert modified[0].content_length == len(updated_content)
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False).desc('Deprecated'), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return for k, v in feed_dict.items(): if v != '' and v is not None: setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if not feed.dt_updated: feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.status = FeedStatus.READY feed.save() for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = Story.bulk_save_by_feed(feed.id, storys) LOG.info( 'feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys) ) feed.refresh_from_db() if modified_storys: feed.unfreeze() need_fetch_story = _is_feed_need_fetch_storys(feed) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(feed, story)): ctx.tell('worker_rss.fetch_story', dict( url=story.link, story_id=str(story.id) )) else: _detect_story_images(ctx, story)
def test_mix_bulk_save_by_feed(self): storys_0_30 = self.storys[:30] modified = Story.bulk_save_by_feed(self.feed_id, storys_0_30, batch_size=10) self.assertEqual(len(modified), 30) self.assert_feed_total_storys(30) self.assert_total_story_infos(0) storys_10_50 = self.updated_storys[10:30] + self.storys[30:50] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_10_50, batch_size=10) self.assertEqual(len(modified), 40) self.assert_feed_total_storys(50) self.assert_total_story_infos(40) storys_40_60 = self.storys[40:60] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_40_60, batch_size=10) self.assertEqual(len(modified), 10) self.assert_feed_total_storys(60) self.assert_total_story_infos(50)
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if (v != '' and v is not None) or k in {'warnings'}: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.status = FeedStatus.READY feed.save() for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = Story.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys)) feed.refresh_from_db() if modified_storys: feed.unfreeze() need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell( 'worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, story_id=str(story.id), num_sub_sentences=num_sub_sentences, )) else: _detect_story_images(ctx, story)