def test_delete_by_retention(self): storys_0_30 = self.storys[:30] modified = Story.bulk_save_by_feed(self.feed_id, storys_0_30, batch_size=10) self.assertEqual(len(modified), 30) self.assert_feed_total_storys(30) self.assert_total_story_infos(0) storys_20_50 = self.storys[20:50] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_20_50, batch_size=10) self.assertEqual(len(modified), 20) self.assert_feed_total_storys(50) self.assert_total_story_infos(20) n = STORY_SERVICE.delete_by_retention(self.feed_id, retention=10, limit=10) self.assertEqual(n, 10) self.assert_feed_total_storys(50) self.assert_total_story_infos(20) n = STORY_SERVICE.delete_by_retention(self.feed_id, retention=10, limit=50) self.assertEqual(n, 30) self.assert_feed_total_storys(50) self.assert_total_story_infos(10)
def do_update_story( ctx: ActorContext, feed_id: T.int, offset: T.int, content: T.str, summary: T.str, has_mathjax: T.bool.optional, url: T.url, ): story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) if not story: LOG.error('story#%s,%s not found', feed_id, offset) return if not is_fulltext_content(content): story_text = processor.story_html_to_text(story.content) text = processor.story_html_to_text(content) if not is_summary(story_text, text): msg = 'fetched story#%s,%s url=%r is not fulltext of feed story content' LOG.info(msg, feed_id, offset, url) return data = dict( link=url, content=content, summary=summary, has_mathjax=has_mathjax, ) STORY_SERVICE.update_story(feed_id, offset, data) _detect_story_images(ctx, story)
def _update_story( story: CommonStory, story_content_info: StoryContentInfo, content: str, summary: str, url: str, has_mathjax: bool = None, sentence_count: int = None ) -> FulltextAcceptStrategy: new_info = StoryContentInfo(content) accept = decide_accept_fulltext(new_info, story_content_info) if accept == FulltextAcceptStrategy.REJECT: msg = 'fetched story#%s,%s url=%r is not fulltext of feed story content' LOG.info(msg, story.feed_id, story.offset, url) return accept if accept == FulltextAcceptStrategy.APPEND: content = (story.content or '') + '\n<hr/>\n' + (content or '') data = dict( link=url, content=content, summary=summary, has_mathjax=has_mathjax, sentence_count=sentence_count, ) STORY_SERVICE.update_story(story.feed_id, story.offset, data) return accept
def test_story_dt_and_content_length(self): dt = timezone.datetime(2019, 6, 1, 12, 12, 12, tzinfo=timezone.utc) story = { 'unique_id': f'blog.example.com/1', 'title': f'test story 1', 'dt_published': dt, 'dt_updated': dt, } modified = Story.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(0) dt_created = modified[0].dt_created dt_published = modified[0].dt_published assert modified[0].dt_updated == dt dt = dt + timezone.timedelta(days=1) updated_content = 'updated_content 1' story.update( content=updated_content, content_hash_base64=compute_hash_base64(updated_content), dt_published=dt, dt_updated=dt, ) modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(1) assert modified[0].dt_created == dt_created assert modified[0].dt_published == dt_published assert modified[0].dt_updated == dt assert modified[0].content_length == len(updated_content) dt = dt + timezone.timedelta(days=2) updated_content = 'updated_content 22' story.update( content=updated_content, content_hash_base64=compute_hash_base64(updated_content), dt_published=dt, dt_updated=dt, ) modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(1) assert modified[0].dt_created == dt_created assert modified[0].dt_published == dt_published assert modified[0].dt_updated == dt assert modified[0].content_length == len(updated_content)
def test_update_story(self): storys_0_20 = self.storys[:20] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_0_20, batch_size=10) self.assertEqual(len(modified), 20) self.assert_feed_total_storys(20) self.assert_total_story_infos(20) story_10 = self.updated_storys[10] data = {k: story_10[k] for k in ['content', 'summary', 'dt_published']} STORY_SERVICE.update_story(self.feed_id, 10, data) content_data = {'content': data['content']} STORY_SERVICE.update_story(self.feed_id, 10, content_data)
def do_clean_by_retention(ctx: ActorContext): retention = CONFIG.feed_story_retention feeds = Feed.take_retention_feeds(retention=retention, limit=50) LOG.info('found {} feeds need clean by retention'.format(len(feeds))) for feed in feeds: feed_id = feed['feed_id'] url = feed['url'] n = STORY_SERVICE.delete_by_retention(feed_id, retention=retention) LOG.info(f'deleted {n} storys of feed#{feed_id} {url} by retention')
def do_sync_story_fulltext( ctx: ActorContext, feed_id: T.int, offset: T.int, ) -> T.dict( feed_id=T.int, offset=T.int.min(0), use_proxy=T.bool, url=T.url, response_status=T.int, accept=T_ACCEPT, ): with log_django_context_metric('harbor_rss.sync_story_fulltext:read'): feed = Feed.get_by_pk(feed_id, detail='+use_proxy') story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) assert story, f'story#{feed_id},{offset} not found' story_content_info = StoryContentInfo(story.content) num_sub_sentences = len(split_sentences(story_content_info.text)) ret = dict( feed_id=feed_id, offset=offset, url=story.link, use_proxy=feed.use_proxy, accept=FulltextAcceptStrategy.REJECT.value, ) try: result = ctx.ask( 'worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, feed_id=feed_id, offset=offset, num_sub_sentences=num_sub_sentences, )) except _TIMEOUT_ERRORS as ex: LOG.error(f'Ask worker_rss.fetch_story timeout: {ex}') ret.update(response_status=FeedResponseStatus.CONNECTION_TIMEOUT) return ret else: ret.update( response_status=result['response_status'], use_proxy=result['use_proxy'], ) if not result['content']: return ret with log_django_context_metric('harbor_rss.sync_story_fulltext:write'): accept = _update_story( story=story, story_content_info=story_content_info, content=result['content'], summary=None, # not need update summary url=result['url'], sentence_count=result['sentence_count'], ) ret.update(accept=accept.value) return ret
def _replace_story_images(feed_id, offset): story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) image_processor = StoryImageProcessor(story.link, story.content) image_indexs = image_processor.parse() image_urls = _image_urls_of_indexs(image_indexs) if not image_urls: return image_statuses = ImageInfo.batch_detect_images(image_urls) image_replaces = {} for url, status in image_statuses.items(): if status in IMAGE_REFERER_DENY_STATUS: new_url_data = encode_image_url(url, story.link) image_replaces[url] = '/api/v1/image/{}?{}'.format(new_url_data, RSSANT_IMAGE_TAG) LOG.info(f'story#{feed_id},{offset} {story.link} ' f'replace {len(image_replaces)} referer deny images') # image_processor.process will (1) fix relative url (2) replace image url # call image_processor.process regardless of image_replaces is empty or not content = image_processor.process(image_indexs, image_replaces) STORY_SERVICE.update_story(feed_id, offset, {'content': content})
def _create_test_story( feed: Feed, ident: str, title: str, content: str, summary: str = None, **kwargs, ): story_url = urljoin(feed.url, f'/story/{ident}') if summary is None: summary = content[:80] story_entry = dict( ident=ident, title=title, url=story_url, content=content, summary=summary, **kwargs, ) story_entry = validate_story(story_entry) story = get_story_of_feed_entry(story_entry) STORY_SERVICE.bulk_save_by_feed(feed.id, [story], is_refresh=True)
def test_bulk_save_by_feed_refresh(self): storys_0_20 = self.storys[:20] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_0_20, batch_size=10) self.assertEqual(len(modified), 20) self.assert_feed_total_storys(20) self.assert_total_story_infos(20) modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_0_20, batch_size=10) self.assertEqual(len(modified), 0) self.assert_feed_total_storys(20) self.assert_total_story_infos(20) modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_0_20, batch_size=10, is_refresh=True) self.assertEqual(len(modified), 20) self.assert_feed_total_storys(20) self.assert_total_story_infos(20)
def test_new_bulk_save_by_feed(self): storys_0_30 = self.storys[:30] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_0_30, batch_size=10) self.assertEqual(len(modified), 30) self.assert_feed_total_storys(30) self.assert_total_story_infos(30) storys_20_50 = self.storys[20:50] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_20_50, batch_size=10) self.assertEqual(len(modified), 20) self.assert_feed_total_storys(50) self.assert_total_story_infos(50) updated_storys_30_50 = self.updated_storys[30:50] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, updated_storys_30_50, batch_size=10) self.assertEqual(len(modified), 20) self.assert_feed_total_storys(50) self.assert_total_story_infos(50)
def test_mix_bulk_save_by_feed(self): storys_0_30 = self.storys[:30] modified = Story.bulk_save_by_feed(self.feed_id, storys_0_30, batch_size=10) self.assertEqual(len(modified), 30) self.assert_feed_total_storys(30) self.assert_total_story_infos(0) storys_10_50 = self.updated_storys[10:30] + self.storys[30:50] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_10_50, batch_size=10) self.assertEqual(len(modified), 40) self.assert_feed_total_storys(50) self.assert_total_story_infos(40) storys_40_60 = self.storys[40:60] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_40_60, batch_size=10) self.assertEqual(len(modified), 10) self.assert_feed_total_storys(60) self.assert_total_story_infos(50)
def do_update_story(ctx: ActorContext, feed_id: T.int, offset: T.int, content: T.str, summary: T.str, has_mathjax: T.bool.optional, url: T.url, response_status: T.int.optional, sentence_count: T.int.min(0).optional): story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) if not story: LOG.error('story#%s,%s not found', feed_id, offset) return _update_story( story=story, story_content_info=StoryContentInfo(story.content), content=content, summary=summary, url=url, has_mathjax=has_mathjax, sentence_count=sentence_count, )
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) # FIXME: feed merge 无法正确处理订阅重定向问题。 # 对于这种情况,暂时保留旧的订阅,以后再彻底解决。 # if target_feed: # LOG.info(f'merge feed#{feed.id} url={feed.url} into ' # f'feed#{target_feed.id} url={target_feed.url}') # target_feed.merge(feed) # return if target_feed: LOG.warning( f'FIXME: redirect feed#{feed.id} url={feed.url!r} into ' f'feed#{target_feed.id} url={target_feed.url!r}') feed_dict.pop('url') # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if (v != '' and v is not None) or k in {'warnings'}: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.reverse_url = reverse_url(feed.url) feed.status = FeedStatus.READY feed.save() # save storys, bulk_save_by_feed has standalone transaction for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = STORY_SERVICE.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys)) feed = Feed.get_by_pk(feed_id) is_freezed = feed.freeze_level is None or feed.freeze_level > 1 if modified_storys and is_freezed: Feed.unfreeze_by_id(feed_id) need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not _is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell( 'worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, feed_id=story.feed_id, offset=story.offset, num_sub_sentences=num_sub_sentences, ))