def test_decide_accept_fulltext(name, expect_accept): rss_html = (_data_dir / f'{name}_rss.html').read_text() rss_html = _clean_story_html(rss_html) web_html = (_data_dir / f'{name}_web.html').read_text() web_html = _clean_story_html(web_html, readability=True) got_accept = decide_accept_fulltext(StoryContentInfo(web_html), StoryContentInfo(rss_html)) assert got_accept == expect_accept
def test_rss_is_summary(name, expect_is_summary): rss_html = (_data_dir / f'{name}_rss.html').read_text() rss_html = _clean_story_html(rss_html) web_html = (_data_dir / f'{name}_web.html').read_text() web_html = _clean_story_html(web_html, readability=True) rss_text = StoryContentInfo(rss_html).text web_text = StoryContentInfo(web_html).text got = is_summary(rss_text, web_text) assert got == expect_is_summary
def _update_story( story: CommonStory, story_content_info: StoryContentInfo, content: str, summary: str, url: str, has_mathjax: bool = None, sentence_count: int = None ) -> FulltextAcceptStrategy: new_info = StoryContentInfo(content) accept = decide_accept_fulltext(new_info, story_content_info) if accept == FulltextAcceptStrategy.REJECT: msg = 'fetched story#%s,%s url=%r is not fulltext of feed story content' LOG.info(msg, story.feed_id, story.offset, url) return accept if accept == FulltextAcceptStrategy.APPEND: content = (story.content or '') + '\n<hr/>\n' + (content or '') data = dict( link=url, content=content, summary=summary, has_mathjax=has_mathjax, sentence_count=sentence_count, ) STORY_SERVICE.update_story(story.feed_id, story.offset, data) return accept
def do_sync_story_fulltext( ctx: ActorContext, feed_id: T.int, offset: T.int, ) -> T.dict( feed_id=T.int, offset=T.int.min(0), use_proxy=T.bool, url=T.url, response_status=T.int, accept=T_ACCEPT, ): with log_django_context_metric('harbor_rss.sync_story_fulltext:read'): feed = Feed.get_by_pk(feed_id, detail='+use_proxy') story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) assert story, f'story#{feed_id},{offset} not found' story_content_info = StoryContentInfo(story.content) num_sub_sentences = len(split_sentences(story_content_info.text)) ret = dict( feed_id=feed_id, offset=offset, url=story.link, use_proxy=feed.use_proxy, accept=FulltextAcceptStrategy.REJECT.value, ) try: result = ctx.ask( 'worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, feed_id=feed_id, offset=offset, num_sub_sentences=num_sub_sentences, )) except _TIMEOUT_ERRORS as ex: LOG.error(f'Ask worker_rss.fetch_story timeout: {ex}') ret.update(response_status=FeedResponseStatus.CONNECTION_TIMEOUT) return ret else: ret.update( response_status=result['response_status'], use_proxy=result['use_proxy'], ) if not result['content']: return ret with log_django_context_metric('harbor_rss.sync_story_fulltext:write'): accept = _update_story( story=story, story_content_info=story_content_info, content=result['content'], summary=None, # not need update summary url=result['url'], sentence_count=result['sentence_count'], ) ret.update(accept=accept.value) return ret
def do_process_story_webpage( ctx: ActorContext, feed_id: T.int, offset: T.int, url: T.url, text: T.str.maxlen(_MAX_STORY_HTML_LENGTH), num_sub_sentences: T.int.optional, ) -> SCHEMA_FETCH_STORY_RESULT: # https://github.com/dragnet-org/dragnet # https://github.com/misja/python-boilerpipe # https://github.com/dalab/web2text # https://github.com/grangier/python-goose # https://github.com/buriy/python-readability # https://github.com/codelucas/newspaper DEFAULT_RESULT = dict(feed_id=feed_id, offset=offset, url=url) text = text.strip() if not text: return DEFAULT_RESULT text = story_html_clean(text) content = story_readability(text) content = process_story_links(content, url) content_info = StoryContentInfo(content) text_content = shorten(content_info.text, width=_MAX_STORY_CONTENT_LENGTH) num_sentences = len(split_sentences(text_content)) if len(content) > _MAX_STORY_CONTENT_LENGTH: msg = 'too large story#%s,%s size=%s url=%r, will only save plain text' LOG.warning(msg, feed_id, offset, len(content), url) content = text_content # 如果取回的内容比RSS内容更短,就不是正确的全文 if num_sub_sentences is not None: if not is_fulltext_content(content_info): if num_sentences <= num_sub_sentences: msg = 'fetched story#%s,%s url=%s num_sentences=%s less than num_sub_sentences=%s' LOG.info(msg, feed_id, offset, url, num_sentences, num_sub_sentences) return DEFAULT_RESULT summary = shorten(text_content, width=_MAX_STORY_SUMMARY_LENGTH) if not summary: return DEFAULT_RESULT result = dict( **DEFAULT_RESULT, content=content, summary=summary, sentence_count=num_sentences, ) if not ctx.message.is_ask: ctx.hope('harbor_rss.update_story', result) return result
def do_update_story(ctx: ActorContext, feed_id: T.int, offset: T.int, content: T.str, summary: T.str, has_mathjax: T.bool.optional, url: T.url, response_status: T.int.optional, sentence_count: T.int.min(0).optional): story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) if not story: LOG.error('story#%s,%s not found', feed_id, offset) return _update_story( story=story, story_content_info=StoryContentInfo(story.content), content=content, summary=summary, url=url, has_mathjax=has_mathjax, sentence_count=sentence_count, )
def test_is_fulltext_content(name, expect_is_fulltext): html = (_data_dir / f'{name}.html').read_text() html = _clean_story_html(html, readability=name.endswith('web')) got = is_fulltext_content(StoryContentInfo(html)) assert got == expect_is_fulltext
def _is_fulltext_story(story): if story.iframe_url or story.audio_url or story.image_url: return True return is_fulltext_content(StoryContentInfo(story.content))