Beispiel #1
0
def test_decide_accept_fulltext(name, expect_accept):
    rss_html = (_data_dir / f'{name}_rss.html').read_text()
    rss_html = _clean_story_html(rss_html)
    web_html = (_data_dir / f'{name}_web.html').read_text()
    web_html = _clean_story_html(web_html, readability=True)
    got_accept = decide_accept_fulltext(StoryContentInfo(web_html),
                                        StoryContentInfo(rss_html))
    assert got_accept == expect_accept
Beispiel #2
0
def test_rss_is_summary(name, expect_is_summary):
    rss_html = (_data_dir / f'{name}_rss.html').read_text()
    rss_html = _clean_story_html(rss_html)
    web_html = (_data_dir / f'{name}_web.html').read_text()
    web_html = _clean_story_html(web_html, readability=True)
    rss_text = StoryContentInfo(rss_html).text
    web_text = StoryContentInfo(web_html).text
    got = is_summary(rss_text, web_text)
    assert got == expect_is_summary
Beispiel #3
0
def _update_story(
    story: CommonStory,
    story_content_info: StoryContentInfo,
    content: str,
    summary: str,
    url: str,
    has_mathjax: bool = None,
    sentence_count: int = None
) -> FulltextAcceptStrategy:
    new_info = StoryContentInfo(content)
    accept = decide_accept_fulltext(new_info, story_content_info)
    if accept == FulltextAcceptStrategy.REJECT:
        msg = 'fetched story#%s,%s url=%r is not fulltext of feed story content'
        LOG.info(msg, story.feed_id, story.offset, url)
        return accept
    if accept == FulltextAcceptStrategy.APPEND:
        content = (story.content or '') + '\n<hr/>\n' + (content or '')
    data = dict(
        link=url,
        content=content,
        summary=summary,
        has_mathjax=has_mathjax,
        sentence_count=sentence_count,
    )
    STORY_SERVICE.update_story(story.feed_id, story.offset, data)
    return accept
Beispiel #4
0
def do_sync_story_fulltext(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int,
) -> T.dict(
        feed_id=T.int,
        offset=T.int.min(0),
        use_proxy=T.bool,
        url=T.url,
        response_status=T.int,
        accept=T_ACCEPT,
):
    with log_django_context_metric('harbor_rss.sync_story_fulltext:read'):
        feed = Feed.get_by_pk(feed_id, detail='+use_proxy')
        story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    assert story, f'story#{feed_id},{offset} not found'
    story_content_info = StoryContentInfo(story.content)
    num_sub_sentences = len(split_sentences(story_content_info.text))
    ret = dict(
        feed_id=feed_id,
        offset=offset,
        url=story.link,
        use_proxy=feed.use_proxy,
        accept=FulltextAcceptStrategy.REJECT.value,
    )
    try:
        result = ctx.ask(
            'worker_rss.fetch_story',
            dict(
                url=story.link,
                use_proxy=feed.use_proxy,
                feed_id=feed_id,
                offset=offset,
                num_sub_sentences=num_sub_sentences,
            ))
    except _TIMEOUT_ERRORS as ex:
        LOG.error(f'Ask worker_rss.fetch_story timeout: {ex}')
        ret.update(response_status=FeedResponseStatus.CONNECTION_TIMEOUT)
        return ret
    else:
        ret.update(
            response_status=result['response_status'],
            use_proxy=result['use_proxy'],
        )
        if not result['content']:
            return ret
    with log_django_context_metric('harbor_rss.sync_story_fulltext:write'):
        accept = _update_story(
            story=story,
            story_content_info=story_content_info,
            content=result['content'],
            summary=None,  # not need update summary
            url=result['url'],
            sentence_count=result['sentence_count'],
        )
        ret.update(accept=accept.value)
    return ret
Beispiel #5
0
def do_process_story_webpage(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int,
    url: T.url,
    text: T.str.maxlen(_MAX_STORY_HTML_LENGTH),
    num_sub_sentences: T.int.optional,
) -> SCHEMA_FETCH_STORY_RESULT:
    # https://github.com/dragnet-org/dragnet
    # https://github.com/misja/python-boilerpipe
    # https://github.com/dalab/web2text
    # https://github.com/grangier/python-goose
    # https://github.com/buriy/python-readability
    # https://github.com/codelucas/newspaper
    DEFAULT_RESULT = dict(feed_id=feed_id, offset=offset, url=url)
    text = text.strip()
    if not text:
        return DEFAULT_RESULT
    text = story_html_clean(text)
    content = story_readability(text)
    content = process_story_links(content, url)
    content_info = StoryContentInfo(content)
    text_content = shorten(content_info.text, width=_MAX_STORY_CONTENT_LENGTH)
    num_sentences = len(split_sentences(text_content))
    if len(content) > _MAX_STORY_CONTENT_LENGTH:
        msg = 'too large story#%s,%s size=%s url=%r, will only save plain text'
        LOG.warning(msg, feed_id, offset, len(content), url)
        content = text_content
    # 如果取回的内容比RSS内容更短,就不是正确的全文
    if num_sub_sentences is not None:
        if not is_fulltext_content(content_info):
            if num_sentences <= num_sub_sentences:
                msg = 'fetched story#%s,%s url=%s num_sentences=%s less than num_sub_sentences=%s'
                LOG.info(msg, feed_id, offset, url, num_sentences,
                         num_sub_sentences)
                return DEFAULT_RESULT
    summary = shorten(text_content, width=_MAX_STORY_SUMMARY_LENGTH)
    if not summary:
        return DEFAULT_RESULT
    result = dict(
        **DEFAULT_RESULT,
        content=content,
        summary=summary,
        sentence_count=num_sentences,
    )
    if not ctx.message.is_ask:
        ctx.hope('harbor_rss.update_story', result)
    return result
Beispiel #6
0
def do_update_story(ctx: ActorContext, feed_id: T.int, offset: T.int,
                    content: T.str, summary: T.str,
                    has_mathjax: T.bool.optional, url: T.url,
                    response_status: T.int.optional,
                    sentence_count: T.int.min(0).optional):
    story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    if not story:
        LOG.error('story#%s,%s not found', feed_id, offset)
        return
    _update_story(
        story=story,
        story_content_info=StoryContentInfo(story.content),
        content=content,
        summary=summary,
        url=url,
        has_mathjax=has_mathjax,
        sentence_count=sentence_count,
    )
Beispiel #7
0
def test_is_fulltext_content(name, expect_is_fulltext):
    html = (_data_dir / f'{name}.html').read_text()
    html = _clean_story_html(html, readability=name.endswith('web'))
    got = is_fulltext_content(StoryContentInfo(html))
    assert got == expect_is_fulltext
Beispiel #8
0
def _is_fulltext_story(story):
    if story.iframe_url or story.audio_url or story.image_url:
        return True
    return is_fulltext_content(StoryContentInfo(story.content))