Example #1
0
def do_process_story_webpage(
        ctx: ActorContext,
        story_id: T.int,
        url: T.url,
        text: T.str.maxlen(5 * 1024 * 1024),
):
    # https://github.com/dragnet-org/dragnet
    # https://github.com/misja/python-boilerpipe
    # https://github.com/dalab/web2text
    # https://github.com/grangier/python-goose
    # https://github.com/buriy/python-readability
    # https://github.com/codelucas/newspaper
    text = text.strip()
    if not text:
        return
    content = story_readability(text)
    content = process_story_links(content, url)
    summary = shorten(story_html_to_text(content), width=300)
    if not summary:
        return
    ctx.hope(
        'harbor_rss.update_story',
        dict(
            story_id=story_id,
            content=content,
            summary=summary,
            url=url,
        ))
Example #2
0
def do_save_feed_creation_result(
        ctx: ActorContext,
        feed_creation_id: T.int,
        messages: T.list(T.str),
        feed: FeedSchema.optional,
):
    with transaction.atomic():
        feed_dict = feed
        try:
            feed_creation = FeedCreation.get_by_pk(feed_creation_id)
        except FeedCreation.DoesNotExist:
            LOG.warning(f'feed creation {feed_creation_id} not exists')
            return
        if feed_creation.status == FeedStatus.READY:
            LOG.info(f'feed creation {feed_creation_id} is ready')
            return
        feed_creation.message = '\n\n'.join(messages)
        feed_creation.dt_updated = timezone.now()
        if not feed_dict:
            feed_creation.status = FeedStatus.ERROR
            feed_creation.save()
            FeedUrlMap(source=feed_creation.url,
                       target=FeedUrlMap.NOT_FOUND).save()
            return
        url = feed_dict['url']
        feed = Feed.get_first_by_url(url)
        if not feed:
            now = timezone.now()
            feed = Feed(url=url,
                        status=FeedStatus.READY,
                        reverse_url=reverse_url(url),
                        dt_updated=now,
                        dt_checked=now,
                        dt_synced=now)
            feed.save()
        feed_creation.status = FeedStatus.READY
        feed_creation.feed_id = feed.id
        feed_creation.save()
        user_feed = UserFeed.objects.filter(user_id=feed_creation.user_id,
                                            feed_id=feed.id).first()
        if user_feed:
            LOG.info('UserFeed#{} user_id={} feed_id={} already exists'.format(
                user_feed.id, feed_creation.user_id, feed.id))
        else:
            user_feed = UserFeed(
                user_id=feed_creation.user_id,
                feed_id=feed.id,
                is_from_bookmark=feed_creation.is_from_bookmark,
            )
            user_feed.save()
        FeedUrlMap(source=feed_creation.url, target=feed.url).save()
        if feed.url != feed_creation.url:
            FeedUrlMap(source=feed.url, target=feed.url).save()
    ctx.hope('harbor_rss.update_feed',
             dict(
                 feed_id=feed.id,
                 feed=validate_feed_output(feed_dict),
             ))
Example #3
0
def _retry_feed_creations(ctx: ActorContext, feed_creation_id_urls):
    feed_creation_ids = [id for (id, url) in feed_creation_id_urls]
    FeedCreation.bulk_set_pending(feed_creation_ids)
    expire_at = time.time() + 60 * 60
    for feed_creation_id, url in feed_creation_id_urls:
        ctx.hope('worker_rss.find_feed', dict(
            feed_creation_id=feed_creation_id,
            url=url,
        ), expire_at=expire_at)
Example #4
0
def do_check_feed(ctx: ActorContext):
    rand_sec = random.random() * CHECK_FEED_SECONDS / 10
    outdate_seconds = CHECK_FEED_SECONDS + rand_sec
    feeds = Feed.take_outdated_feeds(outdate_seconds)
    expire_at = time.time() + outdate_seconds
    LOG.info('found {} feeds need sync'.format(len(feeds)))
    for feed in feeds:
        ctx.hope('worker_rss.sync_feed', dict(
            feed_id=feed['feed_id'],
            url=feed['url'],
        ), expire_at=expire_at)
Example #5
0
def do_process_story_webpage(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int,
    url: T.url,
    text: T.str.maxlen(_MAX_STORY_HTML_LENGTH),
    num_sub_sentences: T.int.optional,
) -> SCHEMA_FETCH_STORY_RESULT:
    # https://github.com/dragnet-org/dragnet
    # https://github.com/misja/python-boilerpipe
    # https://github.com/dalab/web2text
    # https://github.com/grangier/python-goose
    # https://github.com/buriy/python-readability
    # https://github.com/codelucas/newspaper
    DEFAULT_RESULT = dict(feed_id=feed_id, offset=offset, url=url)
    text = text.strip()
    if not text:
        return DEFAULT_RESULT
    text = story_html_clean(text)
    content = story_readability(text)
    content = process_story_links(content, url)
    content_info = StoryContentInfo(content)
    text_content = shorten(content_info.text, width=_MAX_STORY_CONTENT_LENGTH)
    num_sentences = len(split_sentences(text_content))
    if len(content) > _MAX_STORY_CONTENT_LENGTH:
        msg = 'too large story#%s,%s size=%s url=%r, will only save plain text'
        LOG.warning(msg, feed_id, offset, len(content), url)
        content = text_content
    # 如果取回的内容比RSS内容更短,就不是正确的全文
    if num_sub_sentences is not None:
        if not is_fulltext_content(content_info):
            if num_sentences <= num_sub_sentences:
                msg = 'fetched story#%s,%s url=%s num_sentences=%s less than num_sub_sentences=%s'
                LOG.info(msg, feed_id, offset, url, num_sentences,
                         num_sub_sentences)
                return DEFAULT_RESULT
    summary = shorten(text_content, width=_MAX_STORY_SUMMARY_LENGTH)
    if not summary:
        return DEFAULT_RESULT
    result = dict(
        **DEFAULT_RESULT,
        content=content,
        summary=summary,
        sentence_count=num_sentences,
    )
    if not ctx.message.is_ask:
        ctx.hope('harbor_rss.update_story', result)
    return result
Example #6
0
def do_process_story_webpage(
    ctx: ActorContext,
    story_id: T.int,
    url: T.url,
    text: T.str.maxlen(_MAX_STORY_HTML_LENGTH),
    num_sub_sentences: T.int.optional,
):
    # https://github.com/dragnet-org/dragnet
    # https://github.com/misja/python-boilerpipe
    # https://github.com/dalab/web2text
    # https://github.com/grangier/python-goose
    # https://github.com/buriy/python-readability
    # https://github.com/codelucas/newspaper
    text = text.strip()
    if not text:
        return
    text = story_html_clean(text)
    content = story_readability(text)
    content = process_story_links(content, url)
    if len(content) > _MAX_STORY_CONTENT_LENGTH:
        msg = 'too large story#%s size=%s url=%r, will only save plain text'
        LOG.warning(msg, story_id, len(content), url)
        content = shorten(story_html_to_text(content),
                          width=_MAX_STORY_CONTENT_LENGTH)
    # 如果取回的内容比RSS内容更短,就不是正确的全文
    if num_sub_sentences is not None:
        if not is_fulltext_content(content):
            num_sentences = len(split_sentences(story_html_to_text(content)))
            if num_sentences <= num_sub_sentences:
                msg = 'fetched story#%s url=%s num_sentences=%s less than num_sub_sentences=%s'
                LOG.info(msg, story_id, url, num_sentences, num_sub_sentences)
                return
    summary = shorten(story_html_to_text(content),
                      width=_MAX_STORY_SUMMARY_LENGTH)
    if not summary:
        return
    ctx.hope(
        'harbor_rss.update_story',
        dict(
            story_id=story_id,
            content=content,
            summary=summary,
            url=url,
        ))
Example #7
0
def do_register(ctx: ActorContext, node: NodeSpecSchema):
    LOG.info(f'register node {node}')
    ctx.registery.add(node)
    ctx.hope('registery.check', dict(node=node))