def do_process_story_webpage( ctx: ActorContext, story_id: T.int, url: T.url, text: T.str.maxlen(5 * 1024 * 1024), ): # https://github.com/dragnet-org/dragnet # https://github.com/misja/python-boilerpipe # https://github.com/dalab/web2text # https://github.com/grangier/python-goose # https://github.com/buriy/python-readability # https://github.com/codelucas/newspaper text = text.strip() if not text: return content = story_readability(text) content = process_story_links(content, url) summary = shorten(story_html_to_text(content), width=300) if not summary: return ctx.hope( 'harbor_rss.update_story', dict( story_id=story_id, content=content, summary=summary, url=url, ))
def do_save_feed_creation_result( ctx: ActorContext, feed_creation_id: T.int, messages: T.list(T.str), feed: FeedSchema.optional, ): with transaction.atomic(): feed_dict = feed try: feed_creation = FeedCreation.get_by_pk(feed_creation_id) except FeedCreation.DoesNotExist: LOG.warning(f'feed creation {feed_creation_id} not exists') return if feed_creation.status == FeedStatus.READY: LOG.info(f'feed creation {feed_creation_id} is ready') return feed_creation.message = '\n\n'.join(messages) feed_creation.dt_updated = timezone.now() if not feed_dict: feed_creation.status = FeedStatus.ERROR feed_creation.save() FeedUrlMap(source=feed_creation.url, target=FeedUrlMap.NOT_FOUND).save() return url = feed_dict['url'] feed = Feed.get_first_by_url(url) if not feed: now = timezone.now() feed = Feed(url=url, status=FeedStatus.READY, reverse_url=reverse_url(url), dt_updated=now, dt_checked=now, dt_synced=now) feed.save() feed_creation.status = FeedStatus.READY feed_creation.feed_id = feed.id feed_creation.save() user_feed = UserFeed.objects.filter(user_id=feed_creation.user_id, feed_id=feed.id).first() if user_feed: LOG.info('UserFeed#{} user_id={} feed_id={} already exists'.format( user_feed.id, feed_creation.user_id, feed.id)) else: user_feed = UserFeed( user_id=feed_creation.user_id, feed_id=feed.id, is_from_bookmark=feed_creation.is_from_bookmark, ) user_feed.save() FeedUrlMap(source=feed_creation.url, target=feed.url).save() if feed.url != feed_creation.url: FeedUrlMap(source=feed.url, target=feed.url).save() ctx.hope('harbor_rss.update_feed', dict( feed_id=feed.id, feed=validate_feed_output(feed_dict), ))
def _retry_feed_creations(ctx: ActorContext, feed_creation_id_urls): feed_creation_ids = [id for (id, url) in feed_creation_id_urls] FeedCreation.bulk_set_pending(feed_creation_ids) expire_at = time.time() + 60 * 60 for feed_creation_id, url in feed_creation_id_urls: ctx.hope('worker_rss.find_feed', dict( feed_creation_id=feed_creation_id, url=url, ), expire_at=expire_at)
def do_check_feed(ctx: ActorContext): rand_sec = random.random() * CHECK_FEED_SECONDS / 10 outdate_seconds = CHECK_FEED_SECONDS + rand_sec feeds = Feed.take_outdated_feeds(outdate_seconds) expire_at = time.time() + outdate_seconds LOG.info('found {} feeds need sync'.format(len(feeds))) for feed in feeds: ctx.hope('worker_rss.sync_feed', dict( feed_id=feed['feed_id'], url=feed['url'], ), expire_at=expire_at)
def do_process_story_webpage( ctx: ActorContext, feed_id: T.int, offset: T.int, url: T.url, text: T.str.maxlen(_MAX_STORY_HTML_LENGTH), num_sub_sentences: T.int.optional, ) -> SCHEMA_FETCH_STORY_RESULT: # https://github.com/dragnet-org/dragnet # https://github.com/misja/python-boilerpipe # https://github.com/dalab/web2text # https://github.com/grangier/python-goose # https://github.com/buriy/python-readability # https://github.com/codelucas/newspaper DEFAULT_RESULT = dict(feed_id=feed_id, offset=offset, url=url) text = text.strip() if not text: return DEFAULT_RESULT text = story_html_clean(text) content = story_readability(text) content = process_story_links(content, url) content_info = StoryContentInfo(content) text_content = shorten(content_info.text, width=_MAX_STORY_CONTENT_LENGTH) num_sentences = len(split_sentences(text_content)) if len(content) > _MAX_STORY_CONTENT_LENGTH: msg = 'too large story#%s,%s size=%s url=%r, will only save plain text' LOG.warning(msg, feed_id, offset, len(content), url) content = text_content # 如果取回的内容比RSS内容更短,就不是正确的全文 if num_sub_sentences is not None: if not is_fulltext_content(content_info): if num_sentences <= num_sub_sentences: msg = 'fetched story#%s,%s url=%s num_sentences=%s less than num_sub_sentences=%s' LOG.info(msg, feed_id, offset, url, num_sentences, num_sub_sentences) return DEFAULT_RESULT summary = shorten(text_content, width=_MAX_STORY_SUMMARY_LENGTH) if not summary: return DEFAULT_RESULT result = dict( **DEFAULT_RESULT, content=content, summary=summary, sentence_count=num_sentences, ) if not ctx.message.is_ask: ctx.hope('harbor_rss.update_story', result) return result
def do_process_story_webpage( ctx: ActorContext, story_id: T.int, url: T.url, text: T.str.maxlen(_MAX_STORY_HTML_LENGTH), num_sub_sentences: T.int.optional, ): # https://github.com/dragnet-org/dragnet # https://github.com/misja/python-boilerpipe # https://github.com/dalab/web2text # https://github.com/grangier/python-goose # https://github.com/buriy/python-readability # https://github.com/codelucas/newspaper text = text.strip() if not text: return text = story_html_clean(text) content = story_readability(text) content = process_story_links(content, url) if len(content) > _MAX_STORY_CONTENT_LENGTH: msg = 'too large story#%s size=%s url=%r, will only save plain text' LOG.warning(msg, story_id, len(content), url) content = shorten(story_html_to_text(content), width=_MAX_STORY_CONTENT_LENGTH) # 如果取回的内容比RSS内容更短,就不是正确的全文 if num_sub_sentences is not None: if not is_fulltext_content(content): num_sentences = len(split_sentences(story_html_to_text(content))) if num_sentences <= num_sub_sentences: msg = 'fetched story#%s url=%s num_sentences=%s less than num_sub_sentences=%s' LOG.info(msg, story_id, url, num_sentences, num_sub_sentences) return summary = shorten(story_html_to_text(content), width=_MAX_STORY_SUMMARY_LENGTH) if not summary: return ctx.hope( 'harbor_rss.update_story', dict( story_id=story_id, content=content, summary=summary, url=url, ))
def do_register(ctx: ActorContext, node: NodeSpecSchema): LOG.info(f'register node {node}') ctx.registery.add(node) ctx.hope('registery.check', dict(node=node))