def do_update_story( ctx: ActorContext, feed_id: T.int, offset: T.int, content: T.str, summary: T.str, has_mathjax: T.bool.optional, url: T.url, ): story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) if not story: LOG.error('story#%s,%s not found', feed_id, offset) return if not is_fulltext_content(content): story_text = processor.story_html_to_text(story.content) text = processor.story_html_to_text(content) if not is_summary(story_text, text): msg = 'fetched story#%s,%s url=%r is not fulltext of feed story content' LOG.info(msg, feed_id, offset, url) return data = dict( link=url, content=content, summary=summary, has_mathjax=has_mathjax, ) STORY_SERVICE.update_story(feed_id, offset, data) _detect_story_images(ctx, story)
def do_update_story( ctx: ActorContext, story_id: T.int, content: T.str, summary: T.str, has_mathjax: T.bool.optional, url: T.url, ): story = Story.objects.get(pk=story_id) if not is_fulltext_content(content): story_text = processor.story_html_to_text(story.content) text = processor.story_html_to_text(content) if not is_summary(story_text, text): msg = 'fetched story#%s url=%r is not fulltext of feed story content' LOG.info(msg, story_id, url) return with transaction.atomic(): story.refresh_from_db() story.link = url story.content = content story.summary = summary if has_mathjax is not None: story.has_mathjax = has_mathjax story.save() _detect_story_images(ctx, story)
def _get_storys(entries: list): storys = deque(maxlen=300) # limit num storys while entries: data = entries.pop() story = {} content = '' if data["content"]: # both content and summary will in content list, peek the longest for x in data["content"]: value = x["value"] if value and len(value) > len(content): content = value if not content: content = data["description"] if not content: content = data["summary"] story['has_mathjax'] = story_has_mathjax(content) link = normlize_url(data["link"]) valid_link = '' if link: try: valid_link = validate_url(link) except Invalid: LOG.warning(f'invalid story link {link!r}') story['link'] = valid_link content = story_html_clean(content) if len(content) >= 1024 * 1024: msg = 'too large story link=%r content length=%s, will only save plain text!' LOG.warning(msg, link, len(content)) content = story_html_to_text(content) content = process_story_links(content, valid_link) story['content'] = content summary = data["summary"] if not summary: summary = content summary = shorten(story_html_to_text(summary), width=300) story['summary'] = summary title = shorten(data["title"] or link or summary, 200) unique_id = shorten(data['id'] or link or title, 200) content_hash_base64 = compute_hash_base64(content, summary, title) story['title'] = title story['content_hash_base64'] = content_hash_base64 story['unique_id'] = unique_id story['author'] = shorten(data["author"], 200) story['dt_published'] = _get_dt_published(data) story['dt_updated'] = _get_dt_updated(data) storys.append(story) return list(storys)
async def do_fetch_story( ctx: ActorContext, story_id: T.int, url: T.url, use_proxy: T.bool.default(False), num_sub_sentences: T.int.optional, ): LOG.info(f'fetch story#{story_id} url={unquote(url)} begin') options = _get_proxy_options() options.update(allow_private_address=CONFIG.allow_private_address) if DNS_SERVICE.is_resolved_url(url): use_proxy = False async with AsyncFeedReader(**options) as reader: use_proxy = use_proxy and reader.has_rss_proxy url_content = await _fetch_story(reader, story_id, url, use_proxy=use_proxy) if not url_content: return url, content = url_content if len(content) >= _MAX_STORY_HTML_LENGTH: content = story_html_clean(content) if len(content) >= _MAX_STORY_HTML_LENGTH: msg = 'too large story#%s size=%s url=%r' LOG.warning(msg, story_id, len(content), url) content = story_html_to_text(content)[:_MAX_STORY_HTML_LENGTH] await ctx.hope( 'worker_rss.process_story_webpage', dict( story_id=story_id, url=url, text=content, num_sub_sentences=num_sub_sentences, ))
def do_process_story_webpage( ctx: ActorContext, story_id: T.int, url: T.url, text: T.str.maxlen(5 * 1024 * 1024), ): # https://github.com/dragnet-org/dragnet # https://github.com/misja/python-boilerpipe # https://github.com/dalab/web2text # https://github.com/grangier/python-goose # https://github.com/buriy/python-readability # https://github.com/codelucas/newspaper text = text.strip() if not text: return content = story_readability(text) content = process_story_links(content, url) summary = shorten(story_html_to_text(content), width=300) if not summary: return ctx.hope( 'harbor_rss.update_story', dict( story_id=story_id, content=content, summary=summary, url=url, ))
async def do_fetch_story( ctx: ActorContext, story_id: T.int, url: T.url, use_proxy: T.bool.default(False), ): LOG.info(f'fetch story#{story_id} url={unquote(url)} begin') async with AsyncFeedReader(**_get_proxy_options()) as reader: use_proxy = use_proxy and reader.has_rss_proxy status, response = await reader.read(url, use_proxy=use_proxy) if response and response.url: url = str(response.url) LOG.info( f'fetch story#{story_id} url={unquote(url)} status={status} finished') if not (response and status == 200): return if not response.rssant_text: msg = 'story#%s url=%s response text is empty!' LOG.error(msg, story_id, unquote(url)) return content = response.rssant_text if len(content) >= 1024 * 1024: content = story_html_clean(content) if len(content) >= 1024 * 1024: msg = 'too large story#%s size=%s url=%r' LOG.warning(msg, story_id, len(content), url) content = story_html_to_text(content) await ctx.hope('worker_rss.process_story_webpage', dict( story_id=story_id, url=url, text=content, ))
def do_process_story_webpage( ctx: ActorContext, story_id: T.int, url: T.url, text: T.str.maxlen(_MAX_STORY_HTML_LENGTH), num_sub_sentences: T.int.optional, ): # https://github.com/dragnet-org/dragnet # https://github.com/misja/python-boilerpipe # https://github.com/dalab/web2text # https://github.com/grangier/python-goose # https://github.com/buriy/python-readability # https://github.com/codelucas/newspaper text = text.strip() if not text: return text = story_html_clean(text) content = story_readability(text) content = process_story_links(content, url) if len(content) > _MAX_STORY_CONTENT_LENGTH: msg = 'too large story#%s size=%s url=%r, will only save plain text' LOG.warning(msg, story_id, len(content), url) content = shorten(story_html_to_text(content), width=_MAX_STORY_CONTENT_LENGTH) # 如果取回的内容比RSS内容更短,就不是正确的全文 if num_sub_sentences is not None: if not is_fulltext_content(content): num_sentences = len(split_sentences(story_html_to_text(content))) if num_sentences <= num_sub_sentences: msg = 'fetched story#%s url=%s num_sentences=%s less than num_sub_sentences=%s' LOG.info(msg, story_id, url, num_sentences, num_sub_sentences) return summary = shorten(story_html_to_text(content), width=_MAX_STORY_SUMMARY_LENGTH) if not summary: return ctx.hope( 'harbor_rss.update_story', dict( story_id=story_id, content=content, summary=summary, url=url, ))
async def do_fetch_story( ctx: ActorContext, feed_id: T.int, offset: T.int.min(0), url: T.url, use_proxy: T.bool.default(False), num_sub_sentences: T.int.optional, ) -> SCHEMA_FETCH_STORY_RESULT: LOG.info(f'fetch story#{feed_id},{offset} url={unquote(url)} begin') options = _proxy_helper.get_proxy_options() if DNS_SERVICE.is_resolved_url(url): use_proxy = False # make timeout less than actor default 30s to avoid ask timeout options.update(request_timeout=25) async with AsyncFeedReader(**options) as reader: use_proxy = use_proxy and reader.has_proxy url, content, response = await _fetch_story(reader, feed_id, offset, url, use_proxy=use_proxy) DEFAULT_RESULT = dict(feed_id=feed_id, offset=offset, url=url, response_status=response.status, use_proxy=response.use_proxy) if not content: return DEFAULT_RESULT if len(content) >= _MAX_STORY_HTML_LENGTH: content = story_html_clean(content) if len(content) >= _MAX_STORY_HTML_LENGTH: msg = 'too large story#%s,%s size=%s url=%r' LOG.warning(msg, feed_id, offset, len(content), url) content = story_html_to_text(content)[:_MAX_STORY_HTML_LENGTH] msg_func = ctx.ask if ctx.message.is_ask else ctx.hope result = await msg_func( 'worker_rss.process_story_webpage', dict( feed_id=feed_id, offset=offset, url=url, text=content, num_sub_sentences=num_sub_sentences, )) if not ctx.message.is_ask: return DEFAULT_RESULT result.update(DEFAULT_RESULT) return result
async def do_fetch_story( ctx: ActorContext, story_id: T.int, url: T.url, use_proxy: T.bool.default(False), ): LOG.info(f'fetch story#{story_id} url={unquote(url)} begin') options = _get_proxy_options() options.update(allow_private_address=CONFIG.allow_private_address) async with AsyncFeedReader(**options) as reader: use_proxy = use_proxy and reader.has_rss_proxy response = await reader.read(url, use_proxy=use_proxy) if response and response.url: url = str(response.url) LOG.info( f'fetch story#{story_id} url={unquote(url)} status={response.status} finished' ) if not (response and response.ok): return if not response.content: msg = 'story#%s url=%s response text is empty!' LOG.error(msg, story_id, unquote(url)) return try: content = response.content.decode(response.encoding) except UnicodeDecodeError as ex: LOG.warning('fetch story unicode decode error=%s url=%r', ex, url) content = response.content.decode(response.encoding, errors='ignore') if len(content) >= 1024 * 1024: content = story_html_clean(content) if len(content) >= 1024 * 1024: msg = 'too large story#%s size=%s url=%r' LOG.warning(msg, story_id, len(content), url) content = story_html_to_text(content) await ctx.hope('worker_rss.process_story_webpage', dict( story_id=story_id, url=url, text=content, ))
def _get_storys(entries: list): storys = deque(maxlen=300) # limit num storys while entries: data = entries.pop() story = {} story['unique_id'] = shorten(_get_story_unique_id(data), 200) content = '' if data["content"]: # both content and summary will in content list, peek the longest for x in data["content"]: value = x["value"] if value and len(value) > len(content): content = value if not content: content = data["description"] if not content: content = data["summary"] story['has_mathjax'] = story_has_mathjax(content) content = story_html_clean(content) content = process_story_links(content, data["link"]) story['content'] = content summary = data["summary"] if not summary: summary = content # TODO: performance summary = shorten(story_html_to_text(summary), width=300) story['summary'] = summary story['link'] = data["link"] title = shorten(data["title"] or story['link'] or story['unique_id'], 200) content_hash_base64 = compute_hash_base64(content, summary, title) story['title'] = title story['content_hash_base64'] = content_hash_base64 story['author'] = shorten(data["author"], 200) story['dt_published'] = _get_dt_published(data) story['dt_updated'] = _get_dt_updated(data) storys.append(story) return list(storys)
def _compute_sentence_count(content: str) -> int: return len(split_sentences(story_html_to_text(content)))
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) # FIXME: feed merge 无法正确处理订阅重定向问题。 # 对于这种情况,暂时保留旧的订阅,以后再彻底解决。 # if target_feed: # LOG.info(f'merge feed#{feed.id} url={feed.url} into ' # f'feed#{target_feed.id} url={target_feed.url}') # target_feed.merge(feed) # return if target_feed: LOG.warning( f'FIXME: redirect feed#{feed.id} url={feed.url!r} into ' f'feed#{target_feed.id} url={target_feed.url!r}') feed_dict.pop('url') # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if (v != '' and v is not None) or k in {'warnings'}: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.reverse_url = reverse_url(feed.url) feed.status = FeedStatus.READY feed.save() # save storys, bulk_save_by_feed has standalone transaction for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = STORY_SERVICE.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys)) feed = Feed.get_by_pk(feed_id) is_freezed = feed.freeze_level is None or feed.freeze_level > 1 if modified_storys and is_freezed: Feed.unfreeze_by_id(feed_id) need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not _is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell( 'worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, feed_id=story.feed_id, offset=story.offset, num_sub_sentences=num_sub_sentences, ))
def summary(self): if self.dt_created < _DATE_LAST_HTML_SUMMARY: return story_html_to_text(self._story.summary) return self._story.summary
def convert_summary(summary): return story_html_to_text(summary)
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if v != '' and v is not None: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.status = FeedStatus.READY feed.save() for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = Story.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info( 'feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys) ) feed.refresh_from_db() if modified_storys: feed.unfreeze() need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell('worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, story_id=str(story.id), num_sub_sentences=num_sub_sentences, )) else: _detect_story_images(ctx, story)