def _do_find(url, max_trys, printer, rss_proxy_url, rss_proxy_token): def message_handler(msg): print(msg) finder = FeedFinder( url, max_trys=max_trys, rss_proxy_url=rss_proxy_url, rss_proxy_token=rss_proxy_token, message_handler=message_handler, ) with finder: found = finder.find() if found: response, raw_result = found printer('-> {}'.format(response)) result = FeedParser().parse(raw_result) printer("-> {}".format(result)) printer('-' * 79) printer(pretty_format_json(result.feed)) for i, story in enumerate(result.storys): printer('{:03d}{}'.format(i, '-' * 76)) story['content'] = shorten(story['content'], 60) story['summary'] = shorten(story['summary'], 60) printer(pretty_format_json(story))
def _parse_found(parsed): feed = AttrDict() res = parsed.response feed.use_proxy = parsed.use_proxy feed.url = _get_url(res) feed.content_length = len(res.content) feed.content_hash_base64 = compute_hash_base64(res.content) parsed_feed = parsed.feed feed.title = shorten(parsed_feed["title"], 200) link = parsed_feed["link"] if not link.startswith('http'): # 有些link属性不是URL,用author_detail的href代替 # 例如:'http://www.cnblogs.com/grenet/' author_detail = parsed_feed['author_detail'] if author_detail: link = author_detail['href'] if not link.startswith('http'): link = feed.url feed.link = link feed.author = shorten(parsed_feed["author"], 200) feed.icon = parsed_feed["icon"] or parsed_feed["logo"] feed.description = parsed_feed["description"] or parsed_feed["subtitle"] feed.dt_updated = _get_dt_updated(parsed_feed) feed.etag = _get_etag(res) feed.last_modified = _get_last_modified(res) feed.encoding = res.encoding feed.version = shorten(parsed.version, 200) entries = list(parsed.entries) # entries will be modified by _get_storys del parsed, res, parsed_feed # release memory in advance feed.storys = _get_storys(entries) return validate_feed(feed)
def do_process_story_webpage( ctx: ActorContext, feed_id: T.int, offset: T.int, url: T.url, text: T.str.maxlen(_MAX_STORY_HTML_LENGTH), num_sub_sentences: T.int.optional, ) -> SCHEMA_FETCH_STORY_RESULT: # https://github.com/dragnet-org/dragnet # https://github.com/misja/python-boilerpipe # https://github.com/dalab/web2text # https://github.com/grangier/python-goose # https://github.com/buriy/python-readability # https://github.com/codelucas/newspaper DEFAULT_RESULT = dict(feed_id=feed_id, offset=offset, url=url) text = text.strip() if not text: return DEFAULT_RESULT text = story_html_clean(text) content = story_readability(text) content = process_story_links(content, url) content_info = StoryContentInfo(content) text_content = shorten(content_info.text, width=_MAX_STORY_CONTENT_LENGTH) num_sentences = len(split_sentences(text_content)) if len(content) > _MAX_STORY_CONTENT_LENGTH: msg = 'too large story#%s,%s size=%s url=%r, will only save plain text' LOG.warning(msg, feed_id, offset, len(content), url) content = text_content # 如果取回的内容比RSS内容更短,就不是正确的全文 if num_sub_sentences is not None: if not is_fulltext_content(content_info): if num_sentences <= num_sub_sentences: msg = 'fetched story#%s,%s url=%s num_sentences=%s less than num_sub_sentences=%s' LOG.info(msg, feed_id, offset, url, num_sentences, num_sub_sentences) return DEFAULT_RESULT summary = shorten(text_content, width=_MAX_STORY_SUMMARY_LENGTH) if not summary: return DEFAULT_RESULT result = dict( **DEFAULT_RESULT, content=content, summary=summary, sentence_count=num_sentences, ) if not ctx.message.is_ask: ctx.hope('harbor_rss.update_story', result) return result
def _get_storys(entries: list): storys = deque(maxlen=300) # limit num storys while entries: data = entries.pop() story = {} content = '' if data["content"]: # both content and summary will in content list, peek the longest for x in data["content"]: value = x["value"] if value and len(value) > len(content): content = value if not content: content = data["description"] if not content: content = data["summary"] story['has_mathjax'] = story_has_mathjax(content) link = normlize_url(data["link"]) valid_link = '' if link: try: valid_link = validate_url(link) except Invalid: LOG.warning(f'invalid story link {link!r}') story['link'] = valid_link content = story_html_clean(content) if len(content) >= 1024 * 1024: msg = 'too large story link=%r content length=%s, will only save plain text!' LOG.warning(msg, link, len(content)) content = story_html_to_text(content) content = process_story_links(content, valid_link) story['content'] = content summary = data["summary"] if not summary: summary = content summary = shorten(story_html_to_text(summary), width=300) story['summary'] = summary title = shorten(data["title"] or link or summary, 200) unique_id = shorten(data['id'] or link or title, 200) content_hash_base64 = compute_hash_base64(content, summary, title) story['title'] = title story['content_hash_base64'] = content_hash_base64 story['unique_id'] = unique_id story['author'] = shorten(data["author"], 200) story['dt_published'] = _get_dt_published(data) story['dt_updated'] = _get_dt_updated(data) storys.append(story) return list(storys)
def do_process_story_webpage( ctx: ActorContext, story_id: T.int, url: T.url, text: T.str.maxlen(5 * 1024 * 1024), ): # https://github.com/dragnet-org/dragnet # https://github.com/misja/python-boilerpipe # https://github.com/dalab/web2text # https://github.com/grangier/python-goose # https://github.com/buriy/python-readability # https://github.com/codelucas/newspaper text = text.strip() if not text: return content = story_readability(text) content = process_story_links(content, url) summary = shorten(story_html_to_text(content), width=300) if not summary: return ctx.hope( 'harbor_rss.update_story', dict( story_id=story_id, content=content, summary=summary, url=url, ))
def _parse_story(self, story: dict, feed_url: str): ident = story['ident'][:200] title = story_html_to_text(story['title'])[:200] url = normlize_url(story['url'] or story['ident'], base_url=feed_url) try: valid_url = validate_url(url) except Invalid: valid_url = None base_url = valid_url or feed_url image_url = normlize_url(story['image_url'], base_url=base_url) author_name = story_html_to_text(story['author_name'])[:100] author_url = normlize_url(story['author_url'], base_url=base_url) author_avatar_url = normlize_url(story['author_avatar_url'], base_url=base_url) content = self._process_content(story['content'], link=base_url) summary = story_html_clean(story['summary']) summary = shorten(story_html_to_text(summary), width=300) has_mathjax = story_has_mathjax(content) return dict( ident=ident, title=title, url=valid_url, content=content, summary=summary, has_mathjax=has_mathjax, image_url=image_url, dt_published=story['dt_published'], dt_updated=story['dt_updated'], author_name=author_name, author_url=author_url, author_avatar_url=author_avatar_url, )
def _do_parse( url: str, printer, checksum, save_checksum, proxy_url, rss_proxy_url, rss_proxy_token, ): if not url.startswith('http://') and not url.startswith('https://'): response_file = FeedResponseFile(url) response = response_file.read() else: reader = FeedReader( proxy_url=proxy_url, rss_proxy_url=rss_proxy_url, rss_proxy_token=rss_proxy_token, ) with reader: response = reader.read(url, use_proxy=reader.has_proxy) print('-> {}'.format(response)) if not response.ok: return if checksum: with open(checksum, 'rb') as f: data = f.read() checksum = FeedChecksum.load(data) print('-> {}'.format(checksum)) else: checksum = None raw_result = RawFeedParser().parse(response) if raw_result.warnings: print('Warning: ' + '; '.join(raw_result.warnings)) result = FeedParser(checksum=checksum).parse(raw_result) print("-> {}".format(result)) printer('-' * 79) printer(pretty_format_json(result.feed)) for i, story in enumerate(result.storys): printer('{:03d}{}'.format(i, '-' * 76)) story['content'] = shorten(story['content'], 60) story['summary'] = shorten(story['summary'], 60) printer(pretty_format_json(story)) if save_checksum: print('-> save {}'.format(save_checksum)) data = result.checksum.dump() with open(save_checksum, 'wb') as f: f.write(data)
def do_process_story_webpage( ctx: ActorContext, story_id: T.int, url: T.url, text: T.str.maxlen(_MAX_STORY_HTML_LENGTH), num_sub_sentences: T.int.optional, ): # https://github.com/dragnet-org/dragnet # https://github.com/misja/python-boilerpipe # https://github.com/dalab/web2text # https://github.com/grangier/python-goose # https://github.com/buriy/python-readability # https://github.com/codelucas/newspaper text = text.strip() if not text: return text = story_html_clean(text) content = story_readability(text) content = process_story_links(content, url) if len(content) > _MAX_STORY_CONTENT_LENGTH: msg = 'too large story#%s size=%s url=%r, will only save plain text' LOG.warning(msg, story_id, len(content), url) content = shorten(story_html_to_text(content), width=_MAX_STORY_CONTENT_LENGTH) # 如果取回的内容比RSS内容更短,就不是正确的全文 if num_sub_sentences is not None: if not is_fulltext_content(content): num_sentences = len(split_sentences(story_html_to_text(content))) if num_sentences <= num_sub_sentences: msg = 'fetched story#%s url=%s num_sentences=%s less than num_sub_sentences=%s' LOG.info(msg, story_id, url, num_sentences, num_sub_sentences) return summary = shorten(story_html_to_text(content), width=_MAX_STORY_SUMMARY_LENGTH) if not summary: return ctx.hope( 'harbor_rss.update_story', dict( story_id=story_id, content=content, summary=summary, url=url, ))
def _parse_story(self, story: dict, feed_url: str): ident = story['ident'][:200] title = story_html_to_text(story['title'])[:200] if not title: title = ident # when title is empty after clean url = normalize_url(story['url'] or story['ident'], base_url=feed_url) try: valid_url = validate_url(url) except Invalid: valid_url = None base_url = valid_url or feed_url image_url = normalize_url(story['image_url'], base_url=base_url) audio_url = normalize_url(story['audio_url'], base_url=base_url) author_name = story_html_to_text(story['author_name'])[:100] author_url = normalize_url(story['author_url'], base_url=base_url) author_avatar_url = normalize_url(story['author_avatar_url'], base_url=base_url) iframe_url = None content, attach = self._process_content(story['content'], link=base_url) if attach: iframe_url = attach.iframe_url if (not audio_url) and attach.audio_url: audio_url = attach.audio_url if (not image_url) and attach.image_url: image_url = attach.image_url if story['summary']: summary = story_html_clean(story['summary']) else: summary = content summary = shorten(story_html_to_text(summary), width=_MAX_SUMMARY_LENGTH) # TODO: summary with links has_mathjax = story_has_mathjax(content) return dict( ident=ident, title=title, url=valid_url, content=content, summary=summary, has_mathjax=has_mathjax, image_url=image_url, iframe_url=iframe_url, audio_url=audio_url, dt_published=story['dt_published'], dt_updated=story['dt_updated'], author_name=author_name, author_url=author_url, author_avatar_url=author_avatar_url, )
def _get_storys(entries: list): storys = deque(maxlen=300) # limit num storys while entries: data = entries.pop() story = {} story['unique_id'] = shorten(_get_story_unique_id(data), 200) content = '' if data["content"]: # both content and summary will in content list, peek the longest for x in data["content"]: value = x["value"] if value and len(value) > len(content): content = value if not content: content = data["description"] if not content: content = data["summary"] story['has_mathjax'] = story_has_mathjax(content) content = story_html_clean(content) content = process_story_links(content, data["link"]) story['content'] = content summary = data["summary"] if not summary: summary = content # TODO: performance summary = shorten(story_html_to_text(summary), width=300) story['summary'] = summary story['link'] = data["link"] title = shorten(data["title"] or story['link'] or story['unique_id'], 200) content_hash_base64 = compute_hash_base64(content, summary, title) story['title'] = title story['content_hash_base64'] = content_hash_base64 story['author'] = shorten(data["author"], 200) story['dt_published'] = _get_dt_published(data) story['dt_updated'] = _get_dt_updated(data) storys.append(story) return list(storys)