def parse_blogs(site_dir: str, config: ES_CONFIG) -> None: filenames = glob(site_dir + '/20*/**/*index.html', recursive=True) paragraphStatsCollector = ParagraphStatsCollector() middlewares: Middlewares = [ pa_sanitize_ws, pa_chunk_long, pa_remove_empty, pa_cat_short, pa_remove_ptag, paragraphStatsCollector, pa_log, ESMiddleware(config), ] blogParser = BlogParser(middlewares) #filenames = ['./site/2017/04/11/custom-intellisense-with-monaco-editor/index.html'] log.info(pformat(filenames, indent=2)) for filename in filenames: blogParser.parse_file(filename) formattedStats = paragraphStatsCollector.formatted() log.info(bannerfy(f"Gathered Statistics:\n{formattedStats}"))
def _get_blog_data(self, *args, **kwargs): '''获取日志数据 ''' if not self.can_access: return # 获取日志数量 self._get_main_page_data() blogs_url = "https://h5.qzone.qq.com/proxy/domain/b.qzone.qq.com/cgi-bin/blognew/get_abs" blogs_payload = { "inCharset": "utf-8", "outCharset": "utf-8", "format": "jsonp", "hostUin": self._account_info.target_uin, "uin": self._account_info.self_uin, "g_tk": self._account_info.g_tk, "pos": "%d" % 0, "num": "%d" % 0, "blogType": "0", "reqInfo": "1" } single_blog_url = "https://h5.qzone.qq.com/proxy/domain/b.qzone.qq.com/cgi-bin/blognew/blog_output_data" single_blog_payload = { "uin": "%s" % self._account_info.target_uin, "blogid": "%s" % "", "numperpage": "15", "inCharset": "utf-8", "outCharset": "utf-8 ", "ref": "qzone" } # 最多100篇日志 num = 100 loop_num = math.ceil(self._account_info.blog_num / num) total_num = 0 for i in range(loop_num): pos = i * num current_num = num if i < loop_num - 1 else self._account_info.blog_num - i * num blogs_payload["pos"] = "%d" % pos blogs_payload["num"] = "%d" % current_num r = self._account_info.get_url(blogs_url, params=blogs_payload) json_data = get_json_data_from_response(r.text) blogs_info = BlogsInfo(json_data, pos, pos + current_num, self._directory) blogs_info.export() for blog in json_data["data"]["list"]: title = blog["title"] print("process blog:", title) blog_id = blog["blogId"] category = blog["cate"] comment_num = blog["commentNum"] blog_info = BlogInfo(category, title, blog_id, comment_num) statistical_json_data = self._get_blog_comment_data(blog_info) single_blog_payload["blogid"] = "%s" % blog_id temp = self._account_info.get_url(single_blog_url, params=single_blog_payload) read = 0 try: new_data = statistical_json_data["data"][0]["current"][ "newdata"] if new_data and len(new_data) > 0: read = new_data["RZRD"] except Exception as e: print("get read num error") print(e) logging.exception(blog_info) logging.exception(e) single_blog = BlogParser(self._directory, blog_info, temp.text, read) single_blog.export() total_num += len(json_data["data"]["list"]) print("current get %d blog(s)" % total_num) if total_num != self._account_info.blog_num: logging.warning( "qq %s: not get encough blog, get: %d, should get: %d" % (self._account_info.target_uin, total_num, self._account_info.blog_num))