def get_blog_info(directory, encoded_category, encoded_filename, matched_result): ''' 获取日志基本信息:标题、分类、id、评论数、阅读数 ''' read_num = 0 comment_num = 0 blog_title = matched_result[1] blog_id = int(matched_result[2]) blog_info = None full_filename = os.path.join(directory, encoded_category, encoded_filename) with open(full_filename, "r", encoding="utf-8") as fin: m = NUMBER_PATTERN.search(fin.read()) if m: read_num = int(m[1]) comment_num = int(m[2]) blog_info = BlogInfo(recover_file_name(encoded_category), recover_file_name(blog_title), blog_id, comment_num, read_num) return blog_info
def _get_blog_data(self, *args, **kwargs): '''获取日志数据 ''' if not self.can_access: return # 获取日志数量 self._get_main_page_data() blogs_url = "https://h5.qzone.qq.com/proxy/domain/b.qzone.qq.com/cgi-bin/blognew/get_abs" blogs_payload = { "inCharset": "utf-8", "outCharset": "utf-8", "format": "jsonp", "hostUin": self._account_info.target_uin, "uin": self._account_info.self_uin, "g_tk": self._account_info.g_tk, "pos": "%d" % 0, "num": "%d" % 0, "blogType": "0", "reqInfo": "1" } single_blog_url = "https://h5.qzone.qq.com/proxy/domain/b.qzone.qq.com/cgi-bin/blognew/blog_output_data" single_blog_payload = { "uin": "%s" % self._account_info.target_uin, "blogid": "%s" % "", "numperpage": "15", "inCharset": "utf-8", "outCharset": "utf-8 ", "ref": "qzone" } # 最多100篇日志 num = 100 loop_num = math.ceil(self._account_info.blog_num / num) total_num = 0 for i in range(loop_num): pos = i * num current_num = num if i < loop_num - 1 else self._account_info.blog_num - i * num blogs_payload["pos"] = "%d" % pos blogs_payload["num"] = "%d" % current_num r = self._account_info.get_url(blogs_url, params=blogs_payload) json_data = get_json_data_from_response(r.text) blogs_info = BlogsInfo(json_data, pos, pos + current_num, self._directory) blogs_info.export() for blog in json_data["data"]["list"]: title = blog["title"] print("process blog:", title) blog_id = blog["blogId"] category = blog["cate"] comment_num = blog["commentNum"] blog_info = BlogInfo(category, title, blog_id, comment_num) statistical_json_data = self._get_blog_comment_data(blog_info) single_blog_payload["blogid"] = "%s" % blog_id temp = self._account_info.get_url(single_blog_url, params=single_blog_payload) read = 0 try: new_data = statistical_json_data["data"][0]["current"][ "newdata"] if new_data and len(new_data) > 0: read = new_data["RZRD"] except Exception as e: print("get read num error") print(e) logging.exception(blog_info) logging.exception(e) single_blog = BlogParser(self._directory, blog_info, temp.text, read) single_blog.export() total_num += len(json_data["data"]["list"]) print("current get %d blog(s)" % total_num) if total_num != self._account_info.blog_num: logging.warning( "qq %s: not get encough blog, get: %d, should get: %d" % (self._account_info.target_uin, total_num, self._account_info.blog_num))