def _get_blog_comment_data(self, blog_info): '''获取日志评论及点赞数据 ''' if not self.can_access: return unikey = "http://user.qzone.qq.com/%s/blog/%s" % ( self._account_info.target_uin, blog_info.blog_id) statistical_json_data = self._get_like_data(unikey) comment_num = blog_info.comment_num print("process blog comment, [%s]\tid: %s\tcomment_num: %d" % (blog_info.title, blog_info.blog_id, comment_num)) comment_url = "https://h5.qzone.qq.com/proxy/domain/b.qzone.qq.com/cgi-bin/blognew/get_comment_list" start = 0 # 每次最多获取50条评论 num = 50 payload = { "inCharset": "gb2312", "outCharset": "gb2312", "format": "jsonp", "uin": self._account_info.target_uin, "g_tk": self._account_info.g_tk, "start": "%d" % start, "num": "%d" % num, "topicId": "%s_%s" % (self._account_info.target_uin, blog_info.blog_id) } loop_num = math.ceil(comment_num / num) total_num = 0 for i in range(loop_num): start = i * num current_num = num if i < loop_num - 1 else comment_num - i * num payload["start"] = "%d" % start payload["num"] = "%d" % current_num print("blog: %s, comment: [%d, %d)" % (blog_info.title, start, start + current_num)) r = self._account_info.get_url(comment_url, params=payload) json_data = get_json_data_from_response(r.text) if "comments" in json_data["data"]: total_num += len(json_data["data"]["comments"]) blog_comment = BlogComment(json_data, start, start + current_num, blog_info, self._directory) blog_comment.export() random_sleep(0, 1) if total_num != comment_num: logging.warning( "qq %s: not get correct blog comment, " "blog: %s, comment get: %d, comment should get: %d" % (self._account_info.target_uin, blog_info.title, total_num, comment_num)) return statistical_json_data
def export(self, need_download_media=False): '''默认下载非登录id发表的资源 ''' if "msglist" in self.json_data: msglist = self.json_data["msglist"] tid_file = os.path.join(self.directory_path, QzoneFileName.SHUOSHUO_TID) with open(tid_file, "a", encoding="utf-8") as f: msglist_len = len(msglist) for i in range(msglist_len): msg = msglist[i] print("%05d\t" % ShuoShuoParser._shuoshuo_count, "process shuoshuo, tid:", msg["tid"]) f.write("%s\n" % msg["tid"]) need_sleep = False comment_num = msg["cmtnum"] if comment_num > 0 and "commentlist" in msg and msg[ "commentlist"]: comment_list = msg["commentlist"] if len(comment_list) != comment_num: need_sleep = True msg = msglist[i] = self._parse_single_shuoshuo( msg["tid"], comment_num) comment_list = msg["commentlist"] for comment in comment_list: if comment["uin"] != self._account_info.self_uin \ or need_download_media: export_media_url(comment, self.directory_path) # 说说图片大于 9 张 if QzoneType.PICTURE in msg and msg[QzoneType.PICTURE]\ and QzoneKey.PIC_TOTAL in msg and msg[QzoneKey.PIC_TOTAL]\ and len(msg[QzoneType.PICTURE]) == 9\ and msg[QzoneKey.PIC_TOTAL] > 9: floatview_data = self._parse_all_picture(msg) msglist[i][QzoneKey.OPTION_DATA] = {} msglist[i][QzoneKey.OPTION_DATA][QzoneKey.SHUOSHUO_FLOATVIEW] =\ self._parse_all_picture(msg) msg = msglist[i] # 需要获取全文 if msg.get("has_more_con"): self._parse_full_content(msg) if self._account_info.target_uin != self._account_info.self_uin \ or need_download_media: export_media_url(msg, self.directory_path) ShuoShuoParser._shuoshuo_count += 1 if need_sleep: random_sleep(0, 1) self.save(self._filename)
def export(self, need_download_media=False): '''默认下载非登录id发表的资源 ''' if "msglist" in self.json_data: msglist = self.json_data["msglist"] tid_file = os.path.join(self.directory_path, config.SHUOSHUO_TID_FILE) with open(tid_file, "a", encoding="utf-8") as f: for i in range(0, len(msglist)): msg = msglist[i] print("%05d\t" % ShuoShuoParser._shuoshuo_count, "process shuoshuo, tid:", msg["tid"]) f.write("%s\n" % msg["tid"]) need_sleep = False comment_num = msg["cmtnum"] if comment_num > 0 and "commentlist" in msg and msg[ "commentlist"]: comment_list = msg["commentlist"] if len(comment_list) != comment_num: need_sleep = True msg = msglist[i] = self._parse_single_shuoshuo( msg["tid"], comment_num) comment_list = msg["commentlist"] for comment in comment_list: if comment["uin"] != self._account_info.self_uin \ or need_download_media: export_comment_media_url( comment, self.directory_path) if self._account_info.target_uin != self._account_info.self_uin \ or need_download_media: for media_type in config.MEDIA_TYPE: if media_type in msg: medias = msg[media_type] for media in medias: export_content_media_url( media, media_type, self.directory_path) ShuoShuoParser._shuoshuo_count += 1 if need_sleep: random_sleep(0, 1) self.save(self._file_name)
def _delete_all_shuoshuo(self): '''根据文件中的说说tid删除所有说说。频繁删除会出现验证码 ''' if not self._account_info.is_self(): return file = os.path.join(self._directory, QzonePath.SHUOSHUO, QzoneFileName.SHUOSHUO_TID) count = 0 with open(file, "r", encoding="utf-8") as f: for line in f: count += 1 shuoshuo_tid = line.strip("\n") self._delete_shuoshuo(shuoshuo_tid) print(count, "delete tid:", shuoshuo_tid) random_sleep(0, 1)
def _get_shuoshuo_like_data(self): '''获取说说点赞数据 ''' if not self.can_access: return unikey_pattern = "http://user.qzone.qq.com/%s/mood/%s" file = os.path.join(self._directory, QzonePath.SHUOSHUO, QzoneFileName.SHUOSHUO_TID) with open(file, "r", encoding="utf-8") as f: for line in f: shuoshuo_tid = line.strip("\n") unikey = unikey_pattern % (self._account_info.target_uin, shuoshuo_tid) self._get_like_data(unikey) random_sleep(1, 2)
def _get_message_board(self, *args, **kwargs): '''获取留言板数据 ''' if not self.can_access: return url_pattern = "https://user.qzone.qq.com/proxy/domain/m.qzone.qq.com/cgi-bin/new/get_msgb" num = 20 pos = 0 result_code = 0 payload = { "format": "jsonp", "inCharset": "utf-8", "outCharset": "utf-8", "uin": self._account_info.self_uin, "hostUin": self._account_info.target_uin, "start": "%d" % pos, "num": "%d" % num, "g_tk": self._account_info.g_tk, } # 获取前20条留言及留言总数 for i in range(RETRY_TIMES): r = self._account_info.get_url(url_pattern, params=payload) json_data = get_json_data_from_response(r.text) result_code = json_data["code"] if result_code == 0: break random_sleep(1, 2) if result_code != 0: return current_num = len(json_data["data"]["commentList"]) msg_parser = MsgBoardParser(json_data, pos, pos + current_num, self._directory) msg_parser.export() msg_num = json_data["data"]["total"] total_num = current_num print("current get msgboard num", total_num) # 处理剩余留言 loop_num = math.ceil(msg_num / num) for i in range(1, loop_num): pos = i * num current_num = num if i < loop_num - 1 else msg_num - (i * num) payload["start"] = "%d" % pos payload["num"] = "%d" % current_num for i in range(RETRY_TIMES): r = self._account_info.get_url(url_pattern, params=payload) json_data = get_json_data_from_response(r.text) result_code = json_data["code"] if result_code == 0: break random_sleep(1, 2) if result_code != 0: continue if "commentList" in json_data["data"]: total_num += len(json_data["data"]["commentList"]) msg_parser = MsgBoardParser(json_data, pos, pos + current_num, self._directory) msg_parser.export() print("current get msgboard num", total_num) random_sleep(0, 1) if total_num != msg_num: logging.warning( "qq %s: not get correct msg in msg_board, get: %d\t, should get: %d" % (self._account_info.target_uin, total_num, msg_num))
def _get_like_data(self, unikey): '''获取unikey对应的点赞数据 ''' if not self.can_access: return print("process like data:", unikey) data_count_file = os.path.join(self._directory, QzoneExporter.DATA_COUNT_FILE) with QzoneExporter._lock: if not os.path.exists(data_count_file): with open(data_count_file, "w", encoding="utf-8") as f: f.write("{\n}") json_data = None like_count = 0 with open(data_count_file, "r", encoding="utf-8") as f: json_data = json.load(f) # 获取点赞数据 like_count_url = "https://user.qzone.qq.com/proxy/domain/r.qzone.qq.com/cgi-bin/user/qz_opcnt2" payload = { "fupdate": "1", "unikey": unikey, "g_tk": self._account_info.g_tk } r = self._account_info.get_url(like_count_url, params=payload) unikey_json_data = get_json_data_from_response(r.text) try: like_count = unikey_json_data["data"][0]["current"]["likedata"][ "cnt"] except Exception as e: print(e) logging.exception("unikey: %s" % unikey) logging.exception(unikey_json_data) logging.exception(e) like_count = 0 json_data[unikey] = {} json_data[unikey][QzoneExporter.LIKE_COUNT_KEY] = unikey_json_data if like_count <= 0: print(unikey, "has no like data") json_data[unikey][ QzoneExporter.LIKE_DETAILED_KEY] = "no like detailed data" else: # 获取点赞详细信息 like_data_url = "https://user.qzone.qq.com/proxy/domain/users.qzone.qq.com/cgi-bin/likes/get_like_list_app" begin_uin = "0" query_count = 60 if_first_page = 1 current_get_num = 0 total_num = 0 payload = { "uin": self._account_info.self_uin, "unikey": unikey, "begin_uin": begin_uin, "query_count": "%d" % query_count, "if_first_page": "%d" % if_first_page, "g_tk": self._account_info.g_tk } json_data[unikey][QzoneExporter.LIKE_DETAILED_KEY] = [] while True: payload["begin_uin"] = begin_uin payload["query_count"] = "%d" % query_count payload["if_first_page"] = "%d" % if_first_page r = self._account_info.get_url(like_data_url, params=payload) temp = r.text temp = temp[temp.find("{"):temp.rfind("}") + 1] # 中文乱码 # 需要先用iso8859编码,再解码 temp_bytes = temp.encode("iso8859") try: try: temp = temp_bytes.decode("utf-8") except UnicodeError: temp = temp_bytes.decode("gb2312") except Exception as e: logging.exception(e) logging.exception("=====\nerror: %s\n=====" % temp) print("decode error, break") break like_json_data = json.loads(temp) json_data[unikey][QzoneExporter.LIKE_DETAILED_KEY].append( like_json_data) if "data" not in like_json_data: break like_uin_info = like_json_data["data"]["like_uin_info"] if len(like_uin_info) == 0: break begin_uin = like_uin_info[-1]["fuin"] if_first_page = 0 current_get_num = like_json_data["data"]["total_number"] total_num += current_get_num if current_get_num <= 0 or total_num >= like_count: break random_sleep(1, 2) with open(data_count_file, "w", encoding="utf-8") as f: json.dump(json_data, f, ensure_ascii=False, indent=4) return unikey_json_data
def _get_album_photo_data(self, album_info, need_get_comment=False): '''获取相册中照片数据 ''' if not self.can_access: return print("process album, name: %s\tid: %s" % (album_info.name, album_info.id)) list_photo_url = "https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/cgi_list_photo" floatview_photo_list = "https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/cgi_floatview_photo_list_v2" start = 0 num = 500 current_num = num list_photo_payload = { "inCharset": "utf-8", "outCharset": "utf-8", "g_tk": self._account_info.g_tk, "hostUin": self._account_info.target_uin, "uin": self._account_info.self_uin, "topicId": album_info.id, "pageStart": "%d" % start, "pageNum": "%d" % current_num, } floatview_photo_payload = { "g_tk": self._account_info.g_tk, "topicId": album_info.id, "hostUin": self._account_info.target_uin, "uin": self._account_info.self_uin, "fupdate": "1", "plat": "qzone", "source": "qzone", "cmtNum": "99", # 必选 "sortOrder": "1", "need_private_comment": "1", "inCharset": "utf-8", "outCharset": "utf-8", "appid": "4", "isFirst": "1", "picKey": "unknown", "postNum": "0" # 获取后续照片数量 } ttt = '''{"data": {"comments":[]}}''' single_comment_data = json.loads(ttt) comment_exported_num = 0 total_comment_num = 0 loop_num = math.ceil(album_info.photo_num / num) for i in range(loop_num): start = i * num current_num = num if i < loop_num - 1 else album_info.photo_num - i * num list_photo_payload["pageStart"] = "%d" % start list_photo_payload["pageNum"] = "%d" % current_num r = self._account_info.get_url(list_photo_url, params=list_photo_payload) json_data = get_json_data_from_response(r.text) photo_parser = PhotoParser(json_data, start, start + current_num, self._directory, album_info.directory) photo_parser.export() # 获取原图及视频url if "photoList" in json_data["data"] and json_data["data"]["photoList"] \ and len(json_data["data"]["photoList"]) > 0: floatview_photo_payload["picKey"] = json_data["data"][ "photoList"][0]["lloc"] floatview_photo_payload["postNum"] = "%d" % (current_num - 1) r = self._account_info.get_url(floatview_photo_list, params=floatview_photo_payload) floatview_json_data = get_json_data_from_response(r.text) photo_parser = PhotoParser(floatview_json_data, start, start + current_num, self._directory, album_info.directory, True) photo_parser.export() # 获取评论数据 if need_get_comment: for photo in json_data["data"]["photoList"]: pic_comment_num = photo["forum"] or 0 if pic_comment_num == 0: continue print("find %d comment(s) in %s" % (pic_comment_num, photo["lloc"])) # 评论数可能显示错误 floatview_photo_payload["cmtNum"] = "%d" % ( pic_comment_num if pic_comment_num > 99 else 99) floatview_photo_payload["picKey"] = photo["lloc"] floatview_photo_payload["postNum"] = "0" r = self._account_info.get_url( floatview_photo_list, params=floatview_photo_payload) floatview_json_data = get_json_data_from_response( r.text) if not ("single" in floatview_json_data["data"] and floatview_json_data["data"]["single"]): continue comment_data = floatview_json_data["data"]["single"][ "comments"] single_comment_data["data"]["comments"] += comment_data pic_comment_num = len(comment_data) total_comment_num += pic_comment_num if total_comment_num > 100 + comment_exported_num: photo_comment = PhotoComment( single_comment_data, comment_exported_num, total_comment_num, self._directory, album_info.directory, self._account_info) photo_comment.export() single_comment_data["data"]["comments"] = [] comment_exported_num = total_comment_num random_sleep(0, 1) random_sleep(1, 2) # 导出剩余评论数据 if need_get_comment: if comment_exported_num < total_comment_num: photo_comment = PhotoComment(single_comment_data, comment_exported_num, total_comment_num, self._directory, album_info.directory, self._account_info) photo_comment.export() print("get %d comment(s) in %s" % (total_comment_num, album_info.name)) print(str(album_info), "photo data done")
def _get_list_album_data(self, get_like_data=False, *args, **kwargs): '''获取相册数据 ''' if not self.can_access: return album_list_url = "https://h5.qzone.qq.com/proxy/domain/photo.qzone.qq.com/fcgi-bin/fcg_list_album_v3" pos = 0 num = 100 current_num = num payload = { "g_tk": self._account_info.g_tk, "uin": self._account_info.self_uin, "hostUin": self._account_info.target_uin, "inCharset": "utf-8", "outCharset": "utf-8", "source": "qzone", "plat": "qzone", "pageStart": "%d" % pos, "pageNum": "%d" % current_num } for i in range(RETRY_TIMES): r = self._account_info.get_url(album_list_url, params=payload) json_data = get_json_data_from_response(r.text) result_code = json_data["code"] if result_code == 0: break random_sleep(1, 2) if result_code != 0: return album_list_mode_key = QzoneKey.ALBUM_LIST_MODE_SORT_KEY if album_list_mode_key not in json_data["data"]: album_list_mode_key = QzoneKey.ALBUM_LIST_MODE_CLASS_KEY if album_list_mode_key not in json_data["data"]: logging.warning("album list data not found in %s" % json_data["data"]) return if not json_data["data"][album_list_mode_key]: logging.warning("%s\nalbum list data is null" % json_data["data"]) self._album_num = albums_num = json_data["data"]["albumsInUser"] print("total album num", self._album_num) current_num = QzoneExporter._get_album_list_data_len(json_data["data"]) total_num = current_num loop_num = math.ceil(albums_num / num) for i in range(1, loop_num): pos = i * num current_num = num if i < loop_num - 1 else albums_num - (i * num) payload["pageStart"] = "%d" % pos payload["pageNum"] = "%d" % current_num for i in range(RETRY_TIMES): r = self._account_info.get_url(album_list_url, params=payload) temp_json_data = get_json_data_from_response(r.text) result_code = temp_json_data["code"] if result_code == 0: break random_sleep(1, 2) if result_code != 0: continue if album_list_mode_key in temp_json_data["data"]: if not temp_json_data["data"][album_list_mode_key]: print("album is null, break") break total_num += QzoneExporter._get_album_list_data_len( temp_json_data["data"]) json_data["data"][album_list_mode_key] += temp_json_data["data"][ album_list_mode_key] print("current get num", total_num) random_sleep(0, 1) album_list_info = AlbumListInfo(json_data, self._directory) album_list_info.export() if not album_list_info.json_data["data"][album_list_mode_key]: return album_list_data = get_album_list_data( album_list_info.json_data["data"]) for album_data in album_list_data: album_info = AlbumInfo(album_data) print(str(album_info)) album_comment_num = self._get_album_comment_data(album_info) self._get_album_photo_data(album_info, album_comment_num == 0) if get_like_data: unikey = "http://user.qzone.qq.com/%s/photo/%s" % ( self._account_info.target_uin, album_info.id) self._get_like_data(unikey) random_sleep(1, 2)