def _parse_personal_blogs(self, next_params: Optional[str] = None): req_data: dict = { "src": "web", "uid": self._login_uid, "device_id": self._login_client_id, "token": self._login_token, "targetUid": self._login_uid, "type": "post", "limit": "20", "order": "createdAt", } if next_params is not None: req_data.update(before=next_params) url_params: str = "" for index, data in enumerate(req_data.items()): if index == 0: url_params += f"?{data[0]}={data[1]}" else: url_params += f"&{data[0]}={data[1]}" blogs_url: str = f"{self._blogs_url}{url_params}" response = self.make_request(url=blogs_url, headers=self._common_headers) if response.content.decode() != "": self._response_data = response.json() if self._response_data is not None and self._response_data[ "m"] == "ok": next_page_variable = None entry_list = self._response_data["d"]["entrylist"] if len(entry_list) > 0: for personal_blog in entry_list: blog_create_time = datetime_str_change_fmt( time_str=personal_blog["createdAt"], prev_fmt="%Y-%m-%dT%H:%M:%S.%fZ", ) blog_data: Dict = { "blogId": personal_blog["objectId"], "blogTitle": personal_blog["title"], "blogHref": personal_blog["originalUrl"], "blogViewers": personal_blog["viewsCount"], "blogCreateTime": blog_create_time, } self._blogs_data.append(blog_data) next_page_variable = personal_blog["verifyCreatedAt"] if self._response_data["d"]["total"] > 20: time.sleep(0.5) self._parse_personal_blogs(next_params=next_page_variable) else: logger.debug(self._blogs_data) self.data_model.set_personal_blogs_data( data=self._blogs_data) logger.info("获取个人博客数据成功!") else: logger.error("查询个人博客失败!") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise LoginException()
def login(self): if self._login_cookies is None: hvc_response = self.make_request_with_session( session=self._session, url=f"{self._login_main_url}/v1/api/riskControl/checkHVC", headers=self._request_headers, json=self._check_hvc_data, method="POST", ) if hvc_response.status_code != 200: logger.error("登录 --> 登录预认证失败!") raise LoginException() if check_is_json(data=hvc_response.content.decode()) is not True: logger.error("登录 --> 登录预认证数据返回失败!") raise LoginException() hvc_json_response = hvc_response.json() if hvc_json_response["message"] != "success": logger.error(f"登录 --> 登录预认证失败!返回结果: {hvc_json_response}") raise LoginException() logger.debug(f"预认证返回结果: {hvc_json_response}") login_response = self.make_request_with_session( session=self._session, url=f"{self._login_main_url}/v1/register/pc/login/doLogin", headers=self._request_headers, json=self._login_data, method="POST", ) if login_response.status_code != 200: logger.error("登录 --> 登录请求失败!") raise LoginException() if check_is_json(data=login_response.content.decode()) is not True: logger.error("登录 --> 登录请求返回失败!") raise LoginException() login_json_response = login_response.json() if login_json_response["message"] == "success": logger.debug(f"登录返回结果: {login_json_response}") self._username = login_json_response["username"] logger.info(f"登录 --> 登录成功!当前用户名: {self._login_username}") self._login_cookies = CookieUtils( cookie_list=login_response.cookies.items()).to_str() self.set_cookies(spider_name=self._spider_name, cookies=self._login_cookies) self._request_headers.update(Cookie=self._login_cookies) self.parse_data_with_method( method=BaseSpiderParseMethodType.LoginResult) else: logger.error(f"登录 --> 登录异常!返回结果: {login_json_response}") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise LoginException() else: self._request_headers.update(Cookie=self._login_cookies) self.parse_data_with_method( method=BaseSpiderParseMethodType.PersonalBlogs)
def _parse_login_data(self): personal_response = self.make_request(url=self._user_url, headers=self._common_headers) if personal_response.status_code == 200: selector = etree.HTML(personal_response.content.decode()) try: # 个人数据 username = selector.xpath( "//h2[@class='profile__heading--name']/text()")[0].strip() description = "".join( selector.xpath("//div[@class='profile__desc']/p/text()")) avatar_img = selector.xpath( "//div[@class='profile__heading--avatar-warp']/a/img/@src" )[0] followee = selector.xpath( "//a[contains(@href, 'followed')]/span[@class='h5']/text()" )[0].replace(" 人", "") follower = selector.xpath( "//a[contains(@href, 'following')]/span[@class='h5']/text()" )[0].replace(" 人", "") # TODO 思否暂时只能在个人动态的混杂数据中获取到点赞的文章,暂时不去解析 like_blogs = 0 personal_data: Dict = { "username": username, "description": description, "avatarImg": avatar_img, "followee": followee, "follower": follower, "likeBlogs": like_blogs, } logger.debug(personal_data) self.data_model.set_personal_data(data=personal_data) logger.info("获取个人主页数据成功!") self.parse_data_with_method( method=BaseSpiderParseMethodType.PersonalBlogs) except IndexError: logger.error("解析个人主页数据失败!") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise ParseDataException() else: logger.error("打开个人主页失败!") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise LoginException()
def login(self): if self._login_cookies is None: login_url, login_data = self._check_username() response = self.make_request( url=login_url, headers=self._common_headers, method="POST", json=login_data, ) if response.content.decode() != "": logger.info("登录成功!") self._response_data = response.json() self._login_cookies = CookieUtils( cookie_list=response.cookies.items()).to_str() logger.debug(self._login_cookies) self.set_cookies(spider_name=self._spider_name, cookies=self._login_cookies) self.parse_data_with_method( method=BaseSpiderParseMethodType.LoginResult) else: logger.error("登录失败!") raise LoginException() else: get_result: str = self.get_data( spider_name=f"{self._spider_name}:params") if get_result is None: self.parse_data_with_method( method=BaseSpiderParseMethodType.LoginResult) else: try: login_params = get_result.split("&")[1:-1] self._login_uid = [d for d in login_params if "uid" in d][0].replace("uid=", "") self._login_token = [ d for d in login_params if "token" in d ][0].replace("token=", "") self._login_client_id = [ d for d in login_params if "device_id" in d ][0].replace("device_id=", "") self.parse_data_with_method( method=BaseSpiderParseMethodType.PersonalBlogs) except Exception as err: logger.error(f"解析 Redis 返回数据失败! 错误原因: {err}") self.parse_data_with_method( method=BaseSpiderParseMethodType.LoginResult)
def login(self): if self._cookies is None: token = self._get_token(url=self._main_url) if token is None: self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise LoginException() login_params: str = f"_={token}" login_url = f"{self._main_url}/api/user/login?{login_params}" # 思否通过判断 referer 结尾的斜杠登录跳转(所以后面会多一个斜杠拼接) self._common_headers.update({ "cookie": self._cookies, "origin": self._main_url, "referer": self._main_url + "/", "x-requested-with": "XMLHttpRequest", }) response = self.make_request( url=login_url, headers=self._common_headers, data={ "remember": "1", "username": self._login_username, "password": self._login_password, }, method="POST", ) if response.status_code == 200: main_response = self.make_request(url=self._main_url, headers=self._common_headers) if response.status_code == 200: selector = etree.HTML(main_response.content.decode()) user_href = selector.xpath( "//a[@class='avatar-* dropdownBtn user-avatar']/@href") if len(user_href) > 0: logger.info("登录成功!") self._user_url = f"{self._main_url}{user_href[0]}" self.set_data( spider_name=f"{self._spider_name}:user_url", data=self._user_url, ) self.parse_data_with_method( method=BaseSpiderParseMethodType.LoginResult) else: logger.error("获取个人页面链接失败!") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise LoginException() else: self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise LoginException() else: self._cookies = None self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise LoginException() else: self._common_headers.update({ "cookie": self._cookies, "origin": self._main_url, "referer": self._main_url + "/", "x-requested-with": "XMLHttpRequest", }) self.parse_data_with_method( method=BaseSpiderParseMethodType.PersonalBlogs)
def _parse_personal_blogs(self, next_params: Optional[int] = None): if next_params is None: next_params = 1 blogs_url: str = f"{self._user_url}/articles?page={next_params}" blogs_response = self.make_request(url=blogs_url, headers=self._common_headers) if blogs_response.status_code == 200: selector = etree.HTML(blogs_response.content.decode()) try: for blog in selector.xpath( "//ul[@class='profile-mine__content']/li"): # TODO 思否获取文章阅读量需要进入文章解析,暂不做支持 # count = ( # blog.xpath( # "div[@class='row']/div/span[@class='label label-warning ']/text()" # )[0] # .replace(" ", "") # .replace("\n", "") # .replace("票", "") # ) href_suffix = blog.xpath( "div[@class='row']/div/a/@href")[0] blog_href = f"{self._main_url}{href_suffix}" blog_id = href_suffix.split("/")[-1] blog_title = blog.xpath( "div[@class='row']/div/a/text()")[0] # 时间处理 time_str = blog.xpath( "div[@class='row']/div/span[@class='profile-mine__content--date']/text()" )[0].rstrip() blog_time = handle_different_time_str(time_str=time_str) blog_data: Dict = { "blogId": blog_id, "blogTitle": blog_title, "blogHref": blog_href, "blogViewers": 0, "blogCreateTime": blog_time, } self._blogs_data.append(blog_data) next_page_element = selector.xpath("//li[@class='next']") if len(next_page_element) > 0: time.sleep(1.5) next_params += 1 self._parse_personal_blogs(next_params=next_params) else: logger.debug(self._blogs_data) self.data_model.set_personal_blogs_data( data=self._blogs_data) logger.info("获取个人博客数据成功!") # 任务末尾 self.parse_data_with_method( method=BaseSpiderParseMethodType.Finish) except (IndexError, Exception): logger.error("解析个人博客数据异常!") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise ParseDataException() else: logger.error("获取个人博客数据失败!") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise LoginException()
def login(self): if self._login_cookies is None: if self._init_login(): grant_type: str = "password" client_id: str = "c3cef7c66a1843f8b3a9e6a1e3160e20" source: str = "com.zhihu.web" timestamp: str = str(int(time.time() * 1000)) signature: str = hmac_encrypt_sha1( key=b"d1b964811afb40118a12068ff74a12f4", encrypt_str=f"{grant_type}{client_id}{source}{timestamp}", ) post_data: dict = { "client_id": client_id, "grant_type": grant_type, "source": source, "username": self._login_username, "password": self._login_password, "lang": "en", "ref_source": "other_https://www.zhihu.com/signin", "utm_source": "", "captcha": "", "timestamp": timestamp, "signature": signature, } js_code = compile_js(js_str=zhihu_encrypt_js_code) data = js_code.call("encrypt", urlencode(post_data)) response = self.make_request_with_session( session=self._session, url=self._login_url, data=data, headers=self._login_headers, method="POST", ) if check_is_json(data=response.content.decode()): json_response = response.json() if json_response.get("user_id"): logger.debug(json_response) self._login_cookies = json_response["cookie"] self._session.cookies.update(self._login_cookies) logger.info(f"登录 --> 登录成功!当前用户:{self._login_username}") self._login_user_info = { "username": self._login_username } self._login_user_info.update(json_response) elif json_response.get("error"): error_code: int = json_response["error"]["code"] error_msg: str = json_response["error"]["message"] if error_code == 100005: logger.error("登录 --> 用户名或密码错误!登录失败!") raise LoginException() elif error_code == 120005: logger.error(f"登录 --> 登录失败!错误信息:{error_code}") raise LoginException() else: logger.error(f"登录 --> 其他错误!错误信息:{error_msg}") raise LoginException() else: logger.error("登录 --> 获取登录后的用户信息失败!登录失败!") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise LoginException() else: logger.error("登录 --> 失败") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise LoginException() if self._login_user_info is not None: self.parse_data_with_method( method=BaseSpiderParseMethodType.LoginResult) else: logger.error("登录 --> 获取用户数据失败!") raise LoginException() else: # self._session.headers.update(self._common_headers) # self._session.cookies.update(self._login_cookies) self._common_headers.update(Cookie=self._login_cookies) self._login_user_url_token = self.get_data( spider_name=f"{self._spider_name}:token") self.parse_data_with_method( method=BaseSpiderParseMethodType.LoginResult)
def _parse_login_data(self): personal_data_url: str = f"{self._personal_main_url}/api/user/show" json_req_data: Dict = {"username": self._login_username} personal_data_response = self.make_request( url=personal_data_url, headers=self._request_headers, json=json_req_data, method="POST", ) if (personal_data_response.status_code != 200 or check_is_json(data=personal_data_response.content.decode()) is not True): logger.error("获取个人数据异常!") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise LoginException() personal_data_json = personal_data_response.json() if personal_data_json["message"] != "成功": logger.error("获取个人数据失败!") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise ParseDataException() data = personal_data_json["data"] personal_data: Dict = { "username": self._username, "description": data["selfdesc"], "avatarImg": data["avatarurl"], "followee": 0, "follower": 0, "likeBlogs": 0, } # 获取关注量 follower_api_url: str = f"{self._personal_main_url}/api/relation/get?username={self._username}" follower_response = self.make_request(url=follower_api_url, headers=self._request_headers) if (follower_response.status_code != 200 or check_is_json(data=follower_response.content.decode()) is not True): logger.error("获取个人关注数据异常!") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise ParseDataException() follower_json_response = follower_response.json() if follower_json_response["message"] == "成功": personal_data["followee"] = follower_json_response["data"][ "fans_num"] personal_data["follower"] = follower_json_response["data"][ "follow_num"] # 获取个人收藏的博客数 collections_response = self.make_request( url=f"{self._personal_main_url}/api/favorite/folderList", headers=self._request_headers, ) if (collections_response.status_code != 200 or check_is_json( collections_response.content.decode()) is not True): logger.error("获取个人收藏数据异常!") self.update_task_status(task_id=self._task_id, data=str(PROCESS_STATUS_FAIL)) raise ParseDataException() collection_json_response = collections_response.json() if collection_json_response["message"] == "成功": collection_count = 0 for collection in collection_json_response["data"]["result"]: favorite_num = collection["FavoriteNum"] if favorite_num != 0: collection_count += collection["FavoriteNum"] self._personal_collection_ids.append(collection["ID"]) personal_data["likeBlogs"] = collection_count # 写入数据 logger.debug(personal_data) self.data_model.set_personal_data(data=personal_data) self.parse_data_with_method( method=BaseSpiderParseMethodType.PersonalBlogs)