Beispiel #1
0
    def _parse_personal_blogs(self, next_params: Optional[str] = None):
        req_data: dict = {
            "src": "web",
            "uid": self._login_uid,
            "device_id": self._login_client_id,
            "token": self._login_token,
            "targetUid": self._login_uid,
            "type": "post",
            "limit": "20",
            "order": "createdAt",
        }
        if next_params is not None:
            req_data.update(before=next_params)

        url_params: str = ""
        for index, data in enumerate(req_data.items()):
            if index == 0:
                url_params += f"?{data[0]}={data[1]}"
            else:
                url_params += f"&{data[0]}={data[1]}"
        blogs_url: str = f"{self._blogs_url}{url_params}"
        response = self.make_request(url=blogs_url,
                                     headers=self._common_headers)
        if response.content.decode() != "":
            self._response_data = response.json()
            if self._response_data is not None and self._response_data[
                    "m"] == "ok":
                next_page_variable = None
                entry_list = self._response_data["d"]["entrylist"]
                if len(entry_list) > 0:
                    for personal_blog in entry_list:
                        blog_create_time = datetime_str_change_fmt(
                            time_str=personal_blog["createdAt"],
                            prev_fmt="%Y-%m-%dT%H:%M:%S.%fZ",
                        )

                        blog_data: Dict = {
                            "blogId": personal_blog["objectId"],
                            "blogTitle": personal_blog["title"],
                            "blogHref": personal_blog["originalUrl"],
                            "blogViewers": personal_blog["viewsCount"],
                            "blogCreateTime": blog_create_time,
                        }
                        self._blogs_data.append(blog_data)
                        next_page_variable = personal_blog["verifyCreatedAt"]

                if self._response_data["d"]["total"] > 20:
                    time.sleep(0.5)
                    self._parse_personal_blogs(next_params=next_page_variable)
                else:
                    logger.debug(self._blogs_data)
                    self.data_model.set_personal_blogs_data(
                        data=self._blogs_data)
                    logger.info("获取个人博客数据成功!")
        else:
            logger.error("查询个人博客失败!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise LoginException()
Beispiel #2
0
    def login(self):
        if self._login_cookies is None:
            hvc_response = self.make_request_with_session(
                session=self._session,
                url=f"{self._login_main_url}/v1/api/riskControl/checkHVC",
                headers=self._request_headers,
                json=self._check_hvc_data,
                method="POST",
            )
            if hvc_response.status_code != 200:
                logger.error("登录 --> 登录预认证失败!")
                raise LoginException()

            if check_is_json(data=hvc_response.content.decode()) is not True:
                logger.error("登录 --> 登录预认证数据返回失败!")
                raise LoginException()

            hvc_json_response = hvc_response.json()
            if hvc_json_response["message"] != "success":
                logger.error(f"登录 --> 登录预认证失败!返回结果: {hvc_json_response}")
                raise LoginException()
            logger.debug(f"预认证返回结果: {hvc_json_response}")

            login_response = self.make_request_with_session(
                session=self._session,
                url=f"{self._login_main_url}/v1/register/pc/login/doLogin",
                headers=self._request_headers,
                json=self._login_data,
                method="POST",
            )
            if login_response.status_code != 200:
                logger.error("登录 --> 登录请求失败!")
                raise LoginException()

            if check_is_json(data=login_response.content.decode()) is not True:
                logger.error("登录 --> 登录请求返回失败!")
                raise LoginException()

            login_json_response = login_response.json()
            if login_json_response["message"] == "success":
                logger.debug(f"登录返回结果: {login_json_response}")
                self._username = login_json_response["username"]
                logger.info(f"登录 --> 登录成功!当前用户名: {self._login_username}")
                self._login_cookies = CookieUtils(
                    cookie_list=login_response.cookies.items()).to_str()
                self.set_cookies(spider_name=self._spider_name,
                                 cookies=self._login_cookies)
                self._request_headers.update(Cookie=self._login_cookies)
                self.parse_data_with_method(
                    method=BaseSpiderParseMethodType.LoginResult)
            else:
                logger.error(f"登录 --> 登录异常!返回结果: {login_json_response}")
                self.update_task_status(task_id=self._task_id,
                                        data=str(PROCESS_STATUS_FAIL))
                raise LoginException()
        else:
            self._request_headers.update(Cookie=self._login_cookies)
            self.parse_data_with_method(
                method=BaseSpiderParseMethodType.PersonalBlogs)
Beispiel #3
0
    def _parse_login_data(self):
        personal_response = self.make_request(url=self._user_url,
                                              headers=self._common_headers)
        if personal_response.status_code == 200:
            selector = etree.HTML(personal_response.content.decode())

            try:
                # 个人数据
                username = selector.xpath(
                    "//h2[@class='profile__heading--name']/text()")[0].strip()
                description = "".join(
                    selector.xpath("//div[@class='profile__desc']/p/text()"))
                avatar_img = selector.xpath(
                    "//div[@class='profile__heading--avatar-warp']/a/img/@src"
                )[0]
                followee = selector.xpath(
                    "//a[contains(@href, 'followed')]/span[@class='h5']/text()"
                )[0].replace(" 人", "")
                follower = selector.xpath(
                    "//a[contains(@href, 'following')]/span[@class='h5']/text()"
                )[0].replace(" 人", "")

                # TODO 思否暂时只能在个人动态的混杂数据中获取到点赞的文章,暂时不去解析
                like_blogs = 0

                personal_data: Dict = {
                    "username": username,
                    "description": description,
                    "avatarImg": avatar_img,
                    "followee": followee,
                    "follower": follower,
                    "likeBlogs": like_blogs,
                }
                logger.debug(personal_data)
                self.data_model.set_personal_data(data=personal_data)
                logger.info("获取个人主页数据成功!")
                self.parse_data_with_method(
                    method=BaseSpiderParseMethodType.PersonalBlogs)
            except IndexError:
                logger.error("解析个人主页数据失败!")
                self.update_task_status(task_id=self._task_id,
                                        data=str(PROCESS_STATUS_FAIL))
                raise ParseDataException()
        else:
            logger.error("打开个人主页失败!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise LoginException()
Beispiel #4
0
 def login(self):
     if self._login_cookies is None:
         login_url, login_data = self._check_username()
         response = self.make_request(
             url=login_url,
             headers=self._common_headers,
             method="POST",
             json=login_data,
         )
         if response.content.decode() != "":
             logger.info("登录成功!")
             self._response_data = response.json()
             self._login_cookies = CookieUtils(
                 cookie_list=response.cookies.items()).to_str()
             logger.debug(self._login_cookies)
             self.set_cookies(spider_name=self._spider_name,
                              cookies=self._login_cookies)
             self.parse_data_with_method(
                 method=BaseSpiderParseMethodType.LoginResult)
         else:
             logger.error("登录失败!")
             raise LoginException()
     else:
         get_result: str = self.get_data(
             spider_name=f"{self._spider_name}:params")
         if get_result is None:
             self.parse_data_with_method(
                 method=BaseSpiderParseMethodType.LoginResult)
         else:
             try:
                 login_params = get_result.split("&")[1:-1]
                 self._login_uid = [d for d in login_params
                                    if "uid" in d][0].replace("uid=", "")
                 self._login_token = [
                     d for d in login_params if "token" in d
                 ][0].replace("token=", "")
                 self._login_client_id = [
                     d for d in login_params if "device_id" in d
                 ][0].replace("device_id=", "")
                 self.parse_data_with_method(
                     method=BaseSpiderParseMethodType.PersonalBlogs)
             except Exception as err:
                 logger.error(f"解析 Redis 返回数据失败! 错误原因: {err}")
                 self.parse_data_with_method(
                     method=BaseSpiderParseMethodType.LoginResult)
Beispiel #5
0
    def login(self):
        if self._cookies is None:
            token = self._get_token(url=self._main_url)
            if token is None:
                self.update_task_status(task_id=self._task_id,
                                        data=str(PROCESS_STATUS_FAIL))
                raise LoginException()
            login_params: str = f"_={token}"
            login_url = f"{self._main_url}/api/user/login?{login_params}"

            # 思否通过判断 referer 结尾的斜杠登录跳转(所以后面会多一个斜杠拼接)
            self._common_headers.update({
                "cookie": self._cookies,
                "origin": self._main_url,
                "referer": self._main_url + "/",
                "x-requested-with": "XMLHttpRequest",
            })
            response = self.make_request(
                url=login_url,
                headers=self._common_headers,
                data={
                    "remember": "1",
                    "username": self._login_username,
                    "password": self._login_password,
                },
                method="POST",
            )
            if response.status_code == 200:
                main_response = self.make_request(url=self._main_url,
                                                  headers=self._common_headers)
                if response.status_code == 200:
                    selector = etree.HTML(main_response.content.decode())
                    user_href = selector.xpath(
                        "//a[@class='avatar-* dropdownBtn user-avatar']/@href")
                    if len(user_href) > 0:
                        logger.info("登录成功!")
                        self._user_url = f"{self._main_url}{user_href[0]}"
                        self.set_data(
                            spider_name=f"{self._spider_name}:user_url",
                            data=self._user_url,
                        )
                        self.parse_data_with_method(
                            method=BaseSpiderParseMethodType.LoginResult)
                    else:
                        logger.error("获取个人页面链接失败!")
                        self.update_task_status(task_id=self._task_id,
                                                data=str(PROCESS_STATUS_FAIL))
                        raise LoginException()
                else:
                    self.update_task_status(task_id=self._task_id,
                                            data=str(PROCESS_STATUS_FAIL))
                    raise LoginException()
            else:
                self._cookies = None
                self.update_task_status(task_id=self._task_id,
                                        data=str(PROCESS_STATUS_FAIL))
                raise LoginException()
        else:
            self._common_headers.update({
                "cookie": self._cookies,
                "origin": self._main_url,
                "referer": self._main_url + "/",
                "x-requested-with": "XMLHttpRequest",
            })
            self.parse_data_with_method(
                method=BaseSpiderParseMethodType.PersonalBlogs)
Beispiel #6
0
    def _parse_personal_blogs(self, next_params: Optional[int] = None):
        if next_params is None:
            next_params = 1

        blogs_url: str = f"{self._user_url}/articles?page={next_params}"

        blogs_response = self.make_request(url=blogs_url,
                                           headers=self._common_headers)
        if blogs_response.status_code == 200:
            selector = etree.HTML(blogs_response.content.decode())
            try:
                for blog in selector.xpath(
                        "//ul[@class='profile-mine__content']/li"):
                    # TODO 思否获取文章阅读量需要进入文章解析,暂不做支持
                    # count = (
                    #     blog.xpath(
                    #         "div[@class='row']/div/span[@class='label label-warning  ']/text()"
                    #     )[0]
                    #     .replace(" ", "")
                    #     .replace("\n", "")
                    #     .replace("票", "")
                    # )
                    href_suffix = blog.xpath(
                        "div[@class='row']/div/a/@href")[0]
                    blog_href = f"{self._main_url}{href_suffix}"
                    blog_id = href_suffix.split("/")[-1]
                    blog_title = blog.xpath(
                        "div[@class='row']/div/a/text()")[0]
                    # 时间处理
                    time_str = blog.xpath(
                        "div[@class='row']/div/span[@class='profile-mine__content--date']/text()"
                    )[0].rstrip()
                    blog_time = handle_different_time_str(time_str=time_str)

                    blog_data: Dict = {
                        "blogId": blog_id,
                        "blogTitle": blog_title,
                        "blogHref": blog_href,
                        "blogViewers": 0,
                        "blogCreateTime": blog_time,
                    }
                    self._blogs_data.append(blog_data)

                next_page_element = selector.xpath("//li[@class='next']")
                if len(next_page_element) > 0:
                    time.sleep(1.5)
                    next_params += 1
                    self._parse_personal_blogs(next_params=next_params)
                else:
                    logger.debug(self._blogs_data)
                    self.data_model.set_personal_blogs_data(
                        data=self._blogs_data)
                    logger.info("获取个人博客数据成功!")
                # 任务末尾
                self.parse_data_with_method(
                    method=BaseSpiderParseMethodType.Finish)
            except (IndexError, Exception):
                logger.error("解析个人博客数据异常!")
                self.update_task_status(task_id=self._task_id,
                                        data=str(PROCESS_STATUS_FAIL))
                raise ParseDataException()
        else:
            logger.error("获取个人博客数据失败!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise LoginException()
Beispiel #7
0
    def login(self):
        if self._login_cookies is None:
            if self._init_login():
                grant_type: str = "password"
                client_id: str = "c3cef7c66a1843f8b3a9e6a1e3160e20"
                source: str = "com.zhihu.web"
                timestamp: str = str(int(time.time() * 1000))
                signature: str = hmac_encrypt_sha1(
                    key=b"d1b964811afb40118a12068ff74a12f4",
                    encrypt_str=f"{grant_type}{client_id}{source}{timestamp}",
                )
                post_data: dict = {
                    "client_id": client_id,
                    "grant_type": grant_type,
                    "source": source,
                    "username": self._login_username,
                    "password": self._login_password,
                    "lang": "en",
                    "ref_source": "other_https://www.zhihu.com/signin",
                    "utm_source": "",
                    "captcha": "",
                    "timestamp": timestamp,
                    "signature": signature,
                }
                js_code = compile_js(js_str=zhihu_encrypt_js_code)
                data = js_code.call("encrypt", urlencode(post_data))
                response = self.make_request_with_session(
                    session=self._session,
                    url=self._login_url,
                    data=data,
                    headers=self._login_headers,
                    method="POST",
                )
                if check_is_json(data=response.content.decode()):
                    json_response = response.json()
                    if json_response.get("user_id"):
                        logger.debug(json_response)
                        self._login_cookies = json_response["cookie"]
                        self._session.cookies.update(self._login_cookies)
                        logger.info(f"登录 --> 登录成功!当前用户:{self._login_username}")
                        self._login_user_info = {
                            "username": self._login_username
                        }
                        self._login_user_info.update(json_response)
                    elif json_response.get("error"):
                        error_code: int = json_response["error"]["code"]
                        error_msg: str = json_response["error"]["message"]
                        if error_code == 100005:
                            logger.error("登录 --> 用户名或密码错误!登录失败!")
                            raise LoginException()
                        elif error_code == 120005:
                            logger.error(f"登录 --> 登录失败!错误信息:{error_code}")
                            raise LoginException()
                        else:
                            logger.error(f"登录 --> 其他错误!错误信息:{error_msg}")
                            raise LoginException()
                else:
                    logger.error("登录 --> 获取登录后的用户信息失败!登录失败!")
                    self.update_task_status(task_id=self._task_id,
                                            data=str(PROCESS_STATUS_FAIL))
                    raise LoginException()
            else:
                logger.error("登录 --> 失败")
                self.update_task_status(task_id=self._task_id,
                                        data=str(PROCESS_STATUS_FAIL))
                raise LoginException()

            if self._login_user_info is not None:
                self.parse_data_with_method(
                    method=BaseSpiderParseMethodType.LoginResult)
            else:
                logger.error("登录 --> 获取用户数据失败!")
                raise LoginException()
        else:
            # self._session.headers.update(self._common_headers)
            # self._session.cookies.update(self._login_cookies)
            self._common_headers.update(Cookie=self._login_cookies)
            self._login_user_url_token = self.get_data(
                spider_name=f"{self._spider_name}:token")
            self.parse_data_with_method(
                method=BaseSpiderParseMethodType.LoginResult)
Beispiel #8
0
    def _parse_login_data(self):
        personal_data_url: str = f"{self._personal_main_url}/api/user/show"
        json_req_data: Dict = {"username": self._login_username}
        personal_data_response = self.make_request(
            url=personal_data_url,
            headers=self._request_headers,
            json=json_req_data,
            method="POST",
        )
        if (personal_data_response.status_code != 200
                or check_is_json(data=personal_data_response.content.decode())
                is not True):
            logger.error("获取个人数据异常!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise LoginException()

        personal_data_json = personal_data_response.json()
        if personal_data_json["message"] != "成功":
            logger.error("获取个人数据失败!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise ParseDataException()

        data = personal_data_json["data"]
        personal_data: Dict = {
            "username": self._username,
            "description": data["selfdesc"],
            "avatarImg": data["avatarurl"],
            "followee": 0,
            "follower": 0,
            "likeBlogs": 0,
        }

        # 获取关注量
        follower_api_url: str = f"{self._personal_main_url}/api/relation/get?username={self._username}"
        follower_response = self.make_request(url=follower_api_url,
                                              headers=self._request_headers)

        if (follower_response.status_code != 200
                or check_is_json(data=follower_response.content.decode())
                is not True):
            logger.error("获取个人关注数据异常!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise ParseDataException()

        follower_json_response = follower_response.json()
        if follower_json_response["message"] == "成功":
            personal_data["followee"] = follower_json_response["data"][
                "fans_num"]
            personal_data["follower"] = follower_json_response["data"][
                "follow_num"]

        # 获取个人收藏的博客数
        collections_response = self.make_request(
            url=f"{self._personal_main_url}/api/favorite/folderList",
            headers=self._request_headers,
        )

        if (collections_response.status_code != 200 or check_is_json(
                collections_response.content.decode()) is not True):
            logger.error("获取个人收藏数据异常!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise ParseDataException()

        collection_json_response = collections_response.json()
        if collection_json_response["message"] == "成功":
            collection_count = 0
            for collection in collection_json_response["data"]["result"]:
                favorite_num = collection["FavoriteNum"]
                if favorite_num != 0:
                    collection_count += collection["FavoriteNum"]
                    self._personal_collection_ids.append(collection["ID"])
            personal_data["likeBlogs"] = collection_count

        # 写入数据
        logger.debug(personal_data)
        self.data_model.set_personal_data(data=personal_data)
        self.parse_data_with_method(
            method=BaseSpiderParseMethodType.PersonalBlogs)