Esempio n. 1
0
    def login(self):
        if self._login_cookies is None:
            hvc_response = self.make_request_with_session(
                session=self._session,
                url=f"{self._login_main_url}/v1/api/riskControl/checkHVC",
                headers=self._request_headers,
                json=self._check_hvc_data,
                method="POST",
            )
            if hvc_response.status_code != 200:
                logger.error("登录 --> 登录预认证失败!")
                raise LoginException()

            if check_is_json(data=hvc_response.content.decode()) is not True:
                logger.error("登录 --> 登录预认证数据返回失败!")
                raise LoginException()

            hvc_json_response = hvc_response.json()
            if hvc_json_response["message"] != "success":
                logger.error(f"登录 --> 登录预认证失败!返回结果: {hvc_json_response}")
                raise LoginException()
            logger.debug(f"预认证返回结果: {hvc_json_response}")

            login_response = self.make_request_with_session(
                session=self._session,
                url=f"{self._login_main_url}/v1/register/pc/login/doLogin",
                headers=self._request_headers,
                json=self._login_data,
                method="POST",
            )
            if login_response.status_code != 200:
                logger.error("登录 --> 登录请求失败!")
                raise LoginException()

            if check_is_json(data=login_response.content.decode()) is not True:
                logger.error("登录 --> 登录请求返回失败!")
                raise LoginException()

            login_json_response = login_response.json()
            if login_json_response["message"] == "success":
                logger.debug(f"登录返回结果: {login_json_response}")
                self._username = login_json_response["username"]
                logger.info(f"登录 --> 登录成功!当前用户名: {self._login_username}")
                self._login_cookies = CookieUtils(
                    cookie_list=login_response.cookies.items()).to_str()
                self.set_cookies(spider_name=self._spider_name,
                                 cookies=self._login_cookies)
                self._request_headers.update(Cookie=self._login_cookies)
                self.parse_data_with_method(
                    method=BaseSpiderParseMethodType.LoginResult)
            else:
                logger.error(f"登录 --> 登录异常!返回结果: {login_json_response}")
                self.update_task_status(task_id=self._task_id,
                                        data=str(PROCESS_STATUS_FAIL))
                raise LoginException()
        else:
            self._request_headers.update(Cookie=self._login_cookies)
            self.parse_data_with_method(
                method=BaseSpiderParseMethodType.PersonalBlogs)
Esempio n. 2
0
    def _test_cookies(self, cookies: Optional[str] = None) -> bool:
        params: str = "visits_count"
        test_user_url: str = f"{self._main_url}/api/v4/me?include={params}"
        test_request_headers: Dict = self.get_default_headers()
        test_request_cookies = self._login_cookies
        if cookies is not None:
            test_request_cookies = cookies

        if isinstance(test_request_cookies, dict):
            test_request_headers.update(Cookie=CookieUtils(
                cookie_list=test_request_cookies.items()).to_str())
        elif isinstance(test_request_cookies, str):
            test_request_headers.update(Cookie=test_request_cookies)
        test_response = self.make_request(url=test_user_url,
                                          headers=test_request_headers)
        if (test_response.status_code != 200
                or check_is_json(test_response.content.decode()) is not True):
            logger.error(f"当前知乎登录状态: 已退出!")
            self._async_task.remove_async_scheduler(job_id=self._spider_name)
            return False

        test_json_response = test_response.json()
        if test_json_response.get("error"):
            logger.error(f"当前知乎账号登录状态: 已退出!")
            return False
        else:
            logger.info(
                f"当前知乎账号为: {self._login_username} 用户 ID: {test_json_response['id']}, 状态: 已登录"
            )
            return True
Esempio n. 3
0
                def inner_spider(c_id: int, page_no: int = 1):
                    like_blogs_url: str = f"{self._personal_main_url}/api/favorite/listByFolder?" f"folderID={c_id}&page={page_no}&pageSize=10&sources="
                    inner_response = self.make_request(
                        url=like_blogs_url, headers=self._request_headers)
                    if (inner_response.status_code != 200 or check_is_json(
                            data=inner_response.content.decode()) is not True):
                        logger.error("获取个人收藏夹数据失败!")
                        raise ParseDataException

                    inner_json_response = inner_response.json()
                    if inner_json_response["message"] == "成功":
                        collection_data_list = inner_json_response["data"][
                            "list"]
                        if len(collection_data_list) > 0:
                            logger.info(
                                f"正在获取收藏夹: {collection_id} 的第 {page_no} 的数据!")
                            for collection_data in collection_data_list:
                                blog_data: Dict = {
                                    "blogId":
                                    collection_data["ID"],
                                    "blogTitle":
                                    collection_data["Title"],
                                    "blogHref":
                                    collection_data["URL"],
                                    "blogViewers":
                                    0,
                                    "blogCreateTime":
                                    collection_data["DatelineString"],
                                }
                                self._like_blogs_data.append(blog_data)

                            # 请求下一页
                            page_no += 1
                            time.sleep(0.5)
                            inner_spider(c_id=c_id, page_no=page_no)
Esempio n. 4
0
                def inner_spider(c_id: int,
                                 next_url: Optional[str] = None) -> bool:
                    req_params: str = "data[*].created,content.comment_count," "suggest_edit,is_normal,thumbnail_extra_info," "thumbnail,description,content,voteup_count," "created,updated,upvoted_followees,voting," "review_info,is_labeled,label_info," "relationship.is_authorized,voting,is_author," "is_thanked,is_nothelp,is_recognized;" "data[*].author.badge[?(type=best_answerer)].topics"
                    if next_url is None:
                        collection_url: str = f"{self._main_url}/api/v4/favlists/" f"{c_id}/items?include={req_params}" f"&offset=0&limit=20"
                    else:
                        collection_url = next_url

                    inner_response = self.make_request(
                        url=collection_url, headers=self.get_default_headers())
                    if check_is_json(data=inner_response.content.decode()):
                        inner_json_response = inner_response.json()
                        for data in inner_json_response["data"]:
                            create_time = datetime_str_change_fmt(
                                time_str=data["created"],
                                prev_fmt="%Y-%m-%dT%H:%M:%SZ")
                            content = data["content"]

                            blog_id = content["id"]
                            blog_href = content["url"]
                            # 收藏夹中可以混入问题类型
                            if content["type"] == "answer":
                                title = content["question"]["title"]
                                blog_href = content["question"]["url"]
                            else:
                                title = content["title"]

                            # 封装数据
                            blog_data: Dict = {
                                "blogId": blog_id,
                                "blogTitle": title,
                                "blogHref": blog_href,
                                "blogViewers": content["voteup_count"],
                                "blogCreateTime": create_time,
                            }
                            self._blogs_collection_data.append(blog_data)

                        if inner_json_response["paging"]["is_end"] is not True:
                            time.sleep(1.5)
                            return inner_spider(
                                c_id=c_id,
                                next_url=inner_json_response["paging"]["next"],
                            )
                        else:
                            logger.info(f"收藏夹 ID: {c_id} 数据爬取完毕!")
                            return True
                    else:
                        logger.error("解析个人收藏博客数据失败!")
                        return False
Esempio n. 5
0
    def _test_cookies(self, cookies: Optional[str] = None) -> bool:
        params = self.get_data(spider_name=f"{self._spider_name}:params")
        if params is None:
            return False
        test_user_url: str = f"https://user-storage-api-ms.juejin.im/v1/getUserInfo{params}"
        test_request_headers: Dict = self.get_default_headers()
        test_response = self.make_request(url=test_user_url,
                                          headers=test_request_headers)
        if (test_response.status_code != 200
                or check_is_json(test_response.content.decode()) is not True):
            logger.error(f"当前掘金账号登录状态: 已退出!")
            self._async_task.remove_async_scheduler(job_id=self._spider_name)
            return False

        test_json_response = test_response.json()
        if test_json_response["s"] == 1:
            logger.info(f"当前掘金账号为: {self._login_username}, 状态: 已登录")
            return True
        else:
            logger.error(f"当前掘金账号登录状态: 已退出!")
            return False
Esempio n. 6
0
    def _parse_personal_blogs(self, next_params: Optional[str] = None):
        include_params: str = "data[*].comment_count,suggest_edit," "is_normal,thumbnail_extra_info," "thumbnail,can_comment,comment_permission," "admin_closed_comment,content,voteup_count," "created,updated,upvoted_followees,voting," "review_info,is_labeled,label_info;" "data[*].author.badge[?(type=best_answerer)].topics"
        if next_params is None:
            self._blogs_url: str = f"{self._main_url}/api/v4/members/" f"{self._login_user_url_token}/articles?include={include_params}" f"&offset=0&limit=20&sort_by=created"
        else:
            self._blogs_url = next_params

        response = self.make_request(url=self._blogs_url,
                                     headers=self.get_default_headers())
        if check_is_json(response.content.decode()):
            json_response = response.json()
            for blogs in json_response["data"]:
                # 知乎的浏览者数据用赞同数代替
                blog_data: Dict = {
                    "blogId":
                    blogs["id"],
                    "blogTitle":
                    blogs["title"],
                    "blogHref":
                    blogs["url"],
                    "blogViewers":
                    blogs["voteup_count"],
                    "blogCreateTime":
                    timestamp_to_datetime_str(timestamp=blogs["created"]),
                }
                self._blogs_data.append(blog_data)

            if json_response["paging"]["is_end"] is not True:
                time.sleep(0.5)
                self._parse_personal_blogs(
                    next_params=json_response["paging"]["next"])
            else:
                logger.debug(self._blogs_data)
                self.data_model.set_personal_blogs_data(data=self._blogs_data)
                logger.info("获取个人博客数据成功!")
        else:
            logger.error("获取个人博客数据失败!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise ParseDataException()
Esempio n. 7
0
    def _test_cookies(self, cookies: Optional[str] = None) -> bool:
        test_url: str = f"{self._personal_main_url}/api/favorite/folderList"
        test_request_headers: Dict = self.get_default_headers()
        test_request_cookies = self._login_cookies
        if cookies is not None:
            test_request_cookies = cookies
        test_request_headers.update(Cookie=test_request_cookies)
        test_response = self.make_request(url=test_url,
                                          headers=test_request_headers)
        if (test_response.status_code != 200
                or check_is_json(test_response.content.decode()) is not True):
            logger.error(f"当前 CSDN 账号登录状态: 已退出!")
            self._async_task.remove_async_scheduler(job_id=self._spider_name)
            return False

        test_json_response = test_response.json()
        if test_json_response["code"] == 200:
            logger.info(f"当前 CSDN 账号为: {self._login_username}, 状态: 已登录")
            return True
        else:
            logger.error(f"当前 CSDN 账号登录状态: 已退出!")
            return False
Esempio n. 8
0
    def _init_login(self) -> bool:
        """
        初始化登录准备
        :return: 是否初始化成功
        """
        self._session.headers.update(self._common_headers)
        self.make_request_with_session(session=self._session,
                                       url=self._signin_url,
                                       headers=self._common_headers)
        response = self.make_request_with_session(
            session=self._session,
            url=self._captcha_url,
            headers=self._captcha_headers)
        if check_is_json(data=response.content.decode()):
            if response.json()["show_captcha"]:
                self._captcha_headers.update(origin="https://www.zhihu.com")
                response = self.make_request_with_session(
                    session=self._session,
                    url=self._captcha_url,
                    headers=self._common_headers,
                    method="PUT",
                )
                if check_is_json(data=response.content.decode()):
                    img_base64 = response.json()["img_base64"].replace(
                        "\\n", "")

                    # 验证码识别
                    captcha_model = ZhihuCaptcha()
                    captcha_code = captcha_model.predict(
                        img=image_base64_to_pillow(img_str=img_base64))
                    post_data: dict = {"input_text": captcha_code}
                    data = MultipartEncoder(fields=post_data,
                                            boundary="----WebKitFormBoundary")
                    headers = {
                        "content-type": data.content_type,
                        "origin": "https://www.zhihu.com",
                        "referer": "https://www.zhihu.com/signin",
                        "x-requested-with": "fetch",
                    }
                    # 这里需要暂停一下, 防止请求过快
                    time.sleep(2)

                    response = self.make_request_with_session(
                        session=self._session,
                        url=self._captcha_url,
                        data=data,
                        headers=headers,
                        method="POST",
                    )
                    if check_is_json(response.content.decode()):
                        if response.json().get("success"):
                            return True
                        else:
                            logger.error(
                                f"验证码校验请求错误!当前接口返回结果:{response.content.decode()}"
                            )
                            return False
                    else:
                        logger.error("登录 --> 验证码校验请求失败")
                        return False
                else:
                    logger.error("登录 --> 获取验证码失败")
                    return False
            else:
                logger.error(
                    f"登录 --> 获取验证码接口数据发生变化!当前接口返回结果:{response.content.decode()}"
                )
                return False
        else:
            logger.error("登录 --> 获取验证码初始化失败")
            return False
Esempio n. 9
0
    def _parse_personal_collect_blogs(self):
        include_params: str = "data[*].updated_time,answer_count,follower_count," "creator,description,is_following,comment_count,created_time"
        self._collection_url: str = f"{self._main_url}/api/v4/people/" f"{self._login_user_url_token}/collections?include={include_params}" f"&offset=0&limit=20"
        response = self.make_request(url=self._collection_url,
                                     headers=self.get_default_headers())
        if check_is_json(data=response.content.decode()):
            json_response = response.json()
            collections_id: List = []
            for collections in json_response["data"]:
                collections_id.append(collections["id"])
            if len(collections_id) == 0:
                logger.info("个人收藏博客获取完毕!数据为空!")
                self.parse_data_with_method(
                    method=BaseSpiderParseMethodType.Finish)
            else:
                # 用闭包进行爬取
                def inner_spider(c_id: int,
                                 next_url: Optional[str] = None) -> bool:
                    req_params: str = "data[*].created,content.comment_count," "suggest_edit,is_normal,thumbnail_extra_info," "thumbnail,description,content,voteup_count," "created,updated,upvoted_followees,voting," "review_info,is_labeled,label_info," "relationship.is_authorized,voting,is_author," "is_thanked,is_nothelp,is_recognized;" "data[*].author.badge[?(type=best_answerer)].topics"
                    if next_url is None:
                        collection_url: str = f"{self._main_url}/api/v4/favlists/" f"{c_id}/items?include={req_params}" f"&offset=0&limit=20"
                    else:
                        collection_url = next_url

                    inner_response = self.make_request(
                        url=collection_url, headers=self.get_default_headers())
                    if check_is_json(data=inner_response.content.decode()):
                        inner_json_response = inner_response.json()
                        for data in inner_json_response["data"]:
                            create_time = datetime_str_change_fmt(
                                time_str=data["created"],
                                prev_fmt="%Y-%m-%dT%H:%M:%SZ")
                            content = data["content"]

                            blog_id = content["id"]
                            blog_href = content["url"]
                            # 收藏夹中可以混入问题类型
                            if content["type"] == "answer":
                                title = content["question"]["title"]
                                blog_href = content["question"]["url"]
                            else:
                                title = content["title"]

                            # 封装数据
                            blog_data: Dict = {
                                "blogId": blog_id,
                                "blogTitle": title,
                                "blogHref": blog_href,
                                "blogViewers": content["voteup_count"],
                                "blogCreateTime": create_time,
                            }
                            self._blogs_collection_data.append(blog_data)

                        if inner_json_response["paging"]["is_end"] is not True:
                            time.sleep(1.5)
                            return inner_spider(
                                c_id=c_id,
                                next_url=inner_json_response["paging"]["next"],
                            )
                        else:
                            logger.info(f"收藏夹 ID: {c_id} 数据爬取完毕!")
                            return True
                    else:
                        logger.error("解析个人收藏博客数据失败!")
                        return False

                for index, collection_id in enumerate(collections_id):
                    logger.info(
                        f"正在爬取第{index}个收藏夹数据...当前收藏夹 ID: {collection_id}")
                    is_continue = inner_spider(c_id=collection_id)
                    if is_continue is not True:
                        break
                logger.info(
                    f"个人收藏博客获取完毕!数据长度: {len(self._blogs_collection_data)}")
                self.data_model.set_personal_like_blogs_data(
                    data=self._blogs_collection_data)
                self.parse_data_with_method(
                    method=BaseSpiderParseMethodType.Finish)
        else:
            logger.error("获取个人收藏博客数据失败!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise ParseDataException()
Esempio n. 10
0
    def _parse_login_data(self):
        include_params: str = "ad_type,available_message_types," "default_notifications_count," "follow_notifications_count," "vote_thank_notifications_count," "messages_count," "email,account_status,is_bind_phone," "visits_count,answer_count,articles_count," "gender,follower_count"
        self._personal_url: str = f"{self._main_url}/api/v4/me?include={include_params}"
        # 这个地方很重要
        request_cookie: str = CookieUtils(
            cookie_list=self._session.cookies.items()).to_str()
        self.set_cookies(spider_name=f"zhihu:{self._login_username}",
                         cookies=request_cookie)
        response = self.make_request_with_session(session=self._session,
                                                  url=self._personal_url,
                                                  headers=self._common_headers)
        if check_is_json(data=response.content.decode()):
            json_response = response.json()
            self._login_user_url_token = json_response["url_token"]
            self.set_data(
                spider_name=f"{self._spider_name}:token",
                data=self._login_user_url_token,
            )
            self._common_headers.update(Cookie=request_cookie)

            # 知乎神奇的参数
            # TODO 知乎把 API 的请求又进行了加密,导致下面的 API 又用不了了,待解决
            resp1 = self._session.get('https://www.zhihu.com',
                                      headers=self.get_default_headers())
            d_c0 = resp1.cookies['d_c0']

            headers = self.get_default_headers()
            search_api: str = f"/api/v4/members/{self._login_user_url_token}/followees"
            req_url: str = f"{self._main_url}{search_api}"
            f = "+".join((
                "3_2.0",
                search_api,
                req_url,
                d_c0,
            ))

            js_code = compile_js(js_str=zhihu_zse86_js_code)
            sign = js_code.call("b", md5_str(encrypt_str=f))

            headers.update({
                "content-type": "application/json",
                "accept": "*/*",
                "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
                "cache-control":
                "no-cache, no-store, must-revalidate, private, max-age=0",
                "pragma": "no-cache",
                "sec-fetch-dest": "empty",
                "sec-fetch-mode": "navigate",
                "sec-fetch-site": "same-origin",
                "cookie": f"d_c0={d_c0}",
                "x-api-version": "3.0.91",
                "x-app-za": "OS=Web",
                "x-requested-with": "fetch",
                "x-zse-83": "3_2.0",
                "x-zse-86": f"1.0_{sign}",
            })
            followee_response = self.make_request(url=req_url, headers=headers)
            followee_count: int = 0
            if check_is_json(data=followee_response.text):
                followee_count = followee_response.json()["paging"]["totals"]

            personal_data: Dict = {
                "username": json_response["name"],
                "description": json_response["headline"],
                "avatarImg": json_response["avatar_url_template"],
                "followee": followee_count,
                "follower": json_response["follower_count"],
                "likeBlogs": 0,
            }
            # 推送数据
            logger.debug(personal_data)
            self.data_model.set_personal_data(data=personal_data)
            logger.info("查询 --> 获取个人数据成功!")
            self.parse_data_with_method(
                method=BaseSpiderParseMethodType.PersonalBlogs)
        else:
            logger.error("查询 --> 获取个人数据失败!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise ParseDataException(message="获取个人数据失败!")
Esempio n. 11
0
    def login(self):
        if self._login_cookies is None:
            if self._init_login():
                grant_type: str = "password"
                client_id: str = "c3cef7c66a1843f8b3a9e6a1e3160e20"
                source: str = "com.zhihu.web"
                timestamp: str = str(int(time.time() * 1000))
                signature: str = hmac_encrypt_sha1(
                    key=b"d1b964811afb40118a12068ff74a12f4",
                    encrypt_str=f"{grant_type}{client_id}{source}{timestamp}",
                )
                post_data: dict = {
                    "client_id": client_id,
                    "grant_type": grant_type,
                    "source": source,
                    "username": self._login_username,
                    "password": self._login_password,
                    "lang": "en",
                    "ref_source": "other_https://www.zhihu.com/signin",
                    "utm_source": "",
                    "captcha": "",
                    "timestamp": timestamp,
                    "signature": signature,
                }
                js_code = compile_js(js_str=zhihu_encrypt_js_code)
                data = js_code.call("encrypt", urlencode(post_data))
                response = self.make_request_with_session(
                    session=self._session,
                    url=self._login_url,
                    data=data,
                    headers=self._login_headers,
                    method="POST",
                )
                if check_is_json(data=response.content.decode()):
                    json_response = response.json()
                    if json_response.get("user_id"):
                        logger.debug(json_response)
                        self._login_cookies = json_response["cookie"]
                        self._session.cookies.update(self._login_cookies)
                        logger.info(f"登录 --> 登录成功!当前用户:{self._login_username}")
                        self._login_user_info = {
                            "username": self._login_username
                        }
                        self._login_user_info.update(json_response)
                    elif json_response.get("error"):
                        error_code: int = json_response["error"]["code"]
                        error_msg: str = json_response["error"]["message"]
                        if error_code == 100005:
                            logger.error("登录 --> 用户名或密码错误!登录失败!")
                            raise LoginException()
                        elif error_code == 120005:
                            logger.error(f"登录 --> 登录失败!错误信息:{error_code}")
                            raise LoginException()
                        else:
                            logger.error(f"登录 --> 其他错误!错误信息:{error_msg}")
                            raise LoginException()
                else:
                    logger.error("登录 --> 获取登录后的用户信息失败!登录失败!")
                    self.update_task_status(task_id=self._task_id,
                                            data=str(PROCESS_STATUS_FAIL))
                    raise LoginException()
            else:
                logger.error("登录 --> 失败")
                self.update_task_status(task_id=self._task_id,
                                        data=str(PROCESS_STATUS_FAIL))
                raise LoginException()

            if self._login_user_info is not None:
                self.parse_data_with_method(
                    method=BaseSpiderParseMethodType.LoginResult)
            else:
                logger.error("登录 --> 获取用户数据失败!")
                raise LoginException()
        else:
            # self._session.headers.update(self._common_headers)
            # self._session.cookies.update(self._login_cookies)
            self._common_headers.update(Cookie=self._login_cookies)
            self._login_user_url_token = self.get_data(
                spider_name=f"{self._spider_name}:token")
            self.parse_data_with_method(
                method=BaseSpiderParseMethodType.LoginResult)
Esempio n. 12
0
    def _parse_personal_blogs(self, page_no: int = 0):
        """
        这个接口比较高级 https://bizapi.csdn.net/blog-console-api/v1/article/list?pageSize=20
        应该说是整个 https://bizapi.csdn.net 的接口都挺高级
        请求头内部通过自定义一套加密字符串和私钥进行 HMAC-SHA256 加密
        """
        api_main_url: str = "https://bizapi.csdn.net"
        api_suffix: str = "/blog-console-api/v1/article/list?pageSize=20"
        if page_no != 0:
            api_suffix = f"/blog-console-api/v1/article/list?page={page_no}pageSize=20"

        # 生成 x-ca-nonce
        nonce: str = ""
        for nonce_char in self._api_nonce_template:
            n = int(16 * random.random()) | 0
            nonce += (hex(n if n == 3 else n
                          | 8)[2:] if nonce_char in "xy" else nonce_char)

        # 生成 x-ca-signature
        encrypt_str: str = f"GET\napplication/json, text/plain, */*\n\n\n\n" f"x-ca-key:{self._api_key}\nx-ca-nonce:{nonce}\n{api_suffix}"
        signature = hmac_encrypt_sha256_base64(key=self._api_encrypt_key,
                                               encrypt_str=encrypt_str)
        api_headers: Dict = self.get_default_headers()
        api_headers.update({
            "Cookie": self._login_cookies,
            "x-ca-key": self._api_key,
            "x-ca-nonce": nonce,
            "x-ca-signature": signature,
            "x-ca-signature-headers": "x-ca-key,x-ca-nonce",
            "origin": "https://mp.csdn.net",
            "referer": "https://mp.csdn.net/console/article",
            "accept": "application/json, text/plain, */*",
        })
        api_response = self.make_request(url=f"{api_main_url}{api_suffix}",
                                         headers=api_headers)

        if (api_response.status_code != 200 or
                check_is_json(data=api_response.content.decode()) is not True):
            logger.error("获取个人博客数据失败!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise ParseDataException()

        api_json_response = api_response.json()
        if api_json_response["msg"] != "success":
            logger.error("获取个人博客数据失败!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise ParseDataException()

        blogs_list_data = api_json_response["data"]["list"]
        if len(blogs_list_data) > 0:
            for blogs in blogs_list_data:
                # https://blog.csdn.net/sinat_28177969/article/details/54138163
                blog_id: str = blogs["ArticleId"]
                blog_href: str = f"{self._blog_main_url}/{self._username}/article/details/{blog_id}"
                blog_create_time = datetime_str_change_fmt(
                    time_str=blogs["PostTime"], prev_fmt="%Y年%m月%d日 %H:%M:%S")

                blog_data: Dict = {
                    "blogId": blog_id,
                    "blogTitle": blogs["Title"],
                    "blogHref": blog_href,
                    "blogViewers": blogs["ViewCount"],
                    "blogCreateTime": blog_create_time,
                }
                self._blogs_data.append(blog_data)

            # 文章有不同状态的,只取 enable 的,其他状态的文章,外部无法访问
            if api_json_response["data"]["count"]["enable"] > 20:
                time.sleep(1)
                page_no += 1
                self._parse_personal_blogs(page_no=page_no)
            else:
                logger.debug(self._blogs_data)
                self.data_model.set_personal_blogs_data(data=self._blogs_data)
                logger.info("获取个人博客数据成功!")
        else:
            logger.info("获取个人博客数据成功!个人博客数据为空!")
Esempio n. 13
0
    def _parse_login_data(self):
        personal_data_url: str = f"{self._personal_main_url}/api/user/show"
        json_req_data: Dict = {"username": self._login_username}
        personal_data_response = self.make_request(
            url=personal_data_url,
            headers=self._request_headers,
            json=json_req_data,
            method="POST",
        )
        if (personal_data_response.status_code != 200
                or check_is_json(data=personal_data_response.content.decode())
                is not True):
            logger.error("获取个人数据异常!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise LoginException()

        personal_data_json = personal_data_response.json()
        if personal_data_json["message"] != "成功":
            logger.error("获取个人数据失败!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise ParseDataException()

        data = personal_data_json["data"]
        personal_data: Dict = {
            "username": self._username,
            "description": data["selfdesc"],
            "avatarImg": data["avatarurl"],
            "followee": 0,
            "follower": 0,
            "likeBlogs": 0,
        }

        # 获取关注量
        follower_api_url: str = f"{self._personal_main_url}/api/relation/get?username={self._username}"
        follower_response = self.make_request(url=follower_api_url,
                                              headers=self._request_headers)

        if (follower_response.status_code != 200
                or check_is_json(data=follower_response.content.decode())
                is not True):
            logger.error("获取个人关注数据异常!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise ParseDataException()

        follower_json_response = follower_response.json()
        if follower_json_response["message"] == "成功":
            personal_data["followee"] = follower_json_response["data"][
                "fans_num"]
            personal_data["follower"] = follower_json_response["data"][
                "follow_num"]

        # 获取个人收藏的博客数
        collections_response = self.make_request(
            url=f"{self._personal_main_url}/api/favorite/folderList",
            headers=self._request_headers,
        )

        if (collections_response.status_code != 200 or check_is_json(
                collections_response.content.decode()) is not True):
            logger.error("获取个人收藏数据异常!")
            self.update_task_status(task_id=self._task_id,
                                    data=str(PROCESS_STATUS_FAIL))
            raise ParseDataException()

        collection_json_response = collections_response.json()
        if collection_json_response["message"] == "成功":
            collection_count = 0
            for collection in collection_json_response["data"]["result"]:
                favorite_num = collection["FavoriteNum"]
                if favorite_num != 0:
                    collection_count += collection["FavoriteNum"]
                    self._personal_collection_ids.append(collection["ID"])
            personal_data["likeBlogs"] = collection_count

        # 写入数据
        logger.debug(personal_data)
        self.data_model.set_personal_data(data=personal_data)
        self.parse_data_with_method(
            method=BaseSpiderParseMethodType.PersonalBlogs)