Ejemplo n.º 1
0
 def __init__(self) -> None:
     super().__init__()
     self.webSubParser = WebSubParser()
Ejemplo n.º 2
0
 def __init__(self) -> None:
     """百度搜索解析器"""
     super().__init__()
     self.webSubParser = WebSubParser()
Ejemplo n.º 3
0
class Parser(BaseSpider):
    def __init__(self) -> None:
        super().__init__()
        self.webSubParser = WebSubParser()

    def parse_web(self, content: str, exclude: list) -> dict:
        """解析百度网页搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度网页搜索HTML源码.
            exclude (list): 要屏蔽的控件.

        Returns:
            dict: 解析后的结果
        """
        soup = BeautifulSoup(content, "html.parser")
        if soup.find("div", id="content_left") is None:
            return {"results": [], "pages": 0}
        # 获取搜索结果总数
        num = int(
            str(soup.find("span", class_="nums_text").text).strip(
                "百度为您找到相关结果约").strip("个").replace(",", ""))
        # 定义预结果(运算以及相关搜索)
        pre_results = []
        # 预处理新闻
        if "news" not in exclude:
            news = soup.find("div",
                             class_="result-op",
                             tpl="sp_realtime_bigpic5",
                             srcid="19")
            news_detail = self.webSubParser.parse_news_block(news)
        # 预处理短视频
        if "video" not in exclude:
            video = soup.find("div", class_="op-short-video-pc")
            video_results = self.webSubParser.parse_video_block(video)
        # 预处理运算
        if "calc" not in exclude:
            calc = soup.find("div", class_="op_new_cal_screen")
        # 预处理相关搜索
        if "related" not in exclude:
            try:
                _related = soup.find("div",
                                     id="rs").find("table").find_all("th")
            except AttributeError:
                _related = []
            related = []
            # 一个一个append相关搜索
            for _ in _related:
                if _.text:
                    related.append(_.text)
        # 预处理百科
        if "baike" not in exclude:
            baike = soup.find("div", class_="c-container", tpl="bk_polysemy")
            baike = self.webSubParser.parse_baike_block(baike)
        # 预处理贴吧
        if "tieba" not in exclude:
            tieba = BeautifulSoup(content, "html.parser").find("div",
                                                               srcid="10")
            tieba = self.webSubParser.parse_tieba_block(tieba)
        # 预处理博客
        article_tags = BeautifulSoup(content, "html.parser").findAll("article")
        if "blog" not in exclude:
            blog = None
            for tmp in article_tags:
                if tmp["class"][-1].startswith("open-source-software-blog"):
                    blog = tmp
                    break
            blog = self.webSubParser.parse_blog_block(blog)
        # 预处理码云
        if "gitee" not in exclude:
            gitee = None
            for tmp in article_tags:
                if tmp["class"][-1].startswith("osc-gitee"):
                    gitee = tmp
                    break
            gitee = self.webSubParser.parse_gitee_block(gitee)
        # 加载贴吧
        if "tieba" not in exclude and tieba:
            pre_results.append(dict(type="tieba", result=tieba))
        # 加载博客
        if "blog" not in exclude and blog:
            pre_results.append(dict(type="blog", result=blog))
        # 加载码云
        if "gitee" not in exclude and gitee:
            pre_results.append(dict(type="gitee", result=gitee))
        # 加载搜索结果总数
        if num != 0:
            pre_results.append(dict(type="total", result=num))
        # 加载运算
        if "calc" not in exclude and calc:
            pre_results.append(
                dict(
                    type="calc",
                    process=str(
                        calc.find("p",
                                  class_="op_new_val_screen_process").find(
                                      "span").text),
                    result=str(
                        calc.find("p", class_="op_new_val_screen_result").find(
                            "span").text),
                ))
        # 加载相关搜索
        if "related" not in exclude and related:
            pre_results.append(dict(type="related", results=related))
        # 加载资讯
        if "news" not in exclude and news_detail:
            pre_results.append(dict(type="news", results=news_detail))
        # 加载短视频
        if "video" not in exclude and video_results:
            pre_results.append(dict(type="video", results=video_results))
        # 加载百科
        if "baike" not in exclude and baike:
            pre_results.append(dict(type="baike", result=baike))
        # 预处理源码
        soup = BeautifulSoup(content, "html.parser")
        results = soup.findAll("div", class_="result")
        res = []
        for result in results:
            des = None
            try:
                result["tpl"]
            except:
                continue
            soup = BeautifulSoup(self._minify(str(result)), "html.parser")
            # 链接
            href = soup.find("a").get("href").strip()
            # 标题
            title = self._format(str(soup.find("a").text))
            # 时间
            try:
                time = self._format(
                    soup.findAll("div", class_="c-abstract")[0].find(
                        "span", class_="newTimeFactor_before_abs").text)
            except (AttributeError, IndexError):
                time = None
            try:
                # 简介
                des = soup.find_all("div", class_="c-abstract")[0].text
                soup = BeautifulSoup(str(result), "html.parser")
                des = self._format(des).lstrip(str(time)).strip()
            except IndexError:
                try:
                    des = des.replace("mn", "")
                except (UnboundLocalError, AttributeError):
                    des = None
            if time:
                time = time.split("-")[0].strip()
            # 因为百度的链接是加密的了,所以需要一个一个去访问
            # 由于性能原因,分析链接部分暂略
            # if href is not None:
            #     try:
            #         # 由于性能原因,这里设置1秒超时
            #         r = requests.get(href, timeout=1)
            #         href = r.url
            #     except:
            #         # 获取网页失败,默认换回原加密链接
            #         href = href
            #     # 分析链接
            #     if href:
            #         parse = urlparse(href)
            #         domain = parse.netloc
            #         prepath = parse.path.split('/')
            #         path = []
            #         for loc in prepath:
            #             if loc != '':
            #                 path.append(loc)
            #     else:
            #         domain = None
            #         path = None
            try:
                result["tpl"]
            except:
                print(result.prettify())
            is_not_special = (result["tpl"] not in [
                "short_video_pc",
                "sp_realtime_bigpic5",
                "bk_polysemy",
                "tieba_general",
            ] and result.find("article") is None)
            domain = None
            if is_not_special:  # 确保不是特殊类型的结果
                # 获取可见的域名
                try:
                    domain = (result.find("div", class_="c-row").find(
                        "div", class_="c-span-last").find(
                            "div", class_="se_st_footer").find(
                                "a", class_="c-showurl").text)
                except Exception:
                    try:
                        domain = (result.find("div", class_="c-row").find(
                            "div", class_="c-span-last").find(
                                "p", class_="op-bk-polysemy-move").find(
                                    "span", class_="c-showurl").text)
                    except Exception:
                        try:
                            domain = (result.find("div",
                                                  class_="se_st_footer").find(
                                                      "a",
                                                      class_="c-showurl").text)
                        except:
                            domain = None
                if domain:
                    domain = domain.replace(" ", "")
            # 加入结果
            if title and href and is_not_special:
                res.append({
                    "title": title,
                    "des": des,
                    "origin": domain,
                    "url": href,
                    "time": time,
                    "type": "result",
                })
        soup = BeautifulSoup(content, "html.parser")
        soup = BeautifulSoup(str(soup.findAll("div", id="page")[0]),
                             "html.parser")
        # 分页
        pages_ = soup.findAll("span", class_="pc")
        pages = []
        for _ in pages_:
            pages.append(int(_.text))
        # 如果搜索结果仅有一页时,百度不会显示底部导航栏
        # 所以这里直接设置成1,如果不设会报错`TypeError`
        if not pages:
            pages = [1]
        # 设置最终结果
        result = pre_results
        result.extend(res)
        return {
            "results": result,
            # 最大页数
            "pages": max(pages),
        }

    @handle_err
    def parse_pic(self, content: str) -> dict:
        """解析百度图片搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度图片搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        # 从JavaScript中加载数据
        # 因为JavaScript很像JSON(JavaScript Object Notation),所以直接用json加载就行了
        # 还有要预处理一下,把函数和无用的括号过滤掉
        error = None
        try:
            data = json.loads(
                content.split("flip.setData('imgData', ")
                [1].split("flip.setData(")[0].split("]);")[0].replace(
                    ");",
                    "").replace("<\\/strong>",
                                "</strong>").replace("\\'",
                                                     "'").replace('\\"', "'"),
                strict=False,
            )
        except Exception as err:
            error = err
            if type(err) in [IndexError, AttributeError]:
                raise ParseError("Invalid HTML content.")
        finally:
            if error:
                raise ParseError(str(error))
        results = []
        for _ in data["data"][:-1]:
            if _:
                # 标题
                title = str(_["fromPageTitle"]).encode("utf-8").decode("utf-8")
                # 去除标题里的HTML
                title = unescape(self._remove_html(title))
                # 链接
                url = _["objURL"]
                # 来源域名
                host = _["fromURLHost"]
                # 生成结果
                result = {"title": title, "url": url, "host": host}
                results.append(result)  # 加入结果
        # 获取分页
        bs = BeautifulSoup(content, "html.parser")
        pages_ = bs.find("div", id="page").findAll("span", class_="pc")
        pages = []
        for _ in pages_:
            pages.append(int(_.text))
        return {
            "results": results,
            # 取最大页码
            "pages": max(pages),
        }

    def parse_zhidao(self, content: str) -> dict:
        """解析百度知道搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度知道搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        bs = BeautifulSoup(self._minify(content), "html.parser")
        # 所有搜索结果
        list_ = bs.find("div", class_="list").findAll("dl")
        results = []
        for item in list_:
            # 屏蔽企业回答
            if "ec-oad" in item["class"]:
                continue
            # 标题
            title = item.find("dt").text.strip("\n")
            # 链接
            try:
                url = item.find("dt").find("a")["href"]
            except KeyError:
                url = item.find("dt").find("a")["data-href"]
            # 简介
            des = item.find("dd", class_="dd").text
            tmp = item.find("dd", class_="explain").findAll("span",
                                                            class_="mr-8")
            # 发布日期
            date = item.find("dd", class_="explain").find("span",
                                                          class_="mr-7").text
            # 回答总数
            count = int(str(tmp[-1].text).strip("\n").strip("个回答"))
            # 生成结果
            result = {
                "title": title,
                "des": des,
                "date": date,
                "count": count,
                "url": url,
            }
            results.append(result)  # 加入结果
        # 获取分页
        wrap = bs.find("div", class_="pager")
        pages_ = wrap.findAll("a")[:-2]
        if "下一页" in pages_[-1].text:
            total = pages_[-2].text
        else:
            total = pages_[-1].text
        return {
            "results": results,
            # 取最大页码
            "pages": int(total),
        }

    def parse_video(self, content: str) -> dict:
        """解析百度视频搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度视频搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        bs = BeautifulSoup(content, "html.parser")
        # 锁定结果div
        data = bs.findAll("li", class_="result")
        results = []
        for res in data:
            # 标题
            title = res.find("a")["title"]
            # 链接
            url = "https://v.baidu.com" + res.find("a")["href"]
            # 封面图片链接
            img = res.find("img", class_="img-normal-layer")["src"]
            # 时长
            time = res.find("span", class_="info").text
            # 生成结果
            result = {"title": title, "url": url, "img": img, "time": time}
            results.append(result)  # 加入结果
        # 分页
        wrap = bs.find("div", class_="page-wrap")
        pages_ = wrap.findAll("a", class_="filter-item")[:-1]
        try:
            pages = int(pages_[-1].text)
        except:
            pages = 0
        return {"results": results, "pages": pages}

    def parse_news(self, content: str) -> dict:
        """解析百度资讯搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度资讯搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        bs = BeautifulSoup(self._format(content), "html.parser")
        # 搜索结果容器
        data = (bs.find("div", id="content_left").findAll("div")[1].findAll(
            "div", class_="result-op"))
        # print(len(data))
        results = []
        for res in data:
            # 标题
            title = self._format(res.find("h3").find("a").text)
            # 链接
            url = res.find("h3").find("a")["href"]
            # 简介
            des = (res.find("div", class_="c-span-last").find(
                "span", class_="c-color-text").text)
            # 作者
            author = (res.find("div", class_="c-span-last").find(
                "div", class_="news-source").find("span",
                                                  class_="c-gap-right").text)
            # 发布日期
            date = (res.find("div", class_="c-span-last").find(
                "div", class_="news-source").find("span",
                                                  class_="c-color-gray2").text)
            # 生成结果
            result = {
                "title": title,
                "author": author,
                "date": date,
                "des": des,
                "url": url,
            }
            results.append(result)  # 加入结果
        # 获取所有页数
        pages_ = bs.find("div", id="page").findAll("a")
        # 过滤页码
        if "< 上一页" in pages_[0].text:
            pages_ = pages_[1:]
        if "下一页 >" in pages_[-1].text:
            pages_ = pages_[:-1]
        return {"results": results, "pages": int(pages_[-1].text)}

    def parse_wenku(self, content: str) -> dict:  # pragma: no cover
        """解析百度文库搜索的页面源代码,目前不可用。

        Args:
            content (str): 已经转换为UTF-8编码的百度文库搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        bs = BeautifulSoup(content, "html.parser")
        data = bs.findAll("dl")
        results = []
        for res in data:
            dt = res.find("dt")
            type_ = self._format(
                dt.find("p", class_="fl").find("span",
                                               class_="ic")["title"]).upper()
            tmp = dt.find("p", class_="fl").find("a")
            title = self._format(tmp.text)
            url = tmp["href"]
            try:
                quality = float(
                    self._format(
                        res.find("p",
                                 class_="fr").findAll("span",
                                                      class_="ib")[1].text))
            except:
                quality = None
            dd = res.find("dd", class_="clearfix").find("div",
                                                        class_="summary-box")
            des = self._format(dd.find("p", class_="summary").text)
            try:
                dd_tags = res.find("dd", class_="tag-tips")
                tags = []
                for a in dd_tags.findAll("a"):
                    tags.append(self._format(a.text))
            except AttributeError:
                tags = []
            detail = dd.find("div", class_="detail").find("div",
                                                          class_="detail-info")
            date = self._format(detail.text.split("|")[0])
            pages = int(
                self._format(
                    detail.text.split("|")[1].replace("共",
                                                      "").replace("页", "")))
            downloads = int(
                self._format(detail.text.split("|")[2].strip("次下载")))
            result = {
                "title": title,
                "type": type_,
                "url": url,
                "des": des,
                "date": date,
                "pages": pages,
                "downloads": downloads,
            }
            results.append(result)
        pages_ = bs.find("div", class_="page-content").findAll("a")
        if "尾页" in pages_[-1].text:
            total = int(
                int(pages_[-1]["href"].split("&")[-1].strip("pn=")) / 10 + 1)
        else:
            total = int(
                bs.find("div", class_="page-content").find("span",
                                                           class_="cur").text)
        return {"results": results, "pages": total}

    def parse_jingyan(self, content: str) -> dict:
        """解析百度经验搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度经验搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        # 最小化代码
        code = self._minify(content)
        bs = BeautifulSoup(code, "html.parser")
        # 加载搜索结果
        data = bs.find("div", class_="search-list").findAll("dl")
        results = []
        for res in data:
            # 标题
            title = self._format(res.find("dt").find("a").text)
            # 链接
            url = "https://jingyan.baidu.com/" + res.find("dt").find(
                "a")["href"]
            # 简介
            des = self._format(
                res.find("dd").find("div", class_="summary").find(
                    "span", class_="abstract").text)
            # 获取发布日期和分类,位于`<span class="cate"/>`中
            tmp = self._format(
                res.find("dd").find("div", class_="summary").find(
                    "span", class_="cate").text).split("-")
            # 发布日期
            date = self._format(tmp[1])
            # 分类
            category = self._format(tmp[-1]).strip("分类:")
            # 支持票数
            votes = int(
                self._format(
                    res.find("dt").find("span",
                                        class_="succ-times").text).strip("得票"))
            # 生成结果
            result = {
                "title": title,
                "url": url,
                "des": des,
                "date": date,
                "category": category,
                "votes": votes,
            }
            results.append(result)  # 加入结果到集合中
        # 获取分页
        pages_ = bs.find("div", id="pg").findAll("a")[-1]
        # 既不是最后一页也没有超出最后一页
        if "尾页" in pages_.text:
            # 获取尾页并加一
            total = int(
                int(pages_["href"].split("&")[-1].strip("pn=")) / 10) + 1
        # 是最后一页或者是超过了最后一页
        else:
            # 重新获取分页
            pages_ = bs.find("div", id="pg").findAll("a")[1]
            # 获取尾页并加一
            total = int(
                int(pages_["href"].split("&")[-1].strip("pn=")) / 10) + 1
        return {"results": results, "pages": total}

    def parse_baike(self, content: str) -> dict:
        """解析百度百科搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度百科搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        code = self._minify(content)
        # 创建BeautifulSoup对象
        soup = (BeautifulSoup(code, "html.parser").find(
            "div", class_="body-wrapper").find("div", class_="searchResult"))
        # 获取百科总数
        total = int(
            soup.find(
                "div",
                class_="result-count").text.strip("百度百科为您找到相关词条约").strip("个"))
        # 获取所有结果
        container = soup.findAll("dd")
        results = []
        for res in container:
            # 链接
            url = "https://baike.baidu.com" + self._format(
                res.find("a", class_="result-title")["href"])
            # 标题
            title = self._format(res.find("a", class_="result-title").text)
            # 简介
            des = self._format(res.find("p", class_="result-summary").text)
            # 更新日期
            date = self._format(res.find("span", class_="result-date").text)
            # 生成结果
            results.append({
                "title": title,
                "des": des,
                "date": date,
                "url": url
            })
        return {"results": results, "pages": total}
Ejemplo n.º 4
0
class Parser(BaseSpider):
    def __init__(self) -> None:
        """百度搜索解析器"""
        super().__init__()
        self.webSubParser = WebSubParser()

    def parse_web(self, content: str, exclude: list) -> dict:
        """解析百度网页搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度网页搜索HTML源码.
            exclude (list): 要屏蔽的控件.

        Returns:
            dict: 解析后的结果
        """
        soup = BeautifulSoup(content, "html.parser")
        if soup.find("div", id="content_left") is None:
            return {"results": [], "pages": 0, "total": 0}
        # 获取搜索结果总数
        tmp1 = soup.findAll("div", class_="result-molecule")
        idx_ = 0
        ele = None
        while ele is None and idx_ < len(tmp1):
            tmp = tmp1[idx_].findAll("span")
            found = False
            for t in tmp:
                if "百度为您找到相关结果" in t.text:
                    ele = t
                    found = True
                    break
            if found:
                break
            idx_ += 1
        num = int(
            str(ele.text).strip("百度为您找到相关结果").strip("约").strip("个").replace(
                ",", ""))
        # 定义预结果(运算以及相关搜索)
        pre_results = []
        # 预处理新闻
        if "news" not in exclude:
            news = soup.find("div", class_="result-op", srcid="19")
            news_detail = self.webSubParser.parse_news_block(news)
        # 预处理短视频
        if "video" not in exclude:
            video = soup.find("div", class_="op-short-video-pc")
            video_results = self.webSubParser.parse_video_block(video)
        # 预处理运算
        if "calc" not in exclude:
            calc = soup.find("div", class_="op_new_cal_screen")
        # 预处理相关搜索
        if "related" not in exclude:
            try:
                _related = soup.findAll("table")[-1].findAll("td")
            except AttributeError:
                _related = []
            related = []
            # 一个一个append相关搜索
            for _ in _related:
                if _.text:
                    related.append(self._format(_.text))
        # 预处理百科
        if "baike" not in exclude:
            baike = soup.find("div", class_="c-container", tpl="bk_polysemy")
            baike = self.webSubParser.parse_baike_block(baike)
        # 预处理贴吧
        if "tieba" not in exclude:
            tieba = BeautifulSoup(content, "html.parser").find("div",
                                                               srcid="10")
            tieba = self.webSubParser.parse_tieba_block(tieba)
        if "music" not in exclude:
            music = BeautifulSoup(content,
                                  "html.parser").find("div",
                                                      class_="result-op",
                                                      tpl="yl_music_song")
            music = self.webSubParser.parse_music_block(music)
        # 预处理博客
        article_tags = BeautifulSoup(content, "html.parser").findAll("article")
        if "blog" not in exclude:
            blog = None
            for tmp in article_tags:
                if tmp["class"][-1].startswith("open-source-software-blog"):
                    blog = tmp
                    break
            blog = self.webSubParser.parse_blog_block(blog)
        # 预处理码云
        if "gitee" not in exclude:
            gitee = None
            for tmp in article_tags:
                if tmp["class"][-1].startswith("osc-gitee"):
                    gitee = tmp
                    break
            gitee = self.webSubParser.parse_gitee_block(gitee)
        # 加载贴吧
        if "tieba" not in exclude and tieba:
            pre_results.append(dict(type="tieba", result=tieba))
        # 加载博客
        if "blog" not in exclude and blog:
            pre_results.append(dict(type="blog", result=blog))
        # 加载码云
        if "gitee" not in exclude and gitee:
            pre_results.append(dict(type="gitee", result=gitee))
        # 加载搜索结果总数
        # 已经移动到根字典中
        # if num != 0:
        #     pre_results.append(dict(type="total", result=num))
        # 加载运算
        if "calc" not in exclude and calc:
            pre_results.append(
                dict(
                    type="calc",
                    process=str(
                        calc.find("p",
                                  class_="op_new_val_screen_process").find(
                                      "span").text),
                    result=str(
                        calc.find("p", class_="op_new_val_screen_result").find(
                            "span").text),
                ))
        # 加载相关搜索
        if "related" not in exclude and related:
            pre_results.append(dict(type="related", results=related))
        # 加载资讯
        if "news" not in exclude and news_detail:
            pre_results.append(dict(type="news", results=news_detail))
        # 加载短视频
        if "video" not in exclude and video_results:
            pre_results.append(dict(type="video", results=video_results))
        # 加载百科
        if "baike" not in exclude and baike:
            pre_results.append(dict(type="baike", result=baike))
        # 加载音乐
        if "music" not in exclude and music:
            pre_results.append(dict(type="music", result=music))
        # 预处理源码
        soup = BeautifulSoup(content, "html.parser")
        results = soup.findAll("div", class_="result")
        res = []
        for result in results:
            des = None
            try:
                result["tpl"]
            except:
                continue
            soup = BeautifulSoup(self._minify(str(result)), "html.parser")
            # 链接
            href = soup.find("a").get("href").strip()
            # 标题
            title = self._format(str(soup.find("a").text))
            # 时间
            try:
                time = self._format(
                    soup.findAll("div", class_="c-abstract")[0].find(
                        "span", class_="newTimeFactor_before_abs").text)
            except (AttributeError, IndexError):
                time = None
            try:
                # 简介
                des = soup.find_all("div", class_="c-abstract")[0].text
                soup = BeautifulSoup(str(result), "html.parser")
                des = self._format(des).lstrip(str(time)).strip()
            except IndexError:
                try:
                    des = des.replace("mn", "")
                except (UnboundLocalError, AttributeError):
                    des = None
            if time:
                time = time.split("-")[0].strip()
            # 因为百度的链接是加密的了,所以需要一个一个去访问
            # 由于性能原因,分析链接部分暂略
            # if href is not None:
            #     try:
            #         # 由于性能原因,这里设置1秒超时
            #         r = requests.get(href, timeout=1)
            #         href = r.url
            #     except:
            #         # 获取网页失败,默认换回原加密链接
            #         href = href
            #     # 分析链接
            #     if href:
            #         parse = urlparse(href)
            #         domain = parse.netloc
            #         prepath = parse.path.split('/')
            #         path = []
            #         for loc in prepath:
            #             if loc != '':
            #                 path.append(loc)
            #     else:
            #         domain = None
            #         path = None
            try:
                result["tpl"]
            except:
                pass
            is_not_special = (result["tpl"] not in [
                "short_video_pc",
                "sp_realtime_bigpic5",
                "bk_polysemy",
                "tieba_general",
                "yl_music_song",
            ] and result.find("article") is None)
            domain = None
            if is_not_special:  # 确保不是特殊类型的结果
                # 获取可见的域名
                try:
                    domain = (result.find("div", class_="c-row").find(
                        "div", class_="c-span-last").find(
                            "div", class_="se_st_footer").find(
                                "a", class_="c-showurl").text)
                except Exception:
                    try:
                        domain = (result.find("div", class_="c-row").find(
                            "div", class_="c-span-last").find(
                                "p", class_="op-bk-polysemy-move").find(
                                    "span", class_="c-showurl").text)
                    except Exception:
                        try:
                            domain = (result.find("div",
                                                  class_="se_st_footer").find(
                                                      "a",
                                                      class_="c-showurl").text)
                        except:
                            domain = None
                if domain:
                    domain = domain.replace(" ", "")
            # 加入结果
            if title and href and is_not_special:
                res.append({
                    "title": title,
                    "des": des,
                    "origin": domain,
                    "url": href,
                    "time": time,
                    "type": "result",
                })
        soup = BeautifulSoup(content, "html.parser")
        soup = BeautifulSoup(str(soup.findAll("div", id="page")[0]),
                             "html.parser")
        # 分页
        # pages_ = soup.findAll("span", class_="pc")
        # pages = []
        # for _ in pages_:
        #     pages.append(int(_.text))
        # # 如果搜索结果仅有一页时,百度不会显示底部导航栏
        # # 所以这里直接设置成1,如果不设会报错`TypeError`
        # if not pages:
        #     pages = [1]
        # 设置最终结果
        result = pre_results
        result.extend(res)
        return {
            "results": result,
            # 最大页数
            # "pages": max(pages),
            "total": num,
        }

    @handle_err
    def parse_pic(self, content: str) -> dict:
        """解析百度图片搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度图片搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        # 从JavaScript中加载数据
        # 因为JavaScript很像JSON(JavaScript Object Notation),所以直接用json加载就行了
        # 还有要预处理一下,把函数和无用的括号过滤掉
        error = None
        try:
            data = json.loads(
                content.split("flip.setData('imgData', ")
                [1].split("flip.setData(")[0].split("]);")[0].replace(
                    ");",
                    "").replace("<\\/strong>",
                                "</strong>").replace("\\'",
                                                     "'").replace('\\"', "'"),
                strict=False,
            )
        except Exception as err:
            error = err
            if type(err) in [IndexError, AttributeError]:
                raise ParseError("Invalid HTML content.")
        finally:
            if error:
                raise ParseError(str(error))
        soup = BeautifulSoup(content, "html.parser")
        total = int(
            soup.find(
                "div",
                id="resultInfo").text.split("约")[-1].split("张")[0].replace(
                    ",", ""))
        results = []
        for _ in data["data"][:-1]:
            if _:
                # 标题
                title = str(_["fromPageTitle"]).encode("utf-8").decode("utf-8")
                # 去除标题里的HTML
                title = unescape(self._remove_html(title))
                # 链接
                url = _["objURL"]
                # 来源域名
                host = _["fromURLHost"]
                # 生成结果
                result = {"title": title, "url": url, "host": host}
                results.append(result)  # 加入结果
        # 获取分页
        # bs = BeautifulSoup(content, "html.parser")
        # pages_ = bs.find("div", id="page").findAll("span", class_="pc")
        # pages = []
        # for _ in pages_:
        #     pages.append(int(_.text))
        return {
            "results": results,
            # 取最大页码
            # "pages": max(pages),
            "total": total,
        }

    def parse_zhidao(self, content: str) -> dict:
        """解析百度知道搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度知道搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        bs = BeautifulSoup(self._minify(content), "html.parser")
        # 搜索结果总数
        total = int(
            bs.find("div", class_="wgt-picker").find(
                "span", class_="f-lighter").text.split("共", 1)[-1].split(
                    "条结果", 1)[0].replace(",", ""))
        # 所有搜索结果
        list_ = bs.find("div", class_="list").findAll("dl")
        results = []
        for item in list_:
            # 屏蔽企业回答
            if "ec-oad" in item["class"]:
                continue
            # print(item.prettify() + '\n\n\n\n\n\n\n')
            # 标题
            title = item.find("dt").text.strip("\n")
            # 链接
            try:
                url = item.find("dt").find("a")["href"]
            except KeyError:
                url = item.find("dt").find("a")["data-href"]
            if item.find("dd", class_="video-content") is not None:
                # 问题
                __ = item.find("dd", class_="summary")
                question = __.text.strip("问:") if __ is not None else None
                item = item.find("div", class_="right")
                tmp = item.findAll("div", class_="video-text")
                # # 简介
                # des = self._format(tmp[2].text)
                answer = None
                # 回答者
                answerer = tmp[0].text.strip("\n").strip("回答:\u2002")
                # 发布日期
                date = self._format(tmp[1].text.strip("时间:"))
                # 回答总数
                count = None
                # 赞同数
                try:
                    agree = int(tmp[2].text.strip("获赞:\u2002").strip("次"))
                except ValueError:
                    agree = 0
                    answer = tmp[2].text.strip()
                type_ = "video"
            else:
                # 回答
                __ = item.find("dd", class_="answer")
                answer = __.text.strip("答:") if __ is not None else None
                # 问题
                __ = item.find("dd", class_="summary")
                question = __.text.strip("问:") if __ is not None else None
                tmp = item.find("dd", class_="explain").findAll("span",
                                                                class_="mr-8")
                # 发布日期
                date = (item.find("dd",
                                  class_="explain").find("span",
                                                         class_="mr-7").text)
                # 回答总数
                try:
                    count = int(str(tmp[-1].text).strip("\n").strip("个回答"))
                except:
                    count = None
                # 回答者
                answerer = tmp[-2].text.strip("\n").strip("回答者:\xa0")
                # 赞同数
                __ = item.find("dd", class_="explain").find("span",
                                                            class_="ml-10")
                agree = int(__.text.strip()) if __ is not None else 0
                type_ = "normal"
            # 生成结果
            result = {
                "title": title,
                "question": question,
                "answer": answer,
                "date": date,
                "count": count,
                "url": url,
                "agree": agree,
                "answerer": answerer,
                # "type": type_
            }
            results.append(result)  # 加入结果
        # 获取分页
        # wrap = bs.find("div", class_="pager")
        # pages_ = wrap.findAll("a")[:-2]
        # if "下一页" in pages_[-1].text:
        #     pages = pages_[-2].text
        # else:
        #     pages = pages_[-1].text
        return {
            "results": results,
            # 取最大页码
            # "pages": int(pages),
            "total": total,
        }

    def parse_video(self, content: str) -> dict:
        """解析百度视频搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度视频搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        bs = BeautifulSoup(content, "html.parser")
        # 锁定结果div
        data = bs.findAll("div", class_="video_short")
        if len(data) == 0:
            return {"results": None}
        results = []
        for res in data:
            __ = res.find("div", class_="video_small_intro")
            _ = __.find("a")
            # 标题
            title = self._format(_.text)
            # 链接
            url = _["href"]
            # 封面图片链接
            img = res.find("img",
                           class_="border-radius")["src"].rsplit("?", 1)[0]
            # 时长
            length_ = res.find("span", class_="video_play_timer").text
            _ = [int(i) for i in length_.split(":")]
            if len(_) < 3:
                length_ = time(minute=_[0], second=_[1])
            else:
                length_ = time(_[0], _[1], _[2])
            # 简介
            try:
                des = __.find("div", class_="c-color-text").text
            except AttributeError:
                des = None
            # 来源
            try:
                origin = self._format(
                    __.find("span", class_="wetSource").text).strip("来源:")
            except AttributeError:
                origin = None
            # 发布时间
            pub_time: str = __.findAll(
                "span", class_="c-font-normal")[-1].text.strip("发布时间:")
            try:
                __ = [int(i) for i in pub_time.split("-")]
            except ValueError:
                __ = self._convert_time(pub_time, True)
            pub_time = datetime(__[0], __[1], __[2])
            # 生成结果
            result = {
                "title": title,
                "url": url,
                "img": img,
                "length": length_,
                "des": des,
                "origin": origin,
                "pub_time": pub_time,
            }
            results.append(result)  # 加入结果
        return {"results": results}

    def parse_news(self, content: str) -> dict:
        """解析百度资讯搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度资讯搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        bs = BeautifulSoup(self._format(content), "html.parser")
        # 搜索结果总数
        total = int(
            bs.find("div", id="wrapper_wrapper").find(
                "span",
                class_="nums").text.split("资讯",
                                          1)[-1].split("篇",
                                                       1)[0].replace(",", ""))
        # 搜索结果容器
        data = (bs.find("div", id="content_left").findAll("div")[1].findAll(
            "div", class_="result-op"))
        results = []
        for res in data:
            # 标题
            title = self._format(res.find("h3").find("a").text)
            # 链接
            url = res.find("h3").find("a")["href"]
            # 简介
            des = (res.find("div", class_="c-span-last").find(
                "span", class_="c-color-text").text)
            _ = res.find("div", class_="c-span-last")
            # 作者
            author = _.find("span", class_="c-gap-right").text
            # 发布日期
            try:
                date = _.find("span", class_="c-color-gray2").text
            except AttributeError:
                date = None
            # 封面图片
            try:
                cover = res.find(
                    "div", class_="c-img-radius-large").find("img")["src"]
            except:
                cover = None
            # 生成结果
            result = {
                "title": title,
                "author": author,
                "date": date,
                "des": des,
                "url": url,
                "cover": cover,
            }
            results.append(result)  # 加入结果
        # 获取所有页数
        # pages_ = bs.find("div", id="page").findAll("a")
        # # 过滤页码
        # if "< 上一页" in pages_[0].text:
        #     pages_ = pages_[1:]
        # if "下一页 >" in pages_[-1].text:
        #     pages_ = pages_[:-1]
        return {"results": results, "total": total}

    def __get_wenku_doc_type(self, t: int) -> str:
        if t in [1, 4, 10, 11, 9]:
            return "DOC"
        if t in [3, 6, 14, 15]:
            return "PPT"
        if t in [2, 5]:
            return "XLS"
        if t in [12]:
            return "VSD"
        if t in [7]:
            return "PDF"
        if t in [8]:
            return "TXT"
        if t in [16]:
            return "EPUB"
        if t in [19]:
            return "CAD"
        if t in [13]:
            return "RTF"
        if t in [20]:
            return "XMIND"
        return "UNKNOWN"

    def parse_wenku(self, content: str) -> dict:
        """解析百度文库搜索的页面源代码。

        Args:
            content (str): 已经转换为UTF-8编码的百度文库搜索API接口JSON数据

        Returns:
            dict: 解析后的结果
        """
        results = []
        pages = 0
        _ = json.loads(content)
        if _["status"]["msg"] != "success":
            raise RuntimeError
        for res in _["data"]["normalResult"]:
            info = res["docInfo"]
            author = res["authorInfo"]
            title = (info["title"].replace("<em>",
                                           "").replace("</em>", "").replace(
                                               " - 百度文库", ""))
            des = info["content"].replace("<em>", "").replace("</em>", "")
            pub_date = strftime("%Y-%m-%d", localtime(int(info["createTime"])))
            page_num = info["pageNum"]
            score = info["qualityScore"]
            downloads = info["downloadCount"]
            url = info["url"]
            is_vip = info["flag"] == 28
            u_name = author["uname"]
            u_url = f"https://wenku.baidu.com/u/{u_name}?uid={author['uid']}"
            results.append({
                "title": title,
                "des": des,
                "pub_date": pub_date,
                "pages": page_num,
                "quality": score,
                "downloads": downloads,
                "url": url,
                "is_vip": is_vip,
                "uploader": {
                    "name": u_name,
                    "url": u_url
                },
            })
        pages = math.ceil(
            (_["data"]["total"] - len(_["data"]["normalResult"])) / 10 + 1)
        return {"results": results, "pages": pages}

    def parse_jingyan(self, content: str) -> dict:
        """解析百度经验搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度经验搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        # 最小化代码
        code = self._minify(content)
        bs = BeautifulSoup(code, "html.parser")
        total = int(
            bs.find("div", class_="result-num").text.split("约", 1)[-1].split(
                "个", 1)[0].replace(",", ""))
        # 加载搜索结果
        data = bs.find("div", class_="search-list").findAll("dl")
        results = []
        for res in data:
            # 标题
            title = self._format(res.find("dt").find("a").text)
            # 链接
            url = "https://jingyan.baidu.com/" + res.find("dt").find(
                "a")["href"]
            # 简介
            des = self._format(
                res.find("dd").find("div", class_="summary").find(
                    "span", class_="abstract").text)
            # 获取发布日期和分类,位于`<span class="cate"/>`中
            _ = res.find("dd").find("div",
                                    class_="summary").find("span",
                                                           class_="cate")
            tmp = self._format(_.text).split("-")
            # 发布日期
            pub_date = self._format(tmp[1]).replace("/", "-")
            # 分类
            category = self._format(tmp[-1]).strip("分类:").split(">")
            # 发布者
            publisher = {
                "name": self._format(_.find("a").text),
                "url": "https://jingyan.baidu.com" + _.find("a")["href"],
            }
            # 支持票数
            votes = int(
                self._format(
                    res.find("dt").find("span",
                                        class_="succ-times").text).strip("得票"))
            # 是否为原创经验
            try:
                res.find("span", class_="i-original").text
                original = True
            except:
                original = False
            # 是否为优秀经验
            try:
                res.find("span", class_="i-good-exp").text
                outstanding = True
            except:
                outstanding = False
            # 生成结果
            result = {
                "title": title,
                "url": url,
                "des": des,
                "pub_date": pub_date,
                "category": category,
                "votes": votes,
                "publisher": publisher,
                "is_original": original,
                "is_outstanding": outstanding,
            }
            results.append(result)  # 加入结果到集合中
        # 获取分页
        # pages_ = bs.find("div", class_="pager-wrap").findAll("a", class_="pg-btn")
        # if not pages_:
        #     return {"results": results, "pages": 1}
        # if "下一页" in pages_[-1].text:
        #     pages_ = pages_[:-1]
        # pages = int(self._format(pages_[-1].text))
        return {"results": results, "total": total}

    def parse_baike(self, content: str) -> dict:
        """解析百度百科搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度百科搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        code = self._minify(content)
        # 创建BeautifulSoup对象
        soup = (BeautifulSoup(code, "html.parser").find(
            "div", class_="body-wrapper").find("div", class_="searchResult"))
        # 获取百科总数
        total = int(
            soup.find(
                "div",
                class_="result-count").text.strip("百度百科为您找到相关词条约").strip("个"))
        # 获取所有结果
        container = soup.findAll("dd")
        results = []
        for res in container:
            # 链接
            url = "https://baike.baidu.com" + self._format(
                res.find("a", class_="result-title")["href"])
            # 标题
            title = self._format(res.find("a", class_="result-title").text)
            # 简介
            des = self._format(res.find("p", class_="result-summary").text)
            # 更新日期
            upd_date = self._format(
                res.find("span", class_="result-date").text)
            # 生成结果
            results.append({
                "title": title,
                "des": des,
                "upd_date": upd_date,
                "url": url
            })
        return {"results": results, "total": total}