Esempio n. 1
0
    def parse_pic(self, content: str) -> dict:
        """解析百度图片搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度图片搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        # 从JavaScript中加载数据
        # 因为JavaScript很像JSON(JavaScript Object Notation),所以直接用json加载就行了
        # 还有要预处理一下,把函数和无用的括号过滤掉
        error = None
        try:
            data = json.loads(
                content.split("flip.setData('imgData', ")[1]
                .split("flip.setData(")[0]
                .split("]);")[0]
                .replace(");", "")
                .replace("<\\/strong>", "</strong>")
                .replace("\\'", "'")
                .replace('\\"', "'"),
                strict=False,
            )
        except Exception as err:
            error = err
            if type(err) in [IndexError, AttributeError]:
                raise ParseError("Invalid HTML content.")
        finally:
            if error:
                raise ParseError(str(error))
        results = []
        for _ in data["data"][:-1]:
            if _:
                # 标题
                title = str(_["fromPageTitle"]).encode("utf-8").decode("utf-8")
                # 去除标题里的HTML
                title = unescape(self._remove_html(title))
                # 链接
                url = _["objURL"]
                # 来源域名
                host = _["fromURLHost"]
                # 生成结果
                result = {"title": title, "url": url, "host": host}
                results.append(result)  # 加入结果
        # 获取分页
        bs = BeautifulSoup(content, "html.parser")
        pages_ = bs.find("div", id="page").findAll("span", class_="pc")
        pages = []
        for _ in pages_:
            pages.append(int(_.text))
        return {
            "results": results,
            # 取最大页码
            "pages": max(pages),
        }
Esempio n. 2
0
    def parse_pic(self, content: str) -> dict:
        """解析百度图片搜索的页面源代码

        Args:
            content (str): 已经转换为UTF-8编码的百度图片搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        # 从JavaScript中加载数据
        # 因为JavaScript很像JSON(JavaScript Object Notation),所以直接用json加载就行了
        # 还有要预处理一下,把函数和无用的括号过滤掉
        error = None
        try:
            data = json.loads(content.split('flip.setData(\'imgData\', ')[1].split(
                'flip.setData(')[0].split(']);')[0].replace(');', '').replace('<\\/strong>', '</strong>').replace('\\\'', '\'').replace('\\"', '\''))
        except Exception as err:
            error = err
            if type(err) in [IndexError, AttributeError]:
                raise ParseError('Invalid HTML content.')
            print(content.split('flip.setData(\'imgData\', ')[1].split(
                'flip.setData(')[0].split(']);')[0].replace(');', '').replace('<\\/strong>', '</strong>').replace('\\\'', '\'').replace('\\"', '\''))
        finally:
            if error: raise ParseError(str(error))
        results = []
        for _ in data['data'][:-1]:
            if _:
                # 标题
                title = str(_['fromPageTitle']).encode('utf-8').decode('utf-8')
                # 去除标题里的HTML
                title = unescape(self._remove_html(title))
                # 链接
                url = _['objURL']
                # 来源域名
                host = _['fromURLHost']
                # 生成结果
                result = {
                    'title': title,
                    'url': url,
                    'host': host
                }
                results.append(result)  # 加入结果
        # 获取分页
        bs = BeautifulSoup(content, 'html.parser')
        pages_ = bs.find('div', id='page').findAll('span', class_='pc')
        pages = []
        for _ in pages_:
            pages.append(int(_.text))
        return {
            'results': results,
            # 取最大页码
            'pages': max(pages)
        }
Esempio n. 3
0
    def parse_web(self, content: str) -> dict:
        """解析百度网页搜索的页面源代码

        Args:
            content (str): 已经转换为UTF-8编码的百度网页搜索HTML源码

        Returns:
            dict: 解析后的结果
        """
        soup = BeautifulSoup(content, 'html.parser')
        if soup.find('div', id='content_left') is None:
            raise ParseError('Invalid HTML content.')
        # 尝试获取搜索结果总数
        try:
            num = int(
                str(soup.find('span', class_='nums_text').text).strip(
                    '百度为您找到相关结果约').strip('个').replace(',', ''))
        except:
            num = 0
        # 查找运算窗口
        calc = soup.find('div', class_='op_new_cal_screen')
        # 定义预结果(运算以及相关搜索)
        pre_results = []
        # 预处理相关搜索
        try:
            _related = soup.find('div', id='rs').find('table').find_all('th')
        except:
            _related = []
        related = []
        # 预处理新闻
        news = soup.find('div',
                         class_='result-op',
                         tpl='sp_realtime_bigpic5',
                         srcid='19')
        # 确认是否有新闻块
        try:
            news_title = self._format(
                news.find('h3', class_='t').find('a').text)
        except:
            news_title = None
            news_detail = []
        else:
            news_rows = news.findAll('div', class_='c-row')
            news_detail = []
            prev_row = None
            for row in news_rows:
                try:
                    row_title = self._format(row.find('a').text)
                except AttributeError:
                    prev_row['des'] = self._format(row.text)
                    continue
                row_time = self._format(
                    row.find('span', class_='c-color-gray2').text)
                row_author = self._format(
                    row.find('span', class_='c-color-gray').text)
                row_url = self._format(row.find('a')['href'])
                news_detail.append({
                    'title': row_title,
                    'time': row_time,
                    'author': row_author,
                    'url': row_url,
                    'des': None
                })
                prev_row = news_detail[-1]
        # 预处理短视频
        video = soup.find('div', class_='op-short-video-pc')
        if video:
            video_rows = video.findAll('div', class_='c-row')
            video_results = []
            for row in video_rows:
                row_res = []
                videos = row.findAll('div', class_='c-span6')
                for v in videos:
                    v_link = v.find('a')
                    v_title = v_link['title']
                    v_url = self._format(v_link['href'])
                    v_img = v_link.find('img')['src']
                    v_len = self._format(
                        v.find('div',
                               class_='op-short-video-pc-duration-wrap').text)
                    v_from = self._format(
                        v.find('div', class_='op-short-video-pc-clamp1').text)
                    row_res.append({
                        'title': v_title,
                        'url': v_url,
                        'cover': v_img,
                        'length': v_len,
                        'origin': v_from
                    })
                video_results += row_res
        else:
            video_results = []
        # 一个一个append相关搜索
        for _ in _related:
            if _.text:
                related.append(_.text)
        # 预处理百科
        baike = soup.find('div', class_='c-container', tpl='bk_polysemy')
        if baike:
            b_title = self._format(baike.find('h3').text)
            b_url = baike.find('a')['href']
            b_des = self._format(
                baike.find('div', class_='c-span-last').find('p').text)
            try:
                b_cover = baike.find('div',
                                     class_='c-span6').find('img')['src']
                b_cover_type = 'image'
            except (TypeError, AttributeError):
                try:
                    b_cover = baike.find(
                        'video', class_='op-bk-polysemy-video')['data-src']
                    b_cover_type = 'video'
                except TypeError:
                    b_cover = None
                    b_cover_type = None
            baike = {
                'title': b_title,
                'url': b_url,
                'des': b_des,
                'cover': b_cover,
                'cover-type': b_cover_type
            }
        # 加载搜索结果总数
        if num != 0:
            pre_results.append(dict(type='total', result=num))
        # 加载运算
        if calc:
            pre_results.append(
                dict(type='calc',
                     process=str(
                         calc.find('p',
                                   class_='op_new_val_screen_process').find(
                                       'span').text),
                     result=str(
                         calc.find('p',
                                   class_='op_new_val_screen_result').find(
                                       'span').text)))
        # 加载相关搜索
        if related:
            pre_results.append(dict(type='related', results=related))
        # 加载资讯
        if news_detail:
            pre_results.append(dict(type='news', results=news_detail))
        # 加载短视频
        if video_results:
            pre_results.append(dict(type='video', results=video_results))
        # 加载百科
        if baike:
            pre_results.append(dict(type='baike', result=baike))
        # 预处理源码
        error = False
        try:
            soup = BeautifulSoup(content, 'html.parser')
        # 错误处理
        except IndexError:
            error = True
        finally:
            if error:
                raise ParseError(
                    'Failed to generate BeautifulSoup object for the given source code content.'
                )
        results = soup.findAll('div', class_='result')
        res = []
        for result in results:
            soup = BeautifulSoup(self._minify(str(result)), 'html.parser')
            # 链接
            href = soup.find('a').get('href').strip()
            # 标题
            title = self._format(str(soup.find('a').text))
            # 时间
            try:
                time = self._format(
                    soup.findAll('div', class_='c-abstract')[0].find(
                        'span', class_='newTimeFactor_before_abs').text)
            except (AttributeError, IndexError):
                time = None
            try:
                # 简介
                des = soup.find_all('div', class_='c-abstract')[0].text
                soup = BeautifulSoup(str(result), 'html.parser')
                des = self._format(des).lstrip(str(time)).strip()
            except IndexError:
                try:
                    des = des.replace('mn', '')
                except (UnboundLocalError, AttributeError):
                    des = None
            if time:
                time = time.split('-')[0].strip()
            # 因为百度的链接是加密的了,所以需要一个一个去访问
            # 由于性能原因,分析链接部分暂略
            # if href is not None:
            #     try:
            #         # 由于性能原因,这里设置1秒超时
            #         r = requests.get(href, timeout=1)
            #         href = r.url
            #     except:
            #         # 获取网页失败,默认换回原加密链接
            #         href = href
            #     # 分析链接
            #     if href:
            #         parse = urlparse(href)
            #         domain = parse.netloc
            #         prepath = parse.path.split('/')
            #         path = []
            #         for loc in prepath:
            #             if loc != '':
            #                 path.append(loc)
            #     else:
            #         domain = None
            #         path = None
            try:
                is_not_special = result['tpl'] not in [
                    'short_video_pc', 'sp_realtime_bigpic5', 'bk_polysemy'
                ]
            except KeyError:
                is_not_special = False
            if is_not_special:  # 确保不是特殊类型的结果
                # 获取可见的域名
                try:
                    domain = result.find('div', class_='c-row').find(
                        'div',
                        class_='c-span-last').find('div',
                                                   class_='se_st_footer').find(
                                                       'a',
                                                       class_='c-showurl').text
                except Exception as error:
                    try:
                        domain = result.find('div', class_='c-row').find(
                            'div', class_='c-span-last').find(
                                'p', class_='op-bk-polysemy-move').find(
                                    'span', class_='c-showurl').text
                    except Exception as error:
                        try:
                            domain = result.find('div',
                                                 class_='se_st_footer').find(
                                                     'a',
                                                     class_='c-showurl').text
                        except:
                            domain = None
                if domain:
                    domain = domain.replace(' ', '')
            else:
                domain = None
            # 加入结果
            if title and href and is_not_special:
                res.append({
                    'title': title,
                    'des': des,
                    'origin': domain,
                    'url': href,
                    'time': time,
                    'type': 'result'
                })
        soup = BeautifulSoup(content, 'html.parser')
        try:
            soup = BeautifulSoup(str(soup.findAll('div', id='page')[0]),
                                 'html.parser')
            # 分页
            pages_ = soup.findAll('span', class_='pc')
        except IndexError:
            pages_ = []
        pages = []
        for _ in pages_:
            pages.append(int(_.text))
        # 如果搜索结果仅有一页时,百度不会显示底部导航栏
        # 所以这里直接设置成1,如果不设会报错`TypeError`
        if not pages:
            pages = [1]
        # 设置最终结果
        result = pre_results
        result.extend(res)
        return {
            'results': result,
            # 最大页数
            'pages': max(pages)
        }
Esempio n. 4
0
    def parse_web(self, content: str, exclude: list) -> dict:
        """解析百度网页搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度网页搜索HTML源码.
            exclude (list): 要屏蔽的控件.

        Returns:
            dict: 解析后的结果
        """
        soup = BeautifulSoup(content, "html.parser")
        if soup.find("div", id="content_left") is None:
            raise ParseError("Invalid HTML content.")
        # 获取搜索结果总数
        num = int(
            str(soup.find("span", class_="nums_text").text).strip(
                "百度为您找到相关结果约").strip("个").replace(",", ""))
        # 定义预结果(运算以及相关搜索)
        pre_results = []
        # 预处理新闻
        if "news" not in exclude:
            news = soup.find("div",
                             class_="result-op",
                             tpl="sp_realtime_bigpic5",
                             srcid="19")
            news_detail = self.webSubParser.parse_news_block(news)
        # 预处理短视频
        if "video" not in exclude:
            video = soup.find("div", class_="op-short-video-pc")
            video_results = self.webSubParser.parse_video_block(video)
        # 预处理运算
        if "calc" not in exclude:
            calc = soup.find("div", class_="op_new_cal_screen")
        # 预处理相关搜索
        if "related" not in exclude:
            try:
                _related = soup.find("div",
                                     id="rs").find("table").find_all("th")
            except AttributeError:
                _related = []
            related = []
            # 一个一个append相关搜索
            for _ in _related:
                if _.text:
                    related.append(_.text)
        # 预处理百科
        if "baike" not in exclude:
            baike = soup.find("div", class_="c-container", tpl="bk_polysemy")
            baike = self.webSubParser.parse_baike_block(baike)
        # 预处理贴吧
        if "tieba" not in exclude:
            tieba = BeautifulSoup(content, "html.parser").find("div",
                                                               srcid="10")
            tieba = self.webSubParser.parse_tieba_block(tieba)
        # 预处理博客
        article_tags = BeautifulSoup(content, "html.parser").findAll("article")
        if "blog" not in exclude:
            blog = None
            for tmp in article_tags:
                if tmp["class"][-1].startswith("open-source-software-blog"):
                    blog = tmp
                    break
            blog = self.webSubParser.parse_blog_block(blog)
        # 预处理码云
        if "gitee" not in exclude:
            gitee = None
            for tmp in article_tags:
                if tmp["class"][-1].startswith("osc-gitee"):
                    gitee = tmp
                    break
            gitee = self.webSubParser.parse_gitee_block(gitee)
        # 加载贴吧
        if "tieba" not in exclude and tieba:
            pre_results.append(dict(type="tieba", result=tieba))
        # 加载博客
        if "blog" not in exclude and blog:
            pre_results.append(dict(type="blog", result=blog))
        # 加载码云
        if "gitee" not in exclude and gitee:
            pre_results.append(dict(type="gitee", result=gitee))
        # 加载搜索结果总数
        if num != 0:
            pre_results.append(dict(type="total", result=num))
        # 加载运算
        if "calc" not in exclude and calc:
            pre_results.append(
                dict(
                    type="calc",
                    process=str(
                        calc.find("p",
                                  class_="op_new_val_screen_process").find(
                                      "span").text),
                    result=str(
                        calc.find("p", class_="op_new_val_screen_result").find(
                            "span").text),
                ))
        # 加载相关搜索
        if "related" not in exclude and related:
            pre_results.append(dict(type="related", results=related))
        # 加载资讯
        if "news" not in exclude and news_detail:
            pre_results.append(dict(type="news", results=news_detail))
        # 加载短视频
        if "video" not in exclude and video_results:
            pre_results.append(dict(type="video", results=video_results))
        # 加载百科
        if "baike" not in exclude and baike:
            pre_results.append(dict(type="baike", result=baike))
        # 预处理源码
        soup = BeautifulSoup(content, "html.parser")
        results = soup.findAll("div", class_="result")
        res = []
        for result in results:
            des = None
            try:
                result["tpl"]
            except:
                continue
            soup = BeautifulSoup(self._minify(str(result)), "html.parser")
            # 链接
            href = soup.find("a").get("href").strip()
            # 标题
            title = self._format(str(soup.find("a").text))
            # 时间
            try:
                time = self._format(
                    soup.findAll("div", class_="c-abstract")[0].find(
                        "span", class_="newTimeFactor_before_abs").text)
            except (AttributeError, IndexError):
                time = None
            try:
                # 简介
                des = soup.find_all("div", class_="c-abstract")[0].text
                soup = BeautifulSoup(str(result), "html.parser")
                des = self._format(des).lstrip(str(time)).strip()
            except IndexError:
                try:
                    des = des.replace("mn", "")
                except (UnboundLocalError, AttributeError):
                    des = None
            if time:
                time = time.split("-")[0].strip()
            # 因为百度的链接是加密的了,所以需要一个一个去访问
            # 由于性能原因,分析链接部分暂略
            # if href is not None:
            #     try:
            #         # 由于性能原因,这里设置1秒超时
            #         r = requests.get(href, timeout=1)
            #         href = r.url
            #     except:
            #         # 获取网页失败,默认换回原加密链接
            #         href = href
            #     # 分析链接
            #     if href:
            #         parse = urlparse(href)
            #         domain = parse.netloc
            #         prepath = parse.path.split('/')
            #         path = []
            #         for loc in prepath:
            #             if loc != '':
            #                 path.append(loc)
            #     else:
            #         domain = None
            #         path = None
            try:
                result["tpl"]
            except:
                print(result.prettify())
            is_not_special = (result["tpl"] not in [
                "short_video_pc",
                "sp_realtime_bigpic5",
                "bk_polysemy",
                "tieba_general",
            ] and result.find("article") is None)
            domain = None
            if is_not_special:  # 确保不是特殊类型的结果
                # 获取可见的域名
                try:
                    domain = (result.find("div", class_="c-row").find(
                        "div", class_="c-span-last").find(
                            "div", class_="se_st_footer").find(
                                "a", class_="c-showurl").text)
                except Exception:
                    try:
                        domain = (result.find("div", class_="c-row").find(
                            "div", class_="c-span-last").find(
                                "p", class_="op-bk-polysemy-move").find(
                                    "span", class_="c-showurl").text)
                    except Exception:
                        try:
                            domain = (result.find("div",
                                                  class_="se_st_footer").find(
                                                      "a",
                                                      class_="c-showurl").text)
                        except:
                            domain = None
                if domain:
                    domain = domain.replace(" ", "")
            # 加入结果
            if title and href and is_not_special:
                res.append({
                    "title": title,
                    "des": des,
                    "origin": domain,
                    "url": href,
                    "time": time,
                    "type": "result",
                })
        soup = BeautifulSoup(content, "html.parser")
        soup = BeautifulSoup(str(soup.findAll("div", id="page")[0]),
                             "html.parser")
        # 分页
        pages_ = soup.findAll("span", class_="pc")
        pages = []
        for _ in pages_:
            pages.append(int(_.text))
        # 如果搜索结果仅有一页时,百度不会显示底部导航栏
        # 所以这里直接设置成1,如果不设会报错`TypeError`
        if not pages:
            pages = [1]
        # 设置最终结果
        result = pre_results
        result.extend(res)
        return {
            "results": result,
            # 最大页数
            "pages": max(pages),
        }
Esempio n. 5
0
    def parse_advertising(self, content: str) -> dict:
        """解析百度网页搜索的页面源代码.

        Args:
            content (str): 已经转换为UTF-8编码的百度网页搜索HTML源码.
            exclude (list): 要屏蔽的控件.

        Returns:
            dict: 解析后的结果
        """
        soup = BeautifulSoup(content, "html.parser")
        if soup.find("div", id="content_left") is None:
            raise ParseError("Invalid HTML content.")
        # 获取搜索结果总数
        num = int(
            str(soup.find("span", class_="nums_text").text)
            .strip("百度为您找到相关结果约")
            .strip("个")
            .replace(",", "")
        )

        soup = BeautifulSoup(content, "html.parser")
        results = soup.findAll("div", attrs={"cmatchid":True, "data-ecimtimesign":True})

        res = []
        for result in results:
            des = None
            try:
                result["cmatchid"]
            except:
                continue
            soup = BeautifulSoup(self._minify(str(result)), "html.parser")
            # 链接,可能有多个推广链接,但是只取一个,后面的信息内容一样
            href = soup.find("a").get("href").strip()
            # 标题
            title = self._format(str(soup.find("a").text))
            # 时间
            try:
                time = self._format(
                    soup.findAll("div", class_="c-abstract")[0]
                    .find("span", class_="newTimeFactor_before_abs")
                    .text
                )
            except (AttributeError, IndexError):
                time = None
            try:
                # 简介
                des = soup.find_all("div", class_="c-abstract")[0].text
                soup = BeautifulSoup(str(result), "html.parser")
                des = self._format(des).lstrip(str(time)).strip()
            except IndexError:
                try:
                    des = des.replace("mn", "")
                except (UnboundLocalError, AttributeError):
                    des = None
            if time:
                time = time.split("-")[0].strip()
            # 因为百度的链接是加密的了,所以需要一个一个去访问
            # 由于性能原因,分析链接部分暂略
            # if href is not None:
            #     try:
            #         # 由于性能原因,这里设置1秒超时
            #         r = requests.get(href, timeout=1)
            #         href = r.url
            #     except:
            #         # 获取网页失败,默认换回原加密链接
            #         href = href
            #     # 分析链接
            #     if href:
            #         parse = urlparse(href)
            #         domain = parse.netloc
            #         prepath = parse.path.split('/')
            #         path = []
            #         for loc in prepath:
            #             if loc != '':
            #                 path.append(loc)
            #     else:
            #         domain = None
            #         path = None
            try:
                result["tpl"]
            except:
                print(result.prettify())
            # is_not_special = (
            #     result["tpl"]
            #     not in [
            #         "short_video_pc",
            #         "sp_realtime_bigpic5",
            #         "bk_polysemy",
            #         "tieba_general",
            #     ]
            #     and result.find("article") is None
            # )
            domain = None
            # if is_not_special:  # 确保不是特殊类型的结果
            if True:
                # 获取可见的域名
                try:
                    domain = (
                        result.find("div", class_="c-row")
                        .find("div", class_="c-span-last")
                        .find("div", class_="se_st_footer")
                        .find("a", class_="c-showurl")
                        .text
                    )
                except Exception:
                    try:
                        domain = (
                            result.find("div", class_="c-row")
                            .find("div", class_="c-span-last")
                            .find("p", class_="op-bk-polysemy-move")
                            .find("span", class_="c-showurl")
                            .text
                        )
                    except Exception:
                        try:
                            domain = (
                                result.find("div", class_="se_st_footer")
                                .find("a", class_="c-showurl")
                                .text
                            )
                        except:
                            domain = None
                if domain:
                    domain = domain.replace(" ", "")
            # 加入结果
            if title and href:
                res.append(
                    {
                        "title": title,
                        "des": des,
                        "origin": domain,
                        "url": href,
                        "time": time,
                    }
                )

        soup = BeautifulSoup(content, "html.parser")
        soup = BeautifulSoup(str(soup.findAll("div", id="page")[0]), "html.parser")
        # 分页
        pages_ = soup.findAll("span", class_="pc")
        pages = []
        for _ in pages_:
            pages.append(int(_.text))
        # 如果搜索结果仅有一页时,百度不会显示底部导航栏
        # 所以这里直接设置成1,如果不设会报错`TypeError`
        if not pages:
            pages = [1]
        # 设置最终结果
        result = res
        return {
            "results": result,
            # 最大页数
            "pages": max(pages),
        }