예제 #1
0
 def setUp(self):
     # 导入包
     sys.path.append(os.path.abspath('.'))
     from baiduspider import BaiduSpider
     from baiduspider.errors import ParseError
     self.spider = BaiduSpider()
     self.assets_base_url = 'https://cdn.jsdelivr.net/gh/BaiduSpider/BaiduSpiderTestAssets/pic'
     self.normal_res = {
         'host': 'www.cwq.com',
         'title': 'python中文社区',
         'url': 'http://img.cwq.com/201611/581c95c35ca62.png'
     }
예제 #2
0
class PicTestCase(TestCase):
    def __init__(self, methodName):
        """测试图片搜索

        本测试用于测试BaiduSpider.`search_pic`
        """
        super().__init__(methodName)

    def setUp(self):
        # 导入包
        sys.path.append(os.path.abspath('.'))
        from baiduspider import BaiduSpider
        from baiduspider.errors import ParseError
        self.spider = BaiduSpider()
        self.assets_base_url = 'https://cdn.jsdelivr.net/gh/BaiduSpider/BaiduSpiderTestAssets/pic'
        self.normal_res = {
            'host': 'www.cwq.com',
            'title': 'python中文社区',
            'url': 'http://img.cwq.com/201611/581c95c35ca62.png'
        }

    def __get_asset(self, name):
        return requests.get('{base_url}/test_pic_{name}.html'.format(
            base_url=self.assets_base_url, name=name)).text

    def test_pic_normal(self):
        """测试普通搜索结果"""
        asset = self.__get_asset('normal')
        result = self.spider.parser.parse_pic(asset)
        self.assertIn(self.normal_res, result['results'])

    def test_spider_request(self):
        """测试爬虫获取网页"""
        result = self.spider.search_web('Python')
        self.assertIsNotNone(result['results'])
예제 #3
0
    def parse(self, response):
        pn = int(response.meta["pn"]) | 1
        spider = BaiduSpider()
        result = spider.search_zhidao(query=self.keyword, pn=pn)

        results = result["results"]
        for result in results:
            url = result["url"].replace("http:", "https:")
            if (url.find("zhidao.baidu") >= 0):
                yield SplashRequest(url,
                                    self.parse_zhidao,
                                    endpoint="execute",
                                    args={"lua_source": lua_script},
                                    meta={"origin_url": url})

        if (len(results) > 0):
            next_pn = pn + 1
            yield SplashRequest(self.index_url,
                                self.parse,
                                meta={"pn": next_pn})
예제 #4
0
import re
from time import time
from typing import List, Dict, Union, Iterator

import yaml
from baiduspider import BaiduSpider, ParseError

from safeSearch.error import QueryTooLongException

SPIDER = BaiduSpider()
LAST_QUERY_TIME = time()

# def build_baidu_url(word: str, sites: list = None) -> str:
#     base_url = "https://www.baidu.com/s?wd={wd}".format(wd=word)
#     site_filter = " site:(" + " | ".join(sites) + ")" if sites else ""
#
#     return base_url + site_filter

# def build_google_url(word: str, sites: list = None) -> str:
#     base_url = "https://www.google.com/search?q={q}".format(q=word)
#     site_filter = "+inurl:(+" + "+|+".join(sites) + ")" if sites else ""
#
#     return base_url + site_filter


def split_site_filter(word: str, site: list = None) -> List[str]:
    """Used to recursively split site filter conditions to avoid long query"""

    last_valid_query = ""
    for i in range(len(site)):
        if i == 0:
예제 #5
0
#!/usr/bin/python
# coding:utf-8

from baiduspider import BaiduSpider  # 导入BaiduSpider
from pprint import pprint  # 导入pretty-print
import pandas as pd

# 获取百度的搜索结果,搜索关键词是'爬虫'
# pprint(BaiduSpider().search_web('博士 自杀', pn=4))
new_result = []
for i in range(1, 1010):
    length = len(BaiduSpider().search_web('博士 自杀', pn=i)["results"])
    print(length)
    for j in range(1, length - 1):
        new_result.append(BaiduSpider().search_web('博士 自杀',
                                                   pn=i)["results"][j])

df = pd.DataFrame(new_result)
order = ['time', 'title', 'des', 'origin', 'url', 'type']
print(df)
df = df[order]
columns_map = {
    'time': '时间',
    'title': "标题",
    'des': "描述",
    'origin': "来源",
    'url': "链接",
    'type': "种类"
}
df.rename(columns=columns_map, inplace=True)
df.fillna(' ', inplace=True)
import platform
import pprint

import requests
# 导入BaiduSpider
from baiduspider import BaiduSpider
from tqdm import tqdm, trange

# from data_process_timeout import null_callback, time_out
from data_process_utils import (char_unify_convertor, convert_cn_colon_to_en,
                                convert_en_punct_to_cn, del_spaces,
                                replace_1_with_l, replace_l_with_1,
                                rm_pinyin_yinjie)

# 实例化BaiduSpider
spider = BaiduSpider()

# 填入比赛数据的train
target_file_name = 'train'
file_to_write = f'{target_file_name}_searched_results.pkl'

# 路径根据实际情况调整
target_file = f'../official_data/{target_file_name}.csv'

search_scraper_memo_path = f'search_scraper_memo_{target_file_name}.pkl'

# 多个站点搜索,均记录,分别统计
# 实际上发现:mofangge爬取速度过慢,有时间的话可以做;百度知道上的答案格式不统一,后续提取表达式和答案较为困难
search_site_lists = [
    'zybang.com',
    # 'mofangge.com',
예제 #7
0
    def setUp(self):
        from baiduspider import BaiduSpider

        self.spider = BaiduSpider()
예제 #8
0
class SpiderTestCase(TestCase):
    """
    BaiduSpider的测试
    """

    def setUp(self):
        from baiduspider import BaiduSpider

        self.spider = BaiduSpider()

    def test_search_web(self):
        for pn in range(1, 10):
            python = self.spider.search_web("python", exclude=["tieba"], pn=pn)
            total = False
            # print(python)
            for i in python["results"]:
                # 开始判断各项数据是否合规
                if i["type"] == "total":
                    total = "total" in python  # 结果里有两个total
                    self.assertTrue(type(i["result"]) == int)
                elif i["type"] == "related":
                    self.assertEqual(type(i["results"]), list)
                elif i["type"] == "calc":
                    self.assertIn("process", i)
                elif i["type"] == "news":
                    self.are_in(["author", "time", "title", "url", "des"], i["results"])
                elif i["type"] == "video":
                    self.are_in(
                        ["cover", "origin", "length", "url", "title"], i["results"]
                    )
                elif i["type"] == "baike":
                    self.are_in(
                        ["cover", "cover-type", "des", "url", "title"], i["result"]
                    )
                elif i["type"] == "blog":
                    self.are_in(["blogs", "url", "title"], i["result"])
                elif i["type"] == "gitee":
                    self.are_in(
                        [
                            "star",
                            "fork",
                            "watch",
                            "url",
                            "title",
                            "license",
                            "lang",
                            "status",
                        ],
                        i["result"],
                    )
                elif i["type"] == "result":
                    self.are_in(["des", "origin", "title", "url"], i)
                else:
                    self.assertTrue(False)
                # print(i['type'])
                self.assertNotEqual(i["type"], "tieba")
            self.assertTrue(total)

    def test_search_pic(self):
        python = self.spider.search_pic("python")
        self.assertIn("total", python)
        self.assertEqual(type(python["total"]), int)
        for i in python["results"]:
            self.assertTrue(i["title"])
            self.assertTrue(i["url"])
            self.assertIn("host", i)

    def test_search_video(self):
        python = self.spider.search_video("python")
        self.assertIn("total", python)
        self.assertEqual(type(python["total"]), int)
        for i in python["results"]:
            self.assertTrue(i["title"])
            self.assertTrue(i["url"])
            self.assertIn("img", i)
            self.assertIn("time", i)

    def test_search_news(self):
        python = self.spider.search_news("python")
        self.assertEqual(type(python["total"]), int)
        for i in python["results"]:
            self.are_in(["author", "des", "date", "title", "url"], i)

    def test_search_wenku(self):
        try:
            python = self.spider.search_wenku("python")
            self.assertEqual(type(python["total"]), int)
            for i in python["results"]:
                self.are_in(
                    ["downloads", "pages", "date", "des", "title", "url", "type"], i
                )
        except UnboundLocalError:
            pass

    def test_search_jingyan(self):
        python = self.spider.search_jingyan("python")
        self.assertEqual(type(python["total"]), int)
        for i in python["results"]:
            self.are_in(["title", "url", "des", "date", "category", "votes"], i)

    def test_search_baike(self):
        python = self.spider.search_baike("python")
        self.assertEqual(type(python["total"]), int)
        for i in python["results"]:
            self.are_in(["title", "des", "date", "url"], i)

    def test_zhidao(self):
        python = self.spider.search_zhidao("python")
        self.assertEqual(type(python["total"]), int)
        for i in python["results"]:
            self.are_in(["title", "des", "date", "url", "count"], i)

    def are_in(self, members: list, container: list):
        for i in members:
            self.assertIn(i, container)
"""
@author: wanghongliang
@file: baidu_video.py
@time: 2021/1/28 9:34 
"""
from baiduspider import BaiduSpider
from pprint import pprint

# 实例化BaiduSpider
spider = BaiduSpider()

# 搜索网页
# pprint(spider.search_web(query='Python'))

# pprint(spider.search_pic(query='person', pn=1))

pprint(spider.search_video(query='car', pn=1))
예제 #10
0
    def setUp(self):
        # 导入包
        sys.path.append(os.path.abspath("."))
        from baiduspider import BaiduSpider
        from baiduspider.errors import ParseError, UnknownError

        self.spider = BaiduSpider()
        self.assets_base_url = (
            "https://cdn.jsdelivr.net/gh/BaiduSpider/BaiduSpiderTestAssets@master/web"
        )
        self.normal_res = {
            "title": "Welcome to Python.org",
            "des":
            "The official home of the Python Programming Language... # Python 3: Simple output (with Unicode) >>> print(\"Hello, I'm Python!\") Hello, I'm Python!",
            "url":
            "http://www.baidu.com/link?url=yC-vpJc3cGCINc7SrFvV0A5-mBa3lrOseRMxZzZxXmlh1TqtxC8jgrOPHgSJi7_O",
            "time": None,
            "type": "result",
            "origin": "www.python.org/",
        }
        self.video_res = {
            "title": "python在excel中神运用,亮瞎眼的操作哦",
            "url":
            "https://baijiahao.baidu.com/s?id=1659418735845772463&wfr=content",
            "cover":
            "https://vdposter.bdstatic.com/5ecdac23471e6248259e256427ea66c3.jpeg?x-bce-process=image/resize,m_fill,w_242,h_182/format,f_jpg/quality,Q_100",
            "length": "05:41",
            "origin": "好看视频",
        }
        self.news_res = {
            "author": "国际在线",
            "time": "9分钟前",
            "title": "特朗普确诊新冠!",
            "url":
            "http://www.baidu.com/link?url=_APr4uGsSQzeq7MRkeoxLZlS6TfL8np6zzDnQqVuM9_Kwby5rypESvXHhX5ByEBChsusU4ZO_0p4smy0iz4iP0Kh2QsACY9s1_Fa1YACavW",
            "des": None,
        }
        self.baike_img_res = {
            "title":
            "Python(计算机程序设计语言)_百度百科",
            "des":
            "Python是一种跨平台的计算机程序设计语言。 是一个高层次的结合了解释性、编译性、互动性和面向对象的脚本语言。最初被设计用于编写自动化脚本(shell),随着版本的不断更新和语言...",
            "cover":
            "https://dss0.bdstatic.com/6Ox1bjeh1BF3odCf/it/u=783017482,219941889&fm=74&app=80&f=JPEG&size=f121,90?sec=1880279984&t=b639fbc82a72772a726d11888a54d8f6",
            "cover-type":
            "image",
            "url":
            "http://www.baidu.com/link?url=Clp7kAWYKDauuI0IomD4-yj3EPlzvzhtUsU8eODlD2b6rCmZ0R1mH3RgeuVxJ0QerYWOj1f2cI3gvqJPnDiaNa",
        }
        self.baike_video_res = {
            "title":
            "我(汉语汉字)_百度百科",
            "des":
            "我,汉语常用字,读作wǒ,最早见于甲骨文,其甲骨文本义指奴隶社会里一种用来行刑杀人和肢解牲口的凶器,后由本义衍生出“手持大戉,呐喊示威”等意;但到了战国时代,“我”字本义所代表的凶器被后起的更优良的凶器淘汰,于是“我”字在汉...",
            "cover":
            "http://www.baidu.com/link?url=6VGNfYIuPl2uh-HOGwQnK04K4WL2MICdv6ZpoEIhhgxAUanK2l1aTp_6oC51mpYh8LKEem911tdb4pgp3fNK3UN6GPDqFg-iXcmj9aHzQ4xEodjoO0fsgst1Mf3XAW_DW4idF_QXDhBW_R-vskbcZK",
            "cover-type":
            "video",
            "url":
            "http://www.baidu.com/link?url=2_SPS_eUtRUiJS3eT5pvwHmstP1QBW8YXGzDxc3QRRb0xqWBNkIRbL-S8isFYHztETZv59iF_iDPV5ognLjNna",
        }
        self.baike_none_res = {
            "title":
            "ASTM A106无缝钢管_百度百科",
            "des":
            "ASTM A106无缝钢管是属于美标的无缝钢管,材质是普通碳钢系列。",
            "cover":
            None,
            "cover-type":
            None,
            "url":
            "http://www.baidu.com/link?url=uLJ0kfXAXVu14FztaB4KMU7N4yN5lJikkRBI3b8LeUGGCn-8UoyHbYjo1jyXpVEB95B3htArzho5yreAGJS0SElyhz1euRHtbIb8hzpLESe_Q3Zqrt-U8RJARsapbJ4WLSxyjusGQK-ft_Xflkboz_",
        }
        self.calc_res = {
            "process": "20^5+107*13",
            "result": "3 201 391",
            "type": "calc",
        }
        self.related_res = [
            "python有什么用",
            "python为什么叫爬虫",
            "python教程",
            "Python官网",
            "python爬虫教程",
            "python和java",
            "Python代码",
            "Python软件",
            "Python3",
        ]
        self.pages_res = 10
        self.pages_single_res = 1
        self.total_res = 74700000
        self.invalid_res = ParseError
        self.spider_invalid_param_res = ParseError
        self.spider_unknown_error_res = UnknownError
        self.no_related_res = []
        self.no_pager_res = 1
예제 #11
0
class WebTestCase(TestCase):
    def __init__(self, methodName):
        """测试网页搜索

        本测试用于测试BaiduSpider.`search_web`
        """
        super().__init__(methodName)

    def setUp(self):
        # 导入包
        sys.path.append(os.path.abspath("."))
        from baiduspider import BaiduSpider
        from baiduspider.errors import ParseError, UnknownError

        self.spider = BaiduSpider()
        self.assets_base_url = (
            "https://cdn.jsdelivr.net/gh/BaiduSpider/BaiduSpiderTestAssets@master/web"
        )
        self.normal_res = {
            "title": "Welcome to Python.org",
            "des":
            "The official home of the Python Programming Language... # Python 3: Simple output (with Unicode) >>> print(\"Hello, I'm Python!\") Hello, I'm Python!",
            "url":
            "http://www.baidu.com/link?url=yC-vpJc3cGCINc7SrFvV0A5-mBa3lrOseRMxZzZxXmlh1TqtxC8jgrOPHgSJi7_O",
            "time": None,
            "type": "result",
            "origin": "www.python.org/",
        }
        self.video_res = {
            "title": "python在excel中神运用,亮瞎眼的操作哦",
            "url":
            "https://baijiahao.baidu.com/s?id=1659418735845772463&wfr=content",
            "cover":
            "https://vdposter.bdstatic.com/5ecdac23471e6248259e256427ea66c3.jpeg?x-bce-process=image/resize,m_fill,w_242,h_182/format,f_jpg/quality,Q_100",
            "length": "05:41",
            "origin": "好看视频",
        }
        self.news_res = {
            "author": "国际在线",
            "time": "9分钟前",
            "title": "特朗普确诊新冠!",
            "url":
            "http://www.baidu.com/link?url=_APr4uGsSQzeq7MRkeoxLZlS6TfL8np6zzDnQqVuM9_Kwby5rypESvXHhX5ByEBChsusU4ZO_0p4smy0iz4iP0Kh2QsACY9s1_Fa1YACavW",
            "des": None,
        }
        self.baike_img_res = {
            "title":
            "Python(计算机程序设计语言)_百度百科",
            "des":
            "Python是一种跨平台的计算机程序设计语言。 是一个高层次的结合了解释性、编译性、互动性和面向对象的脚本语言。最初被设计用于编写自动化脚本(shell),随着版本的不断更新和语言...",
            "cover":
            "https://dss0.bdstatic.com/6Ox1bjeh1BF3odCf/it/u=783017482,219941889&fm=74&app=80&f=JPEG&size=f121,90?sec=1880279984&t=b639fbc82a72772a726d11888a54d8f6",
            "cover-type":
            "image",
            "url":
            "http://www.baidu.com/link?url=Clp7kAWYKDauuI0IomD4-yj3EPlzvzhtUsU8eODlD2b6rCmZ0R1mH3RgeuVxJ0QerYWOj1f2cI3gvqJPnDiaNa",
        }
        self.baike_video_res = {
            "title":
            "我(汉语汉字)_百度百科",
            "des":
            "我,汉语常用字,读作wǒ,最早见于甲骨文,其甲骨文本义指奴隶社会里一种用来行刑杀人和肢解牲口的凶器,后由本义衍生出“手持大戉,呐喊示威”等意;但到了战国时代,“我”字本义所代表的凶器被后起的更优良的凶器淘汰,于是“我”字在汉...",
            "cover":
            "http://www.baidu.com/link?url=6VGNfYIuPl2uh-HOGwQnK04K4WL2MICdv6ZpoEIhhgxAUanK2l1aTp_6oC51mpYh8LKEem911tdb4pgp3fNK3UN6GPDqFg-iXcmj9aHzQ4xEodjoO0fsgst1Mf3XAW_DW4idF_QXDhBW_R-vskbcZK",
            "cover-type":
            "video",
            "url":
            "http://www.baidu.com/link?url=2_SPS_eUtRUiJS3eT5pvwHmstP1QBW8YXGzDxc3QRRb0xqWBNkIRbL-S8isFYHztETZv59iF_iDPV5ognLjNna",
        }
        self.baike_none_res = {
            "title":
            "ASTM A106无缝钢管_百度百科",
            "des":
            "ASTM A106无缝钢管是属于美标的无缝钢管,材质是普通碳钢系列。",
            "cover":
            None,
            "cover-type":
            None,
            "url":
            "http://www.baidu.com/link?url=uLJ0kfXAXVu14FztaB4KMU7N4yN5lJikkRBI3b8LeUGGCn-8UoyHbYjo1jyXpVEB95B3htArzho5yreAGJS0SElyhz1euRHtbIb8hzpLESe_Q3Zqrt-U8RJARsapbJ4WLSxyjusGQK-ft_Xflkboz_",
        }
        self.calc_res = {
            "process": "20^5+107*13",
            "result": "3 201 391",
            "type": "calc",
        }
        self.related_res = [
            "python有什么用",
            "python为什么叫爬虫",
            "python教程",
            "Python官网",
            "python爬虫教程",
            "python和java",
            "Python代码",
            "Python软件",
            "Python3",
        ]
        self.pages_res = 10
        self.pages_single_res = 1
        self.total_res = 74700000
        self.invalid_res = ParseError
        self.spider_invalid_param_res = ParseError
        self.spider_unknown_error_res = UnknownError
        self.no_related_res = []
        self.no_pager_res = 1

    def __get_asset(self, name):
        return requests.get("{base_url}/test_web_{name}.html".format(
            base_url=self.assets_base_url, name=name)).text

    def test_normal_result(self):
        """测试普通搜索结果"""
        asset = self.__get_asset("normal")
        result = self.spider.parser.parse_web(asset)
        self.assertIn(self.normal_res, result["results"])

    def test_video_result(self):
        """测试视频搜索结果"""
        asset = self.__get_asset("video")
        result = self.spider.parser.parse_web(asset)
        res = []
        for r in result["results"]:
            if r["type"] == "video":
                res = r
                break
        self.assertIn(self.video_res, res["results"])

    def test_news_result(self):
        """测试资讯搜索结果"""
        asset = self.__get_asset("news")
        result = self.spider.parser.parse_web(asset)
        res = []
        for r in result["results"]:
            if r["type"] == "news":
                res = r
                break
        self.assertIn(self.news_res, res["results"])

    def test_baike_img_result(self):
        """测试百科封面类型为图片的搜索结果"""
        asset = self.__get_asset("baike-img")
        result = self.spider.parser.parse_web(asset)
        res = {}
        for r in result["results"]:
            if r["type"] == "baike":
                res = r
                break
        self.assertEqual(self.baike_img_res, res["result"])

    def test_baike_video_result(self):
        """测试百科封面类型为视频的搜索结果"""
        asset = self.__get_asset("baike-video")
        result = self.spider.parser.parse_web(asset)
        res = {}
        for r in result["results"]:
            if r["type"] == "baike":
                res = r
                break
        self.assertEqual(self.baike_video_res, res["result"])

    def test_baike_none_result(self):
        """测试百科封面类型为空的搜索结果"""
        asset = self.__get_asset("baike-none")
        result = self.spider.parser.parse_web(asset)
        res = {}
        for r in result["results"]:
            if r["type"] == "baike":
                res = r
                break
        self.assertEqual(self.baike_none_res, res["result"])

    def test_calc_result(self):
        """测试运算搜索结果"""
        asset = self.__get_asset("calc")
        result = self.spider.parser.parse_web(asset)
        res = {}
        for r in result["results"]:
            if r["type"] == "calc":
                res = r
                break
        self.assertEqual(self.calc_res, res)

    def test_related_result(self):
        """测试相关搜索结果"""
        asset = self.__get_asset("related")
        result = self.spider.parser.parse_web(asset)
        res = []
        for r in result["results"]:
            if r["type"] == "related":
                res = r
                break
        self.assertEqual(self.related_res, res["results"])

    def test_result_pages(self):
        """测试搜索结果页数"""
        asset = self.__get_asset("pages")
        result = self.spider.parser.parse_web(asset)
        self.assertEqual(self.pages_res, result["pages"])

    def test_result_pages_single(self):
        """测试搜索结果仅有一页的页数"""
        asset = self.__get_asset("pages-single")
        result = self.spider.parser.parse_web(asset)
        self.assertEqual(self.pages_single_res, result["pages"])

    def test_result_total(self):
        """测试总计搜索结果数"""
        asset = self.__get_asset("total")
        result = self.spider.parser.parse_web(asset)
        res = 0
        for r in result["results"]:
            if r["type"] == "total":
                res = r
                break
        self.assertEqual(self.total_res, res["result"])

    def test_invalid_template(self):
        """测试无效的HTML对网页搜索的影响"""
        asset = self.__get_asset("invalid")
        self.assertRaises(self.invalid_res, self.spider.parser.parse_web,
                          asset)

    def test_spider_request(self):
        """测试爬虫获取网页"""
        result = self.spider.search_web("Python")
        self.assertIsNotNone(result["results"])

    def test_spider_invalid_param(self):
        """测试无效参数对网页搜索的影响"""
        self.assertRaises(self.spider_invalid_param_res,
                          self.spider.search_web, "")

    def test_spider_unknown_error(self):
        """测试未知错误对网页搜索的影响"""
        self.assertRaises(self.spider_unknown_error_res,
                          self.spider.search_web, 123)

    def test_no_related(self):
        """测试没有相关搜索结果对网页搜索的影响"""
        asset = self.__get_asset("no_related")
        result = self.spider.parser.parse_web(asset)
        res = []
        for r in result["results"]:
            # 此判断不应该通过
            if r["type"] == "related":  # pragma: no cover
                res = r
                break
        self.assertEqual(self.no_related_res, res)

    def test_no_pager(self):
        """测试没有分页的搜索结果对爬虫对影响"""
        asset = self.__get_asset("no_pager")
        result = self.spider.parser.parse_web(asset)
        res = result["pages"]
        self.assertEqual(self.no_pager_res, res)
예제 #12
0
from baiduspider import BaiduSpider  # 导入BaiduSpider
from pprint import pprint  # 导入pretty-print

# 获取百度的搜索结果,搜索关键词是'爬虫'
pprint(BaiduSpider().search_web('爬虫'))
예제 #13
0
class BaiduSpiderOtherTestCase(TestCase):
    def setUp(self) -> None:
        from baiduspider import BaiduSpider

        self.spider = BaiduSpider()

    def test_calc(self):
        result = self.spider.search_web("12345679*9")
        flag = False
        for i in result["results"]:
            if i["type"] == "calc":
                # print(i['process'])
                # print(i['result'])
                flag = True
        self.assertTrue(flag)

    def test_tieba(self):
        result = self.spider.search_web("python吧")
        for i in result["results"]:
            if i["type"] == "tieba":
                self.are_in(
                    [
                        "title", "des", "cover", "url", "followers", "hot",
                        "total"
                    ],
                    i["result"],
                )
            elif i["type"] == "result":
                self.are_in(["des", "origin", "title", "url"], i)

    def test_video(self):
        result = self.spider.search_web("视频")
        # print(result)
        for i in result["results"]:
            if i["type"] == "result":
                self.are_in(["des", "origin", "title", "url"], i)
            elif i["type"] == "tieba":
                self.are_in(
                    [
                        "title", "des", "cover", "url", "followers", "hot",
                        "total"
                    ],
                    i["result"],
                )
            elif i["type"] == "video":
                self.are_in(["length", "origin", "title", "url"],
                            i["results"][0])

    def test_news(self):
        result = self.spider.search_web("今日新闻")
        for i in result["results"]:
            if i["type"] == "news":
                self.are_in(["author", "time", "title", "url", "des"],
                            i["results"][0])

    def test_exclude_all(self):
        result = self.spider.search_web("python", exclude=["all"])
        for i in result["results"]:
            self.assertIn(i["type"], ["result", "total"])

    def test_page(self):
        result = self.spider.search_web("ocaiueno")
        print(result)
        result = self.spider.search_web(
            "774f43c6744b47de98b1661d2344490b3761829a", pn=100)
        print(result)

    def are_in(self, members: list, container: list):
        for i in members:
            self.assertIn(i, container)
예제 #14
0
import time
import random
import os
import csv

fieldKey = ["title", "des", "origin", "url", "time"]

filename = "reult.csv"

file = open(filename, 'w', encoding='utf-8', newline='')

writer = csv.DictWriter(file, fieldKey)
writer.writeheader()

# 实例化BaiduSpider
spider = BaiduSpider()

# 搜索网页
for i in range(0, 100):

    resultDic = spider.search_ads(query='防水补漏', pn=i)
    if len(resultDic["results"]) == 0:
        continue

    for item in resultDic["results"]:

        print(type(item))
        print("===================")
        pprint(item)
        print("===================")
예제 #15
0
from baiduspider import BaiduSpider
from pprint import pprint

# 实例化BaiduSpider
spider = BaiduSpider()

# 搜索网页
pprint(spider.search_web(query='com.tencent.freestyle'))
예제 #16
0
class WebTestCase(TestCase):
    def __init__(self, methodName):
        """测试网页搜索

        本测试用于测试BaiduSpider.`search_web`
        """
        super().__init__(methodName)

    def setUp(self):
        # 导入包
        sys.path.append(os.path.abspath('.'))
        from baiduspider import BaiduSpider
        from baiduspider.errors import ParseError, UnknownError
        self.spider = BaiduSpider()
        self.assets_base_url = 'https://cdn.jsdelivr.net/gh/BaiduSpider/BaiduSpiderTestAssets/web'
        self.normal_res = {
            'title': 'Welcome to Python.org',
            'des':
            'The official home of the Python Programming Language... # Python 3: Simple output (with Unicode) >>> print("Hello, I\'m Python!") Hello, I\'m Python!',
            'url':
            'http://www.baidu.com/link?url=yC-vpJc3cGCINc7SrFvV0A5-mBa3lrOseRMxZzZxXmlh1TqtxC8jgrOPHgSJi7_O',
            'time': None,
            'type': 'result',
            'origin': 'www.python.org/'
        }
        self.video_res = {
            'title': 'python在excel中神运用,亮瞎眼的操作哦',
            'url':
            'https://baijiahao.baidu.com/s?id=1659418735845772463&wfr=content',
            'cover':
            'https://vdposter.bdstatic.com/5ecdac23471e6248259e256427ea66c3.jpeg?x-bce-process=image/resize,m_fill,w_242,h_182/format,f_jpg/quality,Q_100',
            'length': '05:41',
            'origin': '好看视频'
        }
        self.news_res = {
            'author': '国际在线',
            'time': '9分钟前',
            'title': '特朗普确诊新冠!',
            'url':
            'http://www.baidu.com/link?url=_APr4uGsSQzeq7MRkeoxLZlS6TfL8np6zzDnQqVuM9_Kwby5rypESvXHhX5ByEBChsusU4ZO_0p4smy0iz4iP0Kh2QsACY9s1_Fa1YACavW',
            'des': None
        }
        self.baike_img_res = {
            'title':
            'Python(计算机程序设计语言)_百度百科',
            'des':
            'Python是一种跨平台的计算机程序设计语言。 是一个高层次的结合了解释性、编译性、互动性和面向对象的脚本语言。最初被设计用于编写自动化脚本(shell),随着版本的不断更新和语言...',
            'cover':
            'https://dss0.bdstatic.com/6Ox1bjeh1BF3odCf/it/u=783017482,219941889&fm=74&app=80&f=JPEG&size=f121,90?sec=1880279984&t=b639fbc82a72772a726d11888a54d8f6',
            'cover-type':
            'image',
            'url':
            'http://www.baidu.com/link?url=Clp7kAWYKDauuI0IomD4-yj3EPlzvzhtUsU8eODlD2b6rCmZ0R1mH3RgeuVxJ0QerYWOj1f2cI3gvqJPnDiaNa'
        }
        self.baike_video_res = {
            'title':
            '我(汉语汉字)_百度百科',
            'des':
            '我,汉语常用字,读作wǒ,最早见于甲骨文,其甲骨文本义指奴隶社会里一种用来行刑杀人和肢解牲口的凶器,后由本义衍生出“手持大戉,呐喊示威”等意;但到了战国时代,“我”字本义所代表的凶器被后起的更优良的凶器淘汰,于是“我”字在汉...',
            'cover':
            'http://www.baidu.com/link?url=6VGNfYIuPl2uh-HOGwQnK04K4WL2MICdv6ZpoEIhhgxAUanK2l1aTp_6oC51mpYh8LKEem911tdb4pgp3fNK3UN6GPDqFg-iXcmj9aHzQ4xEodjoO0fsgst1Mf3XAW_DW4idF_QXDhBW_R-vskbcZK',
            'cover-type':
            'video',
            'url':
            'http://www.baidu.com/link?url=2_SPS_eUtRUiJS3eT5pvwHmstP1QBW8YXGzDxc3QRRb0xqWBNkIRbL-S8isFYHztETZv59iF_iDPV5ognLjNna'
        }
        self.baike_none_res = {
            'title':
            'ASTM A106无缝钢管_百度百科',
            'des':
            'ASTM A106无缝钢管是属于美标的无缝钢管,材质是普通碳钢系列。',
            'cover':
            None,
            'cover-type':
            None,
            'url':
            'http://www.baidu.com/link?url=uLJ0kfXAXVu14FztaB4KMU7N4yN5lJikkRBI3b8LeUGGCn-8UoyHbYjo1jyXpVEB95B3htArzho5yreAGJS0SElyhz1euRHtbIb8hzpLESe_Q3Zqrt-U8RJARsapbJ4WLSxyjusGQK-ft_Xflkboz_'
        }
        self.calc_res = {
            'process': '20^5+107*13',
            'result': '3 201 391',
            'type': 'calc'
        }
        self.related_res = [
            'python有什么用', 'python为什么叫爬虫', 'python教程', 'Python官网', 'python爬虫教程',
            'python和java', 'Python代码', 'Python软件', 'Python3'
        ]
        self.pages_res = 10
        self.pages_single_res = 1
        self.total_res = 74700000
        self.invalid_res = ParseError
        self.spider_invalid_param_res = ParseError
        self.spider_unknown_error_res = UnknownError

    def __get_asset(self, name):
        return requests.get('{base_url}/test_web_{name}.html'.format(
            base_url=self.assets_base_url, name=name)).text

    def test_normal_result(self):
        """测试普通搜索结果"""
        asset = self.__get_asset('normal')
        result = self.spider.parser.parse_web(asset)
        self.assertIn(self.normal_res, result['results'])

    def test_video_result(self):
        """测试视频搜索结果"""
        asset = self.__get_asset('video')
        result = self.spider.parser.parse_web(asset)
        res = []
        for r in result['results']:
            if r['type'] == 'video':
                res = r
                break
        self.assertIn(self.video_res, res['results'])

    def test_news_result(self):
        """测试资讯搜索结果"""
        asset = self.__get_asset('news')
        result = self.spider.parser.parse_web(asset)
        res = []
        for r in result['results']:
            if r['type'] == 'news':
                res = r
                break
        self.assertIn(self.news_res, res['results'])

    def test_baike_img_result(self):
        """测试百科封面类型为图片的搜索结果"""
        asset = self.__get_asset('baike-img')
        result = self.spider.parser.parse_web(asset)
        res = {}
        for r in result['results']:
            if r['type'] == 'baike':
                res = r
                break
        self.assertEqual(self.baike_img_res, res['result'])

    def test_baike_video_result(self):
        """测试百科封面类型为视频的搜索结果"""
        asset = self.__get_asset('baike-video')
        result = self.spider.parser.parse_web(asset)
        res = {}
        for r in result['results']:
            if r['type'] == 'baike':
                res = r
                break
        self.assertEqual(self.baike_video_res, res['result'])

    def test_baike_none_result(self):
        """测试百科封面类型为空的搜索结果"""
        asset = self.__get_asset('baike-none')
        result = self.spider.parser.parse_web(asset)
        res = {}
        for r in result['results']:
            if r['type'] == 'baike':
                res = r
                break
        self.assertEqual(self.baike_none_res, res['result'])

    def test_calc_result(self):
        """测试运算搜索结果"""
        asset = self.__get_asset('calc')
        result = self.spider.parser.parse_web(asset)
        res = {}
        for r in result['results']:
            if r['type'] == 'calc':
                res = r
                break
        self.assertEqual(self.calc_res, res)

    def test_related_result(self):
        """测试相关搜索结果"""
        asset = self.__get_asset('related')
        result = self.spider.parser.parse_web(asset)
        res = []
        for r in result['results']:
            if r['type'] == 'related':
                res = r
                break
        self.assertEqual(self.related_res, res['results'])

    def test_result_pages(self):
        """测试搜索结果页数"""
        asset = self.__get_asset('pages')
        result = self.spider.parser.parse_web(asset)
        self.assertEqual(self.pages_res, result['pages'])

    def test_result_pages_single(self):
        """测试搜索结果仅有一页的页数"""
        asset = self.__get_asset('pages-single')
        result = self.spider.parser.parse_web(asset)
        self.assertEqual(self.pages_single_res, result['pages'])

    def test_result_total(self):
        """测试总计搜索结果数"""
        asset = self.__get_asset('total')
        result = self.spider.parser.parse_web(asset)
        res = 0
        for r in result['results']:
            if r['type'] == 'total':
                res = r
                break
        self.assertEqual(self.total_res, res['result'])

    def test_invalid_template(self):
        """测试无效的HTML对BaiduSpider的影响"""
        asset = self.__get_asset('invalid')
        self.assertRaises(self.invalid_res, self.spider.parser.parse_web,
                          asset)

    def test_spider_request(self):
        """测试爬虫获取网页"""
        result = self.spider.search_web('Python')
        self.assertIsNotNone(result['results'])

    def test_spider_invalid_param(self):
        """测试无效参数对BaiduSpider的影响"""
        self.assertRaises(self.spider_invalid_param_res,
                          self.spider.search_web, '')

    def test_spider_unknown_error(self):
        """测试未知错误对BaiduSpider的影响"""
        self.assertRaises(self.spider_unknown_error_res,
                          self.spider.search_web, 123)