def setUp(self): # 导入包 sys.path.append(os.path.abspath('.')) from baiduspider import BaiduSpider from baiduspider.errors import ParseError self.spider = BaiduSpider() self.assets_base_url = 'https://cdn.jsdelivr.net/gh/BaiduSpider/BaiduSpiderTestAssets/pic' self.normal_res = { 'host': 'www.cwq.com', 'title': 'python中文社区', 'url': 'http://img.cwq.com/201611/581c95c35ca62.png' }
class PicTestCase(TestCase): def __init__(self, methodName): """测试图片搜索 本测试用于测试BaiduSpider.`search_pic` """ super().__init__(methodName) def setUp(self): # 导入包 sys.path.append(os.path.abspath('.')) from baiduspider import BaiduSpider from baiduspider.errors import ParseError self.spider = BaiduSpider() self.assets_base_url = 'https://cdn.jsdelivr.net/gh/BaiduSpider/BaiduSpiderTestAssets/pic' self.normal_res = { 'host': 'www.cwq.com', 'title': 'python中文社区', 'url': 'http://img.cwq.com/201611/581c95c35ca62.png' } def __get_asset(self, name): return requests.get('{base_url}/test_pic_{name}.html'.format( base_url=self.assets_base_url, name=name)).text def test_pic_normal(self): """测试普通搜索结果""" asset = self.__get_asset('normal') result = self.spider.parser.parse_pic(asset) self.assertIn(self.normal_res, result['results']) def test_spider_request(self): """测试爬虫获取网页""" result = self.spider.search_web('Python') self.assertIsNotNone(result['results'])
def parse(self, response): pn = int(response.meta["pn"]) | 1 spider = BaiduSpider() result = spider.search_zhidao(query=self.keyword, pn=pn) results = result["results"] for result in results: url = result["url"].replace("http:", "https:") if (url.find("zhidao.baidu") >= 0): yield SplashRequest(url, self.parse_zhidao, endpoint="execute", args={"lua_source": lua_script}, meta={"origin_url": url}) if (len(results) > 0): next_pn = pn + 1 yield SplashRequest(self.index_url, self.parse, meta={"pn": next_pn})
import re from time import time from typing import List, Dict, Union, Iterator import yaml from baiduspider import BaiduSpider, ParseError from safeSearch.error import QueryTooLongException SPIDER = BaiduSpider() LAST_QUERY_TIME = time() # def build_baidu_url(word: str, sites: list = None) -> str: # base_url = "https://www.baidu.com/s?wd={wd}".format(wd=word) # site_filter = " site:(" + " | ".join(sites) + ")" if sites else "" # # return base_url + site_filter # def build_google_url(word: str, sites: list = None) -> str: # base_url = "https://www.google.com/search?q={q}".format(q=word) # site_filter = "+inurl:(+" + "+|+".join(sites) + ")" if sites else "" # # return base_url + site_filter def split_site_filter(word: str, site: list = None) -> List[str]: """Used to recursively split site filter conditions to avoid long query""" last_valid_query = "" for i in range(len(site)): if i == 0:
#!/usr/bin/python # coding:utf-8 from baiduspider import BaiduSpider # 导入BaiduSpider from pprint import pprint # 导入pretty-print import pandas as pd # 获取百度的搜索结果,搜索关键词是'爬虫' # pprint(BaiduSpider().search_web('博士 自杀', pn=4)) new_result = [] for i in range(1, 1010): length = len(BaiduSpider().search_web('博士 自杀', pn=i)["results"]) print(length) for j in range(1, length - 1): new_result.append(BaiduSpider().search_web('博士 自杀', pn=i)["results"][j]) df = pd.DataFrame(new_result) order = ['time', 'title', 'des', 'origin', 'url', 'type'] print(df) df = df[order] columns_map = { 'time': '时间', 'title': "标题", 'des': "描述", 'origin': "来源", 'url': "链接", 'type': "种类" } df.rename(columns=columns_map, inplace=True) df.fillna(' ', inplace=True)
import platform import pprint import requests # 导入BaiduSpider from baiduspider import BaiduSpider from tqdm import tqdm, trange # from data_process_timeout import null_callback, time_out from data_process_utils import (char_unify_convertor, convert_cn_colon_to_en, convert_en_punct_to_cn, del_spaces, replace_1_with_l, replace_l_with_1, rm_pinyin_yinjie) # 实例化BaiduSpider spider = BaiduSpider() # 填入比赛数据的train target_file_name = 'train' file_to_write = f'{target_file_name}_searched_results.pkl' # 路径根据实际情况调整 target_file = f'../official_data/{target_file_name}.csv' search_scraper_memo_path = f'search_scraper_memo_{target_file_name}.pkl' # 多个站点搜索,均记录,分别统计 # 实际上发现:mofangge爬取速度过慢,有时间的话可以做;百度知道上的答案格式不统一,后续提取表达式和答案较为困难 search_site_lists = [ 'zybang.com', # 'mofangge.com',
def setUp(self): from baiduspider import BaiduSpider self.spider = BaiduSpider()
class SpiderTestCase(TestCase): """ BaiduSpider的测试 """ def setUp(self): from baiduspider import BaiduSpider self.spider = BaiduSpider() def test_search_web(self): for pn in range(1, 10): python = self.spider.search_web("python", exclude=["tieba"], pn=pn) total = False # print(python) for i in python["results"]: # 开始判断各项数据是否合规 if i["type"] == "total": total = "total" in python # 结果里有两个total self.assertTrue(type(i["result"]) == int) elif i["type"] == "related": self.assertEqual(type(i["results"]), list) elif i["type"] == "calc": self.assertIn("process", i) elif i["type"] == "news": self.are_in(["author", "time", "title", "url", "des"], i["results"]) elif i["type"] == "video": self.are_in( ["cover", "origin", "length", "url", "title"], i["results"] ) elif i["type"] == "baike": self.are_in( ["cover", "cover-type", "des", "url", "title"], i["result"] ) elif i["type"] == "blog": self.are_in(["blogs", "url", "title"], i["result"]) elif i["type"] == "gitee": self.are_in( [ "star", "fork", "watch", "url", "title", "license", "lang", "status", ], i["result"], ) elif i["type"] == "result": self.are_in(["des", "origin", "title", "url"], i) else: self.assertTrue(False) # print(i['type']) self.assertNotEqual(i["type"], "tieba") self.assertTrue(total) def test_search_pic(self): python = self.spider.search_pic("python") self.assertIn("total", python) self.assertEqual(type(python["total"]), int) for i in python["results"]: self.assertTrue(i["title"]) self.assertTrue(i["url"]) self.assertIn("host", i) def test_search_video(self): python = self.spider.search_video("python") self.assertIn("total", python) self.assertEqual(type(python["total"]), int) for i in python["results"]: self.assertTrue(i["title"]) self.assertTrue(i["url"]) self.assertIn("img", i) self.assertIn("time", i) def test_search_news(self): python = self.spider.search_news("python") self.assertEqual(type(python["total"]), int) for i in python["results"]: self.are_in(["author", "des", "date", "title", "url"], i) def test_search_wenku(self): try: python = self.spider.search_wenku("python") self.assertEqual(type(python["total"]), int) for i in python["results"]: self.are_in( ["downloads", "pages", "date", "des", "title", "url", "type"], i ) except UnboundLocalError: pass def test_search_jingyan(self): python = self.spider.search_jingyan("python") self.assertEqual(type(python["total"]), int) for i in python["results"]: self.are_in(["title", "url", "des", "date", "category", "votes"], i) def test_search_baike(self): python = self.spider.search_baike("python") self.assertEqual(type(python["total"]), int) for i in python["results"]: self.are_in(["title", "des", "date", "url"], i) def test_zhidao(self): python = self.spider.search_zhidao("python") self.assertEqual(type(python["total"]), int) for i in python["results"]: self.are_in(["title", "des", "date", "url", "count"], i) def are_in(self, members: list, container: list): for i in members: self.assertIn(i, container)
""" @author: wanghongliang @file: baidu_video.py @time: 2021/1/28 9:34 """ from baiduspider import BaiduSpider from pprint import pprint # 实例化BaiduSpider spider = BaiduSpider() # 搜索网页 # pprint(spider.search_web(query='Python')) # pprint(spider.search_pic(query='person', pn=1)) pprint(spider.search_video(query='car', pn=1))
def setUp(self): # 导入包 sys.path.append(os.path.abspath(".")) from baiduspider import BaiduSpider from baiduspider.errors import ParseError, UnknownError self.spider = BaiduSpider() self.assets_base_url = ( "https://cdn.jsdelivr.net/gh/BaiduSpider/BaiduSpiderTestAssets@master/web" ) self.normal_res = { "title": "Welcome to Python.org", "des": "The official home of the Python Programming Language... # Python 3: Simple output (with Unicode) >>> print(\"Hello, I'm Python!\") Hello, I'm Python!", "url": "http://www.baidu.com/link?url=yC-vpJc3cGCINc7SrFvV0A5-mBa3lrOseRMxZzZxXmlh1TqtxC8jgrOPHgSJi7_O", "time": None, "type": "result", "origin": "www.python.org/", } self.video_res = { "title": "python在excel中神运用,亮瞎眼的操作哦", "url": "https://baijiahao.baidu.com/s?id=1659418735845772463&wfr=content", "cover": "https://vdposter.bdstatic.com/5ecdac23471e6248259e256427ea66c3.jpeg?x-bce-process=image/resize,m_fill,w_242,h_182/format,f_jpg/quality,Q_100", "length": "05:41", "origin": "好看视频", } self.news_res = { "author": "国际在线", "time": "9分钟前", "title": "特朗普确诊新冠!", "url": "http://www.baidu.com/link?url=_APr4uGsSQzeq7MRkeoxLZlS6TfL8np6zzDnQqVuM9_Kwby5rypESvXHhX5ByEBChsusU4ZO_0p4smy0iz4iP0Kh2QsACY9s1_Fa1YACavW", "des": None, } self.baike_img_res = { "title": "Python(计算机程序设计语言)_百度百科", "des": "Python是一种跨平台的计算机程序设计语言。 是一个高层次的结合了解释性、编译性、互动性和面向对象的脚本语言。最初被设计用于编写自动化脚本(shell),随着版本的不断更新和语言...", "cover": "https://dss0.bdstatic.com/6Ox1bjeh1BF3odCf/it/u=783017482,219941889&fm=74&app=80&f=JPEG&size=f121,90?sec=1880279984&t=b639fbc82a72772a726d11888a54d8f6", "cover-type": "image", "url": "http://www.baidu.com/link?url=Clp7kAWYKDauuI0IomD4-yj3EPlzvzhtUsU8eODlD2b6rCmZ0R1mH3RgeuVxJ0QerYWOj1f2cI3gvqJPnDiaNa", } self.baike_video_res = { "title": "我(汉语汉字)_百度百科", "des": "我,汉语常用字,读作wǒ,最早见于甲骨文,其甲骨文本义指奴隶社会里一种用来行刑杀人和肢解牲口的凶器,后由本义衍生出“手持大戉,呐喊示威”等意;但到了战国时代,“我”字本义所代表的凶器被后起的更优良的凶器淘汰,于是“我”字在汉...", "cover": "http://www.baidu.com/link?url=6VGNfYIuPl2uh-HOGwQnK04K4WL2MICdv6ZpoEIhhgxAUanK2l1aTp_6oC51mpYh8LKEem911tdb4pgp3fNK3UN6GPDqFg-iXcmj9aHzQ4xEodjoO0fsgst1Mf3XAW_DW4idF_QXDhBW_R-vskbcZK", "cover-type": "video", "url": "http://www.baidu.com/link?url=2_SPS_eUtRUiJS3eT5pvwHmstP1QBW8YXGzDxc3QRRb0xqWBNkIRbL-S8isFYHztETZv59iF_iDPV5ognLjNna", } self.baike_none_res = { "title": "ASTM A106无缝钢管_百度百科", "des": "ASTM A106无缝钢管是属于美标的无缝钢管,材质是普通碳钢系列。", "cover": None, "cover-type": None, "url": "http://www.baidu.com/link?url=uLJ0kfXAXVu14FztaB4KMU7N4yN5lJikkRBI3b8LeUGGCn-8UoyHbYjo1jyXpVEB95B3htArzho5yreAGJS0SElyhz1euRHtbIb8hzpLESe_Q3Zqrt-U8RJARsapbJ4WLSxyjusGQK-ft_Xflkboz_", } self.calc_res = { "process": "20^5+107*13", "result": "3 201 391", "type": "calc", } self.related_res = [ "python有什么用", "python为什么叫爬虫", "python教程", "Python官网", "python爬虫教程", "python和java", "Python代码", "Python软件", "Python3", ] self.pages_res = 10 self.pages_single_res = 1 self.total_res = 74700000 self.invalid_res = ParseError self.spider_invalid_param_res = ParseError self.spider_unknown_error_res = UnknownError self.no_related_res = [] self.no_pager_res = 1
class WebTestCase(TestCase): def __init__(self, methodName): """测试网页搜索 本测试用于测试BaiduSpider.`search_web` """ super().__init__(methodName) def setUp(self): # 导入包 sys.path.append(os.path.abspath(".")) from baiduspider import BaiduSpider from baiduspider.errors import ParseError, UnknownError self.spider = BaiduSpider() self.assets_base_url = ( "https://cdn.jsdelivr.net/gh/BaiduSpider/BaiduSpiderTestAssets@master/web" ) self.normal_res = { "title": "Welcome to Python.org", "des": "The official home of the Python Programming Language... # Python 3: Simple output (with Unicode) >>> print(\"Hello, I'm Python!\") Hello, I'm Python!", "url": "http://www.baidu.com/link?url=yC-vpJc3cGCINc7SrFvV0A5-mBa3lrOseRMxZzZxXmlh1TqtxC8jgrOPHgSJi7_O", "time": None, "type": "result", "origin": "www.python.org/", } self.video_res = { "title": "python在excel中神运用,亮瞎眼的操作哦", "url": "https://baijiahao.baidu.com/s?id=1659418735845772463&wfr=content", "cover": "https://vdposter.bdstatic.com/5ecdac23471e6248259e256427ea66c3.jpeg?x-bce-process=image/resize,m_fill,w_242,h_182/format,f_jpg/quality,Q_100", "length": "05:41", "origin": "好看视频", } self.news_res = { "author": "国际在线", "time": "9分钟前", "title": "特朗普确诊新冠!", "url": "http://www.baidu.com/link?url=_APr4uGsSQzeq7MRkeoxLZlS6TfL8np6zzDnQqVuM9_Kwby5rypESvXHhX5ByEBChsusU4ZO_0p4smy0iz4iP0Kh2QsACY9s1_Fa1YACavW", "des": None, } self.baike_img_res = { "title": "Python(计算机程序设计语言)_百度百科", "des": "Python是一种跨平台的计算机程序设计语言。 是一个高层次的结合了解释性、编译性、互动性和面向对象的脚本语言。最初被设计用于编写自动化脚本(shell),随着版本的不断更新和语言...", "cover": "https://dss0.bdstatic.com/6Ox1bjeh1BF3odCf/it/u=783017482,219941889&fm=74&app=80&f=JPEG&size=f121,90?sec=1880279984&t=b639fbc82a72772a726d11888a54d8f6", "cover-type": "image", "url": "http://www.baidu.com/link?url=Clp7kAWYKDauuI0IomD4-yj3EPlzvzhtUsU8eODlD2b6rCmZ0R1mH3RgeuVxJ0QerYWOj1f2cI3gvqJPnDiaNa", } self.baike_video_res = { "title": "我(汉语汉字)_百度百科", "des": "我,汉语常用字,读作wǒ,最早见于甲骨文,其甲骨文本义指奴隶社会里一种用来行刑杀人和肢解牲口的凶器,后由本义衍生出“手持大戉,呐喊示威”等意;但到了战国时代,“我”字本义所代表的凶器被后起的更优良的凶器淘汰,于是“我”字在汉...", "cover": "http://www.baidu.com/link?url=6VGNfYIuPl2uh-HOGwQnK04K4WL2MICdv6ZpoEIhhgxAUanK2l1aTp_6oC51mpYh8LKEem911tdb4pgp3fNK3UN6GPDqFg-iXcmj9aHzQ4xEodjoO0fsgst1Mf3XAW_DW4idF_QXDhBW_R-vskbcZK", "cover-type": "video", "url": "http://www.baidu.com/link?url=2_SPS_eUtRUiJS3eT5pvwHmstP1QBW8YXGzDxc3QRRb0xqWBNkIRbL-S8isFYHztETZv59iF_iDPV5ognLjNna", } self.baike_none_res = { "title": "ASTM A106无缝钢管_百度百科", "des": "ASTM A106无缝钢管是属于美标的无缝钢管,材质是普通碳钢系列。", "cover": None, "cover-type": None, "url": "http://www.baidu.com/link?url=uLJ0kfXAXVu14FztaB4KMU7N4yN5lJikkRBI3b8LeUGGCn-8UoyHbYjo1jyXpVEB95B3htArzho5yreAGJS0SElyhz1euRHtbIb8hzpLESe_Q3Zqrt-U8RJARsapbJ4WLSxyjusGQK-ft_Xflkboz_", } self.calc_res = { "process": "20^5+107*13", "result": "3 201 391", "type": "calc", } self.related_res = [ "python有什么用", "python为什么叫爬虫", "python教程", "Python官网", "python爬虫教程", "python和java", "Python代码", "Python软件", "Python3", ] self.pages_res = 10 self.pages_single_res = 1 self.total_res = 74700000 self.invalid_res = ParseError self.spider_invalid_param_res = ParseError self.spider_unknown_error_res = UnknownError self.no_related_res = [] self.no_pager_res = 1 def __get_asset(self, name): return requests.get("{base_url}/test_web_{name}.html".format( base_url=self.assets_base_url, name=name)).text def test_normal_result(self): """测试普通搜索结果""" asset = self.__get_asset("normal") result = self.spider.parser.parse_web(asset) self.assertIn(self.normal_res, result["results"]) def test_video_result(self): """测试视频搜索结果""" asset = self.__get_asset("video") result = self.spider.parser.parse_web(asset) res = [] for r in result["results"]: if r["type"] == "video": res = r break self.assertIn(self.video_res, res["results"]) def test_news_result(self): """测试资讯搜索结果""" asset = self.__get_asset("news") result = self.spider.parser.parse_web(asset) res = [] for r in result["results"]: if r["type"] == "news": res = r break self.assertIn(self.news_res, res["results"]) def test_baike_img_result(self): """测试百科封面类型为图片的搜索结果""" asset = self.__get_asset("baike-img") result = self.spider.parser.parse_web(asset) res = {} for r in result["results"]: if r["type"] == "baike": res = r break self.assertEqual(self.baike_img_res, res["result"]) def test_baike_video_result(self): """测试百科封面类型为视频的搜索结果""" asset = self.__get_asset("baike-video") result = self.spider.parser.parse_web(asset) res = {} for r in result["results"]: if r["type"] == "baike": res = r break self.assertEqual(self.baike_video_res, res["result"]) def test_baike_none_result(self): """测试百科封面类型为空的搜索结果""" asset = self.__get_asset("baike-none") result = self.spider.parser.parse_web(asset) res = {} for r in result["results"]: if r["type"] == "baike": res = r break self.assertEqual(self.baike_none_res, res["result"]) def test_calc_result(self): """测试运算搜索结果""" asset = self.__get_asset("calc") result = self.spider.parser.parse_web(asset) res = {} for r in result["results"]: if r["type"] == "calc": res = r break self.assertEqual(self.calc_res, res) def test_related_result(self): """测试相关搜索结果""" asset = self.__get_asset("related") result = self.spider.parser.parse_web(asset) res = [] for r in result["results"]: if r["type"] == "related": res = r break self.assertEqual(self.related_res, res["results"]) def test_result_pages(self): """测试搜索结果页数""" asset = self.__get_asset("pages") result = self.spider.parser.parse_web(asset) self.assertEqual(self.pages_res, result["pages"]) def test_result_pages_single(self): """测试搜索结果仅有一页的页数""" asset = self.__get_asset("pages-single") result = self.spider.parser.parse_web(asset) self.assertEqual(self.pages_single_res, result["pages"]) def test_result_total(self): """测试总计搜索结果数""" asset = self.__get_asset("total") result = self.spider.parser.parse_web(asset) res = 0 for r in result["results"]: if r["type"] == "total": res = r break self.assertEqual(self.total_res, res["result"]) def test_invalid_template(self): """测试无效的HTML对网页搜索的影响""" asset = self.__get_asset("invalid") self.assertRaises(self.invalid_res, self.spider.parser.parse_web, asset) def test_spider_request(self): """测试爬虫获取网页""" result = self.spider.search_web("Python") self.assertIsNotNone(result["results"]) def test_spider_invalid_param(self): """测试无效参数对网页搜索的影响""" self.assertRaises(self.spider_invalid_param_res, self.spider.search_web, "") def test_spider_unknown_error(self): """测试未知错误对网页搜索的影响""" self.assertRaises(self.spider_unknown_error_res, self.spider.search_web, 123) def test_no_related(self): """测试没有相关搜索结果对网页搜索的影响""" asset = self.__get_asset("no_related") result = self.spider.parser.parse_web(asset) res = [] for r in result["results"]: # 此判断不应该通过 if r["type"] == "related": # pragma: no cover res = r break self.assertEqual(self.no_related_res, res) def test_no_pager(self): """测试没有分页的搜索结果对爬虫对影响""" asset = self.__get_asset("no_pager") result = self.spider.parser.parse_web(asset) res = result["pages"] self.assertEqual(self.no_pager_res, res)
from baiduspider import BaiduSpider # 导入BaiduSpider from pprint import pprint # 导入pretty-print # 获取百度的搜索结果,搜索关键词是'爬虫' pprint(BaiduSpider().search_web('爬虫'))
class BaiduSpiderOtherTestCase(TestCase): def setUp(self) -> None: from baiduspider import BaiduSpider self.spider = BaiduSpider() def test_calc(self): result = self.spider.search_web("12345679*9") flag = False for i in result["results"]: if i["type"] == "calc": # print(i['process']) # print(i['result']) flag = True self.assertTrue(flag) def test_tieba(self): result = self.spider.search_web("python吧") for i in result["results"]: if i["type"] == "tieba": self.are_in( [ "title", "des", "cover", "url", "followers", "hot", "total" ], i["result"], ) elif i["type"] == "result": self.are_in(["des", "origin", "title", "url"], i) def test_video(self): result = self.spider.search_web("视频") # print(result) for i in result["results"]: if i["type"] == "result": self.are_in(["des", "origin", "title", "url"], i) elif i["type"] == "tieba": self.are_in( [ "title", "des", "cover", "url", "followers", "hot", "total" ], i["result"], ) elif i["type"] == "video": self.are_in(["length", "origin", "title", "url"], i["results"][0]) def test_news(self): result = self.spider.search_web("今日新闻") for i in result["results"]: if i["type"] == "news": self.are_in(["author", "time", "title", "url", "des"], i["results"][0]) def test_exclude_all(self): result = self.spider.search_web("python", exclude=["all"]) for i in result["results"]: self.assertIn(i["type"], ["result", "total"]) def test_page(self): result = self.spider.search_web("ocaiueno") print(result) result = self.spider.search_web( "774f43c6744b47de98b1661d2344490b3761829a", pn=100) print(result) def are_in(self, members: list, container: list): for i in members: self.assertIn(i, container)
import time import random import os import csv fieldKey = ["title", "des", "origin", "url", "time"] filename = "reult.csv" file = open(filename, 'w', encoding='utf-8', newline='') writer = csv.DictWriter(file, fieldKey) writer.writeheader() # 实例化BaiduSpider spider = BaiduSpider() # 搜索网页 for i in range(0, 100): resultDic = spider.search_ads(query='防水补漏', pn=i) if len(resultDic["results"]) == 0: continue for item in resultDic["results"]: print(type(item)) print("===================") pprint(item) print("===================")
from baiduspider import BaiduSpider from pprint import pprint # 实例化BaiduSpider spider = BaiduSpider() # 搜索网页 pprint(spider.search_web(query='com.tencent.freestyle'))
class WebTestCase(TestCase): def __init__(self, methodName): """测试网页搜索 本测试用于测试BaiduSpider.`search_web` """ super().__init__(methodName) def setUp(self): # 导入包 sys.path.append(os.path.abspath('.')) from baiduspider import BaiduSpider from baiduspider.errors import ParseError, UnknownError self.spider = BaiduSpider() self.assets_base_url = 'https://cdn.jsdelivr.net/gh/BaiduSpider/BaiduSpiderTestAssets/web' self.normal_res = { 'title': 'Welcome to Python.org', 'des': 'The official home of the Python Programming Language... # Python 3: Simple output (with Unicode) >>> print("Hello, I\'m Python!") Hello, I\'m Python!', 'url': 'http://www.baidu.com/link?url=yC-vpJc3cGCINc7SrFvV0A5-mBa3lrOseRMxZzZxXmlh1TqtxC8jgrOPHgSJi7_O', 'time': None, 'type': 'result', 'origin': 'www.python.org/' } self.video_res = { 'title': 'python在excel中神运用,亮瞎眼的操作哦', 'url': 'https://baijiahao.baidu.com/s?id=1659418735845772463&wfr=content', 'cover': 'https://vdposter.bdstatic.com/5ecdac23471e6248259e256427ea66c3.jpeg?x-bce-process=image/resize,m_fill,w_242,h_182/format,f_jpg/quality,Q_100', 'length': '05:41', 'origin': '好看视频' } self.news_res = { 'author': '国际在线', 'time': '9分钟前', 'title': '特朗普确诊新冠!', 'url': 'http://www.baidu.com/link?url=_APr4uGsSQzeq7MRkeoxLZlS6TfL8np6zzDnQqVuM9_Kwby5rypESvXHhX5ByEBChsusU4ZO_0p4smy0iz4iP0Kh2QsACY9s1_Fa1YACavW', 'des': None } self.baike_img_res = { 'title': 'Python(计算机程序设计语言)_百度百科', 'des': 'Python是一种跨平台的计算机程序设计语言。 是一个高层次的结合了解释性、编译性、互动性和面向对象的脚本语言。最初被设计用于编写自动化脚本(shell),随着版本的不断更新和语言...', 'cover': 'https://dss0.bdstatic.com/6Ox1bjeh1BF3odCf/it/u=783017482,219941889&fm=74&app=80&f=JPEG&size=f121,90?sec=1880279984&t=b639fbc82a72772a726d11888a54d8f6', 'cover-type': 'image', 'url': 'http://www.baidu.com/link?url=Clp7kAWYKDauuI0IomD4-yj3EPlzvzhtUsU8eODlD2b6rCmZ0R1mH3RgeuVxJ0QerYWOj1f2cI3gvqJPnDiaNa' } self.baike_video_res = { 'title': '我(汉语汉字)_百度百科', 'des': '我,汉语常用字,读作wǒ,最早见于甲骨文,其甲骨文本义指奴隶社会里一种用来行刑杀人和肢解牲口的凶器,后由本义衍生出“手持大戉,呐喊示威”等意;但到了战国时代,“我”字本义所代表的凶器被后起的更优良的凶器淘汰,于是“我”字在汉...', 'cover': 'http://www.baidu.com/link?url=6VGNfYIuPl2uh-HOGwQnK04K4WL2MICdv6ZpoEIhhgxAUanK2l1aTp_6oC51mpYh8LKEem911tdb4pgp3fNK3UN6GPDqFg-iXcmj9aHzQ4xEodjoO0fsgst1Mf3XAW_DW4idF_QXDhBW_R-vskbcZK', 'cover-type': 'video', 'url': 'http://www.baidu.com/link?url=2_SPS_eUtRUiJS3eT5pvwHmstP1QBW8YXGzDxc3QRRb0xqWBNkIRbL-S8isFYHztETZv59iF_iDPV5ognLjNna' } self.baike_none_res = { 'title': 'ASTM A106无缝钢管_百度百科', 'des': 'ASTM A106无缝钢管是属于美标的无缝钢管,材质是普通碳钢系列。', 'cover': None, 'cover-type': None, 'url': 'http://www.baidu.com/link?url=uLJ0kfXAXVu14FztaB4KMU7N4yN5lJikkRBI3b8LeUGGCn-8UoyHbYjo1jyXpVEB95B3htArzho5yreAGJS0SElyhz1euRHtbIb8hzpLESe_Q3Zqrt-U8RJARsapbJ4WLSxyjusGQK-ft_Xflkboz_' } self.calc_res = { 'process': '20^5+107*13', 'result': '3 201 391', 'type': 'calc' } self.related_res = [ 'python有什么用', 'python为什么叫爬虫', 'python教程', 'Python官网', 'python爬虫教程', 'python和java', 'Python代码', 'Python软件', 'Python3' ] self.pages_res = 10 self.pages_single_res = 1 self.total_res = 74700000 self.invalid_res = ParseError self.spider_invalid_param_res = ParseError self.spider_unknown_error_res = UnknownError def __get_asset(self, name): return requests.get('{base_url}/test_web_{name}.html'.format( base_url=self.assets_base_url, name=name)).text def test_normal_result(self): """测试普通搜索结果""" asset = self.__get_asset('normal') result = self.spider.parser.parse_web(asset) self.assertIn(self.normal_res, result['results']) def test_video_result(self): """测试视频搜索结果""" asset = self.__get_asset('video') result = self.spider.parser.parse_web(asset) res = [] for r in result['results']: if r['type'] == 'video': res = r break self.assertIn(self.video_res, res['results']) def test_news_result(self): """测试资讯搜索结果""" asset = self.__get_asset('news') result = self.spider.parser.parse_web(asset) res = [] for r in result['results']: if r['type'] == 'news': res = r break self.assertIn(self.news_res, res['results']) def test_baike_img_result(self): """测试百科封面类型为图片的搜索结果""" asset = self.__get_asset('baike-img') result = self.spider.parser.parse_web(asset) res = {} for r in result['results']: if r['type'] == 'baike': res = r break self.assertEqual(self.baike_img_res, res['result']) def test_baike_video_result(self): """测试百科封面类型为视频的搜索结果""" asset = self.__get_asset('baike-video') result = self.spider.parser.parse_web(asset) res = {} for r in result['results']: if r['type'] == 'baike': res = r break self.assertEqual(self.baike_video_res, res['result']) def test_baike_none_result(self): """测试百科封面类型为空的搜索结果""" asset = self.__get_asset('baike-none') result = self.spider.parser.parse_web(asset) res = {} for r in result['results']: if r['type'] == 'baike': res = r break self.assertEqual(self.baike_none_res, res['result']) def test_calc_result(self): """测试运算搜索结果""" asset = self.__get_asset('calc') result = self.spider.parser.parse_web(asset) res = {} for r in result['results']: if r['type'] == 'calc': res = r break self.assertEqual(self.calc_res, res) def test_related_result(self): """测试相关搜索结果""" asset = self.__get_asset('related') result = self.spider.parser.parse_web(asset) res = [] for r in result['results']: if r['type'] == 'related': res = r break self.assertEqual(self.related_res, res['results']) def test_result_pages(self): """测试搜索结果页数""" asset = self.__get_asset('pages') result = self.spider.parser.parse_web(asset) self.assertEqual(self.pages_res, result['pages']) def test_result_pages_single(self): """测试搜索结果仅有一页的页数""" asset = self.__get_asset('pages-single') result = self.spider.parser.parse_web(asset) self.assertEqual(self.pages_single_res, result['pages']) def test_result_total(self): """测试总计搜索结果数""" asset = self.__get_asset('total') result = self.spider.parser.parse_web(asset) res = 0 for r in result['results']: if r['type'] == 'total': res = r break self.assertEqual(self.total_res, res['result']) def test_invalid_template(self): """测试无效的HTML对BaiduSpider的影响""" asset = self.__get_asset('invalid') self.assertRaises(self.invalid_res, self.spider.parser.parse_web, asset) def test_spider_request(self): """测试爬虫获取网页""" result = self.spider.search_web('Python') self.assertIsNotNone(result['results']) def test_spider_invalid_param(self): """测试无效参数对BaiduSpider的影响""" self.assertRaises(self.spider_invalid_param_res, self.spider.search_web, '') def test_spider_unknown_error(self): """测试未知错误对BaiduSpider的影响""" self.assertRaises(self.spider_unknown_error_res, self.spider.search_web, 123)