def test_spider_async(): """ test asyncio spider with asyncio """ loop = asyncio.get_event_loop() # initial fetcher / parser / saver, you also can rewrite this three class fetcher = spider.FetcherAsync(max_repeat=3, sleep_time=0) parser = spider.ParserAsync(max_deep=2) saver = spider.SaverAsync(save_pipe=open("out_spider_async.txt", "w")) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=10000) # initial web_spider web_spider_async = spider.WebSpiderAsync(fetcher, parser, saver, url_filter=url_filter, loop=loop) # add start url web_spider_async.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # start web_spider web_spider_async.start_work_and_wait_done(fetcher_num=10) return
def test_spider_async(): """ test spider with asyncio """ # 得到Loop loop = asyncio.get_event_loop() # 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个 fetcher = spider.FetcherAsync(max_repeat=3, sleep_time=0) parser = spider.ParserAsync(max_deep=1) saver = spider.SaverAsync(save_pipe=open("out_spider_async.txt", "w")) # 初始化WebSpiderAsync web_spider_async = spider.WebSpiderAsync(fetcher, parser, saver, url_filter=spider.UrlFilter(), loop=loop) # 添加种子Url web_spider_async.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # 开始抓取任务并等待其结束 web_spider_async.start_work_and_wait_done(fetcher_num=10) return
def test_spider_async(): """ test spider with asyncio """ web_spider_async = spider.WebSpiderAsync(url_filter=spider.UrlFilter()) web_spider_async.set_start_url("http://zhushou.360.cn/") web_spider_async.start_work_and_wait_done() return
def test_spider_async(): """ test spider with asyncio """ # 初始化WebSpiderAsync web_spider_async = spider.WebSpiderAsync(max_repeat=3, sleep_time=0, max_deep=1, save_pipe=open( "out_spider_async.txt", "w"), url_filter=spider.UrlFilter()) # 添加种子Url web_spider_async.set_start_url("http://zhushou.360.cn/") # 开始抓取任务并等待其结束 web_spider_async.start_work_and_wait_done(fetcher_num=10) return
def test_spider_async(): """ test asyncio spider with asyncio """ # 得到Loop loop = asyncio.get_event_loop() # 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个 fetcher = spider.FetcherAsync(max_repeat=3, sleep_time=0) parser = spider.ParserAsync(max_deep=1) saver = spider.SaverAsync(save_pipe=open("out_spider_async.txt", "w")) # 定义Url过滤, UrlFilter使用BloomFilter, 适合Url数量较多的情况 black_patterns = ( spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", ) white_patterns = ("^http[s]{0,1}://(www\.){0,1}(zhushou\.360)\.(com|cn)", ) url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=10000) # 初始化WebSpiderAsync web_spider_async = spider.WebSpiderAsync(fetcher, parser, saver, url_filter=url_filter, loop=loop) # 添加种子Url web_spider_async.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # 开始抓取任务并等待其结束 web_spider_async.start_work_and_wait_done(fetcher_num=10) return