def test_spider_distributed(): """ test distributed spider """ # 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个 fetcher = spider.Fetcher(max_repeat=3, sleep_time=0) parser = spider.Parser(max_deep=-1) saver = spider.Saver(save_pipe=open("out_spider_distributed.txt", "w")) # 初始化WebSpiderDist web_spider_dist = spider.WebSpiderDist(fetcher, parser, saver, url_filter=spider.UrlFilter(), monitor_sleep_time=5) web_spider_dist.init_redis(host="localhost", port=6379, key_wait="spider.wait", key_all="spider.all") # 添加种子Url web_spider_dist.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # 开始抓取任务并等待其结束 web_spider_dist.start_work_and_wait_done(fetcher_num=10) return
def test_spider(): """ test spider """ # 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个 fetcher = spider.Fetcher(max_repeat=3, sleep_time=0) parser = spider.Parser(max_deep=1) saver = spider.Saver(save_pipe=open("out_spider_thread.txt", "w")) # 定义Url过滤, UrlFilter使用Set, 适合Url数量不多的情况 black_patterns = ( spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", ) white_patterns = ("^http[s]{0,1}://(www\.){0,1}(zhushou\.360)\.(com|cn)", ) url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=1000) # 初始化WebSpider web_spider = spider.WebSpider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) # 添加种子Url web_spider.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # 开始抓取任务并等待其结束 web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) return
def test_spider(): """ test spider """ # initial fetcher / parser / saver, you also can rewrite this three class fetcher = spider.Fetcher(max_repeat=3, sleep_time=1) parser = spider.Parser(max_deep=2) saver = spider.Saver(save_pipe=open("out_spider_thread.txt", "w")) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=None) # initial web_spider web_spider = spider.WebSpider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) # add start url web_spider.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # start web_spider result = web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) # print result of web_spider print(result[spider.TPEnum.URL_FETCH_SUCC], result[spider.TPEnum.HTM_PARSE_SUCC], result[spider.TPEnum.ITEM_SAVE_SUCC]) return
def test_spider_distributed(): """ test distributed spider """ # initial fetcher / parser / saver, you also can rewrite this three class fetcher = spider.Fetcher(max_repeat=3, sleep_time=0) parser = spider.Parser(max_deep=-1) saver = spider.Saver(save_pipe=open("out_spider_distributed.txt", "w")) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns) # initial web_spider web_spider_dist = spider.WebSpiderDist(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) web_spider_dist.init_redis(host="localhost", port=6379, key_high_priority="spider.high", key_low_priority="spider.low") # start web_spider web_spider_dist.start_work_and_wait_done(fetcher_num=10) return
def test_spider_distributed(): """ test distributed spider """ # initial fetcher / parser / saver, you also can rewrite this three class fetcher = spider.Fetcher(max_repeat=3, sleep_time=0) parser = spider.Parser(max_deep=-1) saver = spider.Saver(save_pipe=open("out_spider_distributed.txt", "w")) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns) # initial web_spider web_spider_dist = spider.WebSpiderDist(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) web_spider_dist.init_redis(host="localhost", port=6379, key_wait="spider.wait", key_all="spider.all") # add start url web_spider_dist.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # start web_spider web_spider_dist.start_work_and_wait_done(fetcher_num=10) return
def test_spider(): """ test spider """ # initial fetcher / parser / saver, you also can rewrite this three classes fetcher = spider.Fetcher(max_repeat=1, sleep_time=0) parser = spider.Parser(max_deep=2) saver = spider.Saver(save_pipe=open("out_thread.txt", "w")) # define url_filter url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=None) # initial web_spider web_spider = spider.WebSpider(fetcher, parser, saver, proxieser=None, url_filter=url_filter, monitor_sleep_time=5) # add start url web_spider.set_start_url("http://zhushou.360.cn/", priority=0, keys={"type": "360"}, deep=0) # start web_spider web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) return
def test_spider(): """ test spider """ # 定义fetcher,parser和saver, 你也可以重写这三个类中的任何一个 fetcher = spider.Fetcher(normal_max_repeat=3, normal_sleep_time=0, critical_max_repeat=5, critical_sleep_time=5) parser = spider.Parser(max_deep=1, max_repeat=2) saver = spider.Saver(save_pipe=open("out_spider.txt", "w")) # 定义Url过滤, UrlFilter使用Set, 适合Url数量不多的情况 black_patterns = (spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", ) white_patterns = ("^http[s]{0,1}://(www\.){0,1}(wandoujia|(zhushou\.360))\.(com|cn)", ) url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=1000) # 初始化WebSpider web_spider = spider.WebSpider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) # 首先抓取一次豌豆荚页面, 抓取完成之后不停止monitor web_spider.set_start_url("http://www.wandoujia.com/apps", ("wandoujia",), priority=0, deep=0, critical=False) web_spider.start_work_and_wait_done(fetcher_num=10, is_over=False) # 然后抓取360应用商店页面, 抓取完成之后停止monitor web_spider.set_start_url("http://zhushou.360.cn/", ("360app",), priority=0, deep=0, critical=False) web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) return
def test_spider(): """ test spider """ # initial fetcher / parser / saver, you also can rewrite this three class fetcher = spider.Fetcher(max_repeat=3, sleep_time=0) parser = spider.Parser(max_deep=2) saver = spider.Saver(save_pipe=open("out_spider_thread.txt", "w")) # define url_filter black_patterns = ( spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", ) white_patterns = ("^http[s]{0,1}://(www\.){0,1}(zhushou\.360)\.(com|cn)", ) url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns) # initial web_spider web_spider = spider.WebSpider(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) # add start url web_spider.set_start_url("http://zhushou.360.cn/", keys=("360web", )) # start web_spider web_spider.start_work_and_wait_done(fetcher_num=10, is_over=True) return
def test_spider(mysql, spider_type): """ test spider """ # 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个 fetcher = spider.Fetcher(normal_max_repeat=3, normal_sleep_time=0, critical_max_repeat=5, critical_sleep_time=5) # parser = spider.Parser(max_deep=1, max_repeat=2) parser = MyParser(max_deep=1, max_repeat=2) # 定义Url过滤 black_patterns = (spider.CONFIG_URLPATTERN_FILES, r"binding", r"download", ) white_patterns = ("^http[s]{0,1}://(www\.){0,1}(wandoujia|(zhushou\.360)|duba_\d)\.(com|cn)", ) if not mysql: saver = spider.Saver(save_pipe=open("out.txt", "w", encoding="utf-8")) # UrlFilter, 使用Set, 适合Url数量不多的情况 url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=None) else: saver = spider.SaverMysql(host="localhost", user="******", passwd="123456", database="default") saver.change_sqlstr("insert into t_test(url, title, getdate) values (%s, %s, %s);") # UrlFilter, 使用BloomFilter, 适合Url数量巨大的情况 url_filter = spider.UrlFilter(black_patterns=black_patterns, white_patterns=white_patterns, capacity=10000) # 确定使用ThreadPool还是ProcessPool if spider_type == "thread": web_spider = spider.WebSpiderT(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) else: web_spider = spider.WebSpiderP(fetcher, parser, saver, url_filter=url_filter, monitor_sleep_time=5) parser_num = 1 if spider_type == "thread" else 3 # 首先抓取一次豌豆荚页面,抓取完成之后不停止monitor web_spider.set_start_url("http://www.wandoujia.com/apps", ("wandoujia",), priority=0, deep=0, critical=False) web_spider.start_work_and_wait_done(fetcher_num=10, parser_num=parser_num, is_over=False) # 然后抓取360应用商店页面,并试验critical参数的作用,抓取完成之后停止monitor web_spider.set_start_url("http://zhushou.360.cn/", ("360app",), priority=0, deep=0, critical=False) for i in range(5): web_spider.set_start_url("https://www.duba_%d.com/" % i, ("critical",), priority=0, deep=0, critical=True) web_spider.start_work_and_wait_done(fetcher_num=10, parser_num=parser_num, is_over=True) return