Example #1
0
def test_spider_distributed():
    """
    test distributed spider
    """
    # initial fetcher / parser / saver, you also can rewrite this three class
    fetcher = spider.Fetcher(max_repeat=3, sleep_time=0)
    parser = spider.Parser(max_deep=-1)
    saver = spider.Saver(save_pipe=open("out_spider_distributed.txt", "w"))

    # define url_filter
    url_filter = spider.UrlFilter(black_patterns=black_patterns,
                                  white_patterns=white_patterns)

    # initial web_spider
    web_spider_dist = spider.WebSpiderDist(fetcher,
                                           parser,
                                           saver,
                                           url_filter=url_filter,
                                           monitor_sleep_time=5)
    web_spider_dist.init_redis(host="localhost",
                               port=6379,
                               key_wait="spider.wait",
                               key_all="spider.all")

    # add start url
    web_spider_dist.set_start_url("http://zhushou.360.cn/", keys=("360web", ))

    # start web_spider
    web_spider_dist.start_work_and_wait_done(fetcher_num=10)
    return
Example #2
0
def test_spider_distributed():
    """
    test distributed spider
    """
    # initial fetcher / parser / saver, you also can rewrite this three class
    fetcher = spider.Fetcher(max_repeat=1, sleep_time=0)
    parser = spider.Parser(max_deep=-1)
    saver = spider.Saver(save_pipe=open("out_spider_distributed.txt", "w"))

    # define url_filter
    url_filter = spider.UrlFilter(black_patterns=black_patterns,
                                  white_patterns=white_patterns)

    # initial web_spider
    web_spider_dist = spider.WebSpiderDist(fetcher,
                                           parser,
                                           saver,
                                           proxieser=None,
                                           url_filter=url_filter,
                                           monitor_sleep_time=5)
    web_spider_dist.init_redis(host="localhost",
                               port=6379,
                               key_high_priority="spider.high",
                               key_low_priority="spider.low")

    # start web_spider
    web_spider_dist.start_work_and_wait_done(fetcher_num=10)
    return
Example #3
0
def test_spider_distributed():
    """
    test distributed spider
    """
    # initial fetcher / parser / saver
    fetcher = MyFetcher(max_repeat=1, sleep_time=0)
    parser = MyParser(max_deep=-1)
    saver = spider.Saver(save_pipe=open("out_distributed.txt", "w"))

    # define url_filter
    url_filter = spider.UrlFilter(black_patterns=black_patterns,
                                  white_patterns=white_patterns)

    # initial web_spider
    web_spider_dist = spider.WebSpiderDist(fetcher,
                                           parser,
                                           saver,
                                           proxieser=None,
                                           url_filter=url_filter,
                                           monitor_sleep_time=5)
    web_spider_dist.init_redis(host="localhost",
                               port=6379,
                               key_high_priority="spider.high",
                               key_low_priority="spider.low")

    # start web_spider
    web_spider_dist.start_working(fetcher_num=10)

    # wait for finished
    web_spider_dist.wait_for_finished(is_over=True)
    return
Example #4
0
def test_spider_distributed():
    """
    test distributed spider
    """
    # 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个
    fetcher = spider.Fetcher(max_repeat=3, sleep_time=0)
    parser = spider.Parser(max_deep=-1)
    saver = spider.Saver(save_pipe=open("out_spider_distributed.txt", "w"))

    # 初始化WebSpiderDist
    web_spider_dist = spider.WebSpiderDist(fetcher,
                                           parser,
                                           saver,
                                           monitor_sleep_time=5)
    web_spider_dist.init_redis(host="localhost",
                               port=6379,
                               key_wait="spider.wait",
                               key_all="spider.all")

    # 添加种子Url
    web_spider_dist.set_start_url("http://zhushou.360.cn/", keys=("360web", ))

    # 开始抓取任务并等待其结束
    web_spider_dist.start_work_and_wait_done(fetcher_num=1)
    return