Esempio n. 1
0
def test_spider_async():
    """
    test asyncio spider with asyncio
    """
    loop = asyncio.get_event_loop()

    # initial fetcher / parser / saver, you also can rewrite this three class
    fetcher = spider.FetcherAsync(max_repeat=3, sleep_time=0)
    parser = spider.ParserAsync(max_deep=2)
    saver = spider.SaverAsync(save_pipe=open("out_spider_async.txt", "w"))

    # define url_filter
    url_filter = spider.UrlFilter(black_patterns=black_patterns,
                                  white_patterns=white_patterns,
                                  capacity=10000)

    # initial web_spider
    web_spider_async = spider.WebSpiderAsync(fetcher,
                                             parser,
                                             saver,
                                             url_filter=url_filter,
                                             loop=loop)

    # add start url
    web_spider_async.set_start_url("http://zhushou.360.cn/", keys=("360web", ))

    # start web_spider
    web_spider_async.start_work_and_wait_done(fetcher_num=10)
    return
Esempio n. 2
0
def test_spider_async():
    """
    test spider with asyncio
    """
    # 得到Loop
    loop = asyncio.get_event_loop()

    # 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个
    fetcher = spider.FetcherAsync(max_repeat=3, sleep_time=0)
    parser = spider.ParserAsync(max_deep=1)
    saver = spider.SaverAsync(save_pipe=open("out_spider_async.txt", "w"))

    # 初始化WebSpiderAsync
    web_spider_async = spider.WebSpiderAsync(fetcher,
                                             parser,
                                             saver,
                                             url_filter=spider.UrlFilter(),
                                             loop=loop)

    # 添加种子Url
    web_spider_async.set_start_url("http://zhushou.360.cn/", keys=("360web", ))

    # 开始抓取任务并等待其结束
    web_spider_async.start_work_and_wait_done(fetcher_num=10)
    return
Esempio n. 3
0
def test_spider_async():
    """
    test spider with asyncio
    """
    web_spider_async = spider.WebSpiderAsync(url_filter=spider.UrlFilter())
    web_spider_async.set_start_url("http://zhushou.360.cn/")
    web_spider_async.start_work_and_wait_done()
    return
Esempio n. 4
0
def test_spider_async():
    """
    test spider with asyncio
    """
    # 初始化WebSpiderAsync
    web_spider_async = spider.WebSpiderAsync(max_repeat=3,
                                             sleep_time=0,
                                             max_deep=1,
                                             save_pipe=open(
                                                 "out_spider_async.txt", "w"),
                                             url_filter=spider.UrlFilter())

    # 添加种子Url
    web_spider_async.set_start_url("http://zhushou.360.cn/")

    # 开始抓取任务并等待其结束
    web_spider_async.start_work_and_wait_done(fetcher_num=10)
    return
Esempio n. 5
0
def test_spider_async():
    """
    test asyncio spider with asyncio
    """
    # 得到Loop
    loop = asyncio.get_event_loop()

    # 定义fetcher, parser和saver, 你也可以重写这三个类中的任何一个
    fetcher = spider.FetcherAsync(max_repeat=3, sleep_time=0)
    parser = spider.ParserAsync(max_deep=1)
    saver = spider.SaverAsync(save_pipe=open("out_spider_async.txt", "w"))

    # 定义Url过滤, UrlFilter使用BloomFilter, 适合Url数量较多的情况
    black_patterns = (
        spider.CONFIG_URLPATTERN_FILES,
        r"binding",
        r"download",
    )
    white_patterns = ("^http[s]{0,1}://(www\.){0,1}(zhushou\.360)\.(com|cn)", )
    url_filter = spider.UrlFilter(black_patterns=black_patterns,
                                  white_patterns=white_patterns,
                                  capacity=10000)

    # 初始化WebSpiderAsync
    web_spider_async = spider.WebSpiderAsync(fetcher,
                                             parser,
                                             saver,
                                             url_filter=url_filter,
                                             loop=loop)

    # 添加种子Url
    web_spider_async.set_start_url("http://zhushou.360.cn/", keys=("360web", ))

    # 开始抓取任务并等待其结束
    web_spider_async.start_work_and_wait_done(fetcher_num=10)
    return