def worker(queue_verification): spider = XiCi() for proxy in spider.generator(): while True: if persister.handler().zcount('index_speed', '-inf', '+inf') > config.PROXY_STORE_NUM: time.sleep(config.PROXY_FULL_SLEEP_SEC) elif queue_verification.full(): time.sleep(0.5) else: queue_verification.put(proxy) break
def worker(queue_verification): spiders = [XiCi(), NianShao()] page = 1 while page < 16: for spider in spiders: for proxy in spider.generator(page): if persister.handler().zcount('index_speed', '-inf', '+inf') > config.PROXY_STORE_NUM: time.sleep(config.PROXY_FULL_SLEEP_SEC) elif queue_verification.full(): time.sleep(0.5) else: queue_verification.put(proxy) if page == 15: page = 1 else: page += 1
def worker(queue_verification): """ 工作进程,从各个源来爬取代理 取消了 www.nianshao.me 的代理爬取,此网站应该是没人维护,已经挂了 :param queue_verification: :return: """ # 好的源排在前面优先被爬取 spiders = [XiCi(), Kuai()] page = 1 while page <= config.PAGE: for spider in spiders: for proxy in spider.generator(page): if persister.handler().zcount('index_speed', '-inf', '+inf') > config.PROXY_STORE_NUM: time.sleep(config.PROXY_FULL_SLEEP_SEC) elif queue_verification.full(): time.sleep(1) else: queue_verification.put(proxy) if page == config.PAGE: page = 1 else: page += 1