class AirSpider(BaseParser, Thread): __custom_setting__ = {} def __init__(self, thread_count=None): """ 基于内存队列的爬虫,不支持分布式 :param thread_count: 线程数 """ super(AirSpider, self).__init__() for key, value in self.__class__.__custom_setting__.items(): setattr(setting, key, value) self._thread_count = ( setting.SPIDER_THREAD_COUNT if not thread_count else thread_count ) self._memory_db = MemoryDB() self._parser_controls = [] def distribute_task(self): for request in self.start_requests(): if not isinstance(request, Request): raise ValueError("仅支持 yield Request") request.parser_name = request.parser_name or self.name self._memory_db.add(request) def all_thread_is_done(self): for i in range(3): # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性 # 检测 parser_control 状态 for parser_control in self._parser_controls: if not parser_control.is_not_task(): return False # 检测 任务队列 状态 if not self._memory_db.empty(): return False tools.delay_time(1) return True def run(self): self.distribute_task() for i in range(self._thread_count): parser_control = AirSpiderParserControl(self._memory_db) parser_control.add_parser(self) parser_control.start() self._parser_controls.append(parser_control) while True: if self.all_thread_is_done(): # 停止 parser_controls for parser_control in self._parser_controls: parser_control.stop() log.debug("无任务,爬虫结束") break
class AirSpider(BaseParser, Thread): __custom_setting__ = {} def __init__(self, thread_count=None): """ 基于内存队列的爬虫,不支持分布式 :param thread_count: 线程数 """ super(AirSpider, self).__init__() for key, value in self.__class__.__custom_setting__.items(): setattr(setting, key, value) log.reload() self._thread_count = ( setting.SPIDER_THREAD_COUNT if not thread_count else thread_count ) self._memory_db = MemoryDB() self._parser_controls = [] self._item_buffer = ItemBuffer(redis_key="air_spider") def distribute_task(self): for request in self.start_requests(): if not isinstance(request, Request): raise ValueError("仅支持 yield Request") request.parser_name = request.parser_name or self.name self._memory_db.add(request) def all_thread_is_done(self): for i in range(3): # 降低偶然性, 因为各个环节不是并发的,很有可能当时状态为假,但检测下一条时该状态为真。一次检测很有可能遇到这种偶然性 # 检测 parser_control 状态 for parser_control in self._parser_controls: if not parser_control.is_not_task(): return False # 检测 任务队列 状态 if not self._memory_db.empty(): return False # 检测 item_buffer 状态 if ( self._item_buffer.get_items_count() > 0 or self._item_buffer.is_adding_to_db() ): return False tools.delay_time(1) return True def run(self): self.start_callback() for i in range(self._thread_count): parser_control = AirSpiderParserControl(self._memory_db, self._item_buffer) parser_control.add_parser(self) parser_control.start() self._parser_controls.append(parser_control) self._item_buffer.start() self.distribute_task() while True: if self.all_thread_is_done(): # 停止 parser_controls for parser_control in self._parser_controls: parser_control.stop() # 关闭item_buffer self._item_buffer.stop() # 关闭webdirver if Request.webdriver_pool: Request.webdriver_pool.close() log.info("无任务,爬虫结束") break self.end_callback()