class ArachnadoExecutionEngine(ExecutionEngine): """ Extended ExecutionEngine. It sends a signal when engine gets scheduled to stop. """ def __init__(self, *args, **kwargs): super(ArachnadoExecutionEngine, self).__init__(*args, **kwargs) self.send_tick = CallLaterOnce(self._send_tick_signal) def close_spider(self, spider, reason="cancelled"): if self.slot.closing: return self.slot.closing self.crawler.crawling = False self.signals.send_catch_log(signals.spider_closing) return super(ArachnadoExecutionEngine, self).close_spider(spider, reason) def pause(self): """Pause the execution engine""" super(ArachnadoExecutionEngine, self).pause() self.signals.send_catch_log(signals.engine_paused) def unpause(self): """Resume the execution engine""" super(ArachnadoExecutionEngine, self).unpause() self.signals.send_catch_log(signals.engine_resumed) def _next_request(self, spider): res = super(ArachnadoExecutionEngine, self)._next_request(spider) self.send_tick.schedule(0.1) # avoid sending the signal too often return res def _send_tick_signal(self): self.signals.send_catch_log_deferred(signals.engine_tick)
class MyselfExecutionEngine(ExecutionEngine): """扩写执行引擎 任务停止时发送信号""" def __init__(self, *args, **kwargs): super(MyselfExecutionEngine, self).__init__(*args, **kwargs) self.send_tick = CallLaterOnce(self._send_tick_signal) # TODO def close_spider(self, spider, reason='cancelled'): """关闭spider并清除未完成请求""" # self.slot使用twisted.reactor调度engine的_next_request方法, 核心循环方法 if self.slot.closing: return self.slot.closing self.crawler.crawling = False self.signals.send_catch_log(signals.spider_closing) return super(MyselfExecutionEngine, self).close_spider(spider, reason) def pause(self): """暂停执行引擎""" super(MyselfExecutionEngine, self).pause() self.signals.send_catch_log(signals.engine_paused) def unpause(self): """继续执行暂停任务""" super(MyselfExecutionEngine, self).unpause() self.signals.send_catch_log(signals.engine_resumed) def _next_request(self, spider): """任务调度""" res = super(MyselfExecutionEngine, self)._next_request(spider) self.send_tick.schedule(0.1) return res def _send_tick_signal(self): """发送信号""" self.signals.send_catch_log_deferred(signals.engine_tick)
class ArachnadoExecutionEngine(ExecutionEngine): """ Extended ExecutionEngine. It sends a signal when engine gets scheduled to stop. """ def __init__(self, *args, **kwargs): super(ArachnadoExecutionEngine, self).__init__(*args, **kwargs) self.send_tick = CallLaterOnce(self._send_tick_signal) def close_spider(self, spider, reason='cancelled'): if self.slot.closing: return self.slot.closing self.crawler.crawling = False self.signals.send_catch_log(signals.spider_closing) return super(ArachnadoExecutionEngine, self).close_spider(spider, reason) def pause(self): """Pause the execution engine""" super(ArachnadoExecutionEngine, self).pause() self.signals.send_catch_log(signals.engine_paused) def unpause(self): """Resume the execution engine""" super(ArachnadoExecutionEngine, self).unpause() self.signals.send_catch_log(signals.engine_resumed) def _next_request(self, spider): res = super(ArachnadoExecutionEngine, self)._next_request(spider) self.send_tick.schedule(0.1) # avoid sending the signal too often return res def _send_tick_signal(self): self.signals.send_catch_log_deferred(signals.engine_tick)
def createSpiderTask(site_info, settings, CHECK_POINT): results = iter(select(settings, SITE_ID=site_info["site_id"])) nextcall = CallLaterOnce(eval(site_info["SpiderName"]), site_info, results, CHECK_POINT) heartbeat = task.LoopingCall(nextcall.schedule) # TODO delay 秒后开始回调 nextcall.schedule(delay=0.5) TaskTimer = 3 # TODO 每 TaskTimer秒 产生一次任务 heartbeat.start(TaskTimer)
def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) ### 这里很重要! nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler( self.crawler) # 这里初始化scheduler # start_requests的后处理 start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) # 启动schedule slot.nextcall.schedule() # 开始立刻调度运行 slot.heartbeat.start(5) # 每5秒运行一次nextcall
def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) ## 注册 _next_request 调度方法,循环调度 nextcall = CallLaterOnce(self._next_request, spider) ## 初始化调度器类 scheduler = self.scheduler_cls.from_crawler(self.crawler) ## 调用爬虫中间件的 process_start_requests 方法处理种子请求 ## 可以定义多个爬虫中间件,每个类都重写该方法,爬虫在调度之前会分别调用你定义好的 ## 爬虫中间件,来分别处理起始请求,功能独立而且维护起来更加清晰 start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) ## 封装 slot 对象 slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider ## 调用调度器的 open 方法 yield scheduler.open(spider) ## 调用 scraper 的 open_spider 方法 yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) ## 发起调度 slot.nextcall.schedule() slot.heartbeat.start(5)
def open_spider(self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True): if self.slot is not None: raise RuntimeError( f"No free spider slot when opening {spider.name!r}") logger.info("Spider opened", extra={'spider': spider}) # p.15 创建延迟调用实例,为下一轮事件循环做准备 nextcall = CallLaterOnce(self._next_request) # p.16 创建 调度器 实例 scheduler = create_instance(self.scheduler_cls, settings=None, crawler=self.crawler) # p.17 挂载 爬虫中间件 ,并处理开始请求 start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) # p.18 封装 开始请求,延迟调用实例,调度器 self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.spider = spider # p.19 if hasattr(scheduler, "open"): yield scheduler.open(spider) # p.20 yield self.scraper.open_spider(spider) # p.21 启动信息收集 self.crawler.stats.open_spider(spider) # p.22 yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) self.slot.nextcall.schedule() self.slot.heartbeat.start(5)
def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) # 注册next_request调度方法, 以便循环调度(利用twisted的reactor) nextcall = CallLaterOnce(self._next_request, spider) # 实例化调度器, 见scrapy/core/scheduler.py scheduler = self.scheduler_cls.from_crawler(self.crawler) # 爬虫中间件, 处理种子Request, 见scrapy/core/spidermw.py, 其中start_requests一般为包含 Requests的可迭代对象 start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) # 封装slot对象, 并将返回的包含可迭代对象start_requests赋值给slot slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider # 调用调度器中的open, 进行爬虫对象绑定, 各种任务队列初始化, 并且开启指纹过滤, 见scrapy/core/scheduler.py yield scheduler.open(spider) # 见scrapy/core/scraper.py, 实例化实例化或者绑定spider到pipeline manager类上, 见scrapy/pipelines/__init__.py # 批量调度爬虫中间件所有open_spdier方法 yield self.scraper.open_spider(spider) # scrapy/statscollectors/__init__.py self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) # 开始调度, 实际执行上面注册的_next_request方法 slot.nextcall.schedule() slot.heartbeat.start(5)
def open_spider(self, spider, start_requests=None, close_if_idle=True): assert self.has_capacity(), "No free spider slots when opening %r" % \ spider.name log.msg("Spider opened", spider=spider) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_settings(self.settings) slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler) self.slots[spider] = slot yield scheduler.open(spider) yield self.scraper.open_spider(spider) stats.open_spider(spider) yield send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule()
def open_spider(self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True): if self.slot is not None: raise RuntimeError(f"No free spider slot when opening {spider.name!r}") logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request) scheduler = create_instance(self.scheduler_cls, settings=None, crawler=self.crawler) start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.spider = spider if hasattr(scheduler, "open"): yield scheduler.open(spider) yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) self.slot.nextcall.schedule() self.slot.heartbeat.start(5)
def open_spider(self, spider, start_requests=(), close_if_idle=True): if not self.has_capacity(): raise RuntimeError(f"No free spider slot when opening {spider.name!r}") logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) # 这里相当于 创建了一个 CallLaterOnce的对象 目标是从start_request里开始抛出request 不过 他是相当于仅仅生成个deferred scheduler = self.scheduler_cls.from_crawler(self.crawler)# 实例化scheduler start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) # 调用middleware中的process_start_requests 来处理这些startrequest slot = Slot(start_requests, close_if_idle, nextcall, scheduler) #这里创建对应的slot self.slot = slot self.spider = spider yield scheduler.open(spider) # 初始化 scheduler 生成que yield self.scraper.open_spider(spider)# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~· self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)#发出信号 slot.nextcall.schedule() # 给reactor 添加任务 实际启动_next_request slot.heartbeat.start(5) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·
def open_spider(self, spider, start_requests=(), close_if_idle=True): if not self.has_capacity(): raise RuntimeError("No free spider slot when opening %r" % spider.name) logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler(self.crawler) start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule() slot.heartbeat.start(5)
async def _load_start_requests(self): count = 0 for request in self.start_requests: count += 1 try: await self.engine.future_in_pool(self.engine.crawl, request, self.spider) self._maybe_first_request() self.logger.debug(f"load start request {count} {request}") except asyncio.CancelledError: self.logger.warn("load start requests task cancelled") break except Exception as e: self.logger.error(f"load start request fail {request} {e}") self._maybe_first_request() self.logger.debug(f"load start requests {count} stopped") self.start_requests = None CallLaterOnce(self._maybe_fire_closing).schedule()
def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler(self.crawler) start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) # 开始爬取网页数据 -- comment by jigc 2015-7-29 slot.nextcall.schedule()
def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) # 注册_next_request调度方法,循环调度 nextcall = CallLaterOnce(self._next_request, spider) # 初始化scheduler scheduler = self.scheduler_cls.from_crawler(self.crawler) # 调用爬虫中间件,处理种子请求 start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule() slot.heartbeat.start(5)
def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) # CallLaterOnce 稍后调用一次 nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler(self.crawler) start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) # slot 封装 engine 对 request处理流程 slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) yield self.scraper.open_spider(spider) self.crawler.stats.open_spider(spider) yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) # 开启request处理流程 slot.nextcall.schedule() # 编程式调用,手动调用 slot.heartbeat.start(5) # 5s心跳周期性调用
def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) scheduler = self.scheduler_cls.from_crawler( self.crawler ) # 对调度器进行实例化。实例化了dupefilter,还有三种队列。一种是优先级队列,还有来两个都是fifo先进先出队列,不过一个是直接存储在内存memory中,一个是通过pickle实例化了 start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) # 第一步执行的居然是爬虫中间件里面的process_start_requests slot = Slot(start_requests, close_if_idle, nextcall, scheduler) self.slot = slot self.spider = spider yield scheduler.open(spider) # 打开内存队列FIFO,优先级队列,并打开过滤器 yield self.scraper.open_spider(spider) # 貌似没做啥事 self.crawler.stats.open_spider(spider) # pass,也没做啥事 yield self.signals.send_catch_log_deferred( signals.spider_opened, spider=spider) # 做了好多事啊,初始化日志,还有各种装啊提,中间件似乎都实现了这个函数? slot.nextcall.schedule() # 执行一次self._next_request # 这鬼地方居然只会走一次,也就是初始化的走完这里,但是并不会执行里面的逻辑,应为这个schedule里面用的是reactor.callLater(delay, self),所以是不会执行的,除非你start slot.heartbeat.start(5)
def open_spider(self, spider, start_requests=(), close_if_idle=True): assert self.has_capacity(), "No free spider slot when opening %r" % \ spider.name logger.info("Spider opened", extra={'spider': spider}) nextcall = CallLaterOnce(self._next_request, spider) # 注册_next_request调度方法,循环调度 scheduler = self.scheduler_cls.from_crawler( self.crawler) # 初始化scheduler start_requests = yield self.scraper.spidermw.process_start_requests( start_requests, spider) # 调用爬虫中间件,处理种子请求 slot = Slot(start_requests, close_if_idle, nextcall, scheduler) # 封装Slot对象 self.slot = slot self.spider = spider yield scheduler.open( spider) # 调用scheduler的open,实例化一个优先级队列,其余啥也没干,返回None yield self.scraper.open_spider( spider) # 这里主要是调用所有itemPipline的open_spider方法 self.crawler.stats.open_spider(spider) # 返回None,这个不知道干啥的 yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider) slot.nextcall.schedule() # 发起调度 slot.heartbeat.start(5) # 每5秒调用一次CallLaterOnce.schedule
def open(self, spider): super(RotateScheduler, self).open(spider) self.nextcall = CallLaterOnce(self.more_request) self.locker = threading.Condition()
"""
def __init__(self, *args, **kwargs): super(ArachnadoExecutionEngine, self).__init__(*args, **kwargs) self.send_tick = CallLaterOnce(self._send_tick_signal)