Example #1
0
class ArachnadoExecutionEngine(ExecutionEngine):
    """
    Extended ExecutionEngine.
    It sends a signal when engine gets scheduled to stop.
    """

    def __init__(self, *args, **kwargs):
        super(ArachnadoExecutionEngine, self).__init__(*args, **kwargs)
        self.send_tick = CallLaterOnce(self._send_tick_signal)

    def close_spider(self, spider, reason="cancelled"):
        if self.slot.closing:
            return self.slot.closing
        self.crawler.crawling = False
        self.signals.send_catch_log(signals.spider_closing)
        return super(ArachnadoExecutionEngine, self).close_spider(spider, reason)

    def pause(self):
        """Pause the execution engine"""
        super(ArachnadoExecutionEngine, self).pause()
        self.signals.send_catch_log(signals.engine_paused)

    def unpause(self):
        """Resume the execution engine"""
        super(ArachnadoExecutionEngine, self).unpause()
        self.signals.send_catch_log(signals.engine_resumed)

    def _next_request(self, spider):
        res = super(ArachnadoExecutionEngine, self)._next_request(spider)
        self.send_tick.schedule(0.1)  # avoid sending the signal too often
        return res

    def _send_tick_signal(self):
        self.signals.send_catch_log_deferred(signals.engine_tick)
Example #2
0
class MyselfExecutionEngine(ExecutionEngine):
    """扩写执行引擎 任务停止时发送信号"""
    def __init__(self, *args, **kwargs):
        super(MyselfExecutionEngine, self).__init__(*args, **kwargs)
        self.send_tick = CallLaterOnce(self._send_tick_signal)

    # TODO
    def close_spider(self, spider, reason='cancelled'):
        """关闭spider并清除未完成请求"""
        # self.slot使用twisted.reactor调度engine的_next_request方法, 核心循环方法
        if self.slot.closing:
            return self.slot.closing
        self.crawler.crawling = False
        self.signals.send_catch_log(signals.spider_closing)
        return super(MyselfExecutionEngine, self).close_spider(spider, reason)

    def pause(self):
        """暂停执行引擎"""
        super(MyselfExecutionEngine, self).pause()
        self.signals.send_catch_log(signals.engine_paused)

    def unpause(self):
        """继续执行暂停任务"""
        super(MyselfExecutionEngine, self).unpause()
        self.signals.send_catch_log(signals.engine_resumed)

    def _next_request(self, spider):
        """任务调度"""
        res = super(MyselfExecutionEngine, self)._next_request(spider)
        self.send_tick.schedule(0.1)
        return res

    def _send_tick_signal(self):
        """发送信号"""
        self.signals.send_catch_log_deferred(signals.engine_tick)
class ArachnadoExecutionEngine(ExecutionEngine):
    """
    Extended ExecutionEngine.
    It sends a signal when engine gets scheduled to stop.
    """
    def __init__(self, *args, **kwargs):
        super(ArachnadoExecutionEngine, self).__init__(*args, **kwargs)
        self.send_tick = CallLaterOnce(self._send_tick_signal)

    def close_spider(self, spider, reason='cancelled'):
        if self.slot.closing:
            return self.slot.closing
        self.crawler.crawling = False
        self.signals.send_catch_log(signals.spider_closing)
        return super(ArachnadoExecutionEngine,
                     self).close_spider(spider, reason)

    def pause(self):
        """Pause the execution engine"""
        super(ArachnadoExecutionEngine, self).pause()
        self.signals.send_catch_log(signals.engine_paused)

    def unpause(self):
        """Resume the execution engine"""
        super(ArachnadoExecutionEngine, self).unpause()
        self.signals.send_catch_log(signals.engine_resumed)

    def _next_request(self, spider):
        res = super(ArachnadoExecutionEngine, self)._next_request(spider)
        self.send_tick.schedule(0.1)  # avoid sending the signal too often
        return res

    def _send_tick_signal(self):
        self.signals.send_catch_log_deferred(signals.engine_tick)
def createSpiderTask(site_info, settings, CHECK_POINT):
    results = iter(select(settings, SITE_ID=site_info["site_id"]))
    nextcall = CallLaterOnce(eval(site_info["SpiderName"]), site_info, results,
                             CHECK_POINT)
    heartbeat = task.LoopingCall(nextcall.schedule)
    # TODO delay 秒后开始回调
    nextcall.schedule(delay=0.5)
    TaskTimer = 3
    # TODO 每 TaskTimer秒 产生一次任务
    heartbeat.start(TaskTimer)
Example #5
0
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})

        ### 这里很重要!
        nextcall = CallLaterOnce(self._next_request, spider)

        scheduler = self.scheduler_cls.from_crawler(
            self.crawler)  # 这里初始化scheduler

        # start_requests的后处理
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider

        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)

        # 启动schedule
        slot.nextcall.schedule()  # 开始立刻调度运行
        slot.heartbeat.start(5)  # 每5秒运行一次nextcall
Example #6
0
 def open_spider(self, spider, start_requests=(), close_if_idle=True):
     assert self.has_capacity(), "No free spider slot when opening %r" % \
         spider.name
     logger.info("Spider opened", extra={'spider': spider})
     ## 注册 _next_request 调度方法,循环调度
     nextcall = CallLaterOnce(self._next_request, spider)
     ## 初始化调度器类
     scheduler = self.scheduler_cls.from_crawler(self.crawler)
     ## 调用爬虫中间件的 process_start_requests 方法处理种子请求
     ## 可以定义多个爬虫中间件,每个类都重写该方法,爬虫在调度之前会分别调用你定义好的
     ## 爬虫中间件,来分别处理起始请求,功能独立而且维护起来更加清晰
     start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
     ## 封装 slot 对象
     slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
     self.slot = slot
     self.spider = spider
     ## 调用调度器的 open 方法
     yield scheduler.open(spider)
     ## 调用 scraper 的 open_spider 方法
     yield self.scraper.open_spider(spider)
     self.crawler.stats.open_spider(spider)
     yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
     ## 发起调度
     slot.nextcall.schedule()
     slot.heartbeat.start(5)
Example #7
0
 def open_spider(self,
                 spider: Spider,
                 start_requests: Iterable = (),
                 close_if_idle: bool = True):
     if self.slot is not None:
         raise RuntimeError(
             f"No free spider slot when opening {spider.name!r}")
     logger.info("Spider opened", extra={'spider': spider})
     # p.15 创建延迟调用实例,为下一轮事件循环做准备
     nextcall = CallLaterOnce(self._next_request)
     # p.16 创建 调度器 实例
     scheduler = create_instance(self.scheduler_cls,
                                 settings=None,
                                 crawler=self.crawler)
     # p.17 挂载 爬虫中间件 ,并处理开始请求
     start_requests = yield self.scraper.spidermw.process_start_requests(
         start_requests, spider)
     # p.18 封装 开始请求,延迟调用实例,调度器
     self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
     self.spider = spider
     # p.19
     if hasattr(scheduler, "open"):
         yield scheduler.open(spider)
     # p.20
     yield self.scraper.open_spider(spider)
     # p.21 启动信息收集
     self.crawler.stats.open_spider(spider)
     # p.22
     yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                spider=spider)
     self.slot.nextcall.schedule()
     self.slot.heartbeat.start(5)
Example #8
0
 def open_spider(self, spider, start_requests=(), close_if_idle=True):
     assert self.has_capacity(), "No free spider slot when opening %r" % \
         spider.name
     logger.info("Spider opened", extra={'spider': spider})
     # 注册next_request调度方法, 以便循环调度(利用twisted的reactor)
     nextcall = CallLaterOnce(self._next_request, spider)
     # 实例化调度器, 见scrapy/core/scheduler.py
     scheduler = self.scheduler_cls.from_crawler(self.crawler)
     # 爬虫中间件, 处理种子Request, 见scrapy/core/spidermw.py, 其中start_requests一般为包含 Requests的可迭代对象
     start_requests = yield self.scraper.spidermw.process_start_requests(
         start_requests, spider)
     # 封装slot对象, 并将返回的包含可迭代对象start_requests赋值给slot
     slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
     self.slot = slot
     self.spider = spider
     # 调用调度器中的open, 进行爬虫对象绑定, 各种任务队列初始化, 并且开启指纹过滤, 见scrapy/core/scheduler.py
     yield scheduler.open(spider)
     # 见scrapy/core/scraper.py, 实例化实例化或者绑定spider到pipeline manager类上, 见scrapy/pipelines/__init__.py
     # 批量调度爬虫中间件所有open_spdier方法
     yield self.scraper.open_spider(spider)
     # scrapy/statscollectors/__init__.py
     self.crawler.stats.open_spider(spider)
     yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                spider=spider)
     # 开始调度, 实际执行上面注册的_next_request方法
     slot.nextcall.schedule()
     slot.heartbeat.start(5)
Example #9
0
 def open_spider(self, spider, start_requests=None, close_if_idle=True):
     assert self.has_capacity(), "No free spider slots when opening %r" % \
         spider.name
     log.msg("Spider opened", spider=spider)
     nextcall = CallLaterOnce(self._next_request, spider)
     scheduler = self.scheduler_cls.from_settings(self.settings)
     slot = Slot(start_requests or (), close_if_idle, nextcall, scheduler)
     self.slots[spider] = slot
     yield scheduler.open(spider)
     yield self.scraper.open_spider(spider)
     stats.open_spider(spider)
     yield send_catch_log_deferred(signals.spider_opened, spider=spider)
     slot.nextcall.schedule()
Example #10
0
 def open_spider(self, spider: Spider, start_requests: Iterable = (), close_if_idle: bool = True):
     if self.slot is not None:
         raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
     logger.info("Spider opened", extra={'spider': spider})
     nextcall = CallLaterOnce(self._next_request)
     scheduler = create_instance(self.scheduler_cls, settings=None, crawler=self.crawler)
     start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
     self.slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
     self.spider = spider
     if hasattr(scheduler, "open"):
         yield scheduler.open(spider)
     yield self.scraper.open_spider(spider)
     self.crawler.stats.open_spider(spider)
     yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
     self.slot.nextcall.schedule()
     self.slot.heartbeat.start(5)
Example #11
0
 def open_spider(self, spider, start_requests=(), close_if_idle=True):
     if not self.has_capacity():
         raise RuntimeError(f"No free spider slot when opening {spider.name!r}")
     logger.info("Spider opened", extra={'spider': spider})
     nextcall = CallLaterOnce(self._next_request, spider) # 这里相当于 创建了一个 CallLaterOnce的对象 目标是从start_request里开始抛出request  不过 他是相当于仅仅生成个deferred
     scheduler = self.scheduler_cls.from_crawler(self.crawler)# 实例化scheduler
     start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider) # 调用middleware中的process_start_requests 来处理这些startrequest
     slot = Slot(start_requests, close_if_idle, nextcall, scheduler) #这里创建对应的slot
     self.slot = slot
     self.spider = spider
     yield scheduler.open(spider) # 初始化 scheduler 生成que
     yield self.scraper.open_spider(spider)# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·
     self.crawler.stats.open_spider(spider)
     yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)#发出信号
     slot.nextcall.schedule() # 给reactor 添加任务 实际启动_next_request
     slot.heartbeat.start(5) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~·
Example #12
0
 def open_spider(self, spider, start_requests=(), close_if_idle=True):
     if not self.has_capacity():
         raise RuntimeError("No free spider slot when opening %r" % spider.name)
     logger.info("Spider opened", extra={'spider': spider})
     nextcall = CallLaterOnce(self._next_request, spider)
     scheduler = self.scheduler_cls.from_crawler(self.crawler)
     start_requests = yield self.scraper.spidermw.process_start_requests(start_requests, spider)
     slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
     self.slot = slot
     self.spider = spider
     yield scheduler.open(spider)
     yield self.scraper.open_spider(spider)
     self.crawler.stats.open_spider(spider)
     yield self.signals.send_catch_log_deferred(signals.spider_opened, spider=spider)
     slot.nextcall.schedule()
     slot.heartbeat.start(5)
Example #13
0
 async def _load_start_requests(self):
     count = 0
     for request in self.start_requests:
         count += 1
         try:
             await self.engine.future_in_pool(self.engine.crawl, request,
                                              self.spider)
             self._maybe_first_request()
             self.logger.debug(f"load start request {count} {request}")
         except asyncio.CancelledError:
             self.logger.warn("load start requests task cancelled")
             break
         except Exception as e:
             self.logger.error(f"load start request fail {request} {e}")
     self._maybe_first_request()
     self.logger.debug(f"load start requests {count} stopped")
     self.start_requests = None
     CallLaterOnce(self._maybe_fire_closing).schedule()
Example #14
0
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
            spider.name
        logger.info("Spider opened", extra={'spider': spider})
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)

        # 开始爬取网页数据     -- comment by jigc 2015-7-29
        slot.nextcall.schedule()
Example #15
0
 def open_spider(self, spider, start_requests=(), close_if_idle=True):
     assert self.has_capacity(), "No free spider slot when opening %r" % \
         spider.name
     logger.info("Spider opened", extra={'spider': spider})
     # 注册_next_request调度方法,循环调度
     nextcall = CallLaterOnce(self._next_request, spider)
     # 初始化scheduler
     scheduler = self.scheduler_cls.from_crawler(self.crawler)
     # 调用爬虫中间件,处理种子请求
     start_requests = yield self.scraper.spidermw.process_start_requests(
         start_requests, spider)
     slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
     self.slot = slot
     self.spider = spider
     yield scheduler.open(spider)
     yield self.scraper.open_spider(spider)
     self.crawler.stats.open_spider(spider)
     yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                spider=spider)
     slot.nextcall.schedule()
     slot.heartbeat.start(5)
Example #16
0
    def open_spider(self, spider, start_requests=(), close_if_idle=True):
        assert self.has_capacity(), "No free spider slot when opening %r" % \
                                    spider.name
        logger.info("Spider opened", extra={'spider': spider})
        # CallLaterOnce 稍后调用一次
        nextcall = CallLaterOnce(self._next_request, spider)
        scheduler = self.scheduler_cls.from_crawler(self.crawler)
        start_requests = yield self.scraper.spidermw.process_start_requests(
            start_requests, spider)
        # slot 封装 engine 对 request处理流程
        slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
        self.slot = slot
        self.spider = spider
        yield scheduler.open(spider)
        yield self.scraper.open_spider(spider)
        self.crawler.stats.open_spider(spider)
        yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                   spider=spider)

        # 开启request处理流程
        slot.nextcall.schedule()  # 编程式调用,手动调用
        slot.heartbeat.start(5)  # 5s心跳周期性调用
Example #17
0
 def open_spider(self, spider, start_requests=(), close_if_idle=True):
     assert self.has_capacity(), "No free spider slot when opening %r" % \
         spider.name
     logger.info("Spider opened", extra={'spider': spider})
     nextcall = CallLaterOnce(self._next_request, spider)
     scheduler = self.scheduler_cls.from_crawler(
         self.crawler
     )  # 对调度器进行实例化。实例化了dupefilter,还有三种队列。一种是优先级队列,还有来两个都是fifo先进先出队列,不过一个是直接存储在内存memory中,一个是通过pickle实例化了
     start_requests = yield self.scraper.spidermw.process_start_requests(
         start_requests, spider)  # 第一步执行的居然是爬虫中间件里面的process_start_requests
     slot = Slot(start_requests, close_if_idle, nextcall, scheduler)
     self.slot = slot
     self.spider = spider
     yield scheduler.open(spider)  # 打开内存队列FIFO,优先级队列,并打开过滤器
     yield self.scraper.open_spider(spider)  # 貌似没做啥事
     self.crawler.stats.open_spider(spider)  # pass,也没做啥事
     yield self.signals.send_catch_log_deferred(
         signals.spider_opened,
         spider=spider)  # 做了好多事啊,初始化日志,还有各种装啊提,中间件似乎都实现了这个函数?
     slot.nextcall.schedule()  # 执行一次self._next_request
     # 这鬼地方居然只会走一次,也就是初始化的走完这里,但是并不会执行里面的逻辑,应为这个schedule里面用的是reactor.callLater(delay, self),所以是不会执行的,除非你start
     slot.heartbeat.start(5)
Example #18
0
 def open_spider(self, spider, start_requests=(), close_if_idle=True):
     assert self.has_capacity(), "No free spider slot when opening %r" % \
         spider.name
     logger.info("Spider opened", extra={'spider': spider})
     nextcall = CallLaterOnce(self._next_request,
                              spider)  # 注册_next_request调度方法,循环调度
     scheduler = self.scheduler_cls.from_crawler(
         self.crawler)  # 初始化scheduler
     start_requests = yield self.scraper.spidermw.process_start_requests(
         start_requests, spider)  # 调用爬虫中间件,处理种子请求
     slot = Slot(start_requests, close_if_idle, nextcall,
                 scheduler)  # 封装Slot对象
     self.slot = slot
     self.spider = spider
     yield scheduler.open(
         spider)  # 调用scheduler的open,实例化一个优先级队列,其余啥也没干,返回None
     yield self.scraper.open_spider(
         spider)  # 这里主要是调用所有itemPipline的open_spider方法
     self.crawler.stats.open_spider(spider)  # 返回None,这个不知道干啥的
     yield self.signals.send_catch_log_deferred(signals.spider_opened,
                                                spider=spider)
     slot.nextcall.schedule()  # 发起调度
     slot.heartbeat.start(5)  # 每5秒调用一次CallLaterOnce.schedule
Example #19
0
 def open(self, spider):
     super(RotateScheduler, self).open(spider)
     self.nextcall = CallLaterOnce(self.more_request)
     self.locker = threading.Condition()
Example #20
0
"""
Example #21
0
 def __init__(self, *args, **kwargs):
     super(ArachnadoExecutionEngine, self).__init__(*args, **kwargs)
     self.send_tick = CallLaterOnce(self._send_tick_signal)
Example #22
0
 def __init__(self, *args, **kwargs):
     super(ArachnadoExecutionEngine, self).__init__(*args, **kwargs)
     self.send_tick = CallLaterOnce(self._send_tick_signal)