Beispiel #1
0
 def __init__(self, crawler, spider_closed_callback):
     ## 将爬虫实例存储在执行引擎实例中
     self.crawler = crawler
     ## 将爬虫实例所对应的配置也存储在执行引擎实例中
     self.settings = crawler.settings
     ## 信号
     self.signals = crawler.signals
     ## 日志格式化器
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     ## 是否正在运行
     self.running = False
     ## 是否已暂停执行
     self.paused = False
     ## 从配置文件中加载调度器类
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     ## 从配置文件中加载下载器类
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     ## 实例化下载器
     self.downloader = downloader_cls(crawler)
     ## 实例化 scraper,它是引擎连接爬虫类(Spider)和管道类(Pipeline)的桥梁
     self.scraper = Scraper(crawler)
     ## 指定爬虫关闭的回调函数
     self._spider_closed_callback = spider_closed_callback
Beispiel #2
0
 def test_spider_output_handling(self):
     spider = self.MySpider()
     scraper = Scraper(Crawler(spider))
     scraper.open_spider(spider)
     scraper._process_spidermw_output(RssItem(), None, None, None)
     scraper._process_spidermw_output(ExtendableItem(), None, None, None)
     scraper._process_spidermw_output(RssedItem(), None, None, None)
     scraper.close_spider(spider)
Beispiel #3
0
 def __init__(self, settings, spider_closed_callback):
     self.settings = settings
     self.closing = {}  # dict (spider -> reason) of spiders being closed
     self.closing_dfds = {
     }  # dict (spider -> deferred) of spiders being closed
     self.running = False
     self.paused = False
     self._next_request_calls = {}
     self.scheduler = load_object(settings['SCHEDULER'])()
     self.downloader = Downloader()
     self.scraper = Scraper(self, self.settings)
     self._spider_closed_callback = spider_closed_callback
Beispiel #4
0
 def __init__(self, crawler, spider_closed_callback):
     self.settings = crawler.settings
     self.slots = {}
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     self.downloader = Downloader(crawler)
     self.scraper = Scraper(crawler)
     self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1)
     if self._concurrent_spiders != 1:
         warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
             "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
     self._spider_closed_callback = spider_closed_callback
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     self.downloader = downloader_cls(crawler)
     self.scraper = Scraper(crawler)
     self._spider_closed_callback = spider_closed_callback
Beispiel #6
0
 def __init__(self, crawler, spider_closed_callback: Callable) -> None:
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot: Optional[Slot] = None
     self.spider: Optional[Spider] = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(crawler.settings["SCHEDULER"])
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     self.downloader = downloader_cls(crawler)
     self.scraper = Scraper(crawler)
     self._spider_closed_callback = spider_closed_callback
Beispiel #7
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals  #使用crawler的信号管理器
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(
         self.settings['SCHEDULER'])  #根据配置的调度器类来生成对应的对象
     downloader_cls = load_object(
         self.settings['DOWNLOADER'])  #根据配置的下载器类来生成对应的类
     self.downloader = downloader_cls(crawler)
     self.scraper = Scraper(crawler)  #生成一个刮取器
     self._spider_closed_callback = spider_closed_callback
Beispiel #8
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(
         self.settings['SCHEDULER']
     )  # 从settings中找到Scheduler调度器,找到Scheduler类
     downloader_cls = load_object(
         self.settings['DOWNLOADER'])  # 同样,找到Downloader下载器类
     self.downloader = downloader_cls(crawler)  # 实例化Downloader
     self.scraper = Scraper(crawler)  # 实例化Scraper,它是引擎连接爬虫类的桥梁
     self._spider_closed_callback = spider_closed_callback
Beispiel #9
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     self.scraper = Scraper(crawler)
     self._spider_closed_callback = spider_closed_callback
     if self.downloader is None:
         downloader_cls = load_object(self.settings['DOWNLOADER'])
         MyExecutionEngine.downloader = downloader_cls(crawler)
     self.downloader = MyExecutionEngine.downloader
     self.downloader.close = CloseOnlyLastTime(self.downloader.close)
Beispiel #10
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings  # 配置
     self.signals = crawler.signals  # 信号
     self.logformatter = crawler.logformatter  # 日志格式
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     # 提取scheduler调度器类名(未进行实例化), 其在open_spdier中实例化
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     # 提取downloader下载器类名, 并实例化, 见scrapy/core/downloader/__init__.py文件
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     self.downloader = downloader_cls(crawler)
     # 实例化scrapyer: engine和spider之间的桥梁, 见scrapy/core/scraper.py
     self.scraper = Scraper(crawler)
     self._spider_closed_callback = spider_closed_callback
Beispiel #11
0
 def __init__(self, crawler, spider_closed_callback):
     self.locker = threading.Condition()
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(self.settings['SCHEDULER'])
     downloader_cls = load_object(self.settings['DOWNLOADER'])
     self.downloader = downloader_cls(crawler)
     self.scraper = Scraper(crawler)
     self._concurrent_spiders = self.settings.getint('CONCURRENT_SPIDERS', 1)
     if self._concurrent_spiders != 1:
         warnings.warn("CONCURRENT_SPIDERS settings is deprecated, use " \
             "Scrapyd max_proc config instead", ScrapyDeprecationWarning)
     self._spider_closed_callback = spider_closed_callback
Beispiel #12
0
    def __init__(self, crawler, spider_closed_callback):
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot = None
        self.spider = None
        self.running = False
        self.paused = False
        self.scheduler_cls = load_object(self.settings['SCHEDULER'])
        #在这里获取了scheduler类
        #默认是'scrapy.core.scheduler.Scheduler'

        downloader_cls = load_object(self.settings['DOWNLOADER'])
        #这里获取downloader
        #默认是'scrapy.core.downloader.Downloader'
        self.downloader = downloader_cls(crawler)
        #下载器实例化

        self.scraper = Scraper(crawler)
        self._spider_closed_callback = spider_closed_callback
Beispiel #13
0
 def __init__(self, crawler, spider_closed_callback):
     self.crawler = crawler
     self.settings = crawler.settings
     self.signals = crawler.signals
     self.logformatter = crawler.logformatter
     self.slot = None
     self.spider = None
     self.running = False
     self.paused = False
     self.scheduler_cls = load_object(
         self.settings['SCHEDULER']
     )  # SCHEDULER = 'scrapy.core.scheduler.Scheduler',仅仅获取对象,没做其他坏事
     downloader_cls = load_object(
         self.settings['DOWNLOADER']
     )  # DOWNLOADER = 'scrapy.core.downloader.Downloader'
     self.downloader = downloader_cls(
         crawler
     )  # 这个下载器,里面实例化了handler处理器,和到下载器之间的process_处理逻辑。就是具体的下载功能和中间件功能都已经实现了
     self.scraper = Scraper(
         crawler)  # 这里有定义有spidermw爬虫中间件和ITEM_pipeline管道对象,数据处理功能和存储功能都实现了
     self._spider_closed_callback = spider_closed_callback  # 这个回调很重要,关系到爬虫能不能停下来,是个匿名函数lambda _: self.stop(),最终还是执行engine的self.engine.stop
Beispiel #14
0
    def __init__(self, crawler, spider_closed_callback: Callable) -> None:
        self.crawler = crawler
        self.settings = crawler.settings
        self.signals = crawler.signals
        self.logformatter = crawler.logformatter
        self.slot: Optional[Slot] = None
        self.spider: Optional[Spider] = None
        self.running = False
        # 是否暂停
        self.paused = False
        self.scheduler_cls = self._get_scheduler_class(crawler.settings)

        # 加载下载器
        downloader_cls = load_object(self.settings['DOWNLOADER'])
        self.downloader = downloader_cls(crawler)

        # 实例化这个抓取这个动作
        self.scraper = Scraper(crawler)

        # 外部传入的关闭回调
        self._spider_closed_callback = spider_closed_callback
Beispiel #15
0
"""