def __init__(self, settings=None, install_root_handler=True): super(CrawlerProcess, self).__init__(settings) # 安装shutdown处理机 install_shutdown_handlers(self._signal_shutdown) # 配置日志 configure_logging(self.settings, install_root_handler) log_scrapy_info(self.settings)
def __init__(self, settings=None, install_root_handler=True): ## 父类初始化 super(CrawlerProcess, self).__init__(settings) ## 处理 shutdown 信号 install_shutdown_handlers(self._signal_shutdown) ## 为 Scrapy 配置默认的日志服务 configure_logging(self.settings, install_root_handler) ## 输出 scrapy 的相关信息(启动状态,版本...) log_scrapy_info(self.settings)
def __init__(self, settings=None, install_root_handler=True): logging.info("CrawlerProcess __init__") super().__init__(settings) logging.info("super().__init__(settings)") install_shutdown_handlers(self._signal_shutdown) logging.info("install_shutdown_handlers(self._signal_shutdown)") configure_logging(self.settings, install_root_handler) logging.info("configure_logging(self.settings, install_root_handler)") log_scrapy_info(self.settings) logging.info("log_scrapy_info(self.settings)")
def __init__(self, in_thread, settings=None, install_root_handler=True): # call grandparent init method # avoid calling CrawlerProcess init method which install shutdown_handlers if in_thread: super(CrawlerProcess, self).__init__(settings) configure_logging(self.settings, install_root_handler) log_scrapy_info(self.settings) else: super(PyScraperCrawlerProcess, self).__init__(settings, install_root_handler)
def _run_spider_in_reactor(spider_cls, capture_items=True, return_crawler=False, settings=None, **kwargs): """Runs given spider inside the twisted reactdor. Parameters ---------- spider_cls : scrapy.Spider Spider to run. capture_items : bool (default: True) If enabled, the scraped items are captured and returned. return_crawler : bool (default: False) If enabled, the crawler instance is returned. If ``capture_items`` is enabled, the scraped items is collected in ``crawler.items``. settings : dict, optional Custom crawler settings. Returns ------- out : crochet.EventualResult If ``capture_items`` is ``True``, returns scraped items. If ``return_crawler`` is ``True``, returns the crawler instance. """ settings = settings or {} crawler_settings = get_project_settings().copy() crawler_settings.setdict(default_settings) crawler_settings.setdict(settings) log_scrapy_info(crawler_settings) crawler = Crawler(spider_cls, crawler_settings) d = crawler.crawl(**kwargs) if capture_items: crawler.items = _OutputItems() crawler.signals.connect(crawler.items.append, signal=signals.item_scraped) d.addCallback(lambda _: crawler.items) if return_crawler: d.addCallback(lambda _: crawler) return d
def _run_spider_in_reactor(spider_cls, capture_items=True, return_crawler=False, settings=None): """Runs given spider inside the twisted reactdor. Parameters ---------- spider_cls : scrapy.Spider Spider to run. capture_items : bool (default: True) If enabled, the scraped items are captured and returned. return_crawler : bool (default: False) If enabled, the crawler instance is returned. If ``capture_items`` is enabled, the scraped items is collected in ``crawler.items``. settings : dict, optional Custom crawler settings. Returns ------- out : crochet.EventualResult If ``return_items`` is ``True``, returns scraped items. If ``return_crawler`` is ``True``, returns the crawler instance. """ settings = settings or {} crawler_settings = get_project_settings().copy() crawler_settings.setdict(settings) log_scrapy_info(crawler_settings) crawler = Crawler(spider_cls, crawler_settings) d = crawler.crawl() if capture_items: crawler.items = _OutputItems() crawler.signals.connect(crawler.items.append, signal=signals.item_scraped) d.addCallback(lambda _: crawler.items) if return_crawler: d.addCallback(lambda _: crawler) return d
def __init__(self, settings=None, install_root_handler=True): super().__init__(settings) configure_logging(self.settings, install_root_handler) log_scrapy_info(self.settings) self._initialized_reactor = False
def __init__(self, settings=None): super(CrawlerProcess, self).__init__(settings) install_shutdown_handlers(self._signal_shutdown) configure_logging(self.settings) log_scrapy_info(self.settings)
def __init__(self, settings=None, install_root_handler=False): crawler.CrawlerRunner.__init__(self, settings) install_shutdown_handlers(self._signal_shutdown) configure_logging(self.settings, install_root_handler=install_root_handler) log_scrapy_info(self.settings)
def __init__(self, settings=None, install_root_handler=True): super().__init__(settings) install_shutdown_handlers(self._signal_shutdown) configure_logging(self.settings, install_root_handler) log_scrapy_info(self.settings)
def __init__(self, settings=None): super(CustomCrawler, self).__init__(settings) configure_logging(self.settings) log_scrapy_info(self.settings) pdb.set_trace()
def __init__(self, settings=None, install_root_handler=True): print("sssssssssssssssssssssssssssssssssssssssssss") super(CrawlerProcess, self).__init__(settings) install_shutdown_handlers(self._signal_shutdown) configure_logging(self.settings, install_root_handler) log_scrapy_info(self.settings)
def __init__(self, settings=None, install_root_handler=True): super(CrawlerProcess, self).__init__(settings) install_shutdown_handlers( self._signal_shutdown) # 监控键盘按键,然后发送signal.signal(相关指令)进行控制 configure_logging(self.settings, install_root_handler) log_scrapy_info(self.settings)