Beispiel #1
0
 def __init__(self, settings=None, install_root_handler=True):
     super(CrawlerProcess, self).__init__(settings)
     # 安装shutdown处理机
     install_shutdown_handlers(self._signal_shutdown)
     # 配置日志
     configure_logging(self.settings, install_root_handler)
     log_scrapy_info(self.settings)
Beispiel #2
0
 def __init__(self, settings=None, install_root_handler=True):
     ## 父类初始化
     super(CrawlerProcess, self).__init__(settings)
     ## 处理 shutdown 信号
     install_shutdown_handlers(self._signal_shutdown)
     ## 为 Scrapy 配置默认的日志服务
     configure_logging(self.settings, install_root_handler)
     ## 输出 scrapy 的相关信息(启动状态,版本...)
     log_scrapy_info(self.settings)
Beispiel #3
0
 def __init__(self, settings=None, install_root_handler=True):
     logging.info("CrawlerProcess __init__")
     super().__init__(settings)
     logging.info("super().__init__(settings)")
     install_shutdown_handlers(self._signal_shutdown)
     logging.info("install_shutdown_handlers(self._signal_shutdown)")
     configure_logging(self.settings, install_root_handler)
     logging.info("configure_logging(self.settings, install_root_handler)")
     log_scrapy_info(self.settings)
     logging.info("log_scrapy_info(self.settings)")
Beispiel #4
0
 def __init__(self, in_thread, settings=None, install_root_handler=True):
     # call grandparent init method
     # avoid calling CrawlerProcess init method which install shutdown_handlers
     if in_thread:
         super(CrawlerProcess, self).__init__(settings)
         configure_logging(self.settings, install_root_handler)
         log_scrapy_info(self.settings)
     else:
         super(PyScraperCrawlerProcess,
               self).__init__(settings, install_root_handler)
Beispiel #5
0
def _run_spider_in_reactor(spider_cls,
                           capture_items=True,
                           return_crawler=False,
                           settings=None,
                           **kwargs):
    """Runs given spider inside the twisted reactdor.

    Parameters
    ----------
    spider_cls : scrapy.Spider
        Spider to run.
    capture_items : bool (default: True)
        If enabled, the scraped items are captured and returned.
    return_crawler : bool (default: False)
        If enabled, the crawler instance is returned. If ``capture_items`` is
        enabled, the scraped items is collected in ``crawler.items``.
    settings : dict, optional
        Custom crawler settings.

    Returns
    -------
    out : crochet.EventualResult
        If ``capture_items`` is ``True``, returns scraped items. If
        ``return_crawler`` is ``True``, returns the crawler instance.

    """
    settings = settings or {}
    crawler_settings = get_project_settings().copy()
    crawler_settings.setdict(default_settings)
    crawler_settings.setdict(settings)
    log_scrapy_info(crawler_settings)
    crawler = Crawler(spider_cls, crawler_settings)
    d = crawler.crawl(**kwargs)
    if capture_items:
        crawler.items = _OutputItems()
        crawler.signals.connect(crawler.items.append,
                                signal=signals.item_scraped)
        d.addCallback(lambda _: crawler.items)
    if return_crawler:
        d.addCallback(lambda _: crawler)
    return d
Beispiel #6
0
def _run_spider_in_reactor(spider_cls, capture_items=True, return_crawler=False,
                           settings=None):
    """Runs given spider inside the twisted reactdor.

    Parameters
    ----------
    spider_cls : scrapy.Spider
        Spider to run.
    capture_items : bool (default: True)
        If enabled, the scraped items are captured and returned.
    return_crawler : bool (default: False)
        If enabled, the crawler instance is returned. If ``capture_items`` is
        enabled, the scraped items is collected in ``crawler.items``.
    settings : dict, optional
        Custom crawler settings.

    Returns
    -------
    out : crochet.EventualResult
        If ``return_items`` is ``True``, returns scraped items. If
        ``return_crawler`` is ``True``, returns the crawler instance.

    """
    settings = settings or {}
    crawler_settings = get_project_settings().copy()
    crawler_settings.setdict(settings)
    log_scrapy_info(crawler_settings)
    crawler = Crawler(spider_cls, crawler_settings)
    d = crawler.crawl()
    if capture_items:
        crawler.items = _OutputItems()
        crawler.signals.connect(crawler.items.append, signal=signals.item_scraped)
        d.addCallback(lambda _: crawler.items)
    if return_crawler:
        d.addCallback(lambda _: crawler)
    return d
Beispiel #7
0
 def __init__(self, settings=None, install_root_handler=True):
     super().__init__(settings)
     configure_logging(self.settings, install_root_handler)
     log_scrapy_info(self.settings)
     self._initialized_reactor = False
Beispiel #8
0
 def __init__(self, settings=None):
     super(CrawlerProcess, self).__init__(settings)
     install_shutdown_handlers(self._signal_shutdown)
     configure_logging(self.settings)
     log_scrapy_info(self.settings)
Beispiel #9
0
 def __init__(self, settings=None, install_root_handler=False):
     crawler.CrawlerRunner.__init__(self, settings)
     install_shutdown_handlers(self._signal_shutdown)
     configure_logging(self.settings, install_root_handler=install_root_handler)
     log_scrapy_info(self.settings)
Beispiel #10
0
 def __init__(self, settings=None):
     super(CrawlerProcess, self).__init__(settings)
     install_shutdown_handlers(self._signal_shutdown)
     configure_logging(self.settings)
     log_scrapy_info(self.settings)
Beispiel #11
0
 def __init__(self, settings=None, install_root_handler=True):
     super().__init__(settings)
     install_shutdown_handlers(self._signal_shutdown)
     configure_logging(self.settings, install_root_handler)
     log_scrapy_info(self.settings)
Beispiel #12
0
 def __init__(self, settings=None):
     super(CustomCrawler, self).__init__(settings)
     configure_logging(self.settings)
     log_scrapy_info(self.settings)
     pdb.set_trace()
Beispiel #13
0
 def __init__(self, settings=None, install_root_handler=True):
     print("sssssssssssssssssssssssssssssssssssssssssss")
     super(CrawlerProcess, self).__init__(settings)
     install_shutdown_handlers(self._signal_shutdown)
     configure_logging(self.settings, install_root_handler)
     log_scrapy_info(self.settings)
Beispiel #14
0
 def __init__(self, settings=None, install_root_handler=True):
     super(CrawlerProcess, self).__init__(settings)
     install_shutdown_handlers(
         self._signal_shutdown)  # 监控键盘按键,然后发送signal.signal(相关指令)进行控制
     configure_logging(self.settings, install_root_handler)
     log_scrapy_info(self.settings)