def __init__(self, spidercls, settings=None, init_reactor: bool = False): if isinstance(spidercls, Spider): raise ValueError( 'The spidercls argument must be a class, not an object') if isinstance(settings, dict) or settings is None: settings = Settings(settings) self.spidercls = spidercls self.settings = settings.copy() self.spidercls.update_settings(self.settings) self.signals = SignalManager(self) self.stats = load_object(self.settings['STATS_CLASS'])(self) handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL')) logging.root.addHandler(handler) d = dict(overridden_settings(self.settings)) logger.info("Overridden settings:\n%(settings)s", {'settings': pprint.pformat(d)}) if get_scrapy_root_handler() is not None: # scrapy root handler already installed: update it with new settings install_scrapy_root_handler(self.settings) # lambda is assigned to Crawler attribute because this way it is not # garbage collected after leaving __init__ scope self.__remove_handler = lambda: logging.root.removeHandler(handler) self.signals.connect(self.__remove_handler, signals.engine_stopped) lf_cls = load_object(self.settings['LOG_FORMATTER']) self.logformatter = lf_cls.from_crawler(self) self.request_fingerprinter = create_instance( load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']), settings=self.settings, crawler=self, ) reactor_class = self.settings.get("TWISTED_REACTOR") if init_reactor: # this needs to be done after the spider settings are merged, # but before something imports twisted.internet.reactor if reactor_class: install_reactor(reactor_class, self.settings["ASYNCIO_EVENT_LOOP"]) else: from twisted.internet import reactor # noqa: F401 log_reactor_info() if reactor_class: verify_installed_reactor(reactor_class) self.extensions = ExtensionManager.from_crawler(self) self.settings.freeze() self.crawling = False self.spider = None self.engine = None
def _handle_twisted_reactor(self): if self.settings.get("TWISTED_REACTOR"): install_reactor(self.settings["TWISTED_REACTOR"]) super()._handle_twisted_reactor()
def main(): install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') project = os.environ['SCRAPY_PROJECT'] with project_environment(project): from scrapy.cmdline import execute execute()
def pytest_configure(config): if config.getoption("--reactor") == "asyncio": install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
task = BackgroundTask(crawl, query=query) message = {'status': f'Crawler was scheduled with {query=}'} return JSONResponse(message, status_code=HTTP_202_ACCEPTED, background=task) # first test with async tasks # async def crawl(query): # await asyncio.sleep(5) # print('Crawler finished!') # raise error.ReactorNotRestartable() # async def crawl(query): # process = CrawlerProcess() # process.crawl(QuotesSpider) # process.start() # print('Crawler finished!') install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') async def crawl(query): process = CrawlerProcess() process.crawl(QuotesSpider) process.start(stop_after_crawl=False) print(f'Crawler {query=} finished!') routes = [ Route('/crawl', endpoint=run_crawler, methods=['POST']) ] app = Starlette(routes=routes)
def test_install_asyncio_reactor(self): # this should do nothing install_reactor( "twisted.internet.asyncioreactor.AsyncioSelectorReactor")
def _handle_twisted_reactor(self): if self.settings.get("TWISTED_REACTOR"): install_reactor(self.settings["TWISTED_REACTOR"], self.settings["ASYNCIO_EVENT_LOOP"]) super()._handle_twisted_reactor()