Beispiel #1
0
    def __init__(self, spidercls, settings=None, init_reactor: bool = False):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)

        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings:\n%(settings)s",
                    {'settings': pprint.pformat(d)})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)

        self.request_fingerprinter = create_instance(
            load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']),
            settings=self.settings,
            crawler=self,
        )

        reactor_class = self.settings.get("TWISTED_REACTOR")
        if init_reactor:
            # this needs to be done after the spider settings are merged,
            # but before something imports twisted.internet.reactor
            if reactor_class:
                install_reactor(reactor_class,
                                self.settings["ASYNCIO_EVENT_LOOP"])
            else:
                from twisted.internet import reactor  # noqa: F401
            log_reactor_info()
        if reactor_class:
            verify_installed_reactor(reactor_class)

        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
Beispiel #2
0
 def _handle_twisted_reactor(self):
     if self.settings.get("TWISTED_REACTOR"):
         install_reactor(self.settings["TWISTED_REACTOR"])
     super()._handle_twisted_reactor()
Beispiel #3
0
def main():
    install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
    project = os.environ['SCRAPY_PROJECT']
    with project_environment(project):
        from scrapy.cmdline import execute
        execute()
Beispiel #4
0
def pytest_configure(config):
    if config.getoption("--reactor") == "asyncio":
        install_reactor("twisted.internet.asyncioreactor.AsyncioSelectorReactor")
Beispiel #5
0
    task = BackgroundTask(crawl, query=query)
    message = {'status': f'Crawler was scheduled with {query=}'}
    return JSONResponse(message, status_code=HTTP_202_ACCEPTED, background=task)


# first test with async tasks
# async def crawl(query):
#    await asyncio.sleep(5)
#    print('Crawler finished!')


# raise error.ReactorNotRestartable()
# async def crawl(query):
#    process = CrawlerProcess()
#    process.crawl(QuotesSpider)
#    process.start()
#    print('Crawler finished!')
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')

async def crawl(query):
    process = CrawlerProcess()
    process.crawl(QuotesSpider)
    process.start(stop_after_crawl=False)
    print(f'Crawler {query=} finished!')

routes = [
    Route('/crawl', endpoint=run_crawler, methods=['POST'])
]

app = Starlette(routes=routes)
 def test_install_asyncio_reactor(self):
     # this should do nothing
     install_reactor(
         "twisted.internet.asyncioreactor.AsyncioSelectorReactor")
Beispiel #7
0
 def _handle_twisted_reactor(self):
     if self.settings.get("TWISTED_REACTOR"):
         install_reactor(self.settings["TWISTED_REACTOR"],
                         self.settings["ASYNCIO_EVENT_LOOP"])
     super()._handle_twisted_reactor()