Esempio n. 1
0
    def __init__(self, spidercls, settings=None, init_reactor: bool = False):
        if isinstance(spidercls, Spider):
            raise ValueError(
                'The spidercls argument must be a class, not an object')

        if isinstance(settings, dict) or settings is None:
            settings = Settings(settings)

        self.spidercls = spidercls
        self.settings = settings.copy()
        self.spidercls.update_settings(self.settings)

        self.signals = SignalManager(self)

        self.stats = load_object(self.settings['STATS_CLASS'])(self)

        handler = LogCounterHandler(self, level=self.settings.get('LOG_LEVEL'))
        logging.root.addHandler(handler)

        d = dict(overridden_settings(self.settings))
        logger.info("Overridden settings:\n%(settings)s",
                    {'settings': pprint.pformat(d)})

        if get_scrapy_root_handler() is not None:
            # scrapy root handler already installed: update it with new settings
            install_scrapy_root_handler(self.settings)
        # lambda is assigned to Crawler attribute because this way it is not
        # garbage collected after leaving __init__ scope
        self.__remove_handler = lambda: logging.root.removeHandler(handler)
        self.signals.connect(self.__remove_handler, signals.engine_stopped)

        lf_cls = load_object(self.settings['LOG_FORMATTER'])
        self.logformatter = lf_cls.from_crawler(self)

        self.request_fingerprinter = create_instance(
            load_object(self.settings['REQUEST_FINGERPRINTER_CLASS']),
            settings=self.settings,
            crawler=self,
        )

        reactor_class = self.settings.get("TWISTED_REACTOR")
        if init_reactor:
            # this needs to be done after the spider settings are merged,
            # but before something imports twisted.internet.reactor
            if reactor_class:
                install_reactor(reactor_class,
                                self.settings["ASYNCIO_EVENT_LOOP"])
            else:
                from twisted.internet import reactor  # noqa: F401
            log_reactor_info()
        if reactor_class:
            verify_installed_reactor(reactor_class)

        self.extensions = ExtensionManager.from_crawler(self)

        self.settings.freeze()
        self.crawling = False
        self.spider = None
        self.engine = None
 def __init__(self, settings: Settings) -> None:
     # Check if AsyncioSelectorReactor is installed
     # If this is not the case, an exception is thrown
     verify_installed_reactor(
         "twisted.internet.asyncioreactor.AsyncioSelectorReactor")
     # Check whether to start browser in headless mode or not
     headless = settings.get('PYPPETEER_HEADLESS', True)
     # Get event loop to instantiate pyppeteer browser before requests are processed
     loop = asyncio.get_event_loop()
     loop.run_until_complete(self._instantiate_browser(headless))
Esempio n. 3
0
 def __init__(self, settings: Settings):
     verify_installed_reactor(
         "twisted.internet.asyncioreactor.AsyncioSelectorReactor")
     self._browser: Optional[Browser] = None
     self._browser_is_launching: Optional[bool] = False
     self._launch_options = settings.getdict(
         'PYPPETEER_LAUNCH_OPTIONS') or {}
     self._navigation_timeout: Optional[int] = settings.getint(
         "PYPPETEER_NAVIGATION_TIMEOUT") or None
     self._concurrent_requests: Optional[int] = settings.getint(
         'CONCURRENT_REQUESTS') or None
Esempio n. 4
0
    def __init__(self, crawler: Crawler) -> None:
        settings = crawler.settings
        super().__init__(settings=settings, crawler=crawler)
        verify_installed_reactor(
            "twisted.internet.asyncioreactor.AsyncioSelectorReactor")
        crawler.signals.connect(self._engine_started, signals.engine_started)
        self.stats = crawler.stats

        # read settings
        self.launch_options = settings.getdict(
            "PLAYWRIGHT_LAUNCH_OPTIONS") or {}
        self.context_args = settings.getdict("PLAYWRIGHT_CONTEXT_ARGS") or {}
        self.default_navigation_timeout = (
            settings.getint("PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT") or None)
        if settings.get("PLAYWRIGHT_BROWSER_TYPE"):
            self.browser_type = settings["PLAYWRIGHT_BROWSER_TYPE"]
Esempio n. 5
0
 def __init__(self, crawler: Crawler) -> None:
     super().__init__(settings=crawler.settings, crawler=crawler)
     verify_installed_reactor(
         "twisted.internet.asyncioreactor.AsyncioSelectorReactor")
     crawler.signals.connect(self._launch_browser_signal_handler,
                             signals.engine_started)
     self.stats = crawler.stats
     self.navigation_timeout: Optional[int] = None
     if crawler.settings.get("PYPPETEER_NAVIGATION_TIMEOUT"):
         self.navigation_timeout = crawler.settings.getint(
             "PYPPETEER_NAVIGATION_TIMEOUT")
     self.browser: Optional[pyppeteer.browser.Browser] = None
     self.launch_options: dict = crawler.settings.getdict(
         "PYPPETEER_LAUNCH_OPTIONS") or {}
     if ("executablePath" not in self.launch_options
             and Path(pyppeteer.executablePath()).is_file()):
         self.launch_options["executablePath"] = pyppeteer.executablePath()
     logger.info("Browser launch options: %s" % self.launch_options)
Esempio n. 6
0
    def __init__(self, crawler: Crawler) -> None:
        super().__init__(settings=crawler.settings, crawler=crawler)
        verify_installed_reactor(
            "twisted.internet.asyncioreactor.AsyncioSelectorReactor")
        crawler.signals.connect(self._engine_started, signals.engine_started)
        self.stats = crawler.stats

        self.browser_type: str = crawler.settings.get(
            "PLAYWRIGHT_BROWSER_TYPE") or "chromium"
        self.launch_options: dict = crawler.settings.getdict(
            "PLAYWRIGHT_LAUNCH_OPTIONS") or {}

        self.default_navigation_timeout: Optional[float] = None
        if "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT" in crawler.settings:
            with suppress(TypeError, ValueError):
                self.default_navigation_timeout = float(
                    crawler.settings.get(
                        "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT"))

        if crawler.settings.get("PLAYWRIGHT_PROCESS_REQUEST_HEADERS"):
            self.process_request_headers = load_object(
                crawler.settings["PLAYWRIGHT_PROCESS_REQUEST_HEADERS"])
        else:
            self.process_request_headers = use_scrapy_headers

        default_context_kwargs: dict = {}
        if "PLAYWRIGHT_CONTEXT_ARGS" in crawler.settings:
            default_context_kwargs = crawler.settings.getdict(
                "PLAYWRIGHT_CONTEXT_ARGS")
            warnings.warn(
                "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use"
                " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in"
                " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context",
                category=DeprecationWarning,
                stacklevel=2,
            )
        self.context_kwargs: defaultdict = defaultdict(dict)
        for name, kwargs in (crawler.settings.getdict("PLAYWRIGHT_CONTEXTS")
                             or {}).items():
            if name == "default":
                self.context_kwargs[name] = default_context_kwargs
            self.context_kwargs[name].update(kwargs)
        if "default" not in self.context_kwargs and default_context_kwargs:
            self.context_kwargs["default"] = default_context_kwargs
Esempio n. 7
0
 def _handle_twisted_reactor(self):
     if self.settings.get("TWISTED_REACTOR"):
         verify_installed_reactor(self.settings["TWISTED_REACTOR"])