Exemple #1
0
    async def _download_request_page(self, request: Request, spider: Spider,
                                     page: Page) -> Response:
        self.stats.inc_value("pyppeteer/page_count")
        if self.navigation_timeout is not None:
            page.setDefaultNavigationTimeout(self.navigation_timeout)
        await page.setRequestInterception(True)
        page.on(
            "request",
            partial(_request_handler, scrapy_request=request,
                    stats=self.stats))
        page.on("response", partial(_response_handler, stats=self.stats))

        start_time = time()
        response = await page.goto(request.url)

        page_coroutines = request.meta.get("pyppeteer_page_coroutines") or ()
        if isinstance(page_coroutines, dict):
            page_coroutines = page_coroutines.values()
        for pc in page_coroutines:
            if isinstance(pc, PageCoroutine):
                method = getattr(page, pc.method)

                # set PageCoroutine timeout
                if self.page_coroutine_timeout is not None and not pc.kwargs.get(
                        "timeout", None):
                    pc.kwargs["timeout"] = self.page_coroutine_timeout

                if isinstance(pc, NavigationPageCoroutine):
                    await asyncio.gather(page.waitForNavigation(),
                                         method(*pc.args, **pc.kwargs))
                else:
                    pc.result = await method(*pc.args, **pc.kwargs)

        body = (await page.content()).encode("utf8")
        request.meta["download_latency"] = time() - start_time

        callback = request.callback or spider.parse
        annotations = getattr(callback, "__annotations__", {})
        for key, value in annotations.items():
            if value is pyppeteer.page.Page:
                request.cb_kwargs[key] = page
                self.stats.inc_value("pyppeteer/page_count/injected_callback")
                break
        else:
            await page.close()
            self.stats.inc_value("pyppeteer/page_count/closed")

        headers = Headers(response.headers)
        headers.pop("Content-Encoding", None)
        respcls = responsetypes.from_args(headers=headers,
                                          url=page.url,
                                          body=body)
        return respcls(
            url=page.url,
            status=response.status,
            headers=headers,
            body=body,
            request=request,
            flags=["pyppeteer"],
        )
Exemple #2
0
    async def __add_page_settings(self, page: Page) -> None:
        """Add custom settings to page."""
        # Change the default maximum navigation timeout.
        if self.default_nav_timeout:
            page.setDefaultNavigationTimeout(self.default_nav_timeout)
        tasks = []
        # Blocks URLs from loading.
        if self.blocked_urls:
            self.logger.info(f"Adding {len(self.blocked_urls)} blocked urls")
            tasks.append(
                page._client.send('Network.setBlockedURLs', {
                    'urls': self.blocked_urls,
                }))
        # Disable cache for each request.
        if self.disable_cache:
            self.logger.info("Setting cache disabled.")
            tasks.append(page.setCacheEnabled(False))
        # Add a JavaScript function(s) that will be invoked whenever the page is navigated.
        if self.js_injection_scripts:
            self.logger.info(
                f"Adding {len(self.js_injection_scripts)} JavaScript injection scripts"
            )
            for script in self.js_injection_scripts:
                tasks.append(page.evaluateOnNewDocument(script))
        # Add a JavaScript functions to prevent automation detection.
        for f in Path(__file__).parent.joinpath('automation_detection').glob(
                "*.js"):
            self.logger.info(
                f"(page {page}) Adding automation detection prevention script: {f.name}"
            )
            tasks.append(page.evaluateOnNewDocument(f.read_text()))
        # Add JavaScript functions to prevent detection of headless mode.
        if self.headless:
            for f in Path(__file__).parent.joinpath('headless_detection').glob(
                    "*.js"):
                self.logger.info(
                    f"(page {page}) Adding headless detection prevention script: {f.name}"
                )
                tasks.append(page.evaluateOnNewDocument(f.read_text()))
        # Intercept all request and only allow requests for types not in self.request_abort_types.
        if self.request_abort_types:
            self.logger.info(
                f"Setting request interception for {self.request_abort_types}")
            tasks.append(page.setRequestInterception(True))

            async def block_type(request):
                if request.resourceType in self.request_abort_types:
                    await request.abort()
                else:
                    await request.continue_()

            page.on('request',
                    lambda request: asyncio.create_task(block_type(request)))
        await asyncio.gather(*tasks)
Exemple #3
0
    async def _add_page_settings(self, page: Page) -> None:
        """Add custom settings to a page."""
        # launch options for this page.
        launch_options = self.browsers[page.browser]['launch_options']
        # set the default maximum navigation time.
        if 'defaultNavigationTimeout' in launch_options:
            page.setDefaultNavigationTimeout(
                launch_options['defaultNavigationTimeout'])
        tasks = [self.set_stealth(page)]
        # blocks URLs from loading.
        if 'blockedURLs' in launch_options:
            tasks.append(
                self.set_blocked_urls(page, launch_options['blockedURLs']))
        # disable cache for each request.
        if 'setCacheEnabled' in launch_options:
            tasks.append(
                page.setCacheEnabled(launch_options['setCacheEnabled']))
        # add a JavaScript function(s) that will be invoked whenever the page is navigated.
        for script in launch_options.get('evaluateOnNewDocument', []):
            tasks.append(page.evaluateOnNewDocument(script))
        # intercept all request and only allow requests for types not in request_abort_types.
        request_abort_types = launch_options.get('requestAbortTypes')
        if request_abort_types:
            # enable request interception.
            tasks.append(page.setRequestInterception(True))

            async def block_type(request: Request):
                # condition(s) where requests should be aborted.
                if request.resourceType in request_abort_types:
                    await request.abort()
                elif launch_options.get(
                        'blockRedirects',
                        False) and request.isNavigationRequest() and len(
                            request.redirectChain):
                    await request.abort()
                else:
                    await request.continue_()

            page.on('request',
                    lambda request: asyncio.create_task(block_type(request)))
        await asyncio.gather(*tasks)
Exemple #4
0
    async def _add_page_settings(self, page: Page) -> None:
        """Add custom settings to a page."""
        # add JavaScript functions to prevent automation detection.
        tasks = [
            page.evaluateOnNewDocument(
                f"() => {{{Path(__file__).parent.joinpath('stealth.min.js').read_text()}}}"
            )
        ]
        # launch options for this page.
        launch_options = self.browsers[page.browser]['launch_options']
        # set the default maximum navigation time.
        if 'defaultNavigationTimeout' in launch_options:
            page.setDefaultNavigationTimeout(
                launch_options['defaultNavigationTimeout'])
        # blocks URLs from loading.
        if 'blockedURLs' in launch_options:
            await page._client.send('Network.setBlockedURLs',
                                    {'urls': launch_options['blockedURLs']})
        # disable cache for each request.
        if 'setCacheEnabled' in launch_options:
            tasks.append(
                page.setCacheEnabled(launch_options['setCacheEnabled']))
        # add a JavaScript function(s) that will be invoked whenever the page is navigated.
        for script in launch_options.get('evaluateOnNewDocument', []):
            tasks.append(page.evaluateOnNewDocument(script))
        # intercept all request and only allow requests for types not in request_abort_types.
        request_abort_types = launch_options.get('requestAbortTypes')
        if request_abort_types:
            tasks.append(page.setRequestInterception(True))

            async def block_type(request):
                if request.resourceType in request_abort_types:
                    await request.abort()
                else:
                    await request.continue_()

            page.on('request',
                    lambda request: asyncio.create_task(block_type(request)))
        await asyncio.gather(*tasks)