Example #1
0
    async def _download_request_page(self, request: Request, spider: Spider,
                                     page: Page) -> Response:
        self.stats.inc_value("pyppeteer/page_count")
        if self.navigation_timeout is not None:
            page.setDefaultNavigationTimeout(self.navigation_timeout)
        await page.setRequestInterception(True)
        page.on(
            "request",
            partial(_request_handler, scrapy_request=request,
                    stats=self.stats))
        page.on("response", partial(_response_handler, stats=self.stats))

        start_time = time()
        response = await page.goto(request.url)

        page_coroutines = request.meta.get("pyppeteer_page_coroutines") or ()
        if isinstance(page_coroutines, dict):
            page_coroutines = page_coroutines.values()
        for pc in page_coroutines:
            if isinstance(pc, PageCoroutine):
                method = getattr(page, pc.method)

                # set PageCoroutine timeout
                if self.page_coroutine_timeout is not None and not pc.kwargs.get(
                        "timeout", None):
                    pc.kwargs["timeout"] = self.page_coroutine_timeout

                if isinstance(pc, NavigationPageCoroutine):
                    await asyncio.gather(page.waitForNavigation(),
                                         method(*pc.args, **pc.kwargs))
                else:
                    pc.result = await method(*pc.args, **pc.kwargs)

        body = (await page.content()).encode("utf8")
        request.meta["download_latency"] = time() - start_time

        callback = request.callback or spider.parse
        annotations = getattr(callback, "__annotations__", {})
        for key, value in annotations.items():
            if value is pyppeteer.page.Page:
                request.cb_kwargs[key] = page
                self.stats.inc_value("pyppeteer/page_count/injected_callback")
                break
        else:
            await page.close()
            self.stats.inc_value("pyppeteer/page_count/closed")

        headers = Headers(response.headers)
        headers.pop("Content-Encoding", None)
        respcls = responsetypes.from_args(headers=headers,
                                          url=page.url,
                                          body=body)
        return respcls(
            url=page.url,
            status=response.status,
            headers=headers,
            body=body,
            request=request,
            flags=["pyppeteer"],
        )
Example #2
0
    async def _download_request_with_page(self, request: Request,
                                          spider: Spider,
                                          page: Page) -> Response:
        start_time = time()
        response = await page.goto(request.url)

        page_coroutines = request.meta.get("playwright_page_coroutines") or ()
        if isinstance(page_coroutines, dict):
            page_coroutines = page_coroutines.values()
        for pc in page_coroutines:
            if isinstance(pc, PageCoroutine):
                method = getattr(page, pc.method)
                pc.result = await method(*pc.args, **pc.kwargs)
                await page.wait_for_load_state(
                    timeout=self.default_navigation_timeout)

        body = (await page.content()).encode("utf8")
        request.meta["download_latency"] = time() - start_time

        if request.meta.get("playwright_include_page"):
            request.meta["playwright_page"] = page
        else:
            await page.close()
            self.stats.inc_value("playwright/page_count/closed")

        headers = Headers(response.headers)
        headers.pop("Content-Encoding", None)
        respcls = responsetypes.from_args(headers=headers,
                                          url=page.url,
                                          body=body)
        return respcls(
            url=page.url,
            status=response.status,
            headers=headers,
            body=body,
            request=request,
            flags=["playwright"],
        )