def on_request_finished(self, page: Page, request: Request):
        har_entry = self._entries.get(request.__hash__())
        if har_entry is None:
            return

        async def handle_finished_request():
            response = await request.response()
            if response is None:
                return

            response_headers = await response.all_headers()
            request_headers = await request.all_headers()
            # TODO
            http_version = FALLBACK_HTTP_VERSION
            transfer_size = -1
            headers_size = calculate_response_headers_size(
                http_version, response.status, response.status_text,
                response_headers)
            body_size = -1

            har_entry.request.http_version = http_version
            har_entry.response.body_size = body_size
            har_entry.response.headers_size = headers_size
            har_entry.response._transfer_size = transfer_size
            har_entry.request.headers_size = calculate_request_headers_size(
                request.method, request.url, http_version, request_headers)

        self._tasks.append(self._loop.create_task(handle_finished_request()))
Exemple #2
0
 def request_handler(route: Route, pw_request: PwRequest) -> None:
     """
     Override request headers, method and body
     """
     if pw_request.url == scrapy_request.url:
         overrides = {
             "method": scrapy_request.method,
             "headers": {
                 key.decode("utf-8").lower(): value[0].decode("utf-8")
                 for key, value in scrapy_request.headers.items()
             },
         }
         if scrapy_request.body:
             overrides["post_data"] = scrapy_request.body.decode(
                 scrapy_request.encoding)
         # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
         if browser_type == "firefox":
             overrides["headers"]["host"] = urlparse(pw_request.url).netloc
     else:
         overrides = {"headers": pw_request.headers.copy()}
         # override user agent, for consistency with other requests
         if scrapy_request.headers.get("user-agent"):
             user_agent = scrapy_request.headers["user-agent"].decode(
                 "utf-8")
             overrides["headers"]["user-agent"] = user_agent
     asyncio.create_task(route.continue_(**overrides))
     # increment stats
     stats.inc_value("playwright/request_method_count/{}".format(
         pw_request.method))
     stats.inc_value("playwright/request_count")
     if pw_request.is_navigation_request():
         stats.inc_value("playwright/request_count/navigation")
Exemple #3
0
 def _increment_request_stats(self, request: PlaywrightRequest) -> None:
     stats_prefix = "playwright/request_count"
     self.stats.inc_value(stats_prefix)
     self.stats.inc_value(
         f"{stats_prefix}/resource_type/{request.resource_type}")
     self.stats.inc_value(f"{stats_prefix}/method/{request.method}")
     if request.is_navigation_request():
         self.stats.inc_value(f"{stats_prefix}/navigation")
    def on_request(self, page: Page, request: Request) -> None:
        page_entry = self._page_entries.get(page)
        if page_entry is None:
            return

        parsed_url = urlparse(request.url)

        har_entry = dataclasses.har.Entry(
            pageref=page_entry.id,
            started_date_time=datetime.now(timezone.utc),
            time=-1,
            request=dataclasses.har.Request(
                method=request.method,
                url=request.url,
                http_version=FALLBACK_HTTP_VERSION,
                cookies=[],
                headers=[],
                query_string=query_to_query_params(parsed_url.query),
                post_data=post_data_for_har(request),
                headers_size=-1,
                body_size=calculate_request_body_size(request) or 0,
            ),
            response=dataclasses.har.Response(
                status=-1,
                status_text="",
                http_version=FALLBACK_HTTP_VERSION,
                cookies=[],
                headers=[],
                content=dataclasses.har.Content(
                    size=-1,
                    mime_type="x-unknown",
                ),
                headers_size=-1,
                body_size=-1,
                redirect_url="",
                _transfer_size=-1,
            ),
            cache=dataclasses.har.Cache(before_request=None,
                                        after_request=None),
            timings=dataclasses.har.Timings(send=-1, wait=-1, receive=-1),
        )

        async def update_mime_type_task():
            har_entry.response.content.mime_type = (
                await request.header_value("content-type")
                or har_entry.response.content.mime_type)

        self._tasks.append(self._loop.create_task(update_mime_type_task()))

        redirected_from_request = request.redirected_from
        if redirected_from_request is not None:
            from_entry = self._entries.get(redirected_from_request.__hash__())
            if from_entry is not None:
                from_entry.response.redirect_url = request.url

        self._log.entries.append(har_entry)
        self._entries[request.__hash__()] = har_entry
 async def handle_request(route: Route, request: Request):
     assert route.request == request
     assert repr(route) == f"<Route request={route.request}>"
     assert "empty.html" in request.url
     assert request.headers["user-agent"]
     assert request.method == "GET"
     assert request.post_data is None
     assert request.is_navigation_request()
     assert request.resource_type == "document"
     assert request.frame == page.main_frame
     assert request.frame.url == "about:blank"
     assert (repr(request) ==
             f"<Request url={request.url!r} method={request.method!r}>")
     await route.fulfill(body="Text")
Exemple #6
0
async def use_scrapy_headers(
    browser_type: str,
    playwright_request: PlaywrightRequest,
    scrapy_headers: Headers,
) -> dict:
    """Scrapy headers take precedence over Playwright headers for navigation requests.
    For non-navigation requests, only User-Agent is taken from the Scrapy headers."""

    headers = scrapy_headers.to_unicode_dict()

    # Scrapy's user agent has priority over Playwright's
    headers.setdefault("user-agent",
                       playwright_request.headers.get("user-agent"))

    if playwright_request.is_navigation_request():
        if browser_type == "firefox":
            # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET
            headers["host"] = urlparse(playwright_request.url).netloc
        return headers
    else:
        # override user agent, for consistency with other requests
        if headers.get("user-agent"):
            playwright_request.headers["user-agent"] = headers["user-agent"]
        return playwright_request.headers
Exemple #7
0
        async def _request_handler(
                route: Route, playwright_request: PlaywrightRequest) -> None:
            """Override request headers, method and body."""
            processed_headers = await self.process_request_headers(
                self.browser_type, playwright_request, scrapy_headers)

            # the request that reaches the callback should contain the headers that were sent
            scrapy_headers.clear()
            scrapy_headers.update(processed_headers)

            overrides: dict = {"headers": processed_headers}
            if playwright_request.is_navigation_request():
                overrides["method"] = method
                if body is not None:
                    overrides["post_data"] = body.decode(encoding)

            await route.continue_(**overrides)