def on_request_finished(self, page: Page, request: Request): har_entry = self._entries.get(request.__hash__()) if har_entry is None: return async def handle_finished_request(): response = await request.response() if response is None: return response_headers = await response.all_headers() request_headers = await request.all_headers() # TODO http_version = FALLBACK_HTTP_VERSION transfer_size = -1 headers_size = calculate_response_headers_size( http_version, response.status, response.status_text, response_headers) body_size = -1 har_entry.request.http_version = http_version har_entry.response.body_size = body_size har_entry.response.headers_size = headers_size har_entry.response._transfer_size = transfer_size har_entry.request.headers_size = calculate_request_headers_size( request.method, request.url, http_version, request_headers) self._tasks.append(self._loop.create_task(handle_finished_request()))
def request_handler(route: Route, pw_request: PwRequest) -> None: """ Override request headers, method and body """ if pw_request.url == scrapy_request.url: overrides = { "method": scrapy_request.method, "headers": { key.decode("utf-8").lower(): value[0].decode("utf-8") for key, value in scrapy_request.headers.items() }, } if scrapy_request.body: overrides["post_data"] = scrapy_request.body.decode( scrapy_request.encoding) # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET if browser_type == "firefox": overrides["headers"]["host"] = urlparse(pw_request.url).netloc else: overrides = {"headers": pw_request.headers.copy()} # override user agent, for consistency with other requests if scrapy_request.headers.get("user-agent"): user_agent = scrapy_request.headers["user-agent"].decode( "utf-8") overrides["headers"]["user-agent"] = user_agent asyncio.create_task(route.continue_(**overrides)) # increment stats stats.inc_value("playwright/request_method_count/{}".format( pw_request.method)) stats.inc_value("playwright/request_count") if pw_request.is_navigation_request(): stats.inc_value("playwright/request_count/navigation")
def _increment_request_stats(self, request: PlaywrightRequest) -> None: stats_prefix = "playwright/request_count" self.stats.inc_value(stats_prefix) self.stats.inc_value( f"{stats_prefix}/resource_type/{request.resource_type}") self.stats.inc_value(f"{stats_prefix}/method/{request.method}") if request.is_navigation_request(): self.stats.inc_value(f"{stats_prefix}/navigation")
def on_request(self, page: Page, request: Request) -> None: page_entry = self._page_entries.get(page) if page_entry is None: return parsed_url = urlparse(request.url) har_entry = dataclasses.har.Entry( pageref=page_entry.id, started_date_time=datetime.now(timezone.utc), time=-1, request=dataclasses.har.Request( method=request.method, url=request.url, http_version=FALLBACK_HTTP_VERSION, cookies=[], headers=[], query_string=query_to_query_params(parsed_url.query), post_data=post_data_for_har(request), headers_size=-1, body_size=calculate_request_body_size(request) or 0, ), response=dataclasses.har.Response( status=-1, status_text="", http_version=FALLBACK_HTTP_VERSION, cookies=[], headers=[], content=dataclasses.har.Content( size=-1, mime_type="x-unknown", ), headers_size=-1, body_size=-1, redirect_url="", _transfer_size=-1, ), cache=dataclasses.har.Cache(before_request=None, after_request=None), timings=dataclasses.har.Timings(send=-1, wait=-1, receive=-1), ) async def update_mime_type_task(): har_entry.response.content.mime_type = ( await request.header_value("content-type") or har_entry.response.content.mime_type) self._tasks.append(self._loop.create_task(update_mime_type_task())) redirected_from_request = request.redirected_from if redirected_from_request is not None: from_entry = self._entries.get(redirected_from_request.__hash__()) if from_entry is not None: from_entry.response.redirect_url = request.url self._log.entries.append(har_entry) self._entries[request.__hash__()] = har_entry
async def handle_request(route: Route, request: Request): assert route.request == request assert repr(route) == f"<Route request={route.request}>" assert "empty.html" in request.url assert request.headers["user-agent"] assert request.method == "GET" assert request.post_data is None assert request.is_navigation_request() assert request.resource_type == "document" assert request.frame == page.main_frame assert request.frame.url == "about:blank" assert (repr(request) == f"<Request url={request.url!r} method={request.method!r}>") await route.fulfill(body="Text")
async def use_scrapy_headers( browser_type: str, playwright_request: PlaywrightRequest, scrapy_headers: Headers, ) -> dict: """Scrapy headers take precedence over Playwright headers for navigation requests. For non-navigation requests, only User-Agent is taken from the Scrapy headers.""" headers = scrapy_headers.to_unicode_dict() # Scrapy's user agent has priority over Playwright's headers.setdefault("user-agent", playwright_request.headers.get("user-agent")) if playwright_request.is_navigation_request(): if browser_type == "firefox": # otherwise this fails with playwright.helper.Error: NS_ERROR_NET_RESET headers["host"] = urlparse(playwright_request.url).netloc return headers else: # override user agent, for consistency with other requests if headers.get("user-agent"): playwright_request.headers["user-agent"] = headers["user-agent"] return playwright_request.headers
async def _request_handler( route: Route, playwright_request: PlaywrightRequest) -> None: """Override request headers, method and body.""" processed_headers = await self.process_request_headers( self.browser_type, playwright_request, scrapy_headers) # the request that reaches the callback should contain the headers that were sent scrapy_headers.clear() scrapy_headers.update(processed_headers) overrides: dict = {"headers": processed_headers} if playwright_request.is_navigation_request(): overrides["method"] = method if body is not None: overrides["post_data"] = body.decode(encoding) await route.continue_(**overrides)