async def test_timeout_value(self): settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, } async with make_handler(settings_dict) as handler: assert handler.default_navigation_timeout is None settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": None, } async with make_handler(settings_dict) as handler: assert handler.default_navigation_timeout is None settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0, } async with make_handler(settings_dict) as handler: assert handler.default_navigation_timeout == 0 settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 123, } async with make_handler(settings_dict) as handler: assert handler.default_navigation_timeout == 123 settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 0.5, } async with make_handler(settings_dict) as handler: assert handler.default_navigation_timeout == 0.5
async def test_use_custom_headers(self): """Custom header processing function""" async def important_headers(*args, **kwargs) -> dict: return {"foo": "bar"} settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXTS": { "default": { "user_agent": self.browser_type } }, "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers, } async with make_handler(settings_dict) as handler: with MockServer() as server: req = Request( url=server.urljoin("/headers"), meta={"playwright": True}, headers={ "User-Agent": "foobar", "Asdf": "qwerty" }, ) resp = await handler._download_request(req, Spider("foo")) headers = json.loads(resp.css("pre::text").get()) headers = { key.lower(): value for key, value in headers.items() } assert headers["foo"] == "bar" assert headers.get("user-agent") not in (self.browser_type, "foobar") assert "asdf" not in headers
async def test_use_playwright_headers(self): """Ignore Scrapy headers""" settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXTS": { "default": { "user_agent": self.browser_type } }, "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": "scrapy_playwright.headers.use_playwright_headers", # noqa: E501 } async with make_handler(settings_dict) as handler: with MockServer() as server: req = Request( url=server.urljoin("/headers"), meta={"playwright": True}, headers={ "User-Agent": "foobar", "Asdf": "qwerty" }, ) resp = await handler._download_request(req, Spider("foo")) headers = json.loads(resp.css("pre::text").get()) headers = { key.lower(): value for key, value in headers.items() } assert headers["user-agent"] == self.browser_type assert "asdf" not in headers
async def test_contexts_dynamic(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: meta = { "playwright": True, "playwright_include_page": True, "playwright_context": "new", "playwright_context_kwargs": { "storage_state": { "cookies": [ { "url": "https://example.org", "name": "asdf", "value": "qwerty", }, ], }, }, } req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) page = resp.meta["playwright_page"] storage_state = await page.context.storage_state() await page.close() cookie = storage_state["cookies"][0] assert cookie["name"] == "asdf" assert cookie["value"] == "qwerty" assert cookie["domain"] == "example.org"
async def test_context_kwargs(self): settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXTS": { "default": { "java_script_enabled": False }, }, } async with make_handler(settings_dict) as handler: with StaticMockServer() as server: req = Request( url=server.urljoin("/scroll.html"), meta={ "playwright": True, "playwright_page_coroutines": [ PageCoro("wait_for_selector", selector="div.quote", timeout=1000), ], }, ) with pytest.raises(TimeoutError): await handler._download_request(req, Spider("foo"))
async def test_contexts_startup(self): settings = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXTS": { "first": { "storage_state": { "cookies": [ { "url": "https://example.org", "name": "foo", "value": "bar", }, ], }, }, }, } async with make_handler(settings) as handler: with StaticMockServer() as server: meta = { "playwright": True, "playwright_include_page": True, "playwright_context": "first", } req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) page = resp.meta["playwright_page"] storage_state = await page.context.storage_state() await page.context.close() await page.close() cookie = storage_state["cookies"][0] assert cookie["name"] == "foo" assert cookie["value"] == "bar" assert cookie["domain"] == "example.org"
async def test_timeout(self): settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000, } async with make_handler(settings_dict) as handler: with MockServer() as server: req = Request(server.urljoin("/delay/2"), meta={"playwright": True}) with pytest.raises(TimeoutError): await handler._download_request(req, Spider("foo"))
async def test_post_request(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with MockServer() as server: req = FormRequest(server.urljoin("/delay/2"), meta={"playwright": True}, formdata={"foo": "bar"}) resp = await handler._download_request(req, Spider("foo")) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "playwright" in resp.flags assert "Request body: foo=bar" in resp.text
async def test_basic_response(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: meta = {"playwright": True, "playwright_include_page": True} req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "playwright" in resp.flags assert resp.css("a::text").getall() == [ "Lorem Ipsum", "Infinite Scroll" ] assert isinstance(resp.meta["playwright_page"], PlaywrightPage) assert resp.meta["playwright_page"].url == resp.url await resp.meta["playwright_page"].close()
async def test_event_handler_dialog_str(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: spider = DialogSpider() req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": [ PageCoro("evaluate", "alert('foobar');"), ], "playwright_page_event_handlers": { "dialog": "handle_dialog", }, }, ) await handler._download_request(req, spider) assert spider.dialog_message == "foobar"
async def test_deprecated_setting(self): settings = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXT_ARGS": { "storage_state": { "cookies": [ { "url": "https://example.org", "name": "asdf", "value": "qwerty", }, ], }, }, } with warnings.catch_warnings(record=True) as warning_list: async with make_handler(settings) as handler: assert warning_list[0].category is DeprecationWarning assert str(warning_list[0].message) == ( "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use" " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in" " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context" ) with StaticMockServer() as server: meta = { "playwright": True, "playwright_include_page": True, } req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) page = resp.meta["playwright_page"] storage_state = await page.context.storage_state() await page.close() cookie = storage_state["cookies"][0] assert cookie["name"] == "asdf" assert cookie["value"] == "qwerty" assert cookie["domain"] == "example.org"
async def test_user_agent(self): settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXTS": { "default": { "user_agent": self.browser_type } }, "USER_AGENT": None, } async with make_handler(settings_dict) as handler: with MockServer() as server: # if Scrapy's user agent is None, use the one from the Browser req = Request( url=server.urljoin("/headers"), meta={"playwright": True}, ) resp = await handler._download_request(req, Spider("foo")) headers = json.loads(resp.css("pre::text").get()) headers = { key.lower(): value for key, value in headers.items() } assert headers["user-agent"] == self.browser_type # if Scrapy's user agent is set to some value, use it req = Request( url=server.urljoin("/headers"), meta={"playwright": True}, headers={"User-Agent": "foobar"}, ) resp = await handler._download_request(req, Spider("foo")) headers = json.loads(resp.css("pre::text").get()) headers = { key.lower(): value for key, value in headers.items() } assert headers["user-agent"] == "foobar"
async def test_page_coroutine_screenshot(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with NamedTemporaryFile(mode="w+b") as png_file: with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": { "png": PageCoro("screenshot", path=png_file.name, type="png"), }, }, ) await handler._download_request(req, Spider("foo")) png_file.file.seek(0) assert png_file.file.read( ) == req.meta["playwright_page_coroutines"]["png"].result assert get_mimetype(png_file) == "image/png"
async def test_event_handler_dialog_missing(self, caplog): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: spider = DialogSpider() req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_event_handlers": { "dialog": "missing_method", }, }, ) await handler._download_request(req, spider) assert ( "scrapy-playwright", logging.WARNING, "Spider 'dialog' does not have a 'missing_method' attribute," " ignoring handler for event 'dialog'", ) in caplog.record_tuples assert getattr(spider, "dialog_message", None) is None
async def test_page_coroutine_navigation(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": [PageCoro("click", "a.lorem_ipsum")], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/lorem_ipsum.html") assert resp.status == 200 assert "playwright" in resp.flags assert resp.css("title::text").get() == "Lorem Ipsum" text = resp.css("p::text").get() assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
async def test_page_coroutine_pdf(self): if self.browser_type != "chromium": pytest.skip("PDF generation is supported only in Chromium") async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with NamedTemporaryFile(mode="w+b") as pdf_file: with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": { "pdf": PageCoro("pdf", path=pdf_file.name), }, }, ) await handler._download_request(req, Spider("foo")) pdf_file.file.seek(0) assert pdf_file.file.read( ) == req.meta["playwright_page_coroutines"]["pdf"].result assert get_mimetype(pdf_file) == "application/pdf"
async def test_page_coroutine_infinite_scroll(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: req = Request( url=server.urljoin("/scroll.html"), headers={"User-Agent": "scrapy-playwright"}, meta={ "playwright": True, "playwright_page_coroutines": [ PageCoro("wait_for_selector", selector="div.quote"), PageCoro( "evaluate", "window.scrollBy(0, document.body.scrollHeight)" ), PageCoro("wait_for_selector", selector="div.quote:nth-child(11)"), PageCoro( "evaluate", "window.scrollBy(0, document.body.scrollHeight)" ), PageCoro("wait_for_selector", selector="div.quote:nth-child(21)"), ], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/scroll.html") assert resp.status == 200 assert "playwright" in resp.flags assert len(resp.css("div.quote")) == 30