async def test_contexts_startup(self): settings = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXTS": { "first": { "storage_state": { "cookies": [ { "url": "https://example.org", "name": "foo", "value": "bar", }, ], }, }, }, } async with make_handler(settings) as handler: with StaticMockServer() as server: meta = { "playwright": True, "playwright_include_page": True, "playwright_context": "first", } req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) page = resp.meta["playwright_page"] storage_state = await page.context.storage_state() await page.context.close() await page.close() cookie = storage_state["cookies"][0] assert cookie["name"] == "foo" assert cookie["value"] == "bar" assert cookie["domain"] == "example.org"
async def test_page_coroutine_pdf(self): if self.browser_type != "chromium": pytest.skip("PDF generation is supported only in Chromium") pdf_file = NamedTemporaryFile(mode="w+b") handler = ScrapyPlaywrightDownloadHandler( get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) ) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": { "pdf": PageCoro("pdf", path=pdf_file.name), }, }, ) await handler._download_request(req, Spider("foo")) assert get_mimetype(pdf_file) == "application/pdf" pdf_file.file.seek(0) assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result pdf_file.close() await handler.browser.close()
async def test_page_coroutine_navigation(self): handler = ScrapyPlaywrightDownloadHandler( get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) ) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": [PageCoro("click", "a.lorem_ipsum")], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/lorem_ipsum.html") assert resp.status == 200 assert "playwright" in resp.flags assert resp.css("title::text").get() == "Lorem Ipsum" text = resp.css("p::text").get() assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit." await handler.browser.close()
async def test_page_coroutine_infinite_scroll(self): handler = ScrapyPlaywrightDownloadHandler( get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) ) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/scroll.html"), headers={"User-Agent": "scrapy-playwright"}, meta={ "playwright": True, "playwright_page_coroutines": [ PageCoro("wait_for_selector", selector="div.quote"), PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), PageCoro("wait_for_selector", selector="div.quote:nth-child(11)"), PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), PageCoro("wait_for_selector", selector="div.quote:nth-child(21)"), ], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/scroll.html") assert resp.status == 200 assert "playwright" in resp.flags assert len(resp.css("div.quote")) == 30 await handler.browser.close()
async def test_page_coroutine_screenshot(self): png_file = NamedTemporaryFile(mode="w+b") handler = ScrapyPlaywrightDownloadHandler( get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) ) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": { "png": PageCoro("screenshot", path=png_file.name, type="png"), }, }, ) await handler._download_request(req, Spider("foo")) assert get_mimetype(png_file) == "image/png" png_file.file.seek(0) assert png_file.file.read() == req.meta["playwright_page_coroutines"]["png"].result png_file.close() await handler.browser.close()
async def test_page_coroutine_infinite_scroll(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/scroll.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [ PageCoroutine("waitForSelector", "div.quote"), # first 10 quotes PageCoroutine("evaluate", "window.scrollBy(0, 2000)"), PageCoroutine("waitForSelector", "div.quote:nth-child(11)"), # 2nd request PageCoroutine("evaluate", "window.scrollBy(0, 2000)"), PageCoroutine("waitForSelector", "div.quote:nth-child(21)"), # 3rd request ], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/scroll.html") assert resp.status == 200 assert "pyppeteer" in resp.flags assert len(resp.css("div.quote")) == 30 await handler.browser.close()
async def test_page_coroutine_navigation(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [NavigationPageCoroutine("click", "a.lorem_ipsum")], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/lorem_ipsum.html") assert resp.status == 200 assert "pyppeteer" in resp.flags assert resp.css("title::text").get() == "Lorem Ipsum" text = resp.css("p::text").get() assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit." await handler.browser.close()
async def test_context_kwargs(self): settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXTS": { "default": { "java_script_enabled": False }, }, } async with make_handler(settings_dict) as handler: with StaticMockServer() as server: req = Request( url=server.urljoin("/scroll.html"), meta={ "playwright": True, "playwright_page_coroutines": [ PageCoro("wait_for_selector", selector="div.quote", timeout=1000), ], }, ) with pytest.raises(TimeoutError): await handler._download_request(req, Spider("foo"))
async def test_contexts_dynamic(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: meta = { "playwright": True, "playwright_include_page": True, "playwright_context": "new", "playwright_context_kwargs": { "storage_state": { "cookies": [ { "url": "https://example.org", "name": "asdf", "value": "qwerty", }, ], }, }, } req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) page = resp.meta["playwright_page"] storage_state = await page.context.storage_state() await page.close() cookie = storage_state["cookies"][0] assert cookie["name"] == "asdf" assert cookie["value"] == "qwerty" assert cookie["domain"] == "example.org"
async def test_default_page_coroutine_timeout(): crawler = get_crawler( settings_dict={"PYPPETEER_PAGE_COROUTINE_TIMEOUT": 1000}) handler = ScrapyPyppeteerDownloadHandler(crawler) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [ NavigationPageCoroutine("waitForXPath", '//*[@id="test"]/test') ], }, ) with pytest.raises(pyppeteer.errors.TimeoutError): start = time() await handler._download_request(req, Spider("foo")) elapsed = time() - start assert 1 < elapsed < 2 # 1000 ms of tolerance await handler.browser.close()
class MixedRequestsTestCase(TestCase): @defer.inlineCallbacks def setUp(self): self.server = StaticMockServer() self.server.__enter__() self.handler = ScrapyPyppeteerDownloadHandler.from_crawler(get_crawler()) yield self.handler._engine_started_handler() @defer.inlineCallbacks def tearDown(self): self.server.__exit__(None, None, None) yield self.handler.close() def test_regular_request(self): def _test(response): self.assertIsInstance(response, Response) self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"]) self.assertEqual(response.url, request.url) self.assertEqual(response.status, 200) self.assertNotIn("pyppeteer", response.flags) request = Request(self.server.urljoin("/index.html")) return self.handler.download_request(request, Spider("foo")).addCallback(_test) def test_pyppeteer_request(self): def _test(response): self.assertIsInstance(response, Response) self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"]) self.assertEqual(response.url, request.url) self.assertEqual(response.status, 200) self.assertIn("pyppeteer", response.flags) request = Request(self.server.urljoin("/index.html"), meta={"pyppeteer": True}) return self.handler.download_request(request, Spider("foo")).addCallback(_test)
async def test_basic_response(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request(server.urljoin("/index.html"), meta={"pyppeteer": True}) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "pyppeteer" in resp.flags assert resp.css("a::text").getall() == ["Lorem Ipsum", "Infinite Scroll"] await handler.browser.close()
async def test_page_coroutine_screenshot_pdf(): def get_mimetype(file): return subprocess.run( ["file", "--mime-type", "--brief", file.name], stdout=subprocess.PIPE, universal_newlines=True, ).stdout.strip() png_file = NamedTemporaryFile(mode="w+b") pdf_file = NamedTemporaryFile(mode="w+b") handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": { "png": PageCoroutine("screenshot", options={ "path": png_file.name, "type": "png" }), "pdf": PageCoroutine("pdf", options={"path": pdf_file.name}), }, }, ) await handler._download_request(req, Spider("foo")) assert get_mimetype(png_file) == "image/png" assert get_mimetype(pdf_file) == "application/pdf" png_file.file.seek(0) assert png_file.file.read( ) == req.meta["pyppeteer_page_coroutines"]["png"].result pdf_file.file.seek(0) assert pdf_file.file.read( ) == req.meta["pyppeteer_page_coroutines"]["pdf"].result png_file.close() pdf_file.close() await handler.browser.close()
async def test_page_coroutine_timeout(): crawler = get_crawler(settings_dict={"PYPPETEER_NAVIGATION_TIMEOUT": 1000}) handler = ScrapyPyppeteerDownloadHandler(crawler) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [NavigationPageCoroutine("click", selector="h1")], }, ) with pytest.raises(pyppeteer.errors.TimeoutError): await handler._download_request(req, Spider("foo")) await handler.browser.close()
async def test_page_to_callback(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() async def callback(self, response, page: pyppeteer.page.Page): pass with StaticMockServer() as server: req = Request(server.urljoin("/index.html"), callback, meta={"pyppeteer": True}) resp = await handler._download_request(req, Spider("foo")) page = resp.request.cb_kwargs["page"] assert isinstance(page, pyppeteer.page.Page) assert (await page.title()) == "Awesome site" await page.close() await handler.browser.close()
class MixedRequestsTestCase(TestCase): """ This test case ensures the handler's 'download_request' method works as expected, and non-playwright requests are processed correctly. The rest of the tests directly call '_download_request', which is a coroutine ('download_request' returns a Deferred). This stopped working under py37 with playwright==1.10, but I couldn't find anything in the release notes to explain the change. Also, playwright doesn't currently define a __version__ attribute, which would allow to enable the test for playwright<=1.9 """ @defer.inlineCallbacks def setUp(self): self.server = StaticMockServer() self.server.__enter__() self.handler = ScrapyPlaywrightDownloadHandler.from_crawler( get_crawler()) yield self.handler._engine_started() @defer.inlineCallbacks def tearDown(self): self.server.__exit__(None, None, None) yield self.handler.close() def test_regular_request(self): def _test(response): self.assertIsInstance(response, Response) self.assertEqual( response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"]) self.assertEqual(response.url, request.url) self.assertEqual(response.status, 200) self.assertNotIn("playwright", response.flags) request = Request(self.server.urljoin("/index.html")) return self.handler.download_request(request, Spider("foo")).addCallback(_test) def test_playwright_request(self): def _test(response): self.assertIsInstance(response, Response) self.assertEqual( response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"]) self.assertEqual(response.url, request.url) self.assertEqual(response.status, 200) self.assertIn("playwright", response.flags) request = Request(self.server.urljoin("/index.html"), meta={"playwright": True}) return self.handler.download_request(request, Spider("foo")).addCallback(_test)
async def test_basic_response(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: meta = {"playwright": True, "playwright_include_page": True} req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "playwright" in resp.flags assert resp.css("a::text").getall() == [ "Lorem Ipsum", "Infinite Scroll" ] assert isinstance(resp.meta["playwright_page"], PlaywrightPage) assert resp.meta["playwright_page"].url == resp.url await resp.meta["playwright_page"].close()
async def test_event_handler_dialog_str(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: spider = DialogSpider() req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": [ PageCoro("evaluate", "alert('foobar');"), ], "playwright_page_event_handlers": { "dialog": "handle_dialog", }, }, ) await handler._download_request(req, spider) assert spider.dialog_message == "foobar"
async def test_deprecated_setting(self): settings = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXT_ARGS": { "storage_state": { "cookies": [ { "url": "https://example.org", "name": "asdf", "value": "qwerty", }, ], }, }, } with warnings.catch_warnings(record=True) as warning_list: async with make_handler(settings) as handler: assert warning_list[0].category is DeprecationWarning assert str(warning_list[0].message) == ( "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use" " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in" " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context" ) with StaticMockServer() as server: meta = { "playwright": True, "playwright_include_page": True, } req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) page = resp.meta["playwright_page"] storage_state = await page.context.storage_state() await page.close() cookie = storage_state["cookies"][0] assert cookie["name"] == "asdf" assert cookie["value"] == "qwerty" assert cookie["domain"] == "example.org"
async def test_event_handler_dialog_missing(self, caplog): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: spider = DialogSpider() req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_event_handlers": { "dialog": "missing_method", }, }, ) await handler._download_request(req, spider) assert ( "scrapy-playwright", logging.WARNING, "Spider 'dialog' does not have a 'missing_method' attribute," " ignoring handler for event 'dialog'", ) in caplog.record_tuples assert getattr(spider, "dialog_message", None) is None
async def test_page_coroutine_screenshot(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with NamedTemporaryFile(mode="w+b") as png_file: with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": { "png": PageCoro("screenshot", path=png_file.name, type="png"), }, }, ) await handler._download_request(req, Spider("foo")) png_file.file.seek(0) assert png_file.file.read( ) == req.meta["playwright_page_coroutines"]["png"].result assert get_mimetype(png_file) == "image/png"
def setUp(self): self.server = StaticMockServer() self.server.__enter__() self.handler = ScrapyPyppeteerDownloadHandler.from_crawler( get_crawler()) yield self.handler._launch_browser_signal_handler()
def setUp(self): self.server = StaticMockServer() self.server.__enter__() self.handler = ScrapyPyppeteerDownloadHandler.from_crawler(get_crawler()) yield self.handler._engine_started_handler()