async def test_page_coroutine_infinite_scroll(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/scroll.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [ PageCoroutine("waitForSelector", "div.quote"), # first 10 quotes PageCoroutine("evaluate", "window.scrollBy(0, 2000)"), PageCoroutine("waitForSelector", "div.quote:nth-child(11)"), # 2nd request PageCoroutine("evaluate", "window.scrollBy(0, 2000)"), PageCoroutine("waitForSelector", "div.quote:nth-child(21)"), # 3rd request ], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/scroll.html") assert resp.status == 200 assert "pyppeteer" in resp.flags assert len(resp.css("div.quote")) == 30 await handler.browser.close()
async def test_default_page_coroutine_timeout(): crawler = get_crawler( settings_dict={"PYPPETEER_PAGE_COROUTINE_TIMEOUT": 1000}) handler = ScrapyPyppeteerDownloadHandler(crawler) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [ NavigationPageCoroutine("waitForXPath", '//*[@id="test"]/test') ], }, ) with pytest.raises(pyppeteer.errors.TimeoutError): start = time() await handler._download_request(req, Spider("foo")) elapsed = time() - start assert 1 < elapsed < 2 # 1000 ms of tolerance await handler.browser.close()
async def test_page_coroutine_navigation(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [NavigationPageCoroutine("click", "a.lorem_ipsum")], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/lorem_ipsum.html") assert resp.status == 200 assert "pyppeteer" in resp.flags assert resp.css("title::text").get() == "Lorem Ipsum" text = resp.css("p::text").get() assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit." await handler.browser.close()
async def test_basic_response(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request(server.urljoin("/index.html"), meta={"pyppeteer": True}) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "pyppeteer" in resp.flags assert resp.css("a::text").getall() == ["Lorem Ipsum", "Infinite Scroll"] await handler.browser.close()
async def test_post_request(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with PostMockServer() as server: req = FormRequest(server.urljoin("/"), meta={"pyppeteer": True}, formdata={"foo": "bar"}) resp = await handler._download_request(req, Spider("foo")) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "pyppeteer" in resp.flags assert "Request body: foo=bar" in resp.text await handler.browser.close()
async def test_page_coroutine_screenshot_pdf(): def get_mimetype(file): return subprocess.run( ["file", "--mime-type", "--brief", file.name], stdout=subprocess.PIPE, universal_newlines=True, ).stdout.strip() png_file = NamedTemporaryFile(mode="w+b") pdf_file = NamedTemporaryFile(mode="w+b") handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": { "png": PageCoroutine("screenshot", options={ "path": png_file.name, "type": "png" }), "pdf": PageCoroutine("pdf", options={"path": pdf_file.name}), }, }, ) await handler._download_request(req, Spider("foo")) assert get_mimetype(png_file) == "image/png" assert get_mimetype(pdf_file) == "application/pdf" png_file.file.seek(0) assert png_file.file.read( ) == req.meta["pyppeteer_page_coroutines"]["png"].result pdf_file.file.seek(0) assert pdf_file.file.read( ) == req.meta["pyppeteer_page_coroutines"]["pdf"].result png_file.close() pdf_file.close() await handler.browser.close()
async def test_page_to_callback(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() async def callback(self, response, page: pyppeteer.page.Page): pass with StaticMockServer() as server: req = Request(server.urljoin("/index.html"), callback, meta={"pyppeteer": True}) resp = await handler._download_request(req, Spider("foo")) page = resp.request.cb_kwargs["page"] assert isinstance(page, pyppeteer.page.Page) assert (await page.title()) == "Awesome site" await page.close() await handler.browser.close()
async def test_page_coroutine_timeout(): crawler = get_crawler(settings_dict={"PYPPETEER_NAVIGATION_TIMEOUT": 1000}) handler = ScrapyPyppeteerDownloadHandler(crawler) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [NavigationPageCoroutine("click", selector="h1")], }, ) with pytest.raises(pyppeteer.errors.TimeoutError): await handler._download_request(req, Spider("foo")) await handler.browser.close()
def setUp(self): self.server = StaticMockServer() self.server.__enter__() self.handler = ScrapyPyppeteerDownloadHandler.from_crawler(get_crawler()) yield self.handler._engine_started_handler()
def setUp(self): self.server = StaticMockServer() self.server.__enter__() self.handler = ScrapyPyppeteerDownloadHandler.from_crawler( get_crawler()) yield self.handler._launch_browser_signal_handler()