async def test_contexts_startup(self):
        settings = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_CONTEXTS": {
                "first": {
                    "storage_state": {
                        "cookies": [
                            {
                                "url": "https://example.org",
                                "name": "foo",
                                "value": "bar",
                            },
                        ],
                    },
                },
            },
        }
        async with make_handler(settings) as handler:
            with StaticMockServer() as server:
                meta = {
                    "playwright": True,
                    "playwright_include_page": True,
                    "playwright_context": "first",
                }
                req = Request(server.urljoin("/index.html"), meta=meta)
                resp = await handler._download_request(req, Spider("foo"))

            page = resp.meta["playwright_page"]
            storage_state = await page.context.storage_state()
            await page.context.close()
            await page.close()
            cookie = storage_state["cookies"][0]
            assert cookie["name"] == "foo"
            assert cookie["value"] == "bar"
            assert cookie["domain"] == "example.org"
Example #2
0
    async def test_page_coroutine_pdf(self):
        if self.browser_type != "chromium":
            pytest.skip("PDF generation is supported only in Chromium")

        pdf_file = NamedTemporaryFile(mode="w+b")
        handler = ScrapyPlaywrightDownloadHandler(
            get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
        )
        await handler._launch_browser()

        with StaticMockServer() as server:
            req = Request(
                url=server.urljoin("/index.html"),
                meta={
                    "playwright": True,
                    "playwright_page_coroutines": {
                        "pdf": PageCoro("pdf", path=pdf_file.name),
                    },
                },
            )
            await handler._download_request(req, Spider("foo"))

            assert get_mimetype(pdf_file) == "application/pdf"

            pdf_file.file.seek(0)
            assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result

            pdf_file.close()

        await handler.browser.close()
Example #3
0
    async def test_page_coroutine_navigation(self):
        handler = ScrapyPlaywrightDownloadHandler(
            get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
        )
        await handler._launch_browser()

        with StaticMockServer() as server:
            req = Request(
                url=server.urljoin("/index.html"),
                meta={
                    "playwright": True,
                    "playwright_page_coroutines": [PageCoro("click", "a.lorem_ipsum")],
                },
            )
            resp = await handler._download_request(req, Spider("foo"))

        assert isinstance(resp, HtmlResponse)
        assert resp.request is req
        assert resp.url == server.urljoin("/lorem_ipsum.html")
        assert resp.status == 200
        assert "playwright" in resp.flags
        assert resp.css("title::text").get() == "Lorem Ipsum"
        text = resp.css("p::text").get()
        assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit."

        await handler.browser.close()
Example #4
0
    async def test_page_coroutine_infinite_scroll(self):
        handler = ScrapyPlaywrightDownloadHandler(
            get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
        )
        await handler._launch_browser()

        with StaticMockServer() as server:
            req = Request(
                url=server.urljoin("/scroll.html"),
                headers={"User-Agent": "scrapy-playwright"},
                meta={
                    "playwright": True,
                    "playwright_page_coroutines": [
                        PageCoro("wait_for_selector", selector="div.quote"),
                        PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
                        PageCoro("wait_for_selector", selector="div.quote:nth-child(11)"),
                        PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
                        PageCoro("wait_for_selector", selector="div.quote:nth-child(21)"),
                    ],
                },
            )
            resp = await handler._download_request(req, Spider("foo"))

        assert isinstance(resp, HtmlResponse)
        assert resp.request is req
        assert resp.url == server.urljoin("/scroll.html")
        assert resp.status == 200
        assert "playwright" in resp.flags
        assert len(resp.css("div.quote")) == 30

        await handler.browser.close()
Example #5
0
    async def test_page_coroutine_screenshot(self):
        png_file = NamedTemporaryFile(mode="w+b")
        handler = ScrapyPlaywrightDownloadHandler(
            get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
        )
        await handler._launch_browser()

        with StaticMockServer() as server:
            req = Request(
                url=server.urljoin("/index.html"),
                meta={
                    "playwright": True,
                    "playwright_page_coroutines": {
                        "png": PageCoro("screenshot", path=png_file.name, type="png"),
                    },
                },
            )
            await handler._download_request(req, Spider("foo"))

            assert get_mimetype(png_file) == "image/png"

            png_file.file.seek(0)
            assert png_file.file.read() == req.meta["playwright_page_coroutines"]["png"].result

            png_file.close()

        await handler.browser.close()
async def test_page_coroutine_infinite_scroll():
    handler = ScrapyPyppeteerDownloadHandler(get_crawler())
    await handler._launch_browser()

    with StaticMockServer() as server:
        req = Request(
            url=server.urljoin("/scroll.html"),
            meta={
                "pyppeteer":
                True,
                "pyppeteer_page_coroutines": [
                    PageCoroutine("waitForSelector",
                                  "div.quote"),  # first 10 quotes
                    PageCoroutine("evaluate", "window.scrollBy(0, 2000)"),
                    PageCoroutine("waitForSelector",
                                  "div.quote:nth-child(11)"),  # 2nd request
                    PageCoroutine("evaluate", "window.scrollBy(0, 2000)"),
                    PageCoroutine("waitForSelector",
                                  "div.quote:nth-child(21)"),  # 3rd request
                ],
            },
        )
        resp = await handler._download_request(req, Spider("foo"))

    assert isinstance(resp, HtmlResponse)
    assert resp.request is req
    assert resp.url == server.urljoin("/scroll.html")
    assert resp.status == 200
    assert "pyppeteer" in resp.flags
    assert len(resp.css("div.quote")) == 30

    await handler.browser.close()
async def test_page_coroutine_navigation():
    handler = ScrapyPyppeteerDownloadHandler(get_crawler())
    await handler._launch_browser()

    with StaticMockServer() as server:
        req = Request(
            url=server.urljoin("/index.html"),
            meta={
                "pyppeteer":
                True,
                "pyppeteer_page_coroutines":
                [NavigationPageCoroutine("click", "a.lorem_ipsum")],
            },
        )
        resp = await handler._download_request(req, Spider("foo"))

    assert isinstance(resp, HtmlResponse)
    assert resp.request is req
    assert resp.url == server.urljoin("/lorem_ipsum.html")
    assert resp.status == 200
    assert "pyppeteer" in resp.flags
    assert resp.css("title::text").get() == "Lorem Ipsum"
    text = resp.css("p::text").get()
    assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit."

    await handler.browser.close()
Example #8
0
 async def test_context_kwargs(self):
     settings_dict = {
         "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
         "PLAYWRIGHT_CONTEXTS": {
             "default": {
                 "java_script_enabled": False
             },
         },
     }
     async with make_handler(settings_dict) as handler:
         with StaticMockServer() as server:
             req = Request(
                 url=server.urljoin("/scroll.html"),
                 meta={
                     "playwright":
                     True,
                     "playwright_page_coroutines": [
                         PageCoro("wait_for_selector",
                                  selector="div.quote",
                                  timeout=1000),
                     ],
                 },
             )
             with pytest.raises(TimeoutError):
                 await handler._download_request(req, Spider("foo"))
    async def test_contexts_dynamic(self):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE":
                                 self.browser_type}) as handler:

            with StaticMockServer() as server:
                meta = {
                    "playwright": True,
                    "playwright_include_page": True,
                    "playwright_context": "new",
                    "playwright_context_kwargs": {
                        "storage_state": {
                            "cookies": [
                                {
                                    "url": "https://example.org",
                                    "name": "asdf",
                                    "value": "qwerty",
                                },
                            ],
                        },
                    },
                }
                req = Request(server.urljoin("/index.html"), meta=meta)
                resp = await handler._download_request(req, Spider("foo"))

            page = resp.meta["playwright_page"]
            storage_state = await page.context.storage_state()
            await page.close()
            cookie = storage_state["cookies"][0]
            assert cookie["name"] == "asdf"
            assert cookie["value"] == "qwerty"
            assert cookie["domain"] == "example.org"
async def test_default_page_coroutine_timeout():
    crawler = get_crawler(
        settings_dict={"PYPPETEER_PAGE_COROUTINE_TIMEOUT": 1000})
    handler = ScrapyPyppeteerDownloadHandler(crawler)
    await handler._launch_browser()

    with StaticMockServer() as server:
        req = Request(
            url=server.urljoin("/index.html"),
            meta={
                "pyppeteer":
                True,
                "pyppeteer_page_coroutines": [
                    NavigationPageCoroutine("waitForXPath",
                                            '//*[@id="test"]/test')
                ],
            },
        )
        with pytest.raises(pyppeteer.errors.TimeoutError):
            start = time()
            await handler._download_request(req, Spider("foo"))
        elapsed = time() - start
        assert 1 < elapsed < 2  # 1000 ms of tolerance

    await handler.browser.close()
class MixedRequestsTestCase(TestCase):
    @defer.inlineCallbacks
    def setUp(self):
        self.server = StaticMockServer()
        self.server.__enter__()
        self.handler = ScrapyPyppeteerDownloadHandler.from_crawler(get_crawler())
        yield self.handler._engine_started_handler()

    @defer.inlineCallbacks
    def tearDown(self):
        self.server.__exit__(None, None, None)
        yield self.handler.close()

    def test_regular_request(self):
        def _test(response):
            self.assertIsInstance(response, Response)
            self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"])
            self.assertEqual(response.url, request.url)
            self.assertEqual(response.status, 200)
            self.assertNotIn("pyppeteer", response.flags)

        request = Request(self.server.urljoin("/index.html"))
        return self.handler.download_request(request, Spider("foo")).addCallback(_test)

    def test_pyppeteer_request(self):
        def _test(response):
            self.assertIsInstance(response, Response)
            self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"])
            self.assertEqual(response.url, request.url)
            self.assertEqual(response.status, 200)
            self.assertIn("pyppeteer", response.flags)

        request = Request(self.server.urljoin("/index.html"), meta={"pyppeteer": True})
        return self.handler.download_request(request, Spider("foo")).addCallback(_test)
async def test_basic_response():
    handler = ScrapyPyppeteerDownloadHandler(get_crawler())
    await handler._launch_browser()

    with StaticMockServer() as server:
        req = Request(server.urljoin("/index.html"), meta={"pyppeteer": True})
        resp = await handler._download_request(req, Spider("foo"))

    assert isinstance(resp, HtmlResponse)
    assert resp.request is req
    assert resp.url == req.url
    assert resp.status == 200
    assert "pyppeteer" in resp.flags
    assert resp.css("a::text").getall() == ["Lorem Ipsum", "Infinite Scroll"]

    await handler.browser.close()
async def test_page_coroutine_screenshot_pdf():
    def get_mimetype(file):
        return subprocess.run(
            ["file", "--mime-type", "--brief", file.name],
            stdout=subprocess.PIPE,
            universal_newlines=True,
        ).stdout.strip()

    png_file = NamedTemporaryFile(mode="w+b")
    pdf_file = NamedTemporaryFile(mode="w+b")
    handler = ScrapyPyppeteerDownloadHandler(get_crawler())
    await handler._launch_browser()

    with StaticMockServer() as server:
        req = Request(
            url=server.urljoin("/index.html"),
            meta={
                "pyppeteer": True,
                "pyppeteer_page_coroutines": {
                    "png":
                    PageCoroutine("screenshot",
                                  options={
                                      "path": png_file.name,
                                      "type": "png"
                                  }),
                    "pdf":
                    PageCoroutine("pdf", options={"path": pdf_file.name}),
                },
            },
        )
        await handler._download_request(req, Spider("foo"))

        assert get_mimetype(png_file) == "image/png"
        assert get_mimetype(pdf_file) == "application/pdf"

        png_file.file.seek(0)
        assert png_file.file.read(
        ) == req.meta["pyppeteer_page_coroutines"]["png"].result
        pdf_file.file.seek(0)
        assert pdf_file.file.read(
        ) == req.meta["pyppeteer_page_coroutines"]["pdf"].result

        png_file.close()
        pdf_file.close()

    await handler.browser.close()
async def test_page_coroutine_timeout():
    crawler = get_crawler(settings_dict={"PYPPETEER_NAVIGATION_TIMEOUT": 1000})
    handler = ScrapyPyppeteerDownloadHandler(crawler)
    await handler._launch_browser()

    with StaticMockServer() as server:
        req = Request(
            url=server.urljoin("/index.html"),
            meta={
                "pyppeteer":
                True,
                "pyppeteer_page_coroutines":
                [NavigationPageCoroutine("click", selector="h1")],
            },
        )
        with pytest.raises(pyppeteer.errors.TimeoutError):
            await handler._download_request(req, Spider("foo"))

    await handler.browser.close()
async def test_page_to_callback():
    handler = ScrapyPyppeteerDownloadHandler(get_crawler())
    await handler._launch_browser()

    async def callback(self, response, page: pyppeteer.page.Page):
        pass

    with StaticMockServer() as server:
        req = Request(server.urljoin("/index.html"),
                      callback,
                      meta={"pyppeteer": True})
        resp = await handler._download_request(req, Spider("foo"))

    page = resp.request.cb_kwargs["page"]
    assert isinstance(page, pyppeteer.page.Page)
    assert (await page.title()) == "Awesome site"

    await page.close()
    await handler.browser.close()
class MixedRequestsTestCase(TestCase):
    """
    This test case ensures the handler's 'download_request' method works as expected, and
    non-playwright requests are processed correctly. The rest of the tests directly call
    '_download_request', which is a coroutine ('download_request' returns a Deferred).
    This stopped working under py37 with playwright==1.10, but I couldn't find anything in the
    release notes to explain the change. Also, playwright doesn't currently define a __version__
    attribute, which would allow to enable the test for playwright<=1.9
    """
    @defer.inlineCallbacks
    def setUp(self):
        self.server = StaticMockServer()
        self.server.__enter__()
        self.handler = ScrapyPlaywrightDownloadHandler.from_crawler(
            get_crawler())
        yield self.handler._engine_started()

    @defer.inlineCallbacks
    def tearDown(self):
        self.server.__exit__(None, None, None)
        yield self.handler.close()

    def test_regular_request(self):
        def _test(response):
            self.assertIsInstance(response, Response)
            self.assertEqual(
                response.css("a::text").getall(),
                ["Lorem Ipsum", "Infinite Scroll"])
            self.assertEqual(response.url, request.url)
            self.assertEqual(response.status, 200)
            self.assertNotIn("playwright", response.flags)

        request = Request(self.server.urljoin("/index.html"))
        return self.handler.download_request(request,
                                             Spider("foo")).addCallback(_test)

    def test_playwright_request(self):
        def _test(response):
            self.assertIsInstance(response, Response)
            self.assertEqual(
                response.css("a::text").getall(),
                ["Lorem Ipsum", "Infinite Scroll"])
            self.assertEqual(response.url, request.url)
            self.assertEqual(response.status, 200)
            self.assertIn("playwright", response.flags)

        request = Request(self.server.urljoin("/index.html"),
                          meta={"playwright": True})
        return self.handler.download_request(request,
                                             Spider("foo")).addCallback(_test)
Example #17
0
    async def test_basic_response(self):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE":
                                 self.browser_type}) as handler:
            with StaticMockServer() as server:
                meta = {"playwright": True, "playwright_include_page": True}
                req = Request(server.urljoin("/index.html"), meta=meta)
                resp = await handler._download_request(req, Spider("foo"))

            assert isinstance(resp, HtmlResponse)
            assert resp.request is req
            assert resp.url == req.url
            assert resp.status == 200
            assert "playwright" in resp.flags
            assert resp.css("a::text").getall() == [
                "Lorem Ipsum", "Infinite Scroll"
            ]
            assert isinstance(resp.meta["playwright_page"], PlaywrightPage)
            assert resp.meta["playwright_page"].url == resp.url

            await resp.meta["playwright_page"].close()
Example #18
0
    async def test_event_handler_dialog_str(self):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE":
                                 self.browser_type}) as handler:
            with StaticMockServer() as server:
                spider = DialogSpider()
                req = Request(
                    url=server.urljoin("/index.html"),
                    meta={
                        "playwright":
                        True,
                        "playwright_page_coroutines": [
                            PageCoro("evaluate", "alert('foobar');"),
                        ],
                        "playwright_page_event_handlers": {
                            "dialog": "handle_dialog",
                        },
                    },
                )
                await handler._download_request(req, spider)

            assert spider.dialog_message == "foobar"
    async def test_deprecated_setting(self):
        settings = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_CONTEXT_ARGS": {
                "storage_state": {
                    "cookies": [
                        {
                            "url": "https://example.org",
                            "name": "asdf",
                            "value": "qwerty",
                        },
                    ],
                },
            },
        }
        with warnings.catch_warnings(record=True) as warning_list:
            async with make_handler(settings) as handler:
                assert warning_list[0].category is DeprecationWarning
                assert str(warning_list[0].message) == (
                    "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use"
                    " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in"
                    " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context"
                )

                with StaticMockServer() as server:
                    meta = {
                        "playwright": True,
                        "playwright_include_page": True,
                    }
                    req = Request(server.urljoin("/index.html"), meta=meta)
                    resp = await handler._download_request(req, Spider("foo"))

                page = resp.meta["playwright_page"]
                storage_state = await page.context.storage_state()
                await page.close()
                cookie = storage_state["cookies"][0]
                assert cookie["name"] == "asdf"
                assert cookie["value"] == "qwerty"
                assert cookie["domain"] == "example.org"
Example #20
0
    async def test_event_handler_dialog_missing(self, caplog):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE":
                                 self.browser_type}) as handler:
            with StaticMockServer() as server:
                spider = DialogSpider()
                req = Request(
                    url=server.urljoin("/index.html"),
                    meta={
                        "playwright": True,
                        "playwright_page_event_handlers": {
                            "dialog": "missing_method",
                        },
                    },
                )
                await handler._download_request(req, spider)

        assert (
            "scrapy-playwright",
            logging.WARNING,
            "Spider 'dialog' does not have a 'missing_method' attribute,"
            " ignoring handler for event 'dialog'",
        ) in caplog.record_tuples
        assert getattr(spider, "dialog_message", None) is None
Example #21
0
    async def test_page_coroutine_screenshot(self):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE":
                                 self.browser_type}) as handler:
            with NamedTemporaryFile(mode="w+b") as png_file:
                with StaticMockServer() as server:
                    req = Request(
                        url=server.urljoin("/index.html"),
                        meta={
                            "playwright": True,
                            "playwright_page_coroutines": {
                                "png":
                                PageCoro("screenshot",
                                         path=png_file.name,
                                         type="png"),
                            },
                        },
                    )
                    await handler._download_request(req, Spider("foo"))

                png_file.file.seek(0)
                assert png_file.file.read(
                ) == req.meta["playwright_page_coroutines"]["png"].result
                assert get_mimetype(png_file) == "image/png"
Example #22
0
 def setUp(self):
     self.server = StaticMockServer()
     self.server.__enter__()
     self.handler = ScrapyPyppeteerDownloadHandler.from_crawler(
         get_crawler())
     yield self.handler._launch_browser_signal_handler()
 def setUp(self):
     self.server = StaticMockServer()
     self.server.__enter__()
     self.handler = ScrapyPyppeteerDownloadHandler.from_crawler(get_crawler())
     yield self.handler._engine_started_handler()