Example #1
0
    async def test_page_coroutine_navigation(self):
        handler = ScrapyPlaywrightDownloadHandler(
            get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
        )
        await handler._launch_browser()

        with StaticMockServer() as server:
            req = Request(
                url=server.urljoin("/index.html"),
                meta={
                    "playwright": True,
                    "playwright_page_coroutines": [PageCoro("click", "a.lorem_ipsum")],
                },
            )
            resp = await handler._download_request(req, Spider("foo"))

        assert isinstance(resp, HtmlResponse)
        assert resp.request is req
        assert resp.url == server.urljoin("/lorem_ipsum.html")
        assert resp.status == 200
        assert "playwright" in resp.flags
        assert resp.css("title::text").get() == "Lorem Ipsum"
        text = resp.css("p::text").get()
        assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit."

        await handler.browser.close()
Example #2
0
    async def test_page_coroutine_screenshot(self):
        png_file = NamedTemporaryFile(mode="w+b")
        handler = ScrapyPlaywrightDownloadHandler(
            get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
        )
        await handler._launch_browser()

        with StaticMockServer() as server:
            req = Request(
                url=server.urljoin("/index.html"),
                meta={
                    "playwright": True,
                    "playwright_page_coroutines": {
                        "png": PageCoro("screenshot", path=png_file.name, type="png"),
                    },
                },
            )
            await handler._download_request(req, Spider("foo"))

            assert get_mimetype(png_file) == "image/png"

            png_file.file.seek(0)
            assert png_file.file.read() == req.meta["playwright_page_coroutines"]["png"].result

            png_file.close()

        await handler.browser.close()
Example #3
0
    async def test_page_coroutine_pdf(self):
        if self.browser_type != "chromium":
            pytest.skip("PDF generation is supported only in Chromium")

        pdf_file = NamedTemporaryFile(mode="w+b")
        handler = ScrapyPlaywrightDownloadHandler(
            get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
        )
        await handler._launch_browser()

        with StaticMockServer() as server:
            req = Request(
                url=server.urljoin("/index.html"),
                meta={
                    "playwright": True,
                    "playwright_page_coroutines": {
                        "pdf": PageCoro("pdf", path=pdf_file.name),
                    },
                },
            )
            await handler._download_request(req, Spider("foo"))

            assert get_mimetype(pdf_file) == "application/pdf"

            pdf_file.file.seek(0)
            assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result

            pdf_file.close()

        await handler.browser.close()
Example #4
0
def test_hs_middlewares_retry(hs_downloader_middleware, hs_spider_middleware):
    spider = Spider('test')
    url = 'http://resp-url'
    request_0 = Request(url)
    response_0 = Response(url)

    hs_downloader_middleware.process_request(request_0, spider)

    assert HS_REQUEST_ID_KEY not in request_0.meta
    assert HS_PARENT_ID_KEY not in request_0.meta
    assert len(hs_spider_middleware._seen_requests) == 0
    assert len(hs_downloader_middleware._seen_requests) == 0

    hs_downloader_middleware.process_response(request_0, response_0, spider)

    assert request_0.meta[HS_REQUEST_ID_KEY] == 0
    assert request_0.meta[HS_PARENT_ID_KEY] is None
    assert hs_spider_middleware._seen_requests[request_0] == 0

    request_1 = request_0.copy()
    response_1 = Response(url)
    assert request_1.meta[HS_REQUEST_ID_KEY] == 0
    assert request_1.meta[HS_PARENT_ID_KEY] is None

    hs_downloader_middleware.process_request(request_1, spider)

    assert HS_REQUEST_ID_KEY not in request_1.meta
    assert request_1.meta[HS_PARENT_ID_KEY] == 0

    hs_downloader_middleware.process_response(request_1, response_1, spider)

    assert request_1.meta[HS_REQUEST_ID_KEY] == 1
    assert request_1.meta[HS_PARENT_ID_KEY] == 0
Example #5
0
    async def test_page_coroutine_infinite_scroll(self):
        handler = ScrapyPlaywrightDownloadHandler(
            get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type})
        )
        await handler._launch_browser()

        with StaticMockServer() as server:
            req = Request(
                url=server.urljoin("/scroll.html"),
                headers={"User-Agent": "scrapy-playwright"},
                meta={
                    "playwright": True,
                    "playwright_page_coroutines": [
                        PageCoro("wait_for_selector", selector="div.quote"),
                        PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
                        PageCoro("wait_for_selector", selector="div.quote:nth-child(11)"),
                        PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"),
                        PageCoro("wait_for_selector", selector="div.quote:nth-child(21)"),
                    ],
                },
            )
            resp = await handler._download_request(req, Spider("foo"))

        assert isinstance(resp, HtmlResponse)
        assert resp.request is req
        assert resp.url == server.urljoin("/scroll.html")
        assert resp.status == 200
        assert "playwright" in resp.flags
        assert len(resp.css("div.quote")) == 30

        await handler.browser.close()
 def setUp(self):
     self.spider = Spider('foo')
     self.settings = Settings()
     self.settings.setmodule(default_settings)
     self.settings.setdict(self.local_settings)
     self.storage = MongoStorage(self.settings)
     self.storage.open_spider(self.spider)
    async def test_contexts_dynamic(self):
        async with make_handler({"PLAYWRIGHT_BROWSER_TYPE":
                                 self.browser_type}) as handler:

            with StaticMockServer() as server:
                meta = {
                    "playwright": True,
                    "playwright_include_page": True,
                    "playwright_context": "new",
                    "playwright_context_kwargs": {
                        "storage_state": {
                            "cookies": [
                                {
                                    "url": "https://example.org",
                                    "name": "asdf",
                                    "value": "qwerty",
                                },
                            ],
                        },
                    },
                }
                req = Request(server.urljoin("/index.html"), meta=meta)
                resp = await handler._download_request(req, Spider("foo"))

            page = resp.meta["playwright_page"]
            storage_state = await page.context.storage_state()
            await page.close()
            cookie = storage_state["cookies"][0]
            assert cookie["name"] == "asdf"
            assert cookie["value"] == "qwerty"
            assert cookie["domain"] == "example.org"
    async def test_contexts_startup(self):
        settings = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_CONTEXTS": {
                "first": {
                    "storage_state": {
                        "cookies": [
                            {
                                "url": "https://example.org",
                                "name": "foo",
                                "value": "bar",
                            },
                        ],
                    },
                },
            },
        }
        async with make_handler(settings) as handler:
            with StaticMockServer() as server:
                meta = {
                    "playwright": True,
                    "playwright_include_page": True,
                    "playwright_context": "first",
                }
                req = Request(server.urljoin("/index.html"), meta=meta)
                resp = await handler._download_request(req, Spider("foo"))

            page = resp.meta["playwright_page"]
            storage_state = await page.context.storage_state()
            await page.context.close()
            await page.close()
            cookie = storage_state["cookies"][0]
            assert cookie["name"] == "foo"
            assert cookie["value"] == "bar"
            assert cookie["domain"] == "example.org"
Example #9
0
def test_log_formatter_scrapy_1():
    middleware = get_test_middleware()
    logformatter = CrawleraFetchLogFormatter()
    formatter = Formatter()
    spider = Spider("foo")

    for case in deepcopy(test_requests):
        original = case["original"]
        response = Response(original.url)
        processed = middleware.process_request(original, spider)

        crawlera_meta = original.meta.get("crawlera_fetch") or {}
        if crawlera_meta.get("skip"):
            assert processed is None
            continue

        # crawled
        result = logformatter.crawled(processed, response, spider)
        assert result["args"]["request"] == str(original)
        record = LogRecord(name="logger",
                           pathname="n/a",
                           lineno=1,
                           exc_info=None,
                           **result)
        logstr = formatter.format(record)
        expected = "Crawled (200) {request} ['original url: {url}'] (referer: None)".format(
            request=original, url=original.url)
        assert logstr == expected
async def test_default_page_coroutine_timeout():
    crawler = get_crawler(
        settings_dict={"PYPPETEER_PAGE_COROUTINE_TIMEOUT": 1000})
    handler = ScrapyPyppeteerDownloadHandler(crawler)
    await handler._launch_browser()

    with StaticMockServer() as server:
        req = Request(
            url=server.urljoin("/index.html"),
            meta={
                "pyppeteer":
                True,
                "pyppeteer_page_coroutines": [
                    NavigationPageCoroutine("waitForXPath",
                                            '//*[@id="test"]/test')
                ],
            },
        )
        with pytest.raises(pyppeteer.errors.TimeoutError):
            start = time()
            await handler._download_request(req, Spider("foo"))
        elapsed = time() - start
        assert 1 < elapsed < 2  # 1000 ms of tolerance

    await handler.browser.close()
def test_hs_ext_item_scraped(hs_ext):
    hs_ext._write_item = mock.Mock()
    item = Item()
    spider = Spider('test')
    hs_ext.item_scraped(item, spider)
    assert hs_ext._write_item.call_count == 1
    assert hs_ext._write_item.call_args[0] == ({'_type': 'Item'}, )
Example #12
0
    async def test_use_custom_headers(self):
        """Custom header processing function"""
        async def important_headers(*args, **kwargs) -> dict:
            return {"foo": "bar"}

        settings_dict = {
            "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
            "PLAYWRIGHT_CONTEXTS": {
                "default": {
                    "user_agent": self.browser_type
                }
            },
            "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers,
        }
        async with make_handler(settings_dict) as handler:
            with MockServer() as server:
                req = Request(
                    url=server.urljoin("/headers"),
                    meta={"playwright": True},
                    headers={
                        "User-Agent": "foobar",
                        "Asdf": "qwerty"
                    },
                )
                resp = await handler._download_request(req, Spider("foo"))
                headers = json.loads(resp.css("pre::text").get())
                headers = {
                    key.lower(): value
                    for key, value in headers.items()
                }
                assert headers["foo"] == "bar"
                assert headers.get("user-agent") not in (self.browser_type,
                                                         "foobar")
                assert "asdf" not in headers
Example #13
0
    def test_middleware(self):
        m = PageActionsMiddleware()
        spider = Spider('test_spider')

        req = mkreq()
        spider.page_actions = [{"type": "click", "selector": "#showmore"}]
        m.process_request(req, spider)
        self.assertEqual(req.meta['splash']['endpoint'],
                         'execute')  # Page actions enabled

        req = mkreq()
        spider.page_actions = []
        m.process_request(req, spider)
        self.assertEqual(req.meta['splash']['endpoint'],
                         'render.html')  # Page actions disabled

        req = mkreq()
        spider.page_actions = [{
            "type": "click",
            "selector": "#showmore",
            "reject": "test\\.com"
        }]
        m.process_request(req, spider)
        self.assertEqual(req.meta['splash']['endpoint'],
                         'render.html')  # Page actions disabled
Example #14
0
 async def test_context_kwargs(self):
     settings_dict = {
         "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
         "PLAYWRIGHT_CONTEXTS": {
             "default": {
                 "java_script_enabled": False
             },
         },
     }
     async with make_handler(settings_dict) as handler:
         with StaticMockServer() as server:
             req = Request(
                 url=server.urljoin("/scroll.html"),
                 meta={
                     "playwright":
                     True,
                     "playwright_page_coroutines": [
                         PageCoro("wait_for_selector",
                                  selector="div.quote",
                                  timeout=1000),
                     ],
                 },
             )
             with pytest.raises(TimeoutError):
                 await handler._download_request(req, Spider("foo"))
Example #15
0
 async def test_use_playwright_headers(self):
     """Ignore Scrapy headers"""
     settings_dict = {
         "PLAYWRIGHT_BROWSER_TYPE":
         self.browser_type,
         "PLAYWRIGHT_CONTEXTS": {
             "default": {
                 "user_agent": self.browser_type
             }
         },
         "PLAYWRIGHT_PROCESS_REQUEST_HEADERS":
         "scrapy_playwright.headers.use_playwright_headers",  # noqa: E501
     }
     async with make_handler(settings_dict) as handler:
         with MockServer() as server:
             req = Request(
                 url=server.urljoin("/headers"),
                 meta={"playwright": True},
                 headers={
                     "User-Agent": "foobar",
                     "Asdf": "qwerty"
                 },
             )
             resp = await handler._download_request(req, Spider("foo"))
             headers = json.loads(resp.css("pre::text").get())
             headers = {
                 key.lower(): value
                 for key, value in headers.items()
             }
             assert headers["user-agent"] == self.browser_type
             assert "asdf" not in headers
Example #16
0
    def run_test(self, **kwargs):
        dt = TestData(**kwargs)
        settings = {
            "SPIDERMON_ENABLED": True,
            "SPIDERMON_SPIDER_OPEN_EXPRESSION_MONITORS": [
                {"tests": [{"expression": dt.expression}]}
            ],
        }
        settings.update(dt.settings)
        crawler = get_crawler(settings_dict=settings)
        crawler.stats.get_stats = lambda _: dt.stats
        spidermon = Spidermon.from_crawler(crawler)
        spider = Spider(name=self.spider_name)

        # mocking, to see test results via raising AssertionError exception
        # with failures and errors as results
        spidermon._run_suites = partial(_test_run_suites, spidermon)

        try:
            spidermon.spider_opened(spider)
        except AssertionError as e:
            failures, errors = e.args[0]
            for f in failures:
                _, trace = f
                raise AssertionError(trace)
            for e in errors:
                _, trace = e
                if dt.expected_error and dt.expected_error in trace:
                    dt.expected_error = None
                else:
                    raise AssertionError(trace)
            if dt.expected_error:
                raise AssertionError(
                    f"Expected error <{dt.expected_error}> was not raised"
                )
Example #17
0
 def setUp(self):
     self.spider = Spider("foo")
     self.settings = Settings()
     self.settings.setmodule(default_settings)
     self.settings.setdict(self.local_settings)
     self.storage = RedisStorage(self.settings)
     self.storage.open_spider(self.spider)
Example #18
0
def test_process_request_single_download_slot():
    middleware = get_test_middleware(
        settings={
            "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY": DownloadSlotPolicy.Single
        })

    for case in deepcopy(test_requests):
        original = case["original"]
        expected = case["expected"]
        if expected:
            expected.meta["download_slot"] = "__crawlera_fetch__"

        with shub_jobkey_env_variable():
            processed = middleware.process_request(original, Spider("foo"))

        crawlera_meta = original.meta.get("crawlera_fetch")
        if crawlera_meta.get("skip"):
            assert processed is None
        else:
            assert type(processed) is type(expected)
            assert processed.url == expected.url
            assert processed.method == expected.method
            assert processed.headers == expected.headers
            assert processed.meta == expected.meta
            processed_text = processed.body.decode(processed.encoding)
            expected_text = expected.body.decode(expected.encoding)
            assert json.loads(processed_text) == json.loads(expected_text)
async def test_page_coroutine_infinite_scroll():
    handler = ScrapyPyppeteerDownloadHandler(get_crawler())
    await handler._launch_browser()

    with StaticMockServer() as server:
        req = Request(
            url=server.urljoin("/scroll.html"),
            meta={
                "pyppeteer":
                True,
                "pyppeteer_page_coroutines": [
                    PageCoroutine("waitForSelector",
                                  "div.quote"),  # first 10 quotes
                    PageCoroutine("evaluate", "window.scrollBy(0, 2000)"),
                    PageCoroutine("waitForSelector",
                                  "div.quote:nth-child(11)"),  # 2nd request
                    PageCoroutine("evaluate", "window.scrollBy(0, 2000)"),
                    PageCoroutine("waitForSelector",
                                  "div.quote:nth-child(21)"),  # 3rd request
                ],
            },
        )
        resp = await handler._download_request(req, Spider("foo"))

    assert isinstance(resp, HtmlResponse)
    assert resp.request is req
    assert resp.url == server.urljoin("/scroll.html")
    assert resp.status == 200
    assert "pyppeteer" in resp.flags
    assert len(resp.css("div.quote")) == 30

    await handler.browser.close()
async def test_page_coroutine_navigation():
    handler = ScrapyPyppeteerDownloadHandler(get_crawler())
    await handler._launch_browser()

    with StaticMockServer() as server:
        req = Request(
            url=server.urljoin("/index.html"),
            meta={
                "pyppeteer":
                True,
                "pyppeteer_page_coroutines":
                [NavigationPageCoroutine("click", "a.lorem_ipsum")],
            },
        )
        resp = await handler._download_request(req, Spider("foo"))

    assert isinstance(resp, HtmlResponse)
    assert resp.request is req
    assert resp.url == server.urljoin("/lorem_ipsum.html")
    assert resp.status == 200
    assert "pyppeteer" in resp.flags
    assert resp.css("title::text").get() == "Lorem Ipsum"
    text = resp.css("p::text").get()
    assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit."

    await handler.browser.close()
 def setUp(self) -> None:
     self.settings = Settings()
     self.settings.setmodule(module=default_settings)
     self.settings.setdict(self.mongo_settings)
     self.spider = Spider(name="TestMongoPipeline")
     self.pipe = MongoPipeline.from_settings(settings=self.settings)
     yield self.pipe.open_spider(spider=None)
Example #22
0
 def _crawler(extended_settings={}):
     settings = {
         "EXTENSIONS": {"spider_feeder.loaders.StartUrlsLoader": 500},
     }
     settings.update(extended_settings)
     crawler = Crawler(Spider, settings=settings)
     crawler.spider = Spider("dummy")
     return crawler
Example #23
0
def test_process_request_scrapy_1():
    from tests.utils import get_test_middleware

    middleware = get_test_middleware()
    request = Request("https://example.org")
    with shub_jobkey_env_variable():
        processed = middleware.process_request(request, Spider("foo"))
    assert processed.flags == ["original url: https://example.org"]
def test_stats(mocked_time):
    middleware = get_test_middleware()
    spider = Spider("foo")

    count = 100
    nums = list(range(count))
    random.shuffle(nums)
    status_list = [random.randint(1, 15) for _ in range(count)]
    method_list = [
        random.choice(["GET", "POST", "PUT", "DELETE", "HEAD"])
        for _ in range(count)
    ]

    # expected values
    latencies = [2**n - n for n in nums]
    total_latency = sum(latencies)
    avg_latency = total_latency / count
    max_latency = max(latencies)

    for n, status, method in zip(nums, status_list, method_list):
        request = Request("https://example.org", method=method)
        mocked_time.return_value = n  # start_ts
        processed_request = middleware.process_request(request, spider)

        response = TextResponse(
            url="https://example.org",
            request=processed_request,
            body=json.dumps({
                "headers": {},
                "original_status": status,
                "body": "",
                "url": "http://"
            }).encode("utf-8"),
        )

        mocked_time.return_value = 2**n  # end_ts
        middleware.process_response(processed_request, response, spider)

    middleware.spider_closed(spider, "finished")

    assert middleware.stats.get_value("crawlera_fetch/request_count") == count
    assert middleware.stats.get_value("crawlera_fetch/response_count") == count
    assert middleware.stats.get_value(
        "crawlera_fetch/total_latency") == total_latency
    assert middleware.stats.get_value(
        "crawlera_fetch/avg_latency") == avg_latency
    assert middleware.stats.get_value(
        "crawlera_fetch/max_latency") == max_latency
    for status in set(status_list):
        sc = middleware.stats.get_value(
            "crawlera_fetch/response_status_count/{}".format(status))
        assert sc == status_list.count(status)
    for method in set(method_list):
        mc = middleware.stats.get_value(
            "crawlera_fetch/request_method_count/{}".format(method))
        assert mc == method_list.count(method)
 def setUp(self):
     self.persist = False
     self.key_prefix = 'scrapy_redis:tests:'
     self.queue_key = self.key_prefix + '%(spider)s:requests'
     self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter'
     self.idle_before_close = 0
     self.scheduler = Scheduler(self.server, self.persist, self.queue_key,
                                SpiderQueue, self.dupefilter_key,
                                self.idle_before_close)
     self.spider = Spider('myspider')
    def test_pyppeteer_request(self):
        def _test(response):
            self.assertIsInstance(response, Response)
            self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"])
            self.assertEqual(response.url, request.url)
            self.assertEqual(response.status, 200)
            self.assertIn("pyppeteer", response.flags)

        request = Request(self.server.urljoin("/index.html"), meta={"pyppeteer": True})
        return self.handler.download_request(request, Spider("foo")).addCallback(_test)
Example #27
0
 def test_callback_not_available(self):
     """Callback method is not available in the spider passed to from_dict"""
     spider = TestSpiderDelegation()
     r = Request("http://www.example.com",
                 callback=spider.delegated_callback)
     d = r.to_dict(spider=spider)
     self.assertRaises(ValueError,
                       request_from_dict,
                       d,
                       spider=Spider("foo"))
Example #28
0
 def _make_data(settings=None):
     crawler = Crawler(Spider, settings=settings)
     spider = Spider("dummy")
     return {
         "stats": crawler.stats.get_stats(),
         "crawler": crawler,
         "spider": spider,
         "runner": SpiderMonitorRunner(spider=spider),
         "job": None,
     }
Example #29
0
 def _crawler(extended_settings={}):
     settings = {
         "SPIDERMON_ENABLED": True,
         "EXTENSIONS": {
             "spidermon.contrib.scrapy.extensions.Spidermon": 500
         },
     }
     settings.update(extended_settings)
     crawler = Crawler(Spider, settings=settings)
     crawler.spider = Spider("dummy")
     return crawler
Example #30
0
 async def test_timeout(self):
     settings_dict = {
         "PLAYWRIGHT_BROWSER_TYPE": self.browser_type,
         "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000,
     }
     async with make_handler(settings_dict) as handler:
         with MockServer() as server:
             req = Request(server.urljoin("/delay/2"),
                           meta={"playwright": True})
             with pytest.raises(TimeoutError):
                 await handler._download_request(req, Spider("foo"))