async def test_page_coroutine_navigation(self): handler = ScrapyPlaywrightDownloadHandler( get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) ) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": [PageCoro("click", "a.lorem_ipsum")], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/lorem_ipsum.html") assert resp.status == 200 assert "playwright" in resp.flags assert resp.css("title::text").get() == "Lorem Ipsum" text = resp.css("p::text").get() assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit." await handler.browser.close()
async def test_page_coroutine_screenshot(self): png_file = NamedTemporaryFile(mode="w+b") handler = ScrapyPlaywrightDownloadHandler( get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) ) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": { "png": PageCoro("screenshot", path=png_file.name, type="png"), }, }, ) await handler._download_request(req, Spider("foo")) assert get_mimetype(png_file) == "image/png" png_file.file.seek(0) assert png_file.file.read() == req.meta["playwright_page_coroutines"]["png"].result png_file.close() await handler.browser.close()
async def test_page_coroutine_pdf(self): if self.browser_type != "chromium": pytest.skip("PDF generation is supported only in Chromium") pdf_file = NamedTemporaryFile(mode="w+b") handler = ScrapyPlaywrightDownloadHandler( get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) ) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": { "pdf": PageCoro("pdf", path=pdf_file.name), }, }, ) await handler._download_request(req, Spider("foo")) assert get_mimetype(pdf_file) == "application/pdf" pdf_file.file.seek(0) assert pdf_file.file.read() == req.meta["playwright_page_coroutines"]["pdf"].result pdf_file.close() await handler.browser.close()
def test_hs_middlewares_retry(hs_downloader_middleware, hs_spider_middleware): spider = Spider('test') url = 'http://resp-url' request_0 = Request(url) response_0 = Response(url) hs_downloader_middleware.process_request(request_0, spider) assert HS_REQUEST_ID_KEY not in request_0.meta assert HS_PARENT_ID_KEY not in request_0.meta assert len(hs_spider_middleware._seen_requests) == 0 assert len(hs_downloader_middleware._seen_requests) == 0 hs_downloader_middleware.process_response(request_0, response_0, spider) assert request_0.meta[HS_REQUEST_ID_KEY] == 0 assert request_0.meta[HS_PARENT_ID_KEY] is None assert hs_spider_middleware._seen_requests[request_0] == 0 request_1 = request_0.copy() response_1 = Response(url) assert request_1.meta[HS_REQUEST_ID_KEY] == 0 assert request_1.meta[HS_PARENT_ID_KEY] is None hs_downloader_middleware.process_request(request_1, spider) assert HS_REQUEST_ID_KEY not in request_1.meta assert request_1.meta[HS_PARENT_ID_KEY] == 0 hs_downloader_middleware.process_response(request_1, response_1, spider) assert request_1.meta[HS_REQUEST_ID_KEY] == 1 assert request_1.meta[HS_PARENT_ID_KEY] == 0
async def test_page_coroutine_infinite_scroll(self): handler = ScrapyPlaywrightDownloadHandler( get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) ) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/scroll.html"), headers={"User-Agent": "scrapy-playwright"}, meta={ "playwright": True, "playwright_page_coroutines": [ PageCoro("wait_for_selector", selector="div.quote"), PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), PageCoro("wait_for_selector", selector="div.quote:nth-child(11)"), PageCoro("evaluate", "window.scrollBy(0, document.body.scrollHeight)"), PageCoro("wait_for_selector", selector="div.quote:nth-child(21)"), ], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/scroll.html") assert resp.status == 200 assert "playwright" in resp.flags assert len(resp.css("div.quote")) == 30 await handler.browser.close()
def setUp(self): self.spider = Spider('foo') self.settings = Settings() self.settings.setmodule(default_settings) self.settings.setdict(self.local_settings) self.storage = MongoStorage(self.settings) self.storage.open_spider(self.spider)
async def test_contexts_dynamic(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: meta = { "playwright": True, "playwright_include_page": True, "playwright_context": "new", "playwright_context_kwargs": { "storage_state": { "cookies": [ { "url": "https://example.org", "name": "asdf", "value": "qwerty", }, ], }, }, } req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) page = resp.meta["playwright_page"] storage_state = await page.context.storage_state() await page.close() cookie = storage_state["cookies"][0] assert cookie["name"] == "asdf" assert cookie["value"] == "qwerty" assert cookie["domain"] == "example.org"
async def test_contexts_startup(self): settings = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXTS": { "first": { "storage_state": { "cookies": [ { "url": "https://example.org", "name": "foo", "value": "bar", }, ], }, }, }, } async with make_handler(settings) as handler: with StaticMockServer() as server: meta = { "playwright": True, "playwright_include_page": True, "playwright_context": "first", } req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) page = resp.meta["playwright_page"] storage_state = await page.context.storage_state() await page.context.close() await page.close() cookie = storage_state["cookies"][0] assert cookie["name"] == "foo" assert cookie["value"] == "bar" assert cookie["domain"] == "example.org"
def test_log_formatter_scrapy_1(): middleware = get_test_middleware() logformatter = CrawleraFetchLogFormatter() formatter = Formatter() spider = Spider("foo") for case in deepcopy(test_requests): original = case["original"] response = Response(original.url) processed = middleware.process_request(original, spider) crawlera_meta = original.meta.get("crawlera_fetch") or {} if crawlera_meta.get("skip"): assert processed is None continue # crawled result = logformatter.crawled(processed, response, spider) assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=1, exc_info=None, **result) logstr = formatter.format(record) expected = "Crawled (200) {request} ['original url: {url}'] (referer: None)".format( request=original, url=original.url) assert logstr == expected
async def test_default_page_coroutine_timeout(): crawler = get_crawler( settings_dict={"PYPPETEER_PAGE_COROUTINE_TIMEOUT": 1000}) handler = ScrapyPyppeteerDownloadHandler(crawler) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [ NavigationPageCoroutine("waitForXPath", '//*[@id="test"]/test') ], }, ) with pytest.raises(pyppeteer.errors.TimeoutError): start = time() await handler._download_request(req, Spider("foo")) elapsed = time() - start assert 1 < elapsed < 2 # 1000 ms of tolerance await handler.browser.close()
def test_hs_ext_item_scraped(hs_ext): hs_ext._write_item = mock.Mock() item = Item() spider = Spider('test') hs_ext.item_scraped(item, spider) assert hs_ext._write_item.call_count == 1 assert hs_ext._write_item.call_args[0] == ({'_type': 'Item'}, )
async def test_use_custom_headers(self): """Custom header processing function""" async def important_headers(*args, **kwargs) -> dict: return {"foo": "bar"} settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXTS": { "default": { "user_agent": self.browser_type } }, "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": important_headers, } async with make_handler(settings_dict) as handler: with MockServer() as server: req = Request( url=server.urljoin("/headers"), meta={"playwright": True}, headers={ "User-Agent": "foobar", "Asdf": "qwerty" }, ) resp = await handler._download_request(req, Spider("foo")) headers = json.loads(resp.css("pre::text").get()) headers = { key.lower(): value for key, value in headers.items() } assert headers["foo"] == "bar" assert headers.get("user-agent") not in (self.browser_type, "foobar") assert "asdf" not in headers
def test_middleware(self): m = PageActionsMiddleware() spider = Spider('test_spider') req = mkreq() spider.page_actions = [{"type": "click", "selector": "#showmore"}] m.process_request(req, spider) self.assertEqual(req.meta['splash']['endpoint'], 'execute') # Page actions enabled req = mkreq() spider.page_actions = [] m.process_request(req, spider) self.assertEqual(req.meta['splash']['endpoint'], 'render.html') # Page actions disabled req = mkreq() spider.page_actions = [{ "type": "click", "selector": "#showmore", "reject": "test\\.com" }] m.process_request(req, spider) self.assertEqual(req.meta['splash']['endpoint'], 'render.html') # Page actions disabled
async def test_context_kwargs(self): settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXTS": { "default": { "java_script_enabled": False }, }, } async with make_handler(settings_dict) as handler: with StaticMockServer() as server: req = Request( url=server.urljoin("/scroll.html"), meta={ "playwright": True, "playwright_page_coroutines": [ PageCoro("wait_for_selector", selector="div.quote", timeout=1000), ], }, ) with pytest.raises(TimeoutError): await handler._download_request(req, Spider("foo"))
async def test_use_playwright_headers(self): """Ignore Scrapy headers""" settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXTS": { "default": { "user_agent": self.browser_type } }, "PLAYWRIGHT_PROCESS_REQUEST_HEADERS": "scrapy_playwright.headers.use_playwright_headers", # noqa: E501 } async with make_handler(settings_dict) as handler: with MockServer() as server: req = Request( url=server.urljoin("/headers"), meta={"playwright": True}, headers={ "User-Agent": "foobar", "Asdf": "qwerty" }, ) resp = await handler._download_request(req, Spider("foo")) headers = json.loads(resp.css("pre::text").get()) headers = { key.lower(): value for key, value in headers.items() } assert headers["user-agent"] == self.browser_type assert "asdf" not in headers
def run_test(self, **kwargs): dt = TestData(**kwargs) settings = { "SPIDERMON_ENABLED": True, "SPIDERMON_SPIDER_OPEN_EXPRESSION_MONITORS": [ {"tests": [{"expression": dt.expression}]} ], } settings.update(dt.settings) crawler = get_crawler(settings_dict=settings) crawler.stats.get_stats = lambda _: dt.stats spidermon = Spidermon.from_crawler(crawler) spider = Spider(name=self.spider_name) # mocking, to see test results via raising AssertionError exception # with failures and errors as results spidermon._run_suites = partial(_test_run_suites, spidermon) try: spidermon.spider_opened(spider) except AssertionError as e: failures, errors = e.args[0] for f in failures: _, trace = f raise AssertionError(trace) for e in errors: _, trace = e if dt.expected_error and dt.expected_error in trace: dt.expected_error = None else: raise AssertionError(trace) if dt.expected_error: raise AssertionError( f"Expected error <{dt.expected_error}> was not raised" )
def setUp(self): self.spider = Spider("foo") self.settings = Settings() self.settings.setmodule(default_settings) self.settings.setdict(self.local_settings) self.storage = RedisStorage(self.settings) self.storage.open_spider(self.spider)
def test_process_request_single_download_slot(): middleware = get_test_middleware( settings={ "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY": DownloadSlotPolicy.Single }) for case in deepcopy(test_requests): original = case["original"] expected = case["expected"] if expected: expected.meta["download_slot"] = "__crawlera_fetch__" with shub_jobkey_env_variable(): processed = middleware.process_request(original, Spider("foo")) crawlera_meta = original.meta.get("crawlera_fetch") if crawlera_meta.get("skip"): assert processed is None else: assert type(processed) is type(expected) assert processed.url == expected.url assert processed.method == expected.method assert processed.headers == expected.headers assert processed.meta == expected.meta processed_text = processed.body.decode(processed.encoding) expected_text = expected.body.decode(expected.encoding) assert json.loads(processed_text) == json.loads(expected_text)
async def test_page_coroutine_infinite_scroll(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/scroll.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [ PageCoroutine("waitForSelector", "div.quote"), # first 10 quotes PageCoroutine("evaluate", "window.scrollBy(0, 2000)"), PageCoroutine("waitForSelector", "div.quote:nth-child(11)"), # 2nd request PageCoroutine("evaluate", "window.scrollBy(0, 2000)"), PageCoroutine("waitForSelector", "div.quote:nth-child(21)"), # 3rd request ], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/scroll.html") assert resp.status == 200 assert "pyppeteer" in resp.flags assert len(resp.css("div.quote")) == 30 await handler.browser.close()
async def test_page_coroutine_navigation(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [NavigationPageCoroutine("click", "a.lorem_ipsum")], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/lorem_ipsum.html") assert resp.status == 200 assert "pyppeteer" in resp.flags assert resp.css("title::text").get() == "Lorem Ipsum" text = resp.css("p::text").get() assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit." await handler.browser.close()
def setUp(self) -> None: self.settings = Settings() self.settings.setmodule(module=default_settings) self.settings.setdict(self.mongo_settings) self.spider = Spider(name="TestMongoPipeline") self.pipe = MongoPipeline.from_settings(settings=self.settings) yield self.pipe.open_spider(spider=None)
def _crawler(extended_settings={}): settings = { "EXTENSIONS": {"spider_feeder.loaders.StartUrlsLoader": 500}, } settings.update(extended_settings) crawler = Crawler(Spider, settings=settings) crawler.spider = Spider("dummy") return crawler
def test_process_request_scrapy_1(): from tests.utils import get_test_middleware middleware = get_test_middleware() request = Request("https://example.org") with shub_jobkey_env_variable(): processed = middleware.process_request(request, Spider("foo")) assert processed.flags == ["original url: https://example.org"]
def test_stats(mocked_time): middleware = get_test_middleware() spider = Spider("foo") count = 100 nums = list(range(count)) random.shuffle(nums) status_list = [random.randint(1, 15) for _ in range(count)] method_list = [ random.choice(["GET", "POST", "PUT", "DELETE", "HEAD"]) for _ in range(count) ] # expected values latencies = [2**n - n for n in nums] total_latency = sum(latencies) avg_latency = total_latency / count max_latency = max(latencies) for n, status, method in zip(nums, status_list, method_list): request = Request("https://example.org", method=method) mocked_time.return_value = n # start_ts processed_request = middleware.process_request(request, spider) response = TextResponse( url="https://example.org", request=processed_request, body=json.dumps({ "headers": {}, "original_status": status, "body": "", "url": "http://" }).encode("utf-8"), ) mocked_time.return_value = 2**n # end_ts middleware.process_response(processed_request, response, spider) middleware.spider_closed(spider, "finished") assert middleware.stats.get_value("crawlera_fetch/request_count") == count assert middleware.stats.get_value("crawlera_fetch/response_count") == count assert middleware.stats.get_value( "crawlera_fetch/total_latency") == total_latency assert middleware.stats.get_value( "crawlera_fetch/avg_latency") == avg_latency assert middleware.stats.get_value( "crawlera_fetch/max_latency") == max_latency for status in set(status_list): sc = middleware.stats.get_value( "crawlera_fetch/response_status_count/{}".format(status)) assert sc == status_list.count(status) for method in set(method_list): mc = middleware.stats.get_value( "crawlera_fetch/request_method_count/{}".format(method)) assert mc == method_list.count(method)
def setUp(self): self.persist = False self.key_prefix = 'scrapy_redis:tests:' self.queue_key = self.key_prefix + '%(spider)s:requests' self.dupefilter_key = self.key_prefix + '%(spider)s:dupefilter' self.idle_before_close = 0 self.scheduler = Scheduler(self.server, self.persist, self.queue_key, SpiderQueue, self.dupefilter_key, self.idle_before_close) self.spider = Spider('myspider')
def test_pyppeteer_request(self): def _test(response): self.assertIsInstance(response, Response) self.assertEqual(response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"]) self.assertEqual(response.url, request.url) self.assertEqual(response.status, 200) self.assertIn("pyppeteer", response.flags) request = Request(self.server.urljoin("/index.html"), meta={"pyppeteer": True}) return self.handler.download_request(request, Spider("foo")).addCallback(_test)
def test_callback_not_available(self): """Callback method is not available in the spider passed to from_dict""" spider = TestSpiderDelegation() r = Request("http://www.example.com", callback=spider.delegated_callback) d = r.to_dict(spider=spider) self.assertRaises(ValueError, request_from_dict, d, spider=Spider("foo"))
def _make_data(settings=None): crawler = Crawler(Spider, settings=settings) spider = Spider("dummy") return { "stats": crawler.stats.get_stats(), "crawler": crawler, "spider": spider, "runner": SpiderMonitorRunner(spider=spider), "job": None, }
def _crawler(extended_settings={}): settings = { "SPIDERMON_ENABLED": True, "EXTENSIONS": { "spidermon.contrib.scrapy.extensions.Spidermon": 500 }, } settings.update(extended_settings) crawler = Crawler(Spider, settings=settings) crawler.spider = Spider("dummy") return crawler
async def test_timeout(self): settings_dict = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000, } async with make_handler(settings_dict) as handler: with MockServer() as server: req = Request(server.urljoin("/delay/2"), meta={"playwright": True}) with pytest.raises(TimeoutError): await handler._download_request(req, Spider("foo"))