def test_middleware(self): m = PageActionsMiddleware() spider = Spider('test_spider') req = mkreq() spider.page_actions = [{ "type": "click", "selector": "#showmore" }] m.process_request(req, spider) self.assertEqual(req.meta['splash']['endpoint'], 'execute') # Page actions enabled req = mkreq() spider.page_actions = [] m.process_request(req, spider) self.assertEqual(req.meta['splash']['endpoint'], 'render.html') # Page actions disabled req = mkreq() spider.page_actions = [{ "type": "click", "selector": "#showmore", "reject": "test\\.com" }] m.process_request(req, spider) self.assertEqual(req.meta['splash']['endpoint'], 'render.html') # Page actions disabled
def __init__(self, *args, **kwargs): """Spider initialization. """ Spider.__init__(self, *args, **kwargs) self.requests = [] self.responses = []
def __init__(self, op, **kwargs): self.op = op self.reach_limit = False self.last_feed_updated_time = None self.make_sure_path_exists(self.get_output_dir_path()) # TODO: why print log in __int__ doesn't work? # self.log('Initializing spider...') Spider.__init__(self, self.name, **kwargs)
def __init__(self, txt_path=None, *args, **kwargs): Spider.__init__(self, *args, **kwargs) if not txt_path: txt_path = "%s%s%s" % (os.curdir, os.sep, self.name) if not os.path.exists(txt_path): os.mkdir(txt_path) self.txt_path = txt_path
def __init__(self, city_name, city_id, api, *args, **kwargs): self.api_key = str(api) self.city_id = city_id self.city_name = city_name self.base_url += city_id self.averages = {} self.top10_restaurants = {} self.db_manager = DBManager(self) Spider.__init__(self, *args, **kwargs) dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, url): """ Set up the spider to start scraping from the given URL. URLs should be the first page of "Savvy Buys" for a supermarket and should be read from the app.cfg file. For multiple supermarkets, use multiple spiders. Keyword arguments: url -- a single URL to start from. """ Spider.__init__(self) self.start_urls = [url]
def close(spider, reason): if reason == 'finished': spider.get_connector().log(spider.name, spider.ACTION_FINISHED) else: spider.get_connector().log(spider.name, spider.ACTION_UNEXPECTED_END, reason) return Spider.close(spider, reason)
def setUp(self): self.spider = Spider('default') self.mocked_hsref = mock.Mock() self.patch = mock.patch('sh_scrapy.hsref.hsref', self.mocked_hsref) self.crawler_mock = mock.Mock() self.crawler_mock.settings = Settings({ 'PAGE_STORAGE_ENABLED': True, 'PAGE_STORAGE_MODE': 'VERSIONED_CACHE', 'PAGE_STORAGE_LIMIT': 10, 'PAGE_STORAGE_ON_ERROR_LIMIT': 5 }) self.mocked_hsref.project.collections.url = '/test/url' self.patch.start() self.instance = PageStorageMiddleware.from_crawler(self.crawler_mock)
def make_queue(redis_server, cls: type, slots=None, skip_cache=True, settings=None, hints=None) -> BaseRequestQueue: global logging_configured if not logging_configured: configure_logging(settings=settings) logging_configured = True crawler = Crawler(Spider, settings=settings) if slots is None: slots = {} spider = Spider.from_crawler(crawler, 'test_dd_spider') if hints: spider.hint_urls = hints return cls(server=redis_server, spider=spider, key=SCHEDULER_QUEUE_KEY, slots_mock=slots, skip_cache=skip_cache)
def test_pyppeteer_request(self): def _test(response): self.assertIsInstance(response, Response) self.assertEqual( response.css("a::text").getall(), ["Lorem Ipsum", "Infinite Scroll"]) self.assertEqual(response.url, request.url) self.assertEqual(response.status, 200) self.assertIn("pyppeteer", response.flags) request = Request(self.server.urljoin("/index.html"), meta={"pyppeteer": True}) return self.handler.download_request(request, Spider("foo")).addCallback(_test)
def test_hs_mware_process_spider_output_filter_request(hs_mware): response = Response('http://resp-url') # provide a response and a new request in result child_response = Response('http://resp-url-child') child_response.request = Request('http://resp-url-child-req') child_request = Request('http://req-url-child') hs_mware._seen = WeakKeyDictionary({response: 'riq'}) result = list( hs_mware.process_spider_output(response, [child_response, child_request], Spider('test'))) assert len(result) == 2 # make sure that we update hsparent meta only for requests assert result[0].meta.get('_hsparent') is None assert result[1].meta['_hsparent'] == 'riq'
def start_requests(self): self.log('start request...') self.log('spider name: %s, allowed_domains: %s, op: %s' % (self.name, self.allowed_domains, self.op)) self.set_pipeline_class() # doesn't work currently. if self.is_content_op(self): self.start_urls = self.get_content_start_urls() elif self.is_feed_op(self): self.last_feed_updated_time = self.get_last_feed_updated_time() self.start_urls = self.get_feed_start_urls() else: self.log('*' * 60, log.ERROR) self.log('*** Value of "op" parameter is not supported: %s ' % self.op, log.ERROR) self.log('*' * 60, log.ERROR) self.log('start_urls: %s' % self.start_urls) return Spider.start_requests(self)
def open(self, spider: Spider) -> None: self.spider = spider try: self.queue = load_object(self.queue_cls)( server=self.server, spider=spider, key=self.queue_key % { 'spider': spider.name }, serializer=self.serializer, ) except TypeError as e: raise ValueError("Failed to instantiate queue class '%s': %s", self.queue_cls, e) self.df = load_object(self.dupefilter_cls).from_spider(spider) if self.flush_on_start: self.flush() # notice if there are requests already in the queue to resume the crawl if len(self.queue): spider.log("Resuming crawl (%d requests scheduled)" % len(self.queue))
def test_hs_mware_process_spider_input(hs_mware): response = Response('http://resp-url') response.request = Request('http://req-url') hs_mware.process_spider_input(response, Spider('test')) assert hs_mware.pipe_writer.write_request.call_count == 1 args = hs_mware.pipe_writer.write_request.call_args[1] assert args == { 'duration': 0, 'fp': request_fingerprint(response.request), 'method': 'GET', 'parent': None, 'rs': 0, 'status': 200, 'url': 'http://resp-url' } assert hs_mware._seen == WeakKeyDictionary({response: 0})
async def test_basic_response(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request(server.urljoin("/index.html"), meta={"pyppeteer": True}) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "pyppeteer" in resp.flags assert resp.css("a::text").getall() == ["Lorem Ipsum", "Infinite Scroll"] await handler.browser.close()
async def test_page_coroutine_screenshot_pdf(): def get_mimetype(file): return subprocess.run( ["file", "--mime-type", "--brief", file.name], stdout=subprocess.PIPE, universal_newlines=True, ).stdout.strip() png_file = NamedTemporaryFile(mode="w+b") pdf_file = NamedTemporaryFile(mode="w+b") handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": { "png": PageCoroutine("screenshot", options={ "path": png_file.name, "type": "png" }), "pdf": PageCoroutine("pdf", options={"path": pdf_file.name}), }, }, ) await handler._download_request(req, Spider("foo")) assert get_mimetype(png_file) == "image/png" assert get_mimetype(pdf_file) == "application/pdf" png_file.file.seek(0) assert png_file.file.read( ) == req.meta["pyppeteer_page_coroutines"]["png"].result pdf_file.file.seek(0) assert pdf_file.file.read( ) == req.meta["pyppeteer_page_coroutines"]["pdf"].result png_file.close() pdf_file.close() await handler.browser.close()
async def test_timeout(self): handler = ScrapyPlaywrightDownloadHandler( get_crawler( settings_dict={ "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_DEFAULT_NAVIGATION_TIMEOUT": 1000, } ) ) await handler._launch_browser() with MockServer() as server: req = Request(server.urljoin("/index.html"), meta={"playwright": True}) with pytest.raises(TimeoutError): await handler._download_request(req, Spider("foo")) await handler.browser.close()
async def test_post_request(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() with PostMockServer() as server: req = FormRequest(server.urljoin("/"), meta={"pyppeteer": True}, formdata={"foo": "bar"}) resp = await handler._download_request(req, Spider("foo")) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "pyppeteer" in resp.flags assert "Request body: foo=bar" in resp.text await handler.browser.close()
def start_requests(self): self.log('start request...') self.log('spider name: %s, allowed_domains: %s, op: %s' % (self.name, self.allowed_domains, self.op)) self.set_pipeline_class() # doesn't work currently. if self.is_content_op(self): self.start_urls = self.get_content_start_urls() elif self.is_feed_op(self): self.last_feed_updated_time = self.get_last_feed_updated_time() self.start_urls = self.get_feed_start_urls() else: self.log('*' * 60, log.ERROR) self.log( '*** Value of "op" parameter is not supported: %s ' % self.op, log.ERROR) self.log('*' * 60, log.ERROR) self.log('start_urls: %s' % self.start_urls) return Spider.start_requests(self)
def test_hs_ext_attrs_item_scraped(hs_ext): try: import attr import iteamadapter except ImportError: pytest.skip('attrs not installed') return @attr.s class AttrsItem(object): pass hs_ext._write_item = mock.Mock() item = AttrsItem() spider = Spider('test') hs_ext.item_scraped(item, spider) assert hs_ext._write_item.call_count == 1 assert hs_ext._write_item.call_args[0] == ({'_type': 'AttrsItem'}, )
async def test_post_request(self): handler = ScrapyPlaywrightDownloadHandler( get_crawler(settings_dict={"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) ) await handler._launch_browser() with MockServer() as server: req = FormRequest( server.urljoin("/"), meta={"playwright": True}, formdata={"foo": "bar"} ) resp = await handler._download_request(req, Spider("foo")) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "playwright" in resp.flags assert "Request body: foo=bar" in resp.text await handler.browser.close()
async def test_page_coroutine_timeout(): crawler = get_crawler(settings_dict={"PYPPETEER_NAVIGATION_TIMEOUT": 1000}) handler = ScrapyPyppeteerDownloadHandler(crawler) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [NavigationPageCoroutine("click", selector="h1")], }, ) with pytest.raises(pyppeteer.errors.TimeoutError): await handler._download_request(req, Spider("foo")) await handler.browser.close()
async def test_page_to_callback(): handler = ScrapyPyppeteerDownloadHandler(get_crawler()) await handler._launch_browser() async def callback(self, response, page: pyppeteer.page.Page): pass with StaticMockServer() as server: req = Request(server.urljoin("/index.html"), callback, meta={"pyppeteer": True}) resp = await handler._download_request(req, Spider("foo")) page = resp.request.cb_kwargs["page"] assert isinstance(page, pyppeteer.page.Page) assert (await page.title()) == "Awesome site" await page.close() await handler.browser.close()
def test_process_request_default_args(): middleware = get_test_middleware(settings={ "CRAWLERA_FETCH_DEFAULT_ARGS": { "foo": "bar", "answer": "42" } }) for case in deepcopy(test_requests): original = case["original"] processed = middleware.process_request(original, Spider("foo")) crawlera_meta = original.meta.get("crawlera_fetch") if crawlera_meta.get("skip"): assert processed is None else: processed_text = processed.body.decode(processed.encoding) processed_json = json.loads(processed_text) assert processed_json["foo"] == "bar" assert processed_json["answer"] == "42"
async def test_basic_response(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: meta = {"playwright": True, "playwright_include_page": True} req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == req.url assert resp.status == 200 assert "playwright" in resp.flags assert resp.css("a::text").getall() == [ "Lorem Ipsum", "Infinite Scroll" ] assert isinstance(resp.meta["playwright_page"], PlaywrightPage) assert resp.meta["playwright_page"].url == resp.url await resp.meta["playwright_page"].close()
def test_process_response_skip(): response = TextResponse( url="https://example.org", status=200, headers={ "Content-Encoding": "gzip", "Transfer-Encoding": "chunked", "Date": "Fri, 24 Apr 2020 18:06:42 GMT", }, request=Request(url="https://example.org", meta={"crawlera_fetch": { "skip": True }}), body=b"""<html></html>""", ) middleware = get_test_middleware() processed = middleware.process_response(response.request, response, Spider("foo")) assert response is processed
def open_spider(self, spider: scrapy.Spider): """Creates initializes the output folders to store the comment items. """ try: os.mkdir('data') spider.log(' Directory data/ created', level=logging.INFO) except FileExistsError: spider.log(' Directory data/ already exists', level=logging.INFO) os.mkdir('data/{}-{}'.format(spider.name, self.key)) spider.log(' Directory data/{}-{} created'.format( spider.name, self.key), level=logging.INFO) filename = 'data/{0}-{1}/part-{2:05d}.jl'.format( spider.name, self.key, self.file_index) self.file = open(filename, 'a')
def test_process_request(): middleware = get_test_middleware() for case in deepcopy(test_requests): original = case["original"] expected = case["expected"] with shub_jobkey_env_variable(): processed = middleware.process_request(original, Spider("foo")) crawlera_meta = original.meta.get("crawlera_fetch") if crawlera_meta.get("skip"): assert processed is None else: assert type(processed) is type(expected) assert processed.url == expected.url assert processed.method == expected.method assert processed.headers == expected.headers assert processed.meta == expected.meta processed_text = processed.body.decode(processed.encoding) expected_text = expected.body.decode(expected.encoding) assert json.loads(processed_text) == json.loads(expected_text)
async def test_deprecated_setting(self): settings = { "PLAYWRIGHT_BROWSER_TYPE": self.browser_type, "PLAYWRIGHT_CONTEXT_ARGS": { "storage_state": { "cookies": [ { "url": "https://example.org", "name": "asdf", "value": "qwerty", }, ], }, }, } with warnings.catch_warnings(record=True) as warning_list: async with make_handler(settings) as handler: assert warning_list[0].category is DeprecationWarning assert str(warning_list[0].message) == ( "The PLAYWRIGHT_CONTEXT_ARGS setting is deprecated, please use" " PLAYWRIGHT_CONTEXTS instead. Keyword arguments defined in" " PLAYWRIGHT_CONTEXT_ARGS will be used when creating the 'default' context" ) with StaticMockServer() as server: meta = { "playwright": True, "playwright_include_page": True, } req = Request(server.urljoin("/index.html"), meta=meta) resp = await handler._download_request(req, Spider("foo")) page = resp.meta["playwright_page"] storage_state = await page.context.storage_state() await page.close() cookie = storage_state["cookies"][0] assert cookie["name"] == "asdf" assert cookie["value"] == "qwerty" assert cookie["domain"] == "example.org"
async def test_default_page_coroutine_timeout(): crawler = get_crawler( settings_dict={"PYPPETEER_PAGE_COROUTINE_TIMEOUT": 1000}) handler = ScrapyPyppeteerDownloadHandler(crawler) await handler._launch_browser() with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "pyppeteer": True, "pyppeteer_page_coroutines": [ NavigationPageCoroutine("waitForXPath", '//*[@id="test"]/test') ], }, ) with pytest.raises(pyppeteer.errors.TimeoutError): await handler._download_request(req, Spider("foo")) await handler.browser.close()
def run_test(self, **kwargs): dt = TestData(**kwargs) settings = { 'SPIDERMON_ENABLED': True, 'SPIDERMON_SPIDER_OPEN_EXPRESSION_MONITORS': [{ 'tests': [{ 'expression': dt.expression, }] }] } settings.update(dt.settings) crawler = get_crawler(settings_dict=settings) crawler.stats.get_stats = lambda _: dt.stats spidermon = Spidermon.from_crawler(crawler) spider = Spider(name=self.spider_name) # mocking, to see test results via raising AssertionError exception # with failures and errors as results spidermon._run_suites = partial(_test_run_suites, spidermon) try: spidermon.spider_opened(spider) except AssertionError as e: failures, errors = e.args[0] for f in failures: _, trace = f raise AssertionError(trace) for e in errors: _, trace = e if dt.expected_error and dt.expected_error in trace: dt.expected_error = None else: raise AssertionError(trace) if dt.expected_error: raise AssertionError( 'Expected error <{}> was not raised'.format( dt.expected_error))
def test_save_response_with_trim(self): self.instance._writer.maxitemsize = 26 self.instance.hsref.job.key = '123/45/67' resp = TextResponse( 'http://resp', request=Request('http://req'), encoding='cp1251', body='\r\n\r\n<html><body></body></html>\r\n \0\0\0\0\0') with mock.patch.object(Spider, 'logger') as log: spider = Spider('default') self.instance.save_response(resp, self.spider) log.warning.assert_called_with( "Page not saved, body too large: <http://resp>") self.instance.trim_html = True self.instance.save_response(resp, spider) self.instance._writer.write.assert_called_with({ u'body': u'<html><body></body></html>', u'_encoding': u'cp1251', u'_type': u'_pageitem', u'_key': u'9b4bed7e56103ddf63455ed39145f61f53b3c702', u'url': u'http://resp', '_jobid': '123/45/67' })
async def test_page_coroutine_navigation(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": [PageCoro("click", "a.lorem_ipsum")], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/lorem_ipsum.html") assert resp.status == 200 assert "playwright" in resp.flags assert resp.css("title::text").get() == "Lorem Ipsum" text = resp.css("p::text").get() assert text == "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
async def test_page_coroutine_screenshot(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with NamedTemporaryFile(mode="w+b") as png_file: with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": { "png": PageCoro("screenshot", path=png_file.name, type="png"), }, }, ) await handler._download_request(req, Spider("foo")) png_file.file.seek(0) assert png_file.file.read( ) == req.meta["playwright_page_coroutines"]["png"].result assert get_mimetype(png_file) == "image/png"
async def test_page_coroutine_pdf(self): if self.browser_type != "chromium": pytest.skip("PDF generation is supported only in Chromium") async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with NamedTemporaryFile(mode="w+b") as pdf_file: with StaticMockServer() as server: req = Request( url=server.urljoin("/index.html"), meta={ "playwright": True, "playwright_page_coroutines": { "pdf": PageCoro("pdf", path=pdf_file.name), }, }, ) await handler._download_request(req, Spider("foo")) pdf_file.file.seek(0) assert pdf_file.file.read( ) == req.meta["playwright_page_coroutines"]["pdf"].result assert get_mimetype(pdf_file) == "application/pdf"
def process_response(self, request: Request, response: Response, spider: Spider): if 400 <= response.status < 500: meta = request.meta if meta.get('oauth_enabled'): query = urlparse(response.url).query query_params = dict(i.split('=') for i in query.split('&')) if response.status == 401: invalid_token_id = query_params.get('oauth_token') self.invalid_tokens_id.add(invalid_token_id) logger.info( 'one token invalid: {}'.format(invalid_token_id)) query_info = { k: v for k, v in query_params.items() if not k.startswith('oauth') } r = spider.partial_form_request(formdata=query_info, meta=meta, dont_filter=True) return self._set_oauth(r) return response
def test_process_response(): middleware = get_test_middleware() for case in test_responses: original = case["original"] expected = case["expected"] processed = middleware.process_response(original.request, original, Spider("foo")) assert type(processed) is type(expected) assert processed.url == expected.url assert processed.status == expected.status assert processed.headers == expected.headers assert processed.body == expected.body crawlera_meta = processed.meta.get("crawlera_fetch") or {} if crawlera_meta.get("upstream_response"): assert crawlera_meta["upstream_response"]["body"] == json.loads( original.text) assert crawlera_meta["upstream_response"][ "headers"] == original.headers assert crawlera_meta["upstream_response"][ "status"] == original.status
async def test_page_coroutine_infinite_scroll(self): async with make_handler({"PLAYWRIGHT_BROWSER_TYPE": self.browser_type}) as handler: with StaticMockServer() as server: req = Request( url=server.urljoin("/scroll.html"), headers={"User-Agent": "scrapy-playwright"}, meta={ "playwright": True, "playwright_page_coroutines": [ PageCoro("wait_for_selector", selector="div.quote"), PageCoro( "evaluate", "window.scrollBy(0, document.body.scrollHeight)" ), PageCoro("wait_for_selector", selector="div.quote:nth-child(11)"), PageCoro( "evaluate", "window.scrollBy(0, document.body.scrollHeight)" ), PageCoro("wait_for_selector", selector="div.quote:nth-child(21)"), ], }, ) resp = await handler._download_request(req, Spider("foo")) assert isinstance(resp, HtmlResponse) assert resp.request is req assert resp.url == server.urljoin("/scroll.html") assert resp.status == 200 assert "playwright" in resp.flags assert len(resp.css("div.quote")) == 30
def __del__(self): self.driver.close() Spider.__del__(self)
def __init__(self): Spider.__init__(self) self.driver = webdriver.PhantomJS()