def test_post_request(): mw = _get_mw() for body in [b'', b'foo=bar']: req1 = scrapy.Request("http://example.com", method="POST", body=body, meta={'splash': {'endpoint': 'render.html'}}) req = mw.process_request(req1, None) assert json.loads(to_native_str(req.body)) == { 'url': 'http://example.com', 'http_method': 'POST', 'body': to_native_str(body), }
def __init__(self, url=None, callback=None, method='GET', endpoint='render', args=None, splash_url=None, slot_policy=SlotPolicy.PER_DOMAIN, splash_headers=None, dont_process_response=False, dont_send_headers=False, magic_response=True, session_id='default', http_status_from_error_code=True, cache_args=None, meta=None, **kwargs): if url is None: url = 'about:blank' url = to_native_str(url) meta = copy.deepcopy(meta) or {} splash_meta = meta.setdefault('splash', {}) splash_meta.setdefault('endpoint', endpoint) splash_meta.setdefault('slot_policy', slot_policy) if splash_url is not None: splash_meta['splash_url'] = splash_url if splash_headers is not None: splash_meta['splash_headers'] = splash_headers if dont_process_response: splash_meta['dont_process_response'] = True else: splash_meta.setdefault('magic_response', magic_response) if dont_send_headers: splash_meta['dont_send_headers'] = True if http_status_from_error_code: splash_meta['http_status_from_error_code'] = True if cache_args is not None: splash_meta['cache_args'] = cache_args if session_id is not None: if splash_meta['endpoint'].strip('/') == 'execute': splash_meta.setdefault('session_id', session_id) _args = {'url': url} # put URL to args in order to preserve #fragment _args.update(args or {}) _args.update(splash_meta.get('args', {})) splash_meta['args'] = _args # This is not strictly required, but it strengthens Splash # requests against AjaxCrawlMiddleware meta['ajax_crawlable'] = True super(SplashRequest, self).__init__(url, callback, method, meta=meta, **kwargs)
def test_url_with_fragment(): mw = _get_mw() url = "http://example.com#id1" req = scrapy.Request("http://example.com", meta={ 'splash': {'args': {'url': url}} }) req = mw.process_request(req, None) assert json.loads(to_native_str(req.body)) == {'url': url}
def __init__(self, url=None, callback=None, method='GET', endpoint='render.html', args=None, splash_url=None, slot_policy=SlotPolicy.PER_DOMAIN, splash_headers=None, dont_process_response=False, dont_send_headers=False, magic_response=True, session_id='default', http_status_from_error_code=True, cache_args=None, meta=None, **kwargs): if url is None: url = 'about:blank' url = to_native_str(url) meta = meta or {} splash_meta = meta.setdefault('splash', {}) splash_meta.setdefault('endpoint', endpoint) splash_meta.setdefault('slot_policy', slot_policy) if splash_url is not None: splash_meta['splash_url'] = splash_url if splash_headers is not None: splash_meta['splash_headers'] = splash_headers if dont_process_response: splash_meta['dont_process_response'] = True else: splash_meta.setdefault('magic_response', magic_response) if dont_send_headers: splash_meta['dont_send_headers'] = True if http_status_from_error_code: splash_meta['http_status_from_error_code'] = True if cache_args is not None: splash_meta['cache_args'] = cache_args if session_id is not None: if splash_meta['endpoint'].strip('/') == 'execute': splash_meta.setdefault('session_id', session_id) _args = {'url': url} # put URL to args in order to preserve #fragment _args.update(args or {}) _args.update(splash_meta.get('args', {})) splash_meta['args'] = _args # This is not strictly required, but it strengthens Splash # requests against AjaxCrawlMiddleware meta['ajax_crawlable'] = True super(SplashRequest, self).__init__(url, callback, method, meta=meta, **kwargs)
def test_float_wait_arg(): mw = _get_mw() req1 = scrapy.Request("http://example.com", meta={ 'splash': { 'endpoint': 'render.html', 'args': {'wait': 0.5} } }) req = mw.process_request(req1, None) assert json.loads(to_native_str(req.body)) == {'url': req1.url, 'wait': 0.5}
def test_override_splash_url(): mw = _get_mw() req1 = scrapy.Request("http://example.com", meta={ 'splash': { 'endpoint': 'render.png', 'splash_url': 'http://splash.example.com' } }) req = mw.process_request(req1, None) assert req.url == 'http://splash.example.com/render.png' assert json.loads(to_native_str(req.body)) == {'url': req1.url}
def test_splash_request_no_url(): mw = _get_mw() lua_source = "function main(splash) return {result='ok'} end" req1 = SplashRequest(meta={'splash': { 'args': {'lua_source': lua_source}, 'endpoint': 'execute', }}) req = mw.process_request(req1, None) assert req.url == 'http://127.0.0.1:8050/execute' assert json.loads(to_native_str(req.body)) == { 'url': 'about:blank', 'lua_source': lua_source }
def test_splash_request(): mw = _get_mw() cookie_mw = _get_cookie_mw() req = SplashRequest("http://example.com?foo=bar&url=1&wait=100") assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>" # check request preprocessing req2 = cookie_mw.process_request(req, None) or req req2 = mw.process_request(req2, None) or req2 assert req2 is not None assert req2 is not req assert req2.url == "http://127.0.0.1:8050/render.html" assert req2.headers == {b'Content-Type': [b'application/json']} assert req2.method == 'POST' assert isinstance(req2, SplashRequest) assert repr( req2 ) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render.html>" expected_body = {'url': req.url} assert json.loads(to_native_str(req2.body)) == expected_body # check response post-processing response = TextResponse( "http://127.0.0.1:8050/render.html", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'text/html'}, body=b"<html><body>Hello</body></html>") response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) assert isinstance(response2, scrapy_splash.SplashTextResponse) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.url assert response2.body == b"<html><body>Hello</body></html>" assert response2.css("body").extract_first() == "<body>Hello</body>" assert response2.headers == {b'Content-Type': [b'text/html']} # check .replace method response3 = response2.replace(status=404) assert response3.status == 404 assert isinstance(response3, scrapy_splash.SplashTextResponse) for attr in ['url', 'real_url', 'headers', 'body']: assert getattr(response3, attr) == getattr(response2, attr)
def test_splash_request(): mw = _get_mw() cookie_mw = _get_cookie_mw() req = SplashRequest("http://example.com?foo=bar&url=1&wait=100") assert repr(req) == "<GET http://example.com?foo=bar&url=1&wait=100>" # check request preprocessing req2 = cookie_mw.process_request(req, None) or req req2 = mw.process_request(req2, None) or req2 assert req2 is not None assert req2 is not req assert req2.url == "http://127.0.0.1:8050/render.html" assert req2.headers == {b'Content-Type': [b'application/json']} assert req2.method == 'POST' assert isinstance(req2, SplashRequest) assert repr(req2) == "<GET http://example.com?foo=bar&url=1&wait=100 via http://127.0.0.1:8050/render.html>" expected_body = {'url': req.url} assert json.loads(to_native_str(req2.body)) == expected_body # check response post-processing response = TextResponse("http://127.0.0.1:8050/render.html", # Scrapy doesn't pass request to constructor # request=req2, headers={b'Content-Type': b'text/html'}, body=b"<html><body>Hello</body></html>") response2 = mw.process_response(req2, response, None) response2 = cookie_mw.process_response(req2, response2, None) assert isinstance(response2, scrapy_splash.SplashTextResponse) assert response2 is not response assert response2.real_url == req2.url assert response2.url == req.url assert response2.body == b"<html><body>Hello</body></html>" assert response2.css("body").extract_first() == "<body>Hello</body>" assert response2.headers == {b'Content-Type': [b'text/html']} # check .replace method response3 = response2.replace(status=404) assert response3.status == 404 assert isinstance(response3, scrapy_splash.SplashTextResponse) for attr in ['url', 'real_url', 'headers', 'body']: assert getattr(response3, attr) == getattr(response2, attr)
def test_splash_request_url_with_fragment(): mw = _get_mw() url = "http://example.com#id1" req = SplashRequest(url) req = mw.process_request(req, None) assert json.loads(to_native_str(req.body)) == {'url': url}