def test_process_response_disabled(): middleware = get_test_middleware( settings={"CRAWLERA_FETCH_ENABLED": False}) for case in test_responses: response = case["original"] assert middleware.process_response(response.request, response, foo_spider) is response
def test_log_formatter_scrapy_1(): middleware = get_test_middleware() logformatter = CrawleraFetchLogFormatter() formatter = Formatter() for case in get_test_requests(): original = case["original"] response = Response(original.url) processed = middleware.process_request(original, foo_spider) crawlera_meta = original.meta.get("crawlera_fetch") or {} if crawlera_meta.get("skip"): assert processed is None continue # crawled result = logformatter.crawled(processed, response, foo_spider) assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=1, exc_info=None, **result) logstr = formatter.format(record) expected = "Crawled (200) {request} ['original url: {url}'] (referer: None)".format( request=original, url=original.url) assert logstr == expected
def test_process_request_disabled(): middleware = get_test_middleware( settings={"CRAWLERA_FETCH_ENABLED": False}) for case in get_test_requests(): request = case["original"] with shub_jobkey_env_variable(): assert middleware.process_request(request, foo_spider) is None
def test_process_request_single_download_slot(): middleware = get_test_middleware( settings={ "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY": DownloadSlotPolicy.Single }) for case in get_test_requests(): original = case["original"] expected = case["expected"] if expected: expected.meta["download_slot"] = "__crawlera_fetch__" with shub_jobkey_env_variable(): processed = middleware.process_request(original, foo_spider) crawlera_meta = original.meta.get("crawlera_fetch") if crawlera_meta.get("skip"): assert processed is None else: assert type(processed) is type(expected) assert processed.url == expected.url assert processed.method == expected.method assert processed.headers == expected.headers assert processed.meta == expected.meta processed_text = processed.body.decode(processed.encoding) expected_text = expected.body.decode(expected.encoding) assert json.loads(processed_text) == json.loads(expected_text)
def test_process_request_scrapy_1(): from tests.utils import get_test_middleware middleware = get_test_middleware() request = Request("https://example.org") with shub_jobkey_env_variable(): processed = middleware.process_request(request, foo_spider) assert processed.flags == ["original url: https://example.org"]
def test_stats(mocked_time): middleware = get_test_middleware() spider = Spider("foo") count = 100 nums = list(range(count)) random.shuffle(nums) status_list = [random.randint(1, 15) for _ in range(count)] method_list = [ random.choice(["GET", "POST", "PUT", "DELETE", "HEAD"]) for _ in range(count) ] # expected values latencies = [2**n - n for n in nums] total_latency = sum(latencies) avg_latency = total_latency / count max_latency = max(latencies) for n, status, method in zip(nums, status_list, method_list): request = Request("https://example.org", method=method) mocked_time.return_value = n # start_ts processed_request = middleware.process_request(request, spider) response = TextResponse( url="https://example.org", request=processed_request, body=json.dumps({ "headers": {}, "original_status": status, "body": "", "url": "http://" }).encode("utf-8"), ) mocked_time.return_value = 2**n # end_ts middleware.process_response(processed_request, response, spider) middleware.spider_closed(spider, "finished") assert middleware.stats.get_value("crawlera_fetch/request_count") == count assert middleware.stats.get_value("crawlera_fetch/response_count") == count assert middleware.stats.get_value( "crawlera_fetch/total_latency") == total_latency assert middleware.stats.get_value( "crawlera_fetch/avg_latency") == avg_latency assert middleware.stats.get_value( "crawlera_fetch/max_latency") == max_latency for status in set(status_list): sc = middleware.stats.get_value( "crawlera_fetch/response_status_count/{}".format(status)) assert sc == status_list.count(status) for method in set(method_list): mc = middleware.stats.get_value( "crawlera_fetch/request_method_count/{}".format(method)) assert mc == method_list.count(method)
def test_log_formatter_scrapy_2(): middleware = get_test_middleware() logformatter = CrawleraFetchLogFormatter() formatter = Formatter() spider = Spider("foo") for case in deepcopy(test_requests): original = case["original"] response = Response(original.url) processed = middleware.process_request(original, spider) crawlera_meta = original.meta.get("crawlera_fetch") or {} if crawlera_meta.get("skip"): assert processed is None continue # crawled result = logformatter.crawled(processed, response, spider) assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=1, exc_info=None, **result) logstr = formatter.format(record) assert logstr == "Crawled (200) %s (referer: None)" % str(original) # spider_error result = logformatter.spider_error(Failure(Exception("exc")), processed, response, spider) assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=1, exc_info=None, **result) logstr = formatter.format(record) assert logstr == "Spider error processing %s (referer: None)" % str( original) # download_error result = logformatter.download_error(Failure(Exception("exc")), processed, spider, "foo") assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=2, exc_info=None, **result) logstr = formatter.format(record) assert logstr == "Error downloading %s: foo" % str(original)
def test_process_request_default_args(): middleware = get_test_middleware( settings={"CRAWLERA_FETCH_DEFAULT_ARGS": {"foo": "bar", "answer": "42"}} ) for case in get_test_requests(): original = case["original"] processed = middleware.process_request(original, foo_spider) crawlera_meta = original.meta.get("crawlera_fetch") if crawlera_meta.get("skip"): assert processed is None else: processed_text = processed.body.decode(processed.encoding) processed_json = json.loads(processed_text) assert processed_json["foo"] == "bar" assert processed_json["answer"] == "42"
def test_process_response_skip(): response = TextResponse( url="https://example.org", status=200, headers={ "Content-Encoding": "gzip", "Transfer-Encoding": "chunked", "Date": "Fri, 24 Apr 2020 18:06:42 GMT", }, request=Request(url="https://example.org", meta={"crawlera_fetch": { "skip": True }}), body=b"""<html></html>""", ) middleware = get_test_middleware() processed = middleware.process_response(response.request, response, Spider("foo")) assert response is processed
def test_process_request(): middleware = get_test_middleware() for case in get_test_requests(): original = case["original"] expected = case["expected"] with shub_jobkey_env_variable(): processed = middleware.process_request(original, foo_spider) crawlera_meta = original.meta.get("crawlera_fetch") if crawlera_meta.get("skip"): assert processed is None else: assert type(processed) is type(expected) assert processed.url == expected.url assert processed.method == expected.method assert processed.headers == expected.headers assert processed.meta == expected.meta processed_text = processed.body.decode(processed.encoding) expected_text = expected.body.decode(expected.encoding) assert json.loads(processed_text) == json.loads(expected_text)
def test_process_response(): middleware = get_test_middleware() for case in test_responses: original = case["original"] expected = case["expected"] processed = middleware.process_response(original.request, original, foo_spider) assert type(processed) is type(expected) assert processed.url == expected.url assert processed.status == expected.status assert processed.headers == expected.headers assert processed.body == expected.body crawlera_meta = processed.meta.get("crawlera_fetch") or {} if crawlera_meta.get("upstream_response"): assert crawlera_meta["upstream_response"]["body"] == json.loads( original.text) assert crawlera_meta["upstream_response"][ "headers"] == original.headers assert crawlera_meta["upstream_response"][ "status"] == original.status
def test_process_response_error(): response_list = [ TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": request_to_dict( Request("https://example.org"), spider=foo_spider, ), } }, ), headers={ "X-Crawlera-Error": "bad_proxy_auth", "Proxy-Authenticate": 'Basic realm="Crawlera"', "Content-Length": "0", "Date": "Mon, 04 May 2020 13:06:15 GMT", "Proxy-Connection": "close", "Connection": "close", }, ), TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": request_to_dict( Request("https://example.org"), spider=foo_spider, ), } }, ), body=b'{"Bad": "JSON', ), TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": request_to_dict( Request("https://example.org"), spider=foo_spider, ), } }, ), body=json.dumps({ "url": "https://example.org", "original_status": 503, "headers": {}, "crawlera_status": "fail", "crawlera_error": "serverbusy", "body_encoding": "plain", "body": "Server busy: too many outstanding requests", }), encoding="utf8", ), ] middleware_raise = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) for response in response_list: with pytest.raises(CrawleraFetchException): middleware_raise.process_response(response.request, response, foo_spider) assert middleware_raise.stats.get_value( "crawlera_fetch/response_error") == 3 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/serverbusy") == 1 middleware_log = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) with LogCapture() as logs: for response in response_list: processed = middleware_log.process_response( response.request, response, foo_spider) assert response is processed logs.check_present( ( "crawlera-fetch-middleware", "WARNING", "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)", # noqa: E501 ), ( "crawlera-fetch-middleware", "WARNING", "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)", # noqa: E501 ), ( "crawlera-fetch-middleware", "WARNING", "Error downloading <GET https://example.org> (Original status: 503, Fetch API error message: Server busy: too many outstanding requests, Request ID: unknown)", # noqa: E501 ), ) assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 3 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/serverbusy") == 1
def test_process_response_error(): response_list = [ TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": { "url": "https://example.org", "method": "GET" }, } }, ), headers={ "X-Crawlera-Error": "bad_proxy_auth", "Proxy-Authenticate": 'Basic realm="Crawlera"', "Content-Length": "0", "Date": "Mon, 04 May 2020 13:06:15 GMT", "Proxy-Connection": "close", "Connection": "close", }, ), TextResponse( url="https://crawlera.com/fake/api/endpoint", request=Request( url="https://crawlera.com/fake/api/endpoint", meta={ "crawlera_fetch": { "timing": { "start_ts": mocked_time() }, "original_request": { "url": "https://example.org", "method": "GET" }, } }, ), body=b'{"Bad": "JSON', ), ] middleware_raise = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": True}) for response in response_list: with pytest.raises(CrawleraFetchException): middleware_raise.process_response(response.request, response, Spider("foo")) assert middleware_raise.stats.get_value( "crawlera_fetch/response_error") == 2 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_raise.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1 middleware_log = get_test_middleware( settings={"CRAWLERA_FETCH_RAISE_ON_ERROR": False}) with LogCapture() as logs: for response in response_list: processed = middleware_log.process_response( response.request, response, Spider("foo")) assert response is processed logs.check_present( ( "crawlera-fetch-middleware", "ERROR", "Error downloading <GET https://example.org> (status: 200, X-Crawlera-Error header: bad_proxy_auth)", # noqa: E501 ), ( "crawlera-fetch-middleware", "ERROR", "Error decoding <GET https://example.org> (status: 200, message: Unterminated string starting at, lineno: 1, colno: 9)", # noqa: E501 ), ) assert middleware_log.stats.get_value("crawlera_fetch/response_error") == 2 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/bad_proxy_auth") == 1 assert middleware_log.stats.get_value( "crawlera_fetch/response_error/JSONDecodeError") == 1