def test_log_formatter_scrapy_1(): middleware = get_test_middleware() logformatter = CrawleraFetchLogFormatter() formatter = Formatter() for case in get_test_requests(): original = case["original"] response = Response(original.url) processed = middleware.process_request(original, foo_spider) crawlera_meta = original.meta.get("crawlera_fetch") or {} if crawlera_meta.get("skip"): assert processed is None continue # crawled result = logformatter.crawled(processed, response, foo_spider) assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=1, exc_info=None, **result) logstr = formatter.format(record) expected = "Crawled (200) {request} ['original url: {url}'] (referer: None)".format( request=original, url=original.url) assert logstr == expected
def test_process_request_single_download_slot(): middleware = get_test_middleware( settings={ "CRAWLERA_FETCH_DOWNLOAD_SLOT_POLICY": DownloadSlotPolicy.Single }) for case in get_test_requests(): original = case["original"] expected = case["expected"] if expected: expected.meta["download_slot"] = "__crawlera_fetch__" with shub_jobkey_env_variable(): processed = middleware.process_request(original, foo_spider) crawlera_meta = original.meta.get("crawlera_fetch") if crawlera_meta.get("skip"): assert processed is None else: assert type(processed) is type(expected) assert processed.url == expected.url assert processed.method == expected.method assert processed.headers == expected.headers assert processed.meta == expected.meta processed_text = processed.body.decode(processed.encoding) expected_text = expected.body.decode(expected.encoding) assert json.loads(processed_text) == json.loads(expected_text)
def test_process_request_disabled(): middleware = get_test_middleware( settings={"CRAWLERA_FETCH_ENABLED": False}) for case in get_test_requests(): request = case["original"] with shub_jobkey_env_variable(): assert middleware.process_request(request, foo_spider) is None
def test_log_formatter_scrapy_2(): middleware = get_test_middleware() logformatter = CrawleraFetchLogFormatter() formatter = Formatter() for case in get_test_requests(): original = case["original"] response = Response(original.url) processed = middleware.process_request(original, foo_spider) crawlera_meta = original.meta.get("crawlera_fetch") or {} if crawlera_meta.get("skip"): assert processed is None continue # crawled result = logformatter.crawled(processed, response, foo_spider) assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=1, exc_info=None, **result) logstr = formatter.format(record) assert logstr == "Crawled (200) %s (referer: None)" % str(original) # spider_error result = logformatter.spider_error(Failure(Exception("exc")), processed, response, foo_spider) assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=1, exc_info=None, **result) logstr = formatter.format(record) assert logstr == "Spider error processing %s (referer: None)" % str( original) # download_error result = logformatter.download_error(Failure(Exception("exc")), processed, foo_spider, "error") assert result["args"]["request"] == str(original) record = LogRecord(name="logger", pathname="n/a", lineno=2, exc_info=None, **result) logstr = formatter.format(record) assert logstr == "Error downloading %s: error" % str(original)
def test_process_request_default_args(): middleware = get_test_middleware( settings={"CRAWLERA_FETCH_DEFAULT_ARGS": {"foo": "bar", "answer": "42"}} ) for case in get_test_requests(): original = case["original"] processed = middleware.process_request(original, foo_spider) crawlera_meta = original.meta.get("crawlera_fetch") if crawlera_meta.get("skip"): assert processed is None else: processed_text = processed.body.decode(processed.encoding) processed_json = json.loads(processed_text) assert processed_json["foo"] == "bar" assert processed_json["answer"] == "42"
def test_process_request(): middleware = get_test_middleware() for case in get_test_requests(): original = case["original"] expected = case["expected"] with shub_jobkey_env_variable(): processed = middleware.process_request(original, foo_spider) crawlera_meta = original.meta.get("crawlera_fetch") if crawlera_meta.get("skip"): assert processed is None else: assert type(processed) is type(expected) assert processed.url == expected.url assert processed.method == expected.method assert processed.headers == expected.headers assert processed.meta == expected.meta processed_text = processed.body.decode(processed.encoding) expected_text = expected.body.decode(expected.encoding) assert json.loads(processed_text) == json.loads(expected_text)