def test_random_user_agent(self): mw = UserAgentMiddleware.from_crawler(Crawler(random_user_agent=True)) req = HttpRequest('http://example.com', headers={}) req2 = HttpRequest('http://example.com', headers={}) mw.handle_request(req) mw.handle_request(req2) assert 'User-Agent' in req.headers assert req.headers.get('User-Agent') != req2.headers.get('User-Agent') assert 'Chrome' in req.headers.get('User-Agent')
def test_copy_http_request(): req = HttpRequest('http://example.com/', params={'key': 'value'}, meta={'depth': 0}) copy_req = req.copy() assert copy_req.url == req.url params = get_params_in_url(copy_req.url) assert 'key' in params and params['key'] == ['value'] assert 'depth' in copy_req.meta and copy_req.meta['depth'] == 0 req.meta['depth'] = 1 assert req.meta['depth'] == 1 and copy_req.meta['depth'] == 0
async def test_allow_redirects(): downloader = Downloader() resp = await downloader.fetch(HttpRequest(make_url('http://httpbin.org/redirect-to', params={'url': 'http://python.org'}))) assert resp.status // 100 == 2 and 'python.org' in resp.url with pytest.raises(HttpError) as e: await downloader.fetch(HttpRequest(make_url('http://httpbin.org/redirect-to', params={'url': 'http://python.org'}), allow_redirects=False)) assert e.value.response.status // 100 == 3
def test_gen_user_agent(self): mw = UserAgentMiddleware.from_crawler( Crawler(user_agent=':desktop,chrome')) req = HttpRequest('http://example.com', headers={}) mw.handle_request(req) assert 'Chrome' in req.headers.get('User-Agent') mw2 = UserAgentMiddleware.from_crawler( Crawler(user_agent=':mobile,chrome')) req2 = HttpRequest('http://example.com', headers={}) mw2.handle_request(req2) assert 'CriOS' in req2.headers.get( 'User-Agent') and 'Mobile' in req2.headers.get('User-Agent')
async def test_lifo_queue(): q = LifoQueue() with pytest.raises(asyncio.TimeoutError): with async_timeout.timeout(0.1): await q.pop() obj_list = [HttpRequest('1'), HttpRequest('2'), HttpRequest('3')] for o in obj_list: await q.push(o) for i in range(len(obj_list)): assert await q.pop() == obj_list[len(obj_list) - i - 1] with pytest.raises(asyncio.TimeoutError): with async_timeout.timeout(0.1): await q.pop()
def test_clear(self): f = HashDupeFilter() r_get = HttpRequest("http://example.com") assert f.is_duplicated(r_get) is False assert f.is_duplicated(r_get) is True f.clear() assert f.is_duplicated(r_get) is False
async def post_bytes(): bytes_data = 'bytes data: 字节数据' resp = await downloader.fetch(HttpRequest('http://httpbin.org/post', 'POST', body=bytes_data.encode(), headers={'Content-Type': 'text/plain'})) assert resp.status == 200 body = json.loads(resp.text)['data'] assert body == bytes_data
async def post_str(): str_data = 'str data: 字符串数据' resp = await downloader.fetch(HttpRequest('http://httpbin.org/post', 'POST', body=str_data, headers={'Content-Type': 'text/plain'})) assert resp.status == 200 body = json.loads(resp.text)['data'] assert body == str_data
def test_handle_request(self): default_headers = {"User-Agent": "xpaw", "Connection": "keep-alive"} mw = DefaultHeadersMiddleware.from_crawler( Crawler(default_headers=default_headers)) req = HttpRequest("http://example.com", headers={"Connection": "close"}) mw.handle_request(req) assert req.headers == {"User-Agent": "xpaw", "Connection": "close"}
async def test_headers(): downloader = Downloader() headers = {'User-Agent': 'xpaw'} resp = await downloader.fetch(HttpRequest("http://httpbin.org/get", headers=headers)) assert resp.status == 200 data = json.loads(resp.text)['headers'] assert 'User-Agent' in data and data['User-Agent'] == 'xpaw'
def test_handle_error(self): mw = RetryMiddleware.from_crawler(Crawler()) req = HttpRequest("http://example.com") err = ValueError() assert mw.handle_error(req, err) is None for err in [ClientError()]: retry_req = mw.handle_error(req, err) assert isinstance(retry_req, HttpRequest) and str( retry_req.url) == str(req.url)
def test_random_user_agent2(self): mw = UserAgentMiddleware.from_crawler( Crawler(user_agent=':mobile', random_user_agent=True)) for i in range(30): req = HttpRequest('http://example.com', headers={}) mw.handle_request(req) assert 'User-Agent' in req.headers assert 'CriOS' in req.headers.get( 'User-Agent') and 'Mobile' in req.headers.get('User-Agent')
def test_http_request_to_dict(): headers = HttpHeaders() headers.add('Set-Cookie', 'a=b') headers.add('Set-Cookie', 'c=d') req = HttpRequest('http://example.com/', 'POST', body=b'body', headers=headers) d = req.to_dict() assert d['url'] == 'http://example.com/' assert d['method'] == 'POST' assert d['body'] == b'body' assert d['headers'] == headers req2 = HttpRequest.from_dict(d) assert req.url == req2.url assert req.method == req2.method assert req.body == req2.body assert req.headers == req2.headers
def test_proxy_dict(self): proxy_dict = {'http': '127.0.0.1:3128', 'https': '127.0.0.1:3129'} mw = ProxyMiddleware.from_crawler(Crawler(proxy=proxy_dict)) req_list = [] for i in ['http://example.com', 'https://example.com']: req_list.append(HttpRequest(i)) res = ['127.0.0.1:3128', '127.0.0.1:3129'] for i in range(len(req_list)): mw.handle_request(req_list[i]) assert req_list[i].proxy == res[i]
def test_retry(self): max_retry_times = 2 mw = RetryMiddleware.from_crawler( Crawler(max_retry_times=max_retry_times, retry_http_status=(500, ))) req = HttpRequest("http://example.com") for i in range(max_retry_times): retry_req = mw.retry(req, "") assert isinstance(retry_req, HttpRequest) and str( retry_req.url) == str(req.url) req = retry_req assert mw.retry(req, "") is None
async def test_priority_queue(): item1_1 = HttpRequest('1_1', priority=1) item1_2 = HttpRequest('1_2', priority=1) item2_1 = HttpRequest('2_1', priority=2) item2_2 = HttpRequest('2_2', priority=2) item3_1 = HttpRequest('3_1', priority=3) item3_2 = HttpRequest('3_2', priority=3) q = PriorityQueue() with pytest.raises(asyncio.TimeoutError): with async_timeout.timeout(0.1): await q.pop() await q.push(item2_1) await q.push(item1_1) await q.push(item3_1) await q.push(item1_2) await q.push(item2_2) await q.push(item3_2) assert await q.pop() is item3_1 assert await q.pop() is item3_2 assert await q.pop() is item2_1 assert await q.pop() is item2_2 assert await q.pop() is item1_1 assert await q.pop() is item1_2 with pytest.raises(asyncio.TimeoutError): with async_timeout.timeout(0.1): await q.pop()
def test_make_requests(): requests = [ None, 'http://unknonw', 'http://python.org/', HttpRequest('http://python.org'), 'http://httpbin.org/status/404' ] results = make_requests(requests, log_level='DEBUG') assert len(results) == len(requests) assert results[0] is None assert isinstance(results[1], ClientError) assert isinstance(results[2], HttpResponse) and results[2].status == 200 assert isinstance(results[3], HttpResponse) and results[3].status == 200 assert isinstance(results[4], HttpError) and results[4].response.status == 404
def test_handle_spider_output(self): class R: def __init__(self, depth=None): self.meta = {} if depth is not None: self.meta['depth'] = depth mw = DepthMiddleware.from_crawler(Crawler(max_depth=1)) req = HttpRequest("http://python.org/", "GET") item = Item() res = [i for i in mw.handle_spider_output(R(), [req, item])] assert res == [req, item] and req.meta['depth'] == 1 res = [i for i in mw.handle_spider_output(R(0), [req, item])] assert res == [req, item] and req.meta['depth'] == 1 res = [i for i in mw.handle_spider_output(R(1), [req, item])] assert res == [item] and req.meta['depth'] == 2
def start_requests(self): yield HttpRequest("http://unknown/", errback=self.error_back) yield HttpRequest("http://unknown/", dont_filter=True, errback=self.async_error_back) yield HttpRequest("http://{}/error".format(self.server_address), errback=self.handle_request_error) yield HttpRequest("http://{}/".format(self.server_address), dont_filter=True) yield HttpRequest("http://{}/".format(self.server_address), dont_filter=True, callback=self.generator_parse) yield HttpRequest("http://{}/".format(self.server_address), dont_filter=True, callback=self.func_prase) yield HttpRequest("http://{}/".format(self.server_address), dont_filter=True, callback=self.async_parse) yield HttpRequest("http://{}/".format(self.server_address), dont_filter=True, callback=self.return_list_parse) yield HttpRequest("http://{}/".format(self.server_address), dont_filter=True, callback=self.return_none_parse)
def test_handle_reponse(self): mw = RetryMiddleware.from_crawler( Crawler(retry_http_status=(500, ), max_retry_times=3)) req = HttpRequest("http://example.com") resp = HttpResponse("http://example.com", 502) assert mw.handle_response(req, resp) is None req2 = HttpRequest("http://example.com") resp2 = HttpResponse("http://example.com", 500) retry_req2 = mw.handle_response(req2, resp2) assert retry_req2.meta['retry_times'] == 1 assert str(retry_req2.url) == str(req2.url) req3 = HttpRequest("http://example.com") resp3 = HttpResponse("http://example.com", 500) req3.meta['retry_times'] = 2 retry_req3 = mw.handle_response(req3, resp3) assert retry_req3.meta['retry_times'] == 3 assert str(retry_req3.url) == str(req3.url) req4 = HttpRequest("http://example.com") req4.meta['retry_times'] = 3 resp4 = HttpResponse("http://example.com", 500) assert mw.handle_response(req4, resp4) is None
async def test_downloader_middleware_manager_handlers(): data = {} crawler = Crawler(extensions=[lambda d=data: FooDownloadermw(d), DummyDownloadermw, FooAsyncDownloaderMw], default_extensions=None, data=data) downloadermw = ExtensionManager.from_crawler(crawler) request_obj = HttpRequest(None) response_obj = HttpResponse(None, None) error_obj = object() await crawler.event_bus.send(events.crawler_start) await downloadermw.handle_request(request_obj) await downloadermw.handle_response(request_obj, response_obj) await downloadermw.handle_error(request_obj, error_obj) await crawler.event_bus.send(events.crawler_shutdown) assert 'open' in data and 'close' in data assert data['handle_request'] is request_obj assert data['handle_response'][0] is request_obj and data['handle_response'][1] is response_obj assert data['handle_error'][0] is request_obj and data['handle_error'][1] is error_obj assert data['async_handle_request'] is request_obj assert data['async_handle_response'][0] is request_obj and data['async_handle_response'][1] is response_obj assert data['async_handle_error'][0] is request_obj and data['async_handle_error'][1] is error_obj
def start_requests(self): yield HttpRequest('http://python.org/', callback=self.parse_response, meta={'key': 'value'})
def test_proxy_str(self): proxy = '127.0.0.1:3128' mw = ProxyMiddleware.from_crawler(Crawler(proxy=proxy)) req = HttpRequest("http://example.com") mw.handle_request(req) assert req.proxy == proxy
async def query_params(): url = "http://httpbin.org/anything?key=value&none=" resp = await downloader.fetch(HttpRequest(url)) assert json.loads(resp.text)['args'] == {'key': 'value', 'none': ''}
async def dict_params(): resp = await downloader.fetch( HttpRequest(make_url("http://httpbin.org/get", params={'key': 'value', 'none': ''}))) assert json.loads(resp.text)['args'] == {'key': 'value', 'none': ''}
async def list_params(): resp = await downloader.fetch(HttpRequest(make_url("http://httpbin.org/get", params=[('list', '1'), ('list', '2')]))) assert json.loads(resp.text)['args'] == {'list': ['1', '2']}
def test_replace_http_request(): req = HttpRequest('http://example.com/', 'POST', body=b'body1') new_req = req.replace(url='https://example.com/', body=b'body2') assert new_req.url == 'https://example.com/' assert new_req.body == b'body2' assert new_req.method == 'POST'
def test_handle_start_requests(self): mw = DepthMiddleware.from_crawler(Crawler()) req = HttpRequest("http://python.org/", "GET") res = [i for i in mw.handle_start_requests([req])] for r in res: assert r.meta.get('depth') == 0
def run_any_dupe_filter(f): r_get = HttpRequest("http://example.com") r_get_port_80 = HttpRequest("http://example.com:80") r_get_port_81 = HttpRequest("http://example.com:81") r_get_dont_filter = HttpRequest("http://example.com", dont_filter=True) r_get_dir = HttpRequest("http://example.com/") r_get_post = HttpRequest("http://example.com/post") r_post = HttpRequest("http://example.com/post", "POST") r_post_dir = HttpRequest("http://example.com/post/", "POST") r_post_data = HttpRequest("http://example.com/post", "POST", body=b'data') r_get_param = HttpRequest( make_url("http://example.com/get", params={'k1': 'v1'})) r_get_query = HttpRequest("http://example.com/get?k1=v1") r_get_param_2 = HttpRequest( make_url("http://example.com/get", params={ 'k1': 'v1', 'k2': 'v2' })) r_get_query_2 = HttpRequest("http://example.com/get?k2=v2&k1=v1") r_get_query_param = HttpRequest( make_url("http://example.com/get?k1=v1", params={'k2': 'v2'})) assert f.is_duplicated(r_get) is False assert f.is_duplicated(r_get_port_80) is True assert f.is_duplicated(r_get_port_81) is False assert f.is_duplicated(r_get) is True assert f.is_duplicated(r_get_dont_filter) is False assert f.is_duplicated(r_get_dir) is False assert f.is_duplicated(r_get_post) is False assert f.is_duplicated(r_post) is False assert f.is_duplicated(r_post_dir) is False assert f.is_duplicated(r_post_data) is False assert f.is_duplicated(r_get_param) is False assert f.is_duplicated(r_get_query) is True assert f.is_duplicated(r_get_param_2) is False assert f.is_duplicated(r_get_query_2) is True assert f.is_duplicated(r_get_query_param) is True
def start_requests(self): yield HttpRequest('http://python.org/')