Example #1
0
 def test_random_user_agent(self):
     mw = UserAgentMiddleware.from_crawler(Crawler(random_user_agent=True))
     req = HttpRequest('http://example.com', headers={})
     req2 = HttpRequest('http://example.com', headers={})
     mw.handle_request(req)
     mw.handle_request(req2)
     assert 'User-Agent' in req.headers
     assert req.headers.get('User-Agent') != req2.headers.get('User-Agent')
     assert 'Chrome' in req.headers.get('User-Agent')
Example #2
0
def test_copy_http_request():
    req = HttpRequest('http://example.com/',
                      params={'key': 'value'},
                      meta={'depth': 0})
    copy_req = req.copy()
    assert copy_req.url == req.url
    params = get_params_in_url(copy_req.url)
    assert 'key' in params and params['key'] == ['value']
    assert 'depth' in copy_req.meta and copy_req.meta['depth'] == 0
    req.meta['depth'] = 1
    assert req.meta['depth'] == 1 and copy_req.meta['depth'] == 0
Example #3
0
async def test_allow_redirects():
    downloader = Downloader()

    resp = await downloader.fetch(HttpRequest(make_url('http://httpbin.org/redirect-to',
                                                       params={'url': 'http://python.org'})))
    assert resp.status // 100 == 2 and 'python.org' in resp.url

    with pytest.raises(HttpError) as e:
        await downloader.fetch(HttpRequest(make_url('http://httpbin.org/redirect-to',
                                                    params={'url': 'http://python.org'}),
                                           allow_redirects=False))
    assert e.value.response.status // 100 == 3
Example #4
0
    def test_gen_user_agent(self):
        mw = UserAgentMiddleware.from_crawler(
            Crawler(user_agent=':desktop,chrome'))
        req = HttpRequest('http://example.com', headers={})
        mw.handle_request(req)
        assert 'Chrome' in req.headers.get('User-Agent')

        mw2 = UserAgentMiddleware.from_crawler(
            Crawler(user_agent=':mobile,chrome'))
        req2 = HttpRequest('http://example.com', headers={})
        mw2.handle_request(req2)
        assert 'CriOS' in req2.headers.get(
            'User-Agent') and 'Mobile' in req2.headers.get('User-Agent')
Example #5
0
async def test_lifo_queue():
    q = LifoQueue()
    with pytest.raises(asyncio.TimeoutError):
        with async_timeout.timeout(0.1):
            await q.pop()
    obj_list = [HttpRequest('1'), HttpRequest('2'), HttpRequest('3')]
    for o in obj_list:
        await q.push(o)
    for i in range(len(obj_list)):
        assert await q.pop() == obj_list[len(obj_list) - i - 1]
    with pytest.raises(asyncio.TimeoutError):
        with async_timeout.timeout(0.1):
            await q.pop()
Example #6
0
 def test_clear(self):
     f = HashDupeFilter()
     r_get = HttpRequest("http://example.com")
     assert f.is_duplicated(r_get) is False
     assert f.is_duplicated(r_get) is True
     f.clear()
     assert f.is_duplicated(r_get) is False
Example #7
0
 async def post_bytes():
     bytes_data = 'bytes data: 字节数据'
     resp = await downloader.fetch(HttpRequest('http://httpbin.org/post',
                                               'POST', body=bytes_data.encode(),
                                               headers={'Content-Type': 'text/plain'}))
     assert resp.status == 200
     body = json.loads(resp.text)['data']
     assert body == bytes_data
Example #8
0
 async def post_str():
     str_data = 'str data: 字符串数据'
     resp = await downloader.fetch(HttpRequest('http://httpbin.org/post',
                                               'POST', body=str_data,
                                               headers={'Content-Type': 'text/plain'}))
     assert resp.status == 200
     body = json.loads(resp.text)['data']
     assert body == str_data
Example #9
0
 def test_handle_request(self):
     default_headers = {"User-Agent": "xpaw", "Connection": "keep-alive"}
     mw = DefaultHeadersMiddleware.from_crawler(
         Crawler(default_headers=default_headers))
     req = HttpRequest("http://example.com",
                       headers={"Connection": "close"})
     mw.handle_request(req)
     assert req.headers == {"User-Agent": "xpaw", "Connection": "close"}
Example #10
0
async def test_headers():
    downloader = Downloader()
    headers = {'User-Agent': 'xpaw'}
    resp = await downloader.fetch(HttpRequest("http://httpbin.org/get",
                                              headers=headers))
    assert resp.status == 200
    data = json.loads(resp.text)['headers']
    assert 'User-Agent' in data and data['User-Agent'] == 'xpaw'
Example #11
0
 def test_handle_error(self):
     mw = RetryMiddleware.from_crawler(Crawler())
     req = HttpRequest("http://example.com")
     err = ValueError()
     assert mw.handle_error(req, err) is None
     for err in [ClientError()]:
         retry_req = mw.handle_error(req, err)
         assert isinstance(retry_req, HttpRequest) and str(
             retry_req.url) == str(req.url)
Example #12
0
 def test_random_user_agent2(self):
     mw = UserAgentMiddleware.from_crawler(
         Crawler(user_agent=':mobile', random_user_agent=True))
     for i in range(30):
         req = HttpRequest('http://example.com', headers={})
         mw.handle_request(req)
         assert 'User-Agent' in req.headers
         assert 'CriOS' in req.headers.get(
             'User-Agent') and 'Mobile' in req.headers.get('User-Agent')
Example #13
0
def test_http_request_to_dict():
    headers = HttpHeaders()
    headers.add('Set-Cookie', 'a=b')
    headers.add('Set-Cookie', 'c=d')
    req = HttpRequest('http://example.com/',
                      'POST',
                      body=b'body',
                      headers=headers)
    d = req.to_dict()
    assert d['url'] == 'http://example.com/'
    assert d['method'] == 'POST'
    assert d['body'] == b'body'
    assert d['headers'] == headers

    req2 = HttpRequest.from_dict(d)
    assert req.url == req2.url
    assert req.method == req2.method
    assert req.body == req2.body
    assert req.headers == req2.headers
Example #14
0
 def test_proxy_dict(self):
     proxy_dict = {'http': '127.0.0.1:3128', 'https': '127.0.0.1:3129'}
     mw = ProxyMiddleware.from_crawler(Crawler(proxy=proxy_dict))
     req_list = []
     for i in ['http://example.com', 'https://example.com']:
         req_list.append(HttpRequest(i))
     res = ['127.0.0.1:3128', '127.0.0.1:3129']
     for i in range(len(req_list)):
         mw.handle_request(req_list[i])
         assert req_list[i].proxy == res[i]
Example #15
0
 def test_retry(self):
     max_retry_times = 2
     mw = RetryMiddleware.from_crawler(
         Crawler(max_retry_times=max_retry_times,
                 retry_http_status=(500, )))
     req = HttpRequest("http://example.com")
     for i in range(max_retry_times):
         retry_req = mw.retry(req, "")
         assert isinstance(retry_req, HttpRequest) and str(
             retry_req.url) == str(req.url)
         req = retry_req
     assert mw.retry(req, "") is None
Example #16
0
async def test_priority_queue():
    item1_1 = HttpRequest('1_1', priority=1)
    item1_2 = HttpRequest('1_2', priority=1)
    item2_1 = HttpRequest('2_1', priority=2)
    item2_2 = HttpRequest('2_2', priority=2)
    item3_1 = HttpRequest('3_1', priority=3)
    item3_2 = HttpRequest('3_2', priority=3)
    q = PriorityQueue()
    with pytest.raises(asyncio.TimeoutError):
        with async_timeout.timeout(0.1):
            await q.pop()
    await q.push(item2_1)
    await q.push(item1_1)
    await q.push(item3_1)
    await q.push(item1_2)
    await q.push(item2_2)
    await q.push(item3_2)
    assert await q.pop() is item3_1
    assert await q.pop() is item3_2
    assert await q.pop() is item2_1
    assert await q.pop() is item2_2
    assert await q.pop() is item1_1
    assert await q.pop() is item1_2
    with pytest.raises(asyncio.TimeoutError):
        with async_timeout.timeout(0.1):
            await q.pop()
Example #17
0
def test_make_requests():
    requests = [
        None, 'http://unknonw', 'http://python.org/',
        HttpRequest('http://python.org'), 'http://httpbin.org/status/404'
    ]
    results = make_requests(requests, log_level='DEBUG')
    assert len(results) == len(requests)
    assert results[0] is None
    assert isinstance(results[1], ClientError)
    assert isinstance(results[2], HttpResponse) and results[2].status == 200
    assert isinstance(results[3], HttpResponse) and results[3].status == 200
    assert isinstance(results[4],
                      HttpError) and results[4].response.status == 404
Example #18
0
    def test_handle_spider_output(self):
        class R:
            def __init__(self, depth=None):
                self.meta = {}
                if depth is not None:
                    self.meta['depth'] = depth

        mw = DepthMiddleware.from_crawler(Crawler(max_depth=1))
        req = HttpRequest("http://python.org/", "GET")
        item = Item()
        res = [i for i in mw.handle_spider_output(R(), [req, item])]
        assert res == [req, item] and req.meta['depth'] == 1
        res = [i for i in mw.handle_spider_output(R(0), [req, item])]
        assert res == [req, item] and req.meta['depth'] == 1
        res = [i for i in mw.handle_spider_output(R(1), [req, item])]
        assert res == [item] and req.meta['depth'] == 2
Example #19
0
 def start_requests(self):
     yield HttpRequest("http://unknown/", errback=self.error_back)
     yield HttpRequest("http://unknown/",
                       dont_filter=True,
                       errback=self.async_error_back)
     yield HttpRequest("http://{}/error".format(self.server_address),
                       errback=self.handle_request_error)
     yield HttpRequest("http://{}/".format(self.server_address),
                       dont_filter=True)
     yield HttpRequest("http://{}/".format(self.server_address),
                       dont_filter=True,
                       callback=self.generator_parse)
     yield HttpRequest("http://{}/".format(self.server_address),
                       dont_filter=True,
                       callback=self.func_prase)
     yield HttpRequest("http://{}/".format(self.server_address),
                       dont_filter=True,
                       callback=self.async_parse)
     yield HttpRequest("http://{}/".format(self.server_address),
                       dont_filter=True,
                       callback=self.return_list_parse)
     yield HttpRequest("http://{}/".format(self.server_address),
                       dont_filter=True,
                       callback=self.return_none_parse)
Example #20
0
 def test_handle_reponse(self):
     mw = RetryMiddleware.from_crawler(
         Crawler(retry_http_status=(500, ), max_retry_times=3))
     req = HttpRequest("http://example.com")
     resp = HttpResponse("http://example.com", 502)
     assert mw.handle_response(req, resp) is None
     req2 = HttpRequest("http://example.com")
     resp2 = HttpResponse("http://example.com", 500)
     retry_req2 = mw.handle_response(req2, resp2)
     assert retry_req2.meta['retry_times'] == 1
     assert str(retry_req2.url) == str(req2.url)
     req3 = HttpRequest("http://example.com")
     resp3 = HttpResponse("http://example.com", 500)
     req3.meta['retry_times'] = 2
     retry_req3 = mw.handle_response(req3, resp3)
     assert retry_req3.meta['retry_times'] == 3
     assert str(retry_req3.url) == str(req3.url)
     req4 = HttpRequest("http://example.com")
     req4.meta['retry_times'] = 3
     resp4 = HttpResponse("http://example.com", 500)
     assert mw.handle_response(req4, resp4) is None
Example #21
0
async def test_downloader_middleware_manager_handlers():
    data = {}
    crawler = Crawler(extensions=[lambda d=data: FooDownloadermw(d),
                                  DummyDownloadermw,
                                  FooAsyncDownloaderMw],
                      default_extensions=None,
                      data=data)
    downloadermw = ExtensionManager.from_crawler(crawler)
    request_obj = HttpRequest(None)
    response_obj = HttpResponse(None, None)
    error_obj = object()
    await crawler.event_bus.send(events.crawler_start)
    await downloadermw.handle_request(request_obj)
    await downloadermw.handle_response(request_obj, response_obj)
    await downloadermw.handle_error(request_obj, error_obj)
    await crawler.event_bus.send(events.crawler_shutdown)
    assert 'open' in data and 'close' in data
    assert data['handle_request'] is request_obj
    assert data['handle_response'][0] is request_obj and data['handle_response'][1] is response_obj
    assert data['handle_error'][0] is request_obj and data['handle_error'][1] is error_obj
    assert data['async_handle_request'] is request_obj
    assert data['async_handle_response'][0] is request_obj and data['async_handle_response'][1] is response_obj
    assert data['async_handle_error'][0] is request_obj and data['async_handle_error'][1] is error_obj
Example #22
0
 def start_requests(self):
     yield HttpRequest('http://python.org/',
                       callback=self.parse_response,
                       meta={'key': 'value'})
Example #23
0
 def test_proxy_str(self):
     proxy = '127.0.0.1:3128'
     mw = ProxyMiddleware.from_crawler(Crawler(proxy=proxy))
     req = HttpRequest("http://example.com")
     mw.handle_request(req)
     assert req.proxy == proxy
Example #24
0
 async def query_params():
     url = "http://httpbin.org/anything?key=value&none="
     resp = await downloader.fetch(HttpRequest(url))
     assert json.loads(resp.text)['args'] == {'key': 'value', 'none': ''}
Example #25
0
 async def dict_params():
     resp = await downloader.fetch(
         HttpRequest(make_url("http://httpbin.org/get", params={'key': 'value', 'none': ''})))
     assert json.loads(resp.text)['args'] == {'key': 'value', 'none': ''}
Example #26
0
 async def list_params():
     resp = await downloader.fetch(HttpRequest(make_url("http://httpbin.org/get",
                                                        params=[('list', '1'), ('list', '2')])))
     assert json.loads(resp.text)['args'] == {'list': ['1', '2']}
Example #27
0
def test_replace_http_request():
    req = HttpRequest('http://example.com/', 'POST', body=b'body1')
    new_req = req.replace(url='https://example.com/', body=b'body2')
    assert new_req.url == 'https://example.com/'
    assert new_req.body == b'body2'
    assert new_req.method == 'POST'
Example #28
0
 def test_handle_start_requests(self):
     mw = DepthMiddleware.from_crawler(Crawler())
     req = HttpRequest("http://python.org/", "GET")
     res = [i for i in mw.handle_start_requests([req])]
     for r in res:
         assert r.meta.get('depth') == 0
Example #29
0
def run_any_dupe_filter(f):
    r_get = HttpRequest("http://example.com")
    r_get_port_80 = HttpRequest("http://example.com:80")
    r_get_port_81 = HttpRequest("http://example.com:81")
    r_get_dont_filter = HttpRequest("http://example.com", dont_filter=True)
    r_get_dir = HttpRequest("http://example.com/")
    r_get_post = HttpRequest("http://example.com/post")
    r_post = HttpRequest("http://example.com/post", "POST")
    r_post_dir = HttpRequest("http://example.com/post/", "POST")
    r_post_data = HttpRequest("http://example.com/post", "POST", body=b'data')
    r_get_param = HttpRequest(
        make_url("http://example.com/get", params={'k1': 'v1'}))
    r_get_query = HttpRequest("http://example.com/get?k1=v1")
    r_get_param_2 = HttpRequest(
        make_url("http://example.com/get", params={
            'k1': 'v1',
            'k2': 'v2'
        }))
    r_get_query_2 = HttpRequest("http://example.com/get?k2=v2&k1=v1")
    r_get_query_param = HttpRequest(
        make_url("http://example.com/get?k1=v1", params={'k2': 'v2'}))
    assert f.is_duplicated(r_get) is False
    assert f.is_duplicated(r_get_port_80) is True
    assert f.is_duplicated(r_get_port_81) is False
    assert f.is_duplicated(r_get) is True
    assert f.is_duplicated(r_get_dont_filter) is False
    assert f.is_duplicated(r_get_dir) is False
    assert f.is_duplicated(r_get_post) is False
    assert f.is_duplicated(r_post) is False
    assert f.is_duplicated(r_post_dir) is False
    assert f.is_duplicated(r_post_data) is False
    assert f.is_duplicated(r_get_param) is False
    assert f.is_duplicated(r_get_query) is True
    assert f.is_duplicated(r_get_param_2) is False
    assert f.is_duplicated(r_get_query_2) is True
    assert f.is_duplicated(r_get_query_param) is True
Example #30
0
 def start_requests(self):
     yield HttpRequest('http://python.org/')