def test_request_ua(): headers = { "User-Agent": "Python3.5" } request = Request('http://www.httpbin.org/get', method='GET', res_type='json', headers=headers) result = asyncio.get_event_loop().run_until_complete(request.fetch()) assert result.html['headers']['User-Agent'] == "Python3.5"
def test_method_error_request(): try: request = Request("https://httpbin.org/", method="PUT") response = asyncio.get_event_loop().run_until_complete(request.fetch()) assert await response.text() == "" except Exception as e: assert isinstance(e, InvalidRequestMethod)
async def _localImage_or_webImage_parse(self, request: Request, spider_ins): ''' process image-data(loacl-image or web-image) during middle.request :param request: Request ''' _raw_url = request.url if _raw_url.startswith('https'): logger.error('Baidu-ocr does not support remote https image link,' 'check your start_urls') request.retry_times = 0 raise ImageTypeError elif _raw_url.startswith('http'): # self._service_payload.update(url=_raw_url) self._service_payload.pop('image', None) self._service_payload.update(url=_raw_url) else: if _raw_url[-3:] not in ['jpg', 'png', 'bmp', 'peg']: logger.error('Baidu does not support this type of picture , ' 'must be `jpg`, `png`, `bmp` or `jpeg`') request.retry_times = 0 request._ok = False raise ImageTypeError else: image, _path = await self.get_ocr_image( _raw_url, request, spider_ins, hook=self.get_ocr_image_hook, region=spider_ins.ocr_options['region']) self._service_payload.pop('url', None) self._service_payload.update(image=image)
def test_method_error_request(): try: request = Request('https://httpbin.org/', method='PUT') response = asyncio.get_event_loop().run_until_complete(request.fetch()) assert response.html == '' except Exception as e: assert isinstance(e, InvalidRequestMethod)
def test_request_params(): params = { "name": "ruia" } request = Request('http://www.httpbin.org/get', method='GET', res_type='json', params=params) result = asyncio.get_event_loop().run_until_complete(request.fetch()) assert result.html['args']['name'] == "ruia"
def test_delay_false(): request_config = {"DELAY": 10} request = Request("https://httpbin.org/", request_config=request_config) # Start a timer to time request timer = time.time() response = asyncio.get_event_loop().run_until_complete(request.fetch(delay=False)) # Ensure delay option was ignored (time taken is less than 10s) assert time.time() - timer < 10
async def sec_request(): form_data, must_cookies, history = await request_example() headers = { 'User-Agent': ('Mozilla/5.0'), } request = Request(url='http://portal.neaea.gov.et/Student/StudentDetailsx', method='POST', headers=headers, metadata=form_data, cookies=must_cookies) print(request) return request.fetch()
def test_retry_delay(): # Test invalid URL (to trigger retries) with 1s delay between retries request_config = {"RETRIES": 2, "RETRY_DELAY": 1} request = Request("http://127.0.0.1:5999/", request_config=request_config) # Start a timer to time retries timer = time.time() _, response = asyncio.get_event_loop().run_until_complete( request.fetch_callback(sem=sem)) # Ensure that for 2 retries the time taken is > 2s (1s between each retry) assert time.time() - timer > 2
async def request_example(): url = 'http://portal.neaea.gov.et/Home/Student' params = { 'name': 'ruia', } headers = { 'User-Agent': ('Mozilla/5.0'), } request = Request(url=url, method='GET', params=params, headers=headers) must_cookies = {} must_cookies_names = ['__RequestVerificationToken'] response = await request.fetch() for cookie_name in must_cookies_names: must_cookies[cookie_name] = response.cookies.get(cookie_name) history = response.history text = await response.text() html = Selector(text=text) csrf_token = html.xpath("/html/body/div[2]/div/form/input/@value").get() form_data = { '__RequestVerificationToken': csrf_token, 'admissionNumber': None # to be set } return form_data, must_cookies, history
async def async_fetch( self, url_or_request: Union[Request, str], response: Response = None, ): """ Fetch target URL :param url_or_request: :param response: :return: """ async with aiohttp.ClientSession() as session: if isinstance(url_or_request, Request): request: Request = url_or_request request.request_session = session else: request: Request = Request(url=url_or_request, request_session=session) if response is None: response: Response = await request.fetch() # process response response.html = await response.text() response.etree = response.html_etree(response.html) self.refresh_user_ns(request, response)
async def parse(self, response): for index, url in enumerate(self.start_urls): yield Request(url, method='POST', data=self.body, callback=self.parse_item, metadata={'index': index})
async def parse(self, response): yield Request( url=response.url, callback=self.parse_item, headers=self.headers, request_config=self.request_config, **self.kwargs )
async def make_post_request(sem, callback): headers = {'Content-Type': 'application/json'} request = Request('https://httpbin.org/post', method='POST', headers=headers, data=params, callback=callback) return await request.fetch_callback(sem)
async def parse(self, res): pages = ['http://www.httpbin.org/get', 'http://www.httpbin.org/get'] for index, page in enumerate(pages): yield Request( page, callback=self.parse_item, metadata={'index': index} )
async def parse(self, response): self.mongo_db = MotorBase().get_db('ruia_test') urls = ['https://news.ycombinator.com/news?p=1', 'https://news.ycombinator.com/news?p=2'] for index, url in enumerate(urls): yield Request( url, callback=self.parse_item, metadata={'index': index} )
async def parse(self, res): items = await alist(ArchivesItem.get_items(html=res.html)) self.mongo_db = MotorBase(loop=self.loop).get_db() for item in items: # 随机休眠 self.request_config['DELAY'] = random.randint(5, 10) yield Request(item.href, callback=self.parse_item, request_config=self.request_config)
async def request_example(): url = "https://httpbin.org/get" params = {"name": "ruia"} headers = {"User-Agent": "Python3.6"} request = Request( url=url, method="GET", res_type="json", params=params, headers=headers ) response = await request.fetch() assert response.html["args"]["name"] == "ruia" assert response.html["headers"]["User-Agent"] == "Python3.6"
async def parse(self, response): self.mongo_db = MotorBase().get_db('hacknews') urls = ['https://baijiahao.baidu.com/s?id=1553475025395018', 'https://baijiahao.baidu.com/s?id=1570895803249513'] for index, url in enumerate(urls): yield Request( url, callback=self.parse_item, metadata={'index': index} )
async def make_post_request(sem, callback): headers = {"Content-Type": "application/json"} request = Request( "https://httpbin.org/post", method="POST", headers=headers, data=params, callback=callback, ) return await request.fetch_callback(sem)
async def parse(self, response): self.mongo_db = MotorBase().get_db("ruia_test") urls = [ "https://news.ycombinator.com/news?p=1", "https://news.ycombinator.com/news?p=2", ] for index, url in enumerate(urls): yield Request(url, callback=self.parse_item, metadata={"index": index})
async def parse(self, res): try: self.mongo_db = MotorBase(loop=self.loop).get_db() except Exception as e: self.logger.exception(e) async for item in ArchivesItem.get_items(html=await res.text()): yield Request( item.href, callback=self.parse_item, request_config=self.request_config, )
async def parse(self, res): etree = res.html_etree pages = ['?start=0&filter=' ] + [i.get('href') for i in etree.cssselect('.paginator>a')] for index, page in enumerate(pages): url = self.start_urls[0] + page yield Request(url, callback=self.parse_item, metadata={'index': index}, request_config=self.request_config, **self.kwargs)
async def timeout_request(sem): request_config = {"RETRIES": 1, "DELAY": 1, "TIMEOUT": 0.1} request = Request( "https://httpbin.org/get", method="GET", metadata={"hello": "ruia"}, encoding="utf-8", request_config=request_config, params=params, callback=hi, ) return await request.fetch_callback(sem)
def test_request_config(): assert str(Request("https://httpbin.org/")) == "<GET https://httpbin.org/>" _, response = asyncio.get_event_loop().run_until_complete( make_get_request(sem=sem, callback=hello)) # assert response.callback_result == "hello ruia" assert response.metadata == {"hello": "ruia"} json_result = asyncio.get_event_loop().run_until_complete(response.json()) assert json_result["args"]["name"] == "ruia" _, response = asyncio.get_event_loop().run_until_complete( make_post_request(sem=sem, callback=None)) json_result = asyncio.get_event_loop().run_until_complete(response.json()) assert json_result["data"] == "name=ruia"
def test_request_config(): assert str(Request('https://httpbin.org/')) == '<GET https://httpbin.org/>' _, response = asyncio.get_event_loop().run_until_complete( make_get_request(sem=sem, callback=hello)) assert response.callback_result == 'hello ruia' assert response.metadata == {'hello': 'ruia'} json_result = asyncio.get_event_loop().run_until_complete(response.json()) assert json_result['args']['name'] == "ruia" _, response = asyncio.get_event_loop().run_until_complete( make_post_request(sem=sem, callback=None)) json_result = asyncio.get_event_loop().run_until_complete(response.json()) assert json_result['data'] == "name=ruia"
async def parse(self, res): etree = res.html_etree urls = [ i.get('href') for i in etree.cssselect('.content_list .dd_bt a') ] for index, url in enumerate(urls): url = 'http:' + url yield Request( url, callback=self.parse_item, metadata={'index': index}, request_config=self.request_config, )
async def parse_item(self, res): async for item in ArticleListItem.get_items(html=await res.text()): # 已经抓取的链接不再请求 is_exist = (await self.mongo_db.source_docs.find_one( {"url": item.href}) or {}) if not is_exist.get("html"): yield Request( item.href, callback=self.save, metadata={"title": item.title}, request_config=self.request_config, )
async def parse_item(self, res): items = await alist(ArticleListItem.get_items(html=res.html)) for item in items: # 已经抓取的链接不再请求 is_exist = await self.mongo_db.source_docs.find_one( {'url': item.href}) if not is_exist: # 随机休眠 self.request_config['DELAY'] = random.randint(5, 10) yield Request(item.href, callback=self.save, metadata={'title': item.title}, request_config=self.request_config)
async def timeout_request(sem): request_config = { 'RETRIES': 1, 'DELAY': 1, 'TIMEOUT': 0.1, } request = Request('https://httpbin.org/get', method='GET', metadata={'hello': 'ruia'}, encoding='utf-8', request_config=request_config, params=params, callback=hi) return await request.fetch_callback(sem)
async def parse_item(self, res): items = await ArticleListItem.get_items(html=res.html) for item in items: # 已经抓取的链接不再请求 is_exist = await self.mongo_db.source_docs.find_one( {'url': item.href}) or {} if not is_exist.get('html'): yield Request( item.href, callback=self.save, metadata={'title': item.title}, request_config=self.request_config, )