async def _retry(self, error_msg): """Manage request""" if self.retry_times > 0: # Sleep to give server a chance to process/cache prior request if self.request_config.get("RETRY_DELAY", 0) > 0: await asyncio.sleep(self.request_config["RETRY_DELAY"]) retry_times = self.request_config.get("RETRIES", 3) - self.retry_times + 1 self.logger.error( f"<Retry url: {self.url}>, Retry times: {retry_times}, Retry message: {error_msg}>" ) self.retry_times -= 1 retry_func = self.request_config.get("RETRY_FUNC") if retry_func and iscoroutinefunction(retry_func): request_ins = await retry_func(weakref.proxy(self)) if isinstance(request_ins, Request): return await request_ins.fetch(delay=False) return await self.fetch(delay=False) else: response = Response( url=self.url, method=self.method, metadata=self.metadata, cookies={}, history=(), headers=None, ) return response
async def _retry(self, error_msg): """Manage request""" if self.retry_times > 0: retry_times = self.request_config.get("RETRIES", 3) - self.retry_times + 1 self.logger.error( f"<Retry url: {self.url}>, Retry times: {retry_times}, Retry message: {error_msg}>" ) self.retry_times -= 1 retry_func = self.request_config.get("RETRY_FUNC") if retry_func and iscoroutinefunction(retry_func): request_ins = await retry_func(weakref.proxy(self)) if isinstance(request_ins, Request): return await request_ins.fetch() return await self.fetch() else: response = Response( url=self.url, method=self.method, metadata=self.metadata, cookies={}, history=(), headers=None, ) return response
async def fetch(self) -> Response: res_headers, res_history = {}, () res_status = 0 res_data, res_cookies = None, None if self.request_config.get('DELAY', 0) > 0: await asyncio.sleep(self.request_config['DELAY']) try: timeout = self.request_config.get('TIMEOUT', 10) async with async_timeout.timeout(timeout): async with self.current_request_func as resp: res_status = resp.status assert res_status in [200, 201] if self.res_type == 'bytes': res_data = await resp.read() elif self.res_type == 'json': res_data = await resp.json() else: res_data = await resp.text() # content = await resp.read() # charset = cchardet.detect(content) # res_data = content.decode(charset['encoding']) res_cookies, res_headers, res_history = resp.cookies, resp.headers, resp.history except Exception as e: self.logger.error(f"<Error: {self.url} {res_status} {str(e)}>") self.logger.exception(e) if self.retry_times > 0 and res_data is None: retry_times = self.request_config.get('RETRIES', 3) - self.retry_times + 1 self.logger.info( f'<Retry url: {self.url}>, Retry times: {retry_times}') self.retry_times -= 1 retry_func = self.request_config.get('RETRY_FUNC') if retry_func and iscoroutinefunction(retry_func): request_ins = await retry_func(self) if isinstance(request_ins, Request): return await request_ins.fetch() return await self.fetch() await self.close() response = Response(url=self.url, html=res_data, metadata=self.metadata, res_type=self.res_type, cookies=res_cookies, headers=res_headers, history=res_history, status=res_status) return response
async def fetch(self) -> Response: """Fetch all the information by using aiohttp""" if self.request_config.get("DELAY", 0) > 0: await asyncio.sleep(self.request_config["DELAY"]) timeout = self.request_config.get("TIMEOUT", 10) try: async with async_timeout.timeout(timeout): resp = await self._make_request() try: resp_data = await resp.text(encoding=self.encoding) except UnicodeDecodeError: resp_data = await resp.read() response = Response( url=self.url, method=self.method, encoding=resp.get_encoding(), html=resp_data, metadata=self.metadata, cookies=resp.cookies, headers=resp.headers, history=resp.history, status=resp.status, aws_json=resp.json, aws_text=resp.text, aws_read=resp.read, ) # Retry middleware aws_valid_response = self.request_config.get("VALID") if aws_valid_response and iscoroutinefunction(aws_valid_response): response = await aws_valid_response(response) if response.ok: return response else: return await self._retry( error_msg= f"Request url failed with status {response.status}!") except asyncio.TimeoutError: return await self._retry(error_msg="timeout") except Exception as e: return await self._retry(error_msg=e) finally: # Close client session await self._close_request()
async def _retry(self): if self.retry_times > 0: retry_times = self.request_config.get('RETRIES', 3) - self.retry_times + 1 self.logger.info( f'<Retry url: {self.url}>, Retry times: {retry_times}') self.retry_times -= 1 retry_func = self.request_config.get('RETRY_FUNC') if retry_func and iscoroutinefunction(retry_func): request_ins = await retry_func(self) if isinstance(request_ins, Request): return await request_ins.fetch() return await self.fetch() else: response = Response(url=self.url, method=self.method, metadata=self.metadata, cookies={}, history=()) return response
async def fetch(self) -> Response: """Fetch all the information by using aiohttp""" if self.request_config.get('DELAY', 0) > 0: await asyncio.sleep(self.request_config['DELAY']) timeout = self.request_config.get('TIMEOUT', 10) try: async with async_timeout.timeout(timeout): resp = await self._make_request() resp_data = await resp.text(encoding=self.encoding) response = Response(url=self.url, method=self.method, encoding=resp.get_encoding(), html=resp_data, metadata=self.metadata, cookies=resp.cookies, headers=resp.headers, history=resp.history, status=resp.status, aws_json=resp.json, aws_text=resp.text, aws_read=resp.read) # Retry middleware aws_valid_response = self.request_config.get('VALID') if aws_valid_response and iscoroutinefunction(aws_valid_response): response = await aws_valid_response(response) if response.ok: return response else: return await self._retry() except asyncio.TimeoutError: # Retry for timeout return await self._retry() finally: # Close client session await self._close_request_session()
async def fetch(self) -> Response: if self.request_config.get('DELAY', 0) > 0: await asyncio.sleep(self.request_config['DELAY']) try: timeout = self.request_config.get('TIMEOUT', 10) if self.load_js: # 此处则由pyppeteer发送请求而不是aiohttp if not hasattr(self, "browser"): self.pyppeteer_args.extend(['--no-sandbox']) self.browser = await pyppeteer.launch( headless=True, args=self.pyppeteer_args, options=self.pyppeteer_launch_options ) page = await self.browser.newPage() # 开启新页面 self.pyppeteer_page_options.update({'timeout': int(timeout * 1000)}) #页面超时设置 res = await page.goto(self.url, options=self.pyppeteer_page_options) data = await page.content() res_cookies = await page.cookies() res_headers = res.headers res_history = None res_status = res.status else: async with async_timeout.timeout(timeout): async with self.current_request_func as resp: res_status = resp.status assert res_status in [200, 201] if self.res_type == 'bytes': data = await resp.read() elif self.res_type == 'json': data = await resp.json() else: data = await resp.text() res_cookies, res_headers, res_history = resp.cookies, resp.headers, resp.history except Exception as e: res_headers = {} res_history = () res_status = 0 data, res_cookies = None, None self.logger.error(f"<Error: {self.url} {res_status} {str(e)}>") if self.retry_times > 0 and data is None: retry_times = self.request_config.get('RETRIES', 3) - self.retry_times + 1 self.logger.info(f'<Retry url: {self.url}>, Retry times: {retry_times}') self.retry_times -= 1 retry_func = self.request_config.get('RETRY_FUNC') if retry_func and iscoroutinefunction(retry_func): request_ins = await retry_func(self) if isinstance(request_ins, Request): return await request_ins.fetch() return await self.fetch() await self.close() response = Response(url=self.url, html=data, metadata=self.metadata, res_type=self.res_type, cookies=res_cookies, headers=res_headers, history=res_history, status=res_status) return response
async def fetch(self) -> PyppeteerResponse: """Fetch all the information by using aiohttp""" if self.request_config.get("DELAY", 0) > 0: await asyncio.sleep(self.request_config["DELAY"]) timeout = self.request_config.get("TIMEOUT", 10) try: if self.load_js: if not hasattr(self, "browser"): self.pyppeteer_args.extend(["--no-sandbox"]) self.browser = await pyppeteer.launch( headless=True, args=self.pyppeteer_args, options=self.pyppeteer_launch_options, ) page = await self.browser.newPage() self.pyppeteer_page_options.update( {"timeout": int(timeout * 1000)}) resp = await page.goto(self.url, options=self.pyppeteer_page_options) await page.setViewport(self.pyppeteer_viewport) resp_data = await page.content() response = PyppeteerResponse( url=self.url, method=self.method, encoding=self.encoding, html=resp_data, page=page, browser=self.browser, metadata=self.metadata, cookies=await page.cookies(), headers=resp.headers, history=(), status=resp.status, aws_json=resp.json, aws_text=resp.text, aws_read=resp.buffer, ) else: async with async_timeout.timeout(timeout): resp = await self._make_request() resp_data = await resp.text(encoding=self.encoding) response = Response( url=self.url, method=self.method, encoding=resp.get_encoding(), html=resp_data, metadata=self.metadata, cookies=resp.cookies, headers=resp.headers, history=resp.history, status=resp.status, aws_json=resp.json, aws_text=resp.text, aws_read=resp.read, ) if not response.ok: return await self._retry( error_msg= f"Request url failed with status {response.status}!") return response except asyncio.TimeoutError: # Retry for timeout return await self._retry("timeout") finally: # Close client session await self._close_request() if self.close_pyppeteer_browser: await self.browser.close()