Esempio n. 1
0
    async def _retry(self, error_msg):
        """Manage request"""
        if self.retry_times > 0:
            # Sleep to give server a chance to process/cache prior request
            if self.request_config.get("RETRY_DELAY", 0) > 0:
                await asyncio.sleep(self.request_config["RETRY_DELAY"])

            retry_times = self.request_config.get("RETRIES",
                                                  3) - self.retry_times + 1
            self.logger.error(
                f"<Retry url: {self.url}>, Retry times: {retry_times}, Retry message: {error_msg}>"
            )
            self.retry_times -= 1
            retry_func = self.request_config.get("RETRY_FUNC")
            if retry_func and iscoroutinefunction(retry_func):
                request_ins = await retry_func(weakref.proxy(self))
                if isinstance(request_ins, Request):
                    return await request_ins.fetch(delay=False)
            return await self.fetch(delay=False)
        else:
            response = Response(
                url=self.url,
                method=self.method,
                metadata=self.metadata,
                cookies={},
                history=(),
                headers=None,
            )

            return response
Esempio n. 2
0
    async def _retry(self, error_msg):
        """Manage request"""
        if self.retry_times > 0:
            retry_times = self.request_config.get("RETRIES",
                                                  3) - self.retry_times + 1
            self.logger.error(
                f"<Retry url: {self.url}>, Retry times: {retry_times}, Retry message: {error_msg}>"
            )
            self.retry_times -= 1
            retry_func = self.request_config.get("RETRY_FUNC")
            if retry_func and iscoroutinefunction(retry_func):
                request_ins = await retry_func(weakref.proxy(self))
                if isinstance(request_ins, Request):
                    return await request_ins.fetch()
            return await self.fetch()
        else:
            response = Response(
                url=self.url,
                method=self.method,
                metadata=self.metadata,
                cookies={},
                history=(),
                headers=None,
            )

            return response
Esempio n. 3
0
    async def fetch(self) -> Response:
        res_headers, res_history = {}, ()
        res_status = 0
        res_data, res_cookies = None, None
        if self.request_config.get('DELAY', 0) > 0:
            await asyncio.sleep(self.request_config['DELAY'])
        try:
            timeout = self.request_config.get('TIMEOUT', 10)

            async with async_timeout.timeout(timeout):
                async with self.current_request_func as resp:
                    res_status = resp.status
                    assert res_status in [200, 201]
                    if self.res_type == 'bytes':
                        res_data = await resp.read()
                    elif self.res_type == 'json':
                        res_data = await resp.json()
                    else:
                        res_data = await resp.text()
                        # content = await resp.read()
                        # charset = cchardet.detect(content)
                        # res_data = content.decode(charset['encoding'])
                    res_cookies, res_headers, res_history = resp.cookies, resp.headers, resp.history
        except Exception as e:
            self.logger.error(f"<Error: {self.url} {res_status} {str(e)}>")
            self.logger.exception(e)

        if self.retry_times > 0 and res_data is None:
            retry_times = self.request_config.get('RETRIES',
                                                  3) - self.retry_times + 1
            self.logger.info(
                f'<Retry url: {self.url}>, Retry times: {retry_times}')
            self.retry_times -= 1
            retry_func = self.request_config.get('RETRY_FUNC')
            if retry_func and iscoroutinefunction(retry_func):
                request_ins = await retry_func(self)
                if isinstance(request_ins, Request):
                    return await request_ins.fetch()
            return await self.fetch()

        await self.close()

        response = Response(url=self.url,
                            html=res_data,
                            metadata=self.metadata,
                            res_type=self.res_type,
                            cookies=res_cookies,
                            headers=res_headers,
                            history=res_history,
                            status=res_status)
        return response
Esempio n. 4
0
    async def fetch(self) -> Response:
        """Fetch all the information by using aiohttp"""
        if self.request_config.get("DELAY", 0) > 0:
            await asyncio.sleep(self.request_config["DELAY"])

        timeout = self.request_config.get("TIMEOUT", 10)
        try:
            async with async_timeout.timeout(timeout):
                resp = await self._make_request()
            try:
                resp_data = await resp.text(encoding=self.encoding)
            except UnicodeDecodeError:
                resp_data = await resp.read()

            response = Response(
                url=self.url,
                method=self.method,
                encoding=resp.get_encoding(),
                html=resp_data,
                metadata=self.metadata,
                cookies=resp.cookies,
                headers=resp.headers,
                history=resp.history,
                status=resp.status,
                aws_json=resp.json,
                aws_text=resp.text,
                aws_read=resp.read,
            )
            # Retry middleware
            aws_valid_response = self.request_config.get("VALID")
            if aws_valid_response and iscoroutinefunction(aws_valid_response):
                response = await aws_valid_response(response)
            if response.ok:
                return response
            else:
                return await self._retry(
                    error_msg=
                    f"Request url failed with status {response.status}!")
        except asyncio.TimeoutError:
            return await self._retry(error_msg="timeout")
        except Exception as e:
            return await self._retry(error_msg=e)
        finally:
            # Close client session
            await self._close_request()
Esempio n. 5
0
    async def _retry(self):
        if self.retry_times > 0:
            retry_times = self.request_config.get('RETRIES',
                                                  3) - self.retry_times + 1
            self.logger.info(
                f'<Retry url: {self.url}>, Retry times: {retry_times}')
            self.retry_times -= 1
            retry_func = self.request_config.get('RETRY_FUNC')
            if retry_func and iscoroutinefunction(retry_func):
                request_ins = await retry_func(self)
                if isinstance(request_ins, Request):
                    return await request_ins.fetch()
            return await self.fetch()
        else:
            response = Response(url=self.url,
                                method=self.method,
                                metadata=self.metadata,
                                cookies={},
                                history=())

            return response
Esempio n. 6
0
    async def fetch(self) -> Response:
        """Fetch all the information by using aiohttp"""
        if self.request_config.get('DELAY', 0) > 0:
            await asyncio.sleep(self.request_config['DELAY'])

        timeout = self.request_config.get('TIMEOUT', 10)
        try:
            async with async_timeout.timeout(timeout):
                resp = await self._make_request()
            resp_data = await resp.text(encoding=self.encoding)
            response = Response(url=self.url,
                                method=self.method,
                                encoding=resp.get_encoding(),
                                html=resp_data,
                                metadata=self.metadata,
                                cookies=resp.cookies,
                                headers=resp.headers,
                                history=resp.history,
                                status=resp.status,
                                aws_json=resp.json,
                                aws_text=resp.text,
                                aws_read=resp.read)
            # Retry middleware
            aws_valid_response = self.request_config.get('VALID')
            if aws_valid_response and iscoroutinefunction(aws_valid_response):
                response = await aws_valid_response(response)
            if response.ok:
                return response
            else:
                return await self._retry()
        except asyncio.TimeoutError:
            # Retry for timeout
            return await self._retry()
        finally:
            # Close client session
            await self._close_request_session()
Esempio n. 7
0
    async def fetch(self) -> Response:
        if self.request_config.get('DELAY', 0) > 0:
            await asyncio.sleep(self.request_config['DELAY'])
        try:
            timeout = self.request_config.get('TIMEOUT', 10)

            if self.load_js:  # 此处则由pyppeteer发送请求而不是aiohttp
                if not hasattr(self, "browser"):
                    self.pyppeteer_args.extend(['--no-sandbox'])
                    self.browser = await pyppeteer.launch(
                        headless=True,
                        args=self.pyppeteer_args,
                        options=self.pyppeteer_launch_options
                    )
                page = await  self.browser.newPage() # 开启新页面
                self.pyppeteer_page_options.update({'timeout': int(timeout * 1000)}) #页面超时设置
                res = await page.goto(self.url, options=self.pyppeteer_page_options)
                data = await page.content()
                res_cookies = await page.cookies()
                res_headers = res.headers
                res_history = None
                res_status = res.status
            else:
                async with async_timeout.timeout(timeout):
                    async with self.current_request_func as resp:
                        res_status = resp.status
                        assert res_status in [200, 201]
                        if self.res_type == 'bytes':
                            data = await resp.read()
                        elif self.res_type == 'json':
                            data = await resp.json()
                        else:
                            data = await resp.text()
                        res_cookies, res_headers, res_history = resp.cookies, resp.headers, resp.history
        except Exception as e:
            res_headers = {}
            res_history = ()
            res_status = 0
            data, res_cookies = None, None
            self.logger.error(f"<Error: {self.url} {res_status} {str(e)}>")

        if self.retry_times > 0 and data is None:
            retry_times = self.request_config.get('RETRIES', 3) - self.retry_times + 1
            self.logger.info(f'<Retry url: {self.url}>, Retry times: {retry_times}')
            self.retry_times -= 1
            retry_func = self.request_config.get('RETRY_FUNC')
            if retry_func and iscoroutinefunction(retry_func):
                request_ins = await retry_func(self)
                if isinstance(request_ins, Request):
                    return await request_ins.fetch()
            return await self.fetch()

        await self.close()

        response = Response(url=self.url,
                            html=data,
                            metadata=self.metadata,
                            res_type=self.res_type,
                            cookies=res_cookies,
                            headers=res_headers,
                            history=res_history,
                            status=res_status)
        return response
Esempio n. 8
0
    async def fetch(self) -> PyppeteerResponse:
        """Fetch all the information by using aiohttp"""
        if self.request_config.get("DELAY", 0) > 0:
            await asyncio.sleep(self.request_config["DELAY"])

        timeout = self.request_config.get("TIMEOUT", 10)
        try:
            if self.load_js:
                if not hasattr(self, "browser"):
                    self.pyppeteer_args.extend(["--no-sandbox"])
                    self.browser = await pyppeteer.launch(
                        headless=True,
                        args=self.pyppeteer_args,
                        options=self.pyppeteer_launch_options,
                    )
                page = await self.browser.newPage()
                self.pyppeteer_page_options.update(
                    {"timeout": int(timeout * 1000)})

                resp = await page.goto(self.url,
                                       options=self.pyppeteer_page_options)
                await page.setViewport(self.pyppeteer_viewport)

                resp_data = await page.content()
                response = PyppeteerResponse(
                    url=self.url,
                    method=self.method,
                    encoding=self.encoding,
                    html=resp_data,
                    page=page,
                    browser=self.browser,
                    metadata=self.metadata,
                    cookies=await page.cookies(),
                    headers=resp.headers,
                    history=(),
                    status=resp.status,
                    aws_json=resp.json,
                    aws_text=resp.text,
                    aws_read=resp.buffer,
                )
            else:
                async with async_timeout.timeout(timeout):
                    resp = await self._make_request()
                resp_data = await resp.text(encoding=self.encoding)
                response = Response(
                    url=self.url,
                    method=self.method,
                    encoding=resp.get_encoding(),
                    html=resp_data,
                    metadata=self.metadata,
                    cookies=resp.cookies,
                    headers=resp.headers,
                    history=resp.history,
                    status=resp.status,
                    aws_json=resp.json,
                    aws_text=resp.text,
                    aws_read=resp.read,
                )
            if not response.ok:
                return await self._retry(
                    error_msg=
                    f"Request url failed with status {response.status}!")
            return response
        except asyncio.TimeoutError:
            # Retry for timeout
            return await self._retry("timeout")
        finally:
            # Close client session
            await self._close_request()
            if self.close_pyppeteer_browser:
                await self.browser.close()