Beispiel #1
0
class APIClient(BaseAPIClient):
    verify = True
    base_url = None

    def __init__(self, *args, **kwargs):
        self.session = Session()
        spec = self.call(SpecEndpoint())
        super(APIClient, self).__init__(*args, spec=spec, **kwargs)

    def make_request(self, endpoint, request):
        request.url = self.base_url + request.url
        prepared = self.session.prepare_request(request)
        return self.session.send(prepared,
                                 stream=False,
                                 timeout=None,
                                 verify=self.verify,
                                 cert=None,
                                 proxies={},
                                 allow_redirects=True)
Beispiel #2
0
class MailChimp:
    """A class that helps interact with the Mailchimp API

    Parameters
    ----------
        
        datacenter: for example us14, us15...

        api_key: your api key on Mailchimp

        **headers: additional headers to add to the request
    """
    def __init__(self, datacenter, api_key, **headers):
        # Base path to access the API
        self.api = f'https://{datacenter}.api.mailchimp.com/3.0/'
        # Create the session that will be used
        # to send a prepared request
        self.session = Session()
        # Create the headers that will be used
        # in relationship wit the request
        base_headers = {
            'Authorization': f'auth {api_key}',
            'Cache-Control': 'no-cache'
        }
        self.api_key = ApiKey(api_key)

        if headers:
            headers = {**base_headers, **headers}
        self.headers = base_headers

    def prepare_request(self, path, method='GET', data: dict = None):
        # For whatever reason, the API returns an error
        # if 'merged_fields' is not present in the request
        # body -- if not present, we need to implement it
        if 'merge_fields' not in data:
            data.update({'merge_fields': {}})

        request = Request(method=method,
                          url=self.build_url(path),
                          headers=self.headers,
                          json=data)
        return self.session.prepare_request(request)

    def get(self, path):
        """Base definition that will be used to send all GET requests
        to the Mailchimp API

        Parameters
        ----------

            path: is the path to append to the base API url
        """
        request = Request('GET', self.build_url(path), auth=self.api_key)
        prepared_request = self.session.prepare_request(request)

        try:
            response = self.session.send(prepared_request)
        except:
            return None
        else:
            if response.status_code == 200:
                return response.json()
            return None

    def post(self, path, data: dict):
        """Base definition that will be used to send all POST requests
        to the Mailchimp API

        Parameters
        ----------

            path: is the path to append to the base API url
        """
        # For whatever reason, the API returns an error
        # if 'merged_fields' is not present in the request
        # body -- if not present, we need to implement it
        if 'merge_fields' not in data:
            data.update({'merge_fields': {}})

        request = Request('POST',
                          self.build_url(path),
                          headers=self.headers,
                          json=data)
        prepared_request = self.session.prepare_request(request)

        try:
            response = self.session.send(prepared_request)
        except:
            raise
        else:
            print(response.text)
            if response.status_code == 200:
                return response.json()
            return None

    def build_url(self, path):
        """Construct the url that will be used to send request to the API
        """
        return urljoin(self.api, path)

    def create_key(self, n=10):
        """Create a secret key
        """
        return secrets.token_hex(n)
class BaseClient(object):

    default_headers = {}
    default_client_timeout = 10

    def __init__(self, *args, **kwargs):
        if self.__class__ is __class__:
            raise NotImplementedError
        self._timeout = kwargs.get("timeout", self.__class__.default_client_timeout)
        self._session = Session()
        self._session.headers.update(self.__class__.default_headers)

    @property
    def user_agent(self):
        return self._session.headers.get('User-Agent')

    def _request(self, method, url,
            params=None, data=None, headers=None, cookies=None, files=None,
            auth=None, timeout=None, allow_redirects=True, proxies=None,
            hooks=None, stream=None, verify=None, cert=None, json=None):

        # Extended from requests/sessions.py  for '_client' kwargs

        req = Request(
            method=method.upper(),
            url=url,
            headers=headers,
            files=files,
            data=data or {},
            json=json,
            params=params or {},
            auth=auth,
            cookies=cookies,
            hooks=hooks,
        )
        prep = self._session.prepare_request(req)
        prep._client = self  # hold the reference to client


        proxies = proxies or {}

        settings = self._session.merge_environment_settings(
            prep.url, proxies, stream, verify, cert
        )

        # Send the request.
        send_kwargs = {
            'timeout': timeout or self._timeout, # set default timeout
            'allow_redirects': allow_redirects,
        }
        send_kwargs.update(settings)
        resp = self._session.send(prep, **send_kwargs)

        return resp

    def _get(self, url, params=None, **kwargs):
        return self._request('GET', url,  params=params, **kwargs)

    def _post(self, url, data=None, json=None, **kwargs):
        return self._request('POST', url, data=data, json=json, **kwargs)

    def set_user_agent(self, user_agent):
        self._session.headers["User-Agent"] = user_agent

    def persist_cookies(self, r):
        """
        From requests/sessions.py, Session.send()

        Session.send() 方法会首先 dispatch_hook 然后再 extract_cookies_to_jar

        在该项目中,对于返回信息异常的请求,在 hooks 校验时会将错误抛出,send() 之后的处理将不会执行。
        遇到的错误往往是 SystemException / TipsException ,而这些客户端认为是错误的情况,
        对于服务器端来说并不是错误请求,服务器端在该次请求结束后可能会要求 Set-Cookies
        但是由于 send() 在 dispatch_hook 时遇到错误而中止,导致后面的 extract_cookies_to_jar
        未能调用,因此 Cookies 并未更新。下一次再请求服务器的时候,就会遇到会话过期的情况。

        在这种情况下,需要在捕获错误后手动更新 cookies 以确保能够保持会话

        """
        if r.history:

            # If the hooks create history then we want those cookies too
            for resp in r.history:
                extract_cookies_to_jar(self._session.cookies, resp.request, resp.raw)

        extract_cookies_to_jar(self._session.cookies, r.request, r.raw)

    def clear_cookies(self):
        self._session.cookies.clear()
Beispiel #4
0
class Spider:
    # todo 针对每次请求不同的`header`来重新加载缓存
    # todo 增加字段`data`,存`post`字段
    # todo 增加字段`header`, 存header

    # 强制使用缓存
    FORCE_CACHE = 2
    # 运行使用缓存
    ENABLE_CACHE = 1
    # 不使用缓存
    DISABLE_CACHE = 0

    cache: CacheBase
    session: Session
    headers_generator: T_Headers_Generator
    sleeper: T_Sleeper
    request_middlewares: List[RequestMiddlewareBase]
    response_middlewares: List[ResponseMiddlewareBase]
    proxy_pool: Optional[ProxyPoolBase]
    __encoding: str

    cache_mode: Literal[0, 1, 2]
    timeout: Tuple[int, int]
    alive_time: int
    retry: int

    def __init__(
        self,
        cache: Optional[CacheBase] = None,
        session: Optional[Session] = None,
        headers_generator: Optional[T_Headers_Generator] = None,
        sleeper: Optional[T_Sleeper] = None,
        request_middlewares: Optional[List[RequestMiddlewareBase]] = None,
        response_middlewares: Optional[List[ResponseMiddlewareBase]] = None,
        proxy_pool: Optional[ProxyPoolBase] = None,
        __encoding: Optional[str] = None,
        cache_mode: Optional[Literal[0, 1, 2]] = None,
        timeout: Optional[Tuple[int, int]] = None,
        alive_time: Optional[int] = None,
        retry: Optional[int] = None,
    ):
        self.cache = cache or NoCache()
        if not session:
            self.session = Session()
        self.sleeper = sleeper or NoSleeper()
        self.request_middlewares = request_middlewares or []
        self.response_middlewares = response_middlewares or []
        self.proxy_pool = proxy_pool
        self.__encoding = __encoding
        # todo
        self.headers_generator = headers_generator or get_random_header

        self.cache_mode = cache_mode or Spider.DISABLE_CACHE
        self.timeout = timeout or (5, 5)
        self.alive_time = alive_time or 3
        self.retry = retry or 3

        self.update_headers()

    @classmethod
    def get_cache_spider(cls) -> 'Spider':
        spider = cls(SqliteCache())
        spider.cache_mode = Spider.ENABLE_CACHE
        return spider

    def set_sleeper(self, sleeper: T_Sleeper):
        if not isinstance(sleeper, Callable):
            raise RuntimeError('参数必须是Callable')
        self.sleeper = sleeper

    def set_random_sleeper(self, a=5, b=10):
        self.set_sleeper(RandomTimeSleeper(a, b))

    def set_sep_sleeper(self, sep_time=10):
        self.set_sleeper(SepTimeSleeper(sep_time))

    @property
    def encoding(self):
        return self.__encoding

    @encoding.setter
    def encoding(self, encoding):
        self.__encoding = encoding

    def set_cookie(self, cookie: str):
        """设置cookie

        :param cookie: 一个 `cookie` 字符串
        """
        cookie = {'Cookie': cookie}
        self.session.cookies = requests.sessions.cookiejar_from_dict(
            cookie, cookiejar=None, overwrite=True)

    def add_request_middlewares(
        self, request_middlewares: Union[RequestMiddlewareBase,
                                         List[RequestMiddlewareBase]]):
        if isinstance(request_middlewares, list):
            self.request_middlewares.extend(request_middlewares)
        else:
            self.request_middlewares.append(request_middlewares)

    def add_response_middlewares(
        self, response_middlewares: Union[ResponseMiddlewareBase,
                                          List[ResponseMiddlewareBase]]):
        if isinstance(response_middlewares, list):
            self.response_middlewares.extend(response_middlewares)
        else:
            self.response_middlewares.append(response_middlewares)

    def remove_request_middlewares(self, request_middlewares):
        del self.request_middlewares[request_middlewares]

    def remove_response_middlewares(self, response_middlewares):
        del self.response_middlewares[response_middlewares]

    def get_response(self,
                     req: PreparedRequest,
                     timeout=None,
                     allow_redirects=True,
                     cache_mode=None,
                     alive_time=None,
                     retry=None):
        """低级 api 发送请求, 在这里进行 request_middleware, response_middleware 的处理"""

        cache_mode = cache_mode or self.cache_mode
        alive_time = alive_time or self.alive_time
        timeout = timeout or self.timeout
        retry = retry or self.retry

        cache_signal = req.url

        if cache_mode == Spider.DISABLE_CACHE:
            # 如果禁用这个url的缓存, 则将之从缓存文件删除
            self.cache.clear_cache(cache_signal)

        # 如果 `is_force_cache` is True 则, 不论缓存是否过期, 都从缓存加载
        elif cache_mode == Spider.FORCE_CACHE and self.cache.is_cached(
                cache_signal, ignore_date=True):
            logger.debug('从缓存: {} <- {}', limit_text(req.url, 100), self.cache)
            response = self.cache.from_cache(cache_signal, force=True)
            if self.encoding:
                response.encoding = self.encoding
            return response

        elif cache_mode == Spider.ENABLE_CACHE and self.cache.is_cached(
                cache_signal, ignore_date=False):
            logger.debug('从缓存: {} <- {}', limit_text(cache_signal, 100),
                         self.cache)
            response = self.cache.from_cache(cache_signal, force=False)
            if self.encoding:
                response.encoding = self.encoding
            return response

        retry = retry
        while retry:
            try:
                # 间隔时间
                response = self.session.send(req,
                                             timeout=timeout or (5, 5),
                                             allow_redirects=allow_redirects)
                if self.encoding:
                    response.encoding = self.encoding
                response = Response(response)

                self.sleeper()

                if len(response.response.history) >= 1:
                    history = [each.url for each in response.response.history]
                    history.append(response.url)
                    logger.debug(
                        '页面重定向: {}',
                        '->'.join(['[{}]'.format(i) for i in history]))
                if response.response.ok:
                    if cache_mode == Spider.ENABLE_CACHE:
                        self.cache.cache(response.url, response, alive_time)
                else:
                    raise HTTPBadCodeError(f'坏HTTP响应', response)
                return response

            except requests.Timeout as e:
                if retry == 1:
                    raise e
                logger.debug('超时,重试---{}'.format(str(4 - retry)))
            except requests.RequestException as e:
                if retry == 1:
                    logger.error('取消重试---{}'.format(str(4 - retry)))
                    raise e
                logger.error('HTTP报错---{}'.format(str(4 - retry)))
                # todo 对于失败的`url`保存到另一个`log`文件
            except HTTPBadCodeError as e:
                if retry == 1:
                    logger.info('坏HTTP响应, 取消重试({})'.format(
                        e.args[1].status_code))
                    return e.args[1]
                logger.debug('坏HTTP响应({})---{}', e.args[1].status_code,
                             4 - retry)
            finally:
                retry -= 1

    def lunch(self,
              method,
              url,
              *,
              cache_mode=None,
              alive_time=None,
              timeout=None,
              retry=None,
              headers=None,
              data=None,
              params=None,
              cookies=None,
              hooks=None):
        """高级 api 发送请求, 处理网页缓存等功能"""
        # 发送请求
        req = requests.Request(
            method=method.upper(),
            url=url,
            headers=headers,
            files=None,
            data=data or {},
            json=None,
            params=params or {},
            auth=None,
            cookies=cookies,
            hooks=hooks,
        )
        req = self.session.prepare_request(req)
        for request_middleware in self.request_middlewares:
            req = request_middleware(self, req)

        resp = self.get_response(req,
                                 cache_mode=cache_mode,
                                 alive_time=alive_time,
                                 timeout=timeout,
                                 retry=retry)

        for response_middleware in self.response_middlewares:
            resp = response_middleware(self, resp)
        return resp

    def get(self, *args, **kwargs) -> [Response, requests.Response, object]:
        """获取网页

        :param args: (元组)`url`的各个路径:
        :param kwargs: 包含`requests`库所有选项
        :keyword alive_time: Union[datetime, int]缓存存活日期
        :keyword cache: 是否使用缓存
        :keyword sep_time: 间隔时间

        :return: Union[Response, requests.Response, object]
        """
        # 获取`alive_time`, `url`参数
        resp = self.lunch('get', *args, **kwargs)
        return resp

    def post(self, *args, **kwargs) -> Response:
        """获取网页

        :param args: (元组)`url`的各个路径:
        :param kwargs: 包含`requests`库所有选项
        :keyword alive_time: Union[datetime, int]缓存存活日期
        :keyword cache: 是否使用缓存
        :keyword sep_time: 间隔时间

        :return: Union[Response, requests.Response, object]
        """
        resp = self.lunch('post', *args, **kwargs)
        return resp

    def update_headers(self):
        """调用`self.headers_generator`来更新头"""
        self.session.headers.update(self.headers_generator())

    def close(self):
        self.session.close()
class RequestHandler(object):
    '''
    Is a general request handler that serves all specific implementations by simply changing the resurce dir and the additional options.
    Every new implementations should 'register' in HandlerTemplate
    '''
    def __init__(self, connectionDetails):
        '''
        Constructor
        '''
        if not isinstance(connectionDetails, ConnectionDetails):
            raise RequestException(
                "Connection detail is not actually derived from Connectiondetails."
            )
        self.__connectionDetails = connectionDetails
        self.debug = False
        self.session = Session()

    def handlerId(self):
        raise RequestException(
            "handlerId() is not overridden in specialization: " +
            self.__class__ + ", please implement it as a static method.")

    def _raw(self,
             mode,
             url,
             headerdict=None,
             paramdict=None,
             bodydict=None,
             rawbody=None):
        if (bodydict is not None) and (rawbody is not None):
            raise RequestException(
                "Not allowed to specify dictionary and raw body at the same time."
            )
        try:
            pr = self.session.prepare_request(
                Request(mode,
                        url,
                        headers=headerdict,
                        params=paramdict,
                        data=rawbody,
                        json=bodydict,
                        auth=self.__connectionDetails.getAuth()))
            if self.debug: pprint(vars(pr), indent=2, depth=100)
            r = self.session.send(pr)
            if self.debug: pprint(vars(r), indent=2, depth=100)
            r.raise_for_status()
        except requests.RequestException as e:
            e.args = e.args[0:] + (e.response.content, )
            raise e
        return r

    def _post(self,
              resource,
              headerdict=None,
              paramdict=None,
              bodydict=None,
              rawbody=None):
        if (bodydict is not None) and (rawbody is not None):
            raise RequestException(
                "Not allowed to specify dictionary and raw body at the same time."
            )
        h = {}
        b = {}
        p = {}
        if self.__connectionDetails is None:
            raise RequestException("Connection detail is None.")
        h.update(self.__connectionDetails.getHeader())
        b.update(self.__connectionDetails.getBody())
        p.update(self.__connectionDetails.getParam())
        if headerdict is not None: h.update(headerdict)
        if bodydict is not None: b.update(bodydict)
        if paramdict is not None: p.update(paramdict)
        if len(h) is 0: h = None
        if len(b) is 0: b = None
        if len(p) is 0: p = None
        try:
            if (rawbody is not None):
                # ignoring template-default body parameters!!!
                pr = self.session.prepare_request(
                    Request("POST",
                            self.__connectionDetails.formURL() +
                            self._formatResource(resource),
                            headers=h,
                            params=p,
                            data=rawbody,
                            auth=self.__connectionDetails.getAuth()))
            else:
                pr = self.session.prepare_request(
                    Request("POST",
                            self.__connectionDetails.formURL() +
                            self._formatResource(resource),
                            headers=h,
                            params=p,
                            json=b,
                            auth=self.__connectionDetails.getAuth()))
            if self.debug: pprint(vars(pr), indent=2, depth=100)
            r = self.session.send(pr)
            if self.debug: pprint(vars(r), indent=2, depth=100)
            r.raise_for_status()
        except Exception as e:
            e.args = e.args[0:] + (e.response.content, )
            raise e
        return r

    def _put(self,
             resource,
             headerdict=None,
             paramdict=None,
             bodydict=None,
             rawbody=None):
        if (bodydict is not None) and (rawbody is not None):
            raise RequestException(
                "Not allowed to specify dictionary and raw body at the same time."
            )
        h = {}
        b = {}
        p = {}
        if self.__connectionDetails is None:
            raise RequestException("Connection detail is None.")
        h.update(self.__connectionDetails.getHeader())
        b.update(self.__connectionDetails.getBody())
        p.update(self.__connectionDetails.getParam())
        if headerdict is not None: h.update(headerdict)
        if bodydict is not None: b.update(bodydict)
        if paramdict is not None: p.update(paramdict)
        if len(h) is 0: h = None
        if len(b) is 0: b = None
        if len(p) is 0: p = None
        try:
            if (rawbody is not None):
                # ignoring template-default body parameters!!!
                pr = self.session.prepare_request(
                    Request("PUT",
                            self.__connectionDetails.formURL() +
                            self._formatResource(resource),
                            headers=h,
                            params=p,
                            data=rawbody,
                            auth=self.__connectionDetails.getAuth()))
            else:
                pr = self.session.prepare_request(
                    Request("PUT",
                            self.__connectionDetails.formURL() +
                            self._formatResource(resource),
                            headers=h,
                            params=p,
                            json=b,
                            auth=self.__connectionDetails.getAuth()))
            if self.debug: pprint(vars(pr), indent=2, depth=100)
            r = self.session.send(pr)
            if self.debug: pprint(vars(r), indent=2, depth=100)
            r.raise_for_status()
        except requests.RequestException as e:
            e.args = e.args[0:] + (e.response.content, )
            raise e
        return r

    def _get(self, resource, headerdict=None, paramdict=None):
        # attaching body is highly discouraged, therefore not supported
        h = {}
        p = {}
        if self.__connectionDetails is None:
            raise RequestException("Connection detail is None.")
        h.update(self.__connectionDetails.getHeader())
        p.update(self.__connectionDetails.getParam())
        if headerdict is not None: h.update(headerdict)
        if paramdict is not None: p.update(paramdict)
        if len(h) is 0: h = None
        if len(p) is 0: p = None
        try:
            pr = self.session.prepare_request(
                Request("GET",
                        self.__connectionDetails.formURL() +
                        self._formatResource(resource),
                        params=p,
                        headers=h,
                        auth=self.__connectionDetails.getAuth()))
            if self.debug: pprint(vars(pr), indent=2, depth=100)
            r = self.session.send(pr)
            if self.debug: pprint(vars(r), indent=2, depth=100)
            r.raise_for_status()
        except requests.RequestException as e:
            e.args = e.args[0:] + (e.response.content, )
            raise e
        return r

    def _del(self, resource, headerdict=None, paramdict=None):
        # attaching body is highly discouraged, therefore not supported
        h = {}
        p = {}
        if self.__connectionDetails is None:
            raise RequestException("Connection detail is None.")
        h.update(self.__connectionDetails.getHeader())
        p.update(self.__connectionDetails.getParam())
        if headerdict is not None: h.update(headerdict)
        if paramdict is not None: p.update(paramdict)
        if len(h) is 0: h = None
        if len(p) is 0: p = None
        try:
            pr = self.session.prepare_request(
                Request("DELETE",
                        self.__connectionDetails.formURL() +
                        self._formatResource(resource),
                        params=p,
                        headers=h,
                        auth=self.__connectionDetails.getAuth()))
            if self.debug: pprint(vars(pr), indent=2, depth=100)
            r = self.session.send(pr)
            if self.debug: pprint(vars(r), indent=2, depth=100)
            r.raise_for_status()
        except Exception as e:
            e.args = e.args[0:] + (e.response.content, )
            raise e
        return r

    def getConnectionDetails(self):
        return self.__connectionDetails

    def _formatResource(self, *args):
        r = ""
        if args is None: return r
        for i in args:
            r += "/" + i
        return r.replace("\\", "/").replace("///", "/").replace("//", "/")