class APIClient(BaseAPIClient): verify = True base_url = None def __init__(self, *args, **kwargs): self.session = Session() spec = self.call(SpecEndpoint()) super(APIClient, self).__init__(*args, spec=spec, **kwargs) def make_request(self, endpoint, request): request.url = self.base_url + request.url prepared = self.session.prepare_request(request) return self.session.send(prepared, stream=False, timeout=None, verify=self.verify, cert=None, proxies={}, allow_redirects=True)
class MailChimp: """A class that helps interact with the Mailchimp API Parameters ---------- datacenter: for example us14, us15... api_key: your api key on Mailchimp **headers: additional headers to add to the request """ def __init__(self, datacenter, api_key, **headers): # Base path to access the API self.api = f'https://{datacenter}.api.mailchimp.com/3.0/' # Create the session that will be used # to send a prepared request self.session = Session() # Create the headers that will be used # in relationship wit the request base_headers = { 'Authorization': f'auth {api_key}', 'Cache-Control': 'no-cache' } self.api_key = ApiKey(api_key) if headers: headers = {**base_headers, **headers} self.headers = base_headers def prepare_request(self, path, method='GET', data: dict = None): # For whatever reason, the API returns an error # if 'merged_fields' is not present in the request # body -- if not present, we need to implement it if 'merge_fields' not in data: data.update({'merge_fields': {}}) request = Request(method=method, url=self.build_url(path), headers=self.headers, json=data) return self.session.prepare_request(request) def get(self, path): """Base definition that will be used to send all GET requests to the Mailchimp API Parameters ---------- path: is the path to append to the base API url """ request = Request('GET', self.build_url(path), auth=self.api_key) prepared_request = self.session.prepare_request(request) try: response = self.session.send(prepared_request) except: return None else: if response.status_code == 200: return response.json() return None def post(self, path, data: dict): """Base definition that will be used to send all POST requests to the Mailchimp API Parameters ---------- path: is the path to append to the base API url """ # For whatever reason, the API returns an error # if 'merged_fields' is not present in the request # body -- if not present, we need to implement it if 'merge_fields' not in data: data.update({'merge_fields': {}}) request = Request('POST', self.build_url(path), headers=self.headers, json=data) prepared_request = self.session.prepare_request(request) try: response = self.session.send(prepared_request) except: raise else: print(response.text) if response.status_code == 200: return response.json() return None def build_url(self, path): """Construct the url that will be used to send request to the API """ return urljoin(self.api, path) def create_key(self, n=10): """Create a secret key """ return secrets.token_hex(n)
class BaseClient(object): default_headers = {} default_client_timeout = 10 def __init__(self, *args, **kwargs): if self.__class__ is __class__: raise NotImplementedError self._timeout = kwargs.get("timeout", self.__class__.default_client_timeout) self._session = Session() self._session.headers.update(self.__class__.default_headers) @property def user_agent(self): return self._session.headers.get('User-Agent') def _request(self, method, url, params=None, data=None, headers=None, cookies=None, files=None, auth=None, timeout=None, allow_redirects=True, proxies=None, hooks=None, stream=None, verify=None, cert=None, json=None): # Extended from requests/sessions.py for '_client' kwargs req = Request( method=method.upper(), url=url, headers=headers, files=files, data=data or {}, json=json, params=params or {}, auth=auth, cookies=cookies, hooks=hooks, ) prep = self._session.prepare_request(req) prep._client = self # hold the reference to client proxies = proxies or {} settings = self._session.merge_environment_settings( prep.url, proxies, stream, verify, cert ) # Send the request. send_kwargs = { 'timeout': timeout or self._timeout, # set default timeout 'allow_redirects': allow_redirects, } send_kwargs.update(settings) resp = self._session.send(prep, **send_kwargs) return resp def _get(self, url, params=None, **kwargs): return self._request('GET', url, params=params, **kwargs) def _post(self, url, data=None, json=None, **kwargs): return self._request('POST', url, data=data, json=json, **kwargs) def set_user_agent(self, user_agent): self._session.headers["User-Agent"] = user_agent def persist_cookies(self, r): """ From requests/sessions.py, Session.send() Session.send() 方法会首先 dispatch_hook 然后再 extract_cookies_to_jar 在该项目中,对于返回信息异常的请求,在 hooks 校验时会将错误抛出,send() 之后的处理将不会执行。 遇到的错误往往是 SystemException / TipsException ,而这些客户端认为是错误的情况, 对于服务器端来说并不是错误请求,服务器端在该次请求结束后可能会要求 Set-Cookies 但是由于 send() 在 dispatch_hook 时遇到错误而中止,导致后面的 extract_cookies_to_jar 未能调用,因此 Cookies 并未更新。下一次再请求服务器的时候,就会遇到会话过期的情况。 在这种情况下,需要在捕获错误后手动更新 cookies 以确保能够保持会话 """ if r.history: # If the hooks create history then we want those cookies too for resp in r.history: extract_cookies_to_jar(self._session.cookies, resp.request, resp.raw) extract_cookies_to_jar(self._session.cookies, r.request, r.raw) def clear_cookies(self): self._session.cookies.clear()
class Spider: # todo 针对每次请求不同的`header`来重新加载缓存 # todo 增加字段`data`,存`post`字段 # todo 增加字段`header`, 存header # 强制使用缓存 FORCE_CACHE = 2 # 运行使用缓存 ENABLE_CACHE = 1 # 不使用缓存 DISABLE_CACHE = 0 cache: CacheBase session: Session headers_generator: T_Headers_Generator sleeper: T_Sleeper request_middlewares: List[RequestMiddlewareBase] response_middlewares: List[ResponseMiddlewareBase] proxy_pool: Optional[ProxyPoolBase] __encoding: str cache_mode: Literal[0, 1, 2] timeout: Tuple[int, int] alive_time: int retry: int def __init__( self, cache: Optional[CacheBase] = None, session: Optional[Session] = None, headers_generator: Optional[T_Headers_Generator] = None, sleeper: Optional[T_Sleeper] = None, request_middlewares: Optional[List[RequestMiddlewareBase]] = None, response_middlewares: Optional[List[ResponseMiddlewareBase]] = None, proxy_pool: Optional[ProxyPoolBase] = None, __encoding: Optional[str] = None, cache_mode: Optional[Literal[0, 1, 2]] = None, timeout: Optional[Tuple[int, int]] = None, alive_time: Optional[int] = None, retry: Optional[int] = None, ): self.cache = cache or NoCache() if not session: self.session = Session() self.sleeper = sleeper or NoSleeper() self.request_middlewares = request_middlewares or [] self.response_middlewares = response_middlewares or [] self.proxy_pool = proxy_pool self.__encoding = __encoding # todo self.headers_generator = headers_generator or get_random_header self.cache_mode = cache_mode or Spider.DISABLE_CACHE self.timeout = timeout or (5, 5) self.alive_time = alive_time or 3 self.retry = retry or 3 self.update_headers() @classmethod def get_cache_spider(cls) -> 'Spider': spider = cls(SqliteCache()) spider.cache_mode = Spider.ENABLE_CACHE return spider def set_sleeper(self, sleeper: T_Sleeper): if not isinstance(sleeper, Callable): raise RuntimeError('参数必须是Callable') self.sleeper = sleeper def set_random_sleeper(self, a=5, b=10): self.set_sleeper(RandomTimeSleeper(a, b)) def set_sep_sleeper(self, sep_time=10): self.set_sleeper(SepTimeSleeper(sep_time)) @property def encoding(self): return self.__encoding @encoding.setter def encoding(self, encoding): self.__encoding = encoding def set_cookie(self, cookie: str): """设置cookie :param cookie: 一个 `cookie` 字符串 """ cookie = {'Cookie': cookie} self.session.cookies = requests.sessions.cookiejar_from_dict( cookie, cookiejar=None, overwrite=True) def add_request_middlewares( self, request_middlewares: Union[RequestMiddlewareBase, List[RequestMiddlewareBase]]): if isinstance(request_middlewares, list): self.request_middlewares.extend(request_middlewares) else: self.request_middlewares.append(request_middlewares) def add_response_middlewares( self, response_middlewares: Union[ResponseMiddlewareBase, List[ResponseMiddlewareBase]]): if isinstance(response_middlewares, list): self.response_middlewares.extend(response_middlewares) else: self.response_middlewares.append(response_middlewares) def remove_request_middlewares(self, request_middlewares): del self.request_middlewares[request_middlewares] def remove_response_middlewares(self, response_middlewares): del self.response_middlewares[response_middlewares] def get_response(self, req: PreparedRequest, timeout=None, allow_redirects=True, cache_mode=None, alive_time=None, retry=None): """低级 api 发送请求, 在这里进行 request_middleware, response_middleware 的处理""" cache_mode = cache_mode or self.cache_mode alive_time = alive_time or self.alive_time timeout = timeout or self.timeout retry = retry or self.retry cache_signal = req.url if cache_mode == Spider.DISABLE_CACHE: # 如果禁用这个url的缓存, 则将之从缓存文件删除 self.cache.clear_cache(cache_signal) # 如果 `is_force_cache` is True 则, 不论缓存是否过期, 都从缓存加载 elif cache_mode == Spider.FORCE_CACHE and self.cache.is_cached( cache_signal, ignore_date=True): logger.debug('从缓存: {} <- {}', limit_text(req.url, 100), self.cache) response = self.cache.from_cache(cache_signal, force=True) if self.encoding: response.encoding = self.encoding return response elif cache_mode == Spider.ENABLE_CACHE and self.cache.is_cached( cache_signal, ignore_date=False): logger.debug('从缓存: {} <- {}', limit_text(cache_signal, 100), self.cache) response = self.cache.from_cache(cache_signal, force=False) if self.encoding: response.encoding = self.encoding return response retry = retry while retry: try: # 间隔时间 response = self.session.send(req, timeout=timeout or (5, 5), allow_redirects=allow_redirects) if self.encoding: response.encoding = self.encoding response = Response(response) self.sleeper() if len(response.response.history) >= 1: history = [each.url for each in response.response.history] history.append(response.url) logger.debug( '页面重定向: {}', '->'.join(['[{}]'.format(i) for i in history])) if response.response.ok: if cache_mode == Spider.ENABLE_CACHE: self.cache.cache(response.url, response, alive_time) else: raise HTTPBadCodeError(f'坏HTTP响应', response) return response except requests.Timeout as e: if retry == 1: raise e logger.debug('超时,重试---{}'.format(str(4 - retry))) except requests.RequestException as e: if retry == 1: logger.error('取消重试---{}'.format(str(4 - retry))) raise e logger.error('HTTP报错---{}'.format(str(4 - retry))) # todo 对于失败的`url`保存到另一个`log`文件 except HTTPBadCodeError as e: if retry == 1: logger.info('坏HTTP响应, 取消重试({})'.format( e.args[1].status_code)) return e.args[1] logger.debug('坏HTTP响应({})---{}', e.args[1].status_code, 4 - retry) finally: retry -= 1 def lunch(self, method, url, *, cache_mode=None, alive_time=None, timeout=None, retry=None, headers=None, data=None, params=None, cookies=None, hooks=None): """高级 api 发送请求, 处理网页缓存等功能""" # 发送请求 req = requests.Request( method=method.upper(), url=url, headers=headers, files=None, data=data or {}, json=None, params=params or {}, auth=None, cookies=cookies, hooks=hooks, ) req = self.session.prepare_request(req) for request_middleware in self.request_middlewares: req = request_middleware(self, req) resp = self.get_response(req, cache_mode=cache_mode, alive_time=alive_time, timeout=timeout, retry=retry) for response_middleware in self.response_middlewares: resp = response_middleware(self, resp) return resp def get(self, *args, **kwargs) -> [Response, requests.Response, object]: """获取网页 :param args: (元组)`url`的各个路径: :param kwargs: 包含`requests`库所有选项 :keyword alive_time: Union[datetime, int]缓存存活日期 :keyword cache: 是否使用缓存 :keyword sep_time: 间隔时间 :return: Union[Response, requests.Response, object] """ # 获取`alive_time`, `url`参数 resp = self.lunch('get', *args, **kwargs) return resp def post(self, *args, **kwargs) -> Response: """获取网页 :param args: (元组)`url`的各个路径: :param kwargs: 包含`requests`库所有选项 :keyword alive_time: Union[datetime, int]缓存存活日期 :keyword cache: 是否使用缓存 :keyword sep_time: 间隔时间 :return: Union[Response, requests.Response, object] """ resp = self.lunch('post', *args, **kwargs) return resp def update_headers(self): """调用`self.headers_generator`来更新头""" self.session.headers.update(self.headers_generator()) def close(self): self.session.close()
class RequestHandler(object): ''' Is a general request handler that serves all specific implementations by simply changing the resurce dir and the additional options. Every new implementations should 'register' in HandlerTemplate ''' def __init__(self, connectionDetails): ''' Constructor ''' if not isinstance(connectionDetails, ConnectionDetails): raise RequestException( "Connection detail is not actually derived from Connectiondetails." ) self.__connectionDetails = connectionDetails self.debug = False self.session = Session() def handlerId(self): raise RequestException( "handlerId() is not overridden in specialization: " + self.__class__ + ", please implement it as a static method.") def _raw(self, mode, url, headerdict=None, paramdict=None, bodydict=None, rawbody=None): if (bodydict is not None) and (rawbody is not None): raise RequestException( "Not allowed to specify dictionary and raw body at the same time." ) try: pr = self.session.prepare_request( Request(mode, url, headers=headerdict, params=paramdict, data=rawbody, json=bodydict, auth=self.__connectionDetails.getAuth())) if self.debug: pprint(vars(pr), indent=2, depth=100) r = self.session.send(pr) if self.debug: pprint(vars(r), indent=2, depth=100) r.raise_for_status() except requests.RequestException as e: e.args = e.args[0:] + (e.response.content, ) raise e return r def _post(self, resource, headerdict=None, paramdict=None, bodydict=None, rawbody=None): if (bodydict is not None) and (rawbody is not None): raise RequestException( "Not allowed to specify dictionary and raw body at the same time." ) h = {} b = {} p = {} if self.__connectionDetails is None: raise RequestException("Connection detail is None.") h.update(self.__connectionDetails.getHeader()) b.update(self.__connectionDetails.getBody()) p.update(self.__connectionDetails.getParam()) if headerdict is not None: h.update(headerdict) if bodydict is not None: b.update(bodydict) if paramdict is not None: p.update(paramdict) if len(h) is 0: h = None if len(b) is 0: b = None if len(p) is 0: p = None try: if (rawbody is not None): # ignoring template-default body parameters!!! pr = self.session.prepare_request( Request("POST", self.__connectionDetails.formURL() + self._formatResource(resource), headers=h, params=p, data=rawbody, auth=self.__connectionDetails.getAuth())) else: pr = self.session.prepare_request( Request("POST", self.__connectionDetails.formURL() + self._formatResource(resource), headers=h, params=p, json=b, auth=self.__connectionDetails.getAuth())) if self.debug: pprint(vars(pr), indent=2, depth=100) r = self.session.send(pr) if self.debug: pprint(vars(r), indent=2, depth=100) r.raise_for_status() except Exception as e: e.args = e.args[0:] + (e.response.content, ) raise e return r def _put(self, resource, headerdict=None, paramdict=None, bodydict=None, rawbody=None): if (bodydict is not None) and (rawbody is not None): raise RequestException( "Not allowed to specify dictionary and raw body at the same time." ) h = {} b = {} p = {} if self.__connectionDetails is None: raise RequestException("Connection detail is None.") h.update(self.__connectionDetails.getHeader()) b.update(self.__connectionDetails.getBody()) p.update(self.__connectionDetails.getParam()) if headerdict is not None: h.update(headerdict) if bodydict is not None: b.update(bodydict) if paramdict is not None: p.update(paramdict) if len(h) is 0: h = None if len(b) is 0: b = None if len(p) is 0: p = None try: if (rawbody is not None): # ignoring template-default body parameters!!! pr = self.session.prepare_request( Request("PUT", self.__connectionDetails.formURL() + self._formatResource(resource), headers=h, params=p, data=rawbody, auth=self.__connectionDetails.getAuth())) else: pr = self.session.prepare_request( Request("PUT", self.__connectionDetails.formURL() + self._formatResource(resource), headers=h, params=p, json=b, auth=self.__connectionDetails.getAuth())) if self.debug: pprint(vars(pr), indent=2, depth=100) r = self.session.send(pr) if self.debug: pprint(vars(r), indent=2, depth=100) r.raise_for_status() except requests.RequestException as e: e.args = e.args[0:] + (e.response.content, ) raise e return r def _get(self, resource, headerdict=None, paramdict=None): # attaching body is highly discouraged, therefore not supported h = {} p = {} if self.__connectionDetails is None: raise RequestException("Connection detail is None.") h.update(self.__connectionDetails.getHeader()) p.update(self.__connectionDetails.getParam()) if headerdict is not None: h.update(headerdict) if paramdict is not None: p.update(paramdict) if len(h) is 0: h = None if len(p) is 0: p = None try: pr = self.session.prepare_request( Request("GET", self.__connectionDetails.formURL() + self._formatResource(resource), params=p, headers=h, auth=self.__connectionDetails.getAuth())) if self.debug: pprint(vars(pr), indent=2, depth=100) r = self.session.send(pr) if self.debug: pprint(vars(r), indent=2, depth=100) r.raise_for_status() except requests.RequestException as e: e.args = e.args[0:] + (e.response.content, ) raise e return r def _del(self, resource, headerdict=None, paramdict=None): # attaching body is highly discouraged, therefore not supported h = {} p = {} if self.__connectionDetails is None: raise RequestException("Connection detail is None.") h.update(self.__connectionDetails.getHeader()) p.update(self.__connectionDetails.getParam()) if headerdict is not None: h.update(headerdict) if paramdict is not None: p.update(paramdict) if len(h) is 0: h = None if len(p) is 0: p = None try: pr = self.session.prepare_request( Request("DELETE", self.__connectionDetails.formURL() + self._formatResource(resource), params=p, headers=h, auth=self.__connectionDetails.getAuth())) if self.debug: pprint(vars(pr), indent=2, depth=100) r = self.session.send(pr) if self.debug: pprint(vars(r), indent=2, depth=100) r.raise_for_status() except Exception as e: e.args = e.args[0:] + (e.response.content, ) raise e return r def getConnectionDetails(self): return self.__connectionDetails def _formatResource(self, *args): r = "" if args is None: return r for i in args: r += "/" + i return r.replace("\\", "/").replace("///", "/").replace("//", "/")