def __init__(self, max_redirects=3, max_retries=3, retry_delay=0, cookiejar=None, headers=None, **kwargs): self.max_redirects = int(max_redirects) self.max_retries = int(max_retries) self.retry_delay = retry_delay self.default_headers = HTTPClient.DEFAULT_HEADERS.copy() if headers: self.default_headers.update(headers) self.cookiejar = cookiejar self.clientpool = HTTPClientPool(**kwargs)
class UserAgent(object): response_type = CompatResponse request_type = CompatRequest valid_response_codes = set([200, 206, 301, 302, 303, 307]) def __init__(self, max_redirects=3, max_retries=3, retry_delay=0, cookiejar=None, headers=None, **kwargs): self.max_redirects = int(max_redirects) self.max_retries = int(max_retries) self.retry_delay = retry_delay self.default_headers = HTTPClient.DEFAULT_HEADERS.copy() if headers: self.default_headers.update(headers) self.cookiejar = cookiejar self.clientpool = HTTPClientPool(**kwargs) def _make_request(self, url, method='GET', headers=None, payload=None): req_headers = self.default_headers.copy() if headers: req_headers.update(headers) if payload: # Adjust headers depending on payload content content_type = req_headers.get('content-type', None) if not content_type and isinstance(payload, dict): req_headers['content-type'] = "application/x-www-form-urlencoded; charset=utf-8" payload = urlencode(payload) req_headers['content-length'] = len(payload) elif not content_type: req_headers['content-type'] = 'application/octet-stream' payload = payload if isinstance(payload, basestring) else str(payload) req_headers['content-length'] = len(payload) elif content_type.startswith("multipart/form-data"): # See restkit for some example implementation # TODO: Implement it raise NotImplementedError else: payload = payload if isinstance(payload, basestring) else str(payload) req_headers['content-length'] = len(payload) return CompatRequest(url, method=method, headers=req_headers, payload=payload) def _urlopen(self, request): client = self.clientpool.get_client(request.url_split) resp = client.request(request.method, request.url_split.request_uri, body=request.payload, headers=request.headers) return CompatResponse(resp, request=request, sent_request=resp._sent_request) def _verify_status(self, status_code, url=None): """ Hook for subclassing """ if status_code not in self.valid_response_codes: raise BadStatusCode(url, code=status_code) def _handle_error(self, e, url=None): """ Hook for subclassing. Raise the error to interrupt further retrying, return it to continue retries and save the error, when retries exceed the limit. Temporary errors should be swallowed here for automatic retries. """ if isinstance(e, (socket.timeout, gevent.Timeout)): return e elif isinstance(e, (socket.error, gevent.dns.DNSError)) and \ e.errno in set([errno.ETIMEDOUT, errno.ENOLINK, errno.ENOENT, errno.EPIPE]): return e elif isinstance(e, ssl.SSLError) and 'read operation timed out' in str(e): return e elif isinstance(e, EmptyResponse): return e raise e, None, sys.exc_info()[2] def _handle_retries_exceeded(self, url, last_error=None): """ Hook for subclassing """ raise RetriesExceeded(url, self.max_retries, original=last_error) def urlopen(self, url, method='GET', response_codes=valid_response_codes, headers=None, payload=None, to_string=False, debug_stream=None, **kwargs): """ Open an URL, do retries and redirects and verify the status code """ # POST or GET parameters can be passed in **kwargs if kwargs: if not payload: payload = kwargs elif isinstance(payload, dict): payload.update(kwargs) req = self._make_request(url, method=method, headers=headers, payload=payload) for retry in xrange(self.max_retries): if retry > 0 and self.retry_delay: # Don't wait the first time and skip if no delay specified gevent.sleep(self.retry_delay) for _ in xrange(self.max_redirects): if self.cookiejar is not None: # Check against None to avoid issues with empty cookiejars self.cookiejar.add_cookie_header(req) try: resp = self._urlopen(req) except gevent.GreenletExit: raise except BaseException as e: e.request = req e = self._handle_error(e, url=req.url) break # Continue with next retry # We received a response if debug_stream is not None: debug_stream.write(self._conversation_str(url, resp) + '\n\n') try: self._verify_status(resp.status_code, url=req.url) except Exception as e: # Basic transmission successful, but not the wished result # Let's collect some debug info e.response = resp e.request = req e.http_log = self._conversation_str(url, resp) resp.release() e = self._handle_error(e, url=req.url) break # Continue with next retry if self.cookiejar is not None: # Check against None to avoid issues with empty cookiejars self.cookiejar.extract_cookies(resp, req) redirection = resp.headers.get('location') if resp.status_code in set([301, 302, 303, 307]) and redirection: resp.release() req.set_url(req.url_split.redirect(redirection)) req.method = 'GET' if resp.status_code in set([302, 303]) else req.method for item in ('content-length', 'content-type', 'content-encoding', 'cookie', 'cookie2'): req.headers.discard(item) req.payload = None continue if not to_string: return resp else: # to_string added as parameter, to handle empty response # bodies as error and continue retries automatically try: ret = resp.content except Exception as e: e = self._handle_error(e, url=url) break else: if not ret: e = EmptyResponse(url, "Empty response body received") e = self._handle_error(e, url=url) break else: return ret else: e = RetriesExceeded(url, "Redirection limit reached (%s)" % self.max_redirects) e = self._handle_error(e, url=url) else: return self._handle_retries_exceeded(url, last_error=e) @classmethod def _conversation_str(cls, url, resp): header_str = '\n'.join('%s: %s' % item for item in resp.headers.pretty_items()) ret = 'REQUEST: ' + url + '\n' + resp._sent_request + '\n\n' ret += 'RESPONSE: ' + resp._response.version + ' ' + \ str(resp.status_code) + '\n' + \ header_str + '\n\n' + resp.content return ret def download(self, url, fpath, chunk_size=16 * 1024, resume=False, **kwargs): kwargs.pop('to_string', None) headers = kwargs.pop('headers', {}) headers['Connection'] = 'Keep-Alive' if resume and os.path.isfile(fpath): offset = os.path.getsize(fpath) else: offset = 0 for _ in xrange(self.max_retries): if offset: headers['Range'] = 'bytes=%d-' % offset resp = self.urlopen(url, headers=headers, **kwargs) cr = resp.headers.get('Content-Range') if resp.status_code != 206 or not cr or not cr.startswith('bytes') or \ not cr.split(None, 1)[1].startswith(str(offset)): resp.release() offset = 0 if not offset: headers.pop('Range', None) resp = self.urlopen(url, headers=headers, **kwargs) with open(fpath, 'ab' if offset else 'wb') as f: if offset: f.seek(offset, os.SEEK_SET) try: data = resp.read(chunk_size) with resp: while data: f.write(data) data = resp.read(chunk_size) except BaseException as e: self._handle_error(e, url=url) if resp.headers.get('accept-ranges') == 'bytes': # Only if this header is set, we can fall back to partial download offset = f.tell() continue # All done, break outer loop break else: self._handle_retries_exceeded(url, last_error=e) return resp def close(self): self.clientpool.close()
class UserAgent(object): response_type = CompatResponse request_type = CompatRequest valid_response_codes = set([200, 206, 301, 302, 303, 307]) def __init__(self, max_redirects=3, max_retries=3, retry_delay=0, cookiejar=None, headers=None, **kwargs): self.max_redirects = int(max_redirects) self.max_retries = int(max_retries) self.retry_delay = retry_delay self.default_headers = HTTPClient.DEFAULT_HEADERS.copy() if headers: self.default_headers.update(headers) self.cookiejar = cookiejar self.clientpool = HTTPClientPool(**kwargs) def _make_request(self, url, method="GET", headers=None, payload=None): req_headers = self.default_headers.copy() if headers: req_headers.update(headers) if payload: # Adjust headers depending on payload content content_type = req_headers.get("content-type", None) if not content_type and isinstance(payload, dict): req_headers["content-type"] = "application/x-www-form-urlencoded; charset=utf-8" payload = urlencode(payload) req_headers["content-length"] = len(payload) elif not content_type: req_headers["content-type"] = "application/octet-stream" payload = payload if isinstance(payload, basestring) else str(payload) req_headers["content-length"] = len(payload) elif content_type.startswith("multipart/form-data"): # See restkit for some example implementation # TODO: Implement it raise NotImplementedError else: payload = payload if isinstance(payload, basestring) else str(payload) req_headers["content-length"] = len(payload) return CompatRequest(url, method=method, headers=req_headers, payload=payload) def _urlopen(self, request): client = self.clientpool.get_client(request.url_split) resp = client.request( request.method, request.url_split.request_uri, body=request.payload, headers=request.headers ) return CompatResponse(resp, request=request, sent_request=resp._sent_request) def _verify_status(self, status_code, url=None): """ Hook for subclassing """ if status_code not in self.valid_response_codes: raise BadStatusCode(url, code=status_code) def _handle_error(self, e, url=None): """ Hook for subclassing. Raise the error to interrupt further retrying, return it to continue retries and save the error, when retries exceed the limit. Temporary errors should be swallowed here for automatic retries. """ if isinstance(e, (socket.timeout, gevent.Timeout)): return e elif isinstance(e, (socket.error, gevent.dns.DNSError)) and e.errno in set( [errno.ETIMEDOUT, errno.ENOLINK, errno.ENOENT, errno.EPIPE] ): return e elif isinstance(e, ssl.SSLError) and "read operation timed out" in str(e): return e elif isinstance(e, EmptyResponse): return e raise e, None, sys.exc_info()[2] def _handle_retries_exceeded(self, url, last_error=None): """ Hook for subclassing """ raise RetriesExceeded(url, self.max_retries, original=last_error) def urlopen( self, url, method="GET", response_codes=valid_response_codes, headers=None, payload=None, to_string=False, debug_stream=None, **kwargs ): """ Open an URL, do retries and redirects and verify the status code """ # POST or GET parameters can be passed in **kwargs if kwargs: if not payload: payload = kwargs elif isinstance(payload, dict): payload.update(kwargs) req = self._make_request(url, method=method, headers=headers, payload=payload) for retry in xrange(self.max_retries): if retry > 0 and self.retry_delay: # Don't wait the first time and skip if no delay specified gevent.sleep(self.retry_delay) for _ in xrange(self.max_redirects): if self.cookiejar is not None: # Check against None to avoid issues with empty cookiejars self.cookiejar.add_cookie_header(req) try: resp = self._urlopen(req) except gevent.GreenletExit: raise except BaseException as e: e.request = req e = self._handle_error(e, url=req.url) break # Continue with next retry # We received a response if debug_stream is not None: debug_stream.write(self._conversation_str(url, resp) + "\n\n") try: self._verify_status(resp.status_code, url=req.url) except Exception as e: # Basic transmission successful, but not the wished result # Let's collect some debug info e.response = resp e.request = req e.http_log = self._conversation_str(url, resp) e = self._handle_error(e, url=req.url) break # Continue with next retry if self.cookiejar is not None: # Check against None to avoid issues with empty cookiejars self.cookiejar.extract_cookies(resp, req) redirection = resp.headers.get("location") if resp.status_code in set([301, 302, 303, 307]) and redirection: resp._response.release() req.set_url(req.url_split.redirect(redirection)) req.method = "GET" if resp.status_code in set([302, 303]) else req.method for item in ("content-length", "content-type", "content-encoding", "cookie", "cookie2"): req.headers.discard(item) req.payload = None continue if not to_string: return resp else: # to_string added as parameter, to handle empty response # bodies as error and continue retries automatically try: ret = resp.content except Exception as e: e = self._handle_error(e, url=url) break else: if not ret: e = EmptyResponse(url, "Empty response body received") e = self._handle_error(e, url=url) break else: return ret else: e = RetriesExceeded(url, "Redirection limit reached (%s)" % self.max_redirects) e = self._handle_error(e, url=url) else: return self._handle_retries_exceeded(url, last_error=e) @classmethod def _conversation_str(cls, url, resp): header_str = "\n".join("%s: %s" % item for item in resp.headers.pretty_items()) ret = "REQUEST: " + url + "\n" + resp._sent_request + "\n\n" ret += ( "RESPONSE: " + resp._response.version + " " + str(resp.status_code) + "\n" + header_str + "\n\n" + resp.content ) return ret def download(self, url, fpath, chunk_size=16 * 1024, resume=False, **kwargs): kwargs.pop("to_string", None) headers = kwargs.pop("headers", {}) headers["Connection"] = "Keep-Alive" if resume and os.path.isfile(fpath): offset = os.path.getsize(fpath) else: offset = 0 for _ in xrange(self.max_retries): if offset: headers["Range"] = "bytes=%d-" % offset resp = self.urlopen(url, headers=headers, **kwargs) cr = resp.headers.get("Content-Range") if ( resp.status_code != 206 or not cr or not cr.startswith("bytes") or not cr.split(None, 1)[1].startswith(str(offset)) ): resp._response.release() offset = 0 if not offset: headers.pop("Range", None) resp = self.urlopen(url, headers=headers, **kwargs) with open(fpath, "ab" if offset else "wb") as f: if offset: f.seek(offset, os.SEEK_SET) try: data = resp.read(chunk_size) while data: f.write(data) data = resp.read(chunk_size) except BaseException as e: self._handle_error(e, url=url) if resp.headers.get("accept-ranges") == "bytes": # Only if this header is set, we can fall back to partial download offset = f.tell() continue # All done, break outer loop break else: self._handle_retries_exceeded(url, last_error=e) return resp def close(self): self.clientpool.close()