def fetch(self, request, **kwargs): """Executes an HTTPRequest, returning an HTTPResponse. If an error occurs during the fetch, we raise an HTTPError. """ if not isinstance(request, HTTPRequest): request = HTTPRequest(url=request, **kwargs) buffer = cStringIO.StringIO() headers = httputil.HTTPHeaders() try: _curl_setup_request(self._curl, request, buffer, headers) self._curl.perform() code = self._curl.getinfo(pycurl.HTTP_CODE) effective_url = self._curl.getinfo(pycurl.EFFECTIVE_URL) buffer.seek(0) response = HTTPResponse(request=request, code=code, headers=headers, buffer=buffer, effective_url=effective_url) if code < 200 or code >= 300: raise HTTPError(code, response=response) return response except pycurl.error, e: buffer.close() raise CurlError(*e)
def _process_queue(self): while True: started = 0 while self._free_list and self._requests: started += 1 curl = self._free_list.pop() (request, callback) = self._requests.popleft() curl.info = { "headers": httputil.HTTPHeaders(), "buffer": cStringIO.StringIO(), "request": request, "callback": callback, "start_time": time.time(), } # Disable IPv6 to mitigate the effects of this bug # on curl versions <= 7.21.0 # http://sourceforge.net/tracker/?func=detail&aid=3017819&group_id=976&atid=100976 if pycurl.version_info()[2] <= 0x71500: # 7.21.0 curl.setopt(pycurl.IPRESOLVE, pycurl.IPRESOLVE_V4) _curl_setup_request(curl, request, curl.info["buffer"], curl.info["headers"]) self._multi.add_handle(curl) if not started: break
def __init__(self, method, uri, version="HTTP/1.0", headers=None, body=None, remote_ip=None, protocol=None, host=None, files=None, connection=None): self.method = method self.uri = uri self.version = version self.headers = headers or httputil.HTTPHeaders() self.body = body or "" if connection and connection.xheaders: # Squid uses X-Forwarded-For, others use X-Real-Ip self.remote_ip = self.headers.get( "X-Real-Ip", self.headers.get("X-Forwarded-For", remote_ip)) self.protocol = self.headers.get("X-Scheme", protocol) or "http" else: self.remote_ip = remote_ip self.protocol = protocol or "http" self.host = host or self.headers.get("Host") or "127.0.0.1" self.files = files or {} self.connection = connection self._start_time = time.time() self._finish_time = None scheme, netloc, path, query, fragment = urlparse.urlsplit(uri) self.path = path self.query = query arguments = cgi.parse_qs(query) self.arguments = {} for name, values in arguments.iteritems(): values = [v for v in values if v] if values: self.arguments[name] = values
def __init__(self, url, method="GET", headers=None, body=None, auth_username=None, auth_password=None, connect_timeout=20.0, request_timeout=20.0, if_modified_since=None, follow_redirects=True, max_redirects=5, user_agent=None, use_gzip=True, network_interface=None, streaming_callback=None, header_callback=None, prepare_curl_callback=None, allow_nonstandard_methods=False): if headers is None: headers = httputil.HTTPHeaders() if if_modified_since: timestamp = calendar.timegm(if_modified_since.utctimetuple()) headers["If-Modified-Since"] = email.utils.formatdate( timestamp, localtime=False, usegmt=True) if "Pragma" not in headers: headers["Pragma"] = "" self.url = _utf8(url) self.method = method self.headers = headers self.body = body self.auth_username = _utf8(auth_username) self.auth_password = _utf8(auth_password) self.connect_timeout = connect_timeout self.request_timeout = request_timeout self.follow_redirects = follow_redirects self.max_redirects = max_redirects self.user_agent = user_agent self.use_gzip = use_gzip self.network_interface = network_interface self.streaming_callback = streaming_callback self.header_callback = header_callback self.prepare_curl_callback = prepare_curl_callback self.allow_nonstandard_methods = allow_nonstandard_methods
def __init__(self, environ): """Parses the given WSGI environ to construct the request.""" self.method = environ["REQUEST_METHOD"] self.path = urllib.quote(environ.get("SCRIPT_NAME", "")) self.path += urllib.quote(environ.get("PATH_INFO", "")) self.uri = self.path self.arguments = {} self.query = environ.get("QUERY_STRING", "") if self.query: self.uri += "?" + self.query arguments = cgi.parse_qs(self.query) for name, values in arguments.iteritems(): values = [v for v in values if v] if values: self.arguments[name] = values self.version = "HTTP/1.1" self.headers = httputil.HTTPHeaders() if environ.get("CONTENT_TYPE"): self.headers["Content-Type"] = environ["CONTENT_TYPE"] if environ.get("CONTENT_LENGTH"): self.headers["Content-Length"] = int(environ["CONTENT_LENGTH"]) for key in environ: if key.startswith("HTTP_"): self.headers[key[5:].replace("_", "-")] = environ[key] if self.headers.get("Content-Length"): self.body = environ["wsgi.input"].read() else: self.body = "" self.protocol = environ["wsgi.url_scheme"] self.remote_ip = environ.get("REMOTE_ADDR", "") if environ.get("HTTP_HOST"): self.host = environ["HTTP_HOST"] else: self.host = environ["SERVER_NAME"] # Parse request body self.files = {} content_type = self.headers.get("Content-Type", "") if content_type.startswith("application/x-www-form-urlencoded"): for name, values in cgi.parse_qs(self.body).iteritems(): self.arguments.setdefault(name, []).extend(values) elif content_type.startswith("multipart/form-data"): if 'boundary=' in content_type: boundary = content_type.split('boundary=', 1)[1] if boundary: self._parse_mime_body(boundary) else: logging.warning("Invalid multipart/form-data") self._start_time = time.time() self._finish_time = None
class AsyncHTTPClient(object): """An non-blocking HTTP client backed with pycurl. Example usage: import ioloop def handle_request(response): if response.error: print "Error:", response.error else: print response.body ioloop.IOLoop.instance().stop() http_client = httpclient.AsyncHTTPClient() http_client.fetch("http://www.google.com/", handle_request) ioloop.IOLoop.instance().start() fetch() can take a string URL or an HTTPRequest instance, which offers more options, like executing POST/PUT/DELETE requests. The keyword argument max_clients to the AsyncHTTPClient constructor determines the maximum number of simultaneous fetch() operations that can execute in parallel on each IOLoop. """ _ASYNC_CLIENTS = weakref.WeakKeyDictionary() def __new__(cls, io_loop=None, max_clients=10, max_simultaneous_connections=None): # There is one client per IOLoop since they share curl instances io_loop = io_loop or ioloop.IOLoop.instance() if io_loop in cls._ASYNC_CLIENTS: return cls._ASYNC_CLIENTS[io_loop] else: instance = super(AsyncHTTPClient, cls).__new__(cls) instance.io_loop = io_loop instance._multi = pycurl.CurlMulti() instance._curls = [ _curl_create(max_simultaneous_connections) for i in xrange(max_clients) ] instance._free_list = instance._curls[:] instance._requests = collections.deque() instance._fds = {} instance._events = {} instance._added_perform_callback = False instance._timeout = None instance._closed = False cls._ASYNC_CLIENTS[io_loop] = instance return instance def close(self): """Destroys this http client, freeing any file descriptors used. Not needed in normal use, but may be helpful in unittests that create and destroy http clients. No other methods may be called on the AsyncHTTPClient after close(). """ del AsyncHTTPClient._ASYNC_CLIENTS[self.io_loop] for curl in self._curls: curl.close() self._multi.close() self._closed = True def fetch(self, request, callback, **kwargs): """Executes an HTTPRequest, calling callback with an HTTPResponse. If an error occurs during the fetch, the HTTPResponse given to the callback has a non-None error attribute that contains the exception encountered during the request. You can call response.reraise() to throw the exception (if any) in the callback. """ if not isinstance(request, HTTPRequest): request = HTTPRequest(url=request, **kwargs) self._requests.append((request, callback)) self._add_perform_callback() def _add_perform_callback(self): if not self._added_perform_callback: self.io_loop.add_callback(self._perform) self._added_perform_callback = True def _handle_events(self, fd, events): self._events[fd] = events self._add_perform_callback() def _handle_timeout(self): self._timeout = None self._perform() def _perform(self): self._added_perform_callback = False if self._closed: return while True: while True: ret, num_handles = self._multi.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # Update the set of active file descriptors. It is important # that this happen immediately after perform() because # fds that have been removed from fdset are free to be reused # in user callbacks. fds = {} (readable, writable, exceptable) = self._multi.fdset() for fd in readable: fds[fd] = fds.get(fd, 0) | 0x1 | 0x2 for fd in writable: fds[fd] = fds.get(fd, 0) | 0x4 for fd in exceptable: fds[fd] = fds.get(fd, 0) | 0x8 | 0x10 if fds and max(fds.iterkeys()) > 900: # Libcurl has a bug in which it behaves unpredictably with # file descriptors greater than 1024. (This is because # even though it uses poll() instead of select(), it still # uses FD_SET internally) Since curl opens its own file # descriptors we can't catch this problem when it happens, # and the best we can do is detect that it's about to # happen. Exiting is a lousy way to handle this error, # but there's not much we can do at this point. Exiting # (and getting restarted by whatever monitoring process # is handling crashed tornado processes) will at least # get things working again and hopefully bring the issue # to someone's attention. # If you run into this issue, you either have a file descriptor # leak or need to run more tornado processes (so that none # of them are handling more than 1000 simultaneous connections) print >> sys.stderr, "ERROR: File descriptor too high for libcurl. Exiting." logging.error("File descriptor too high for libcurl. Exiting.") sys.exit(1) for fd in self._fds: if fd not in fds: try: self.io_loop.remove_handler(fd) except (OSError, IOError), e: if e[0] != errno.ENOENT: raise for fd, events in fds.iteritems(): old_events = self._fds.get(fd, None) if old_events is None: self.io_loop.add_handler(fd, self._handle_events, events) elif old_events != events: try: self.io_loop.update_handler(fd, events) except (OSError, IOError), e: if e[0] == errno.ENOENT: self.io_loop.add_handler(fd, self._handle_events, events) else: raise self._fds = fds # Handle completed fetches completed = 0 while True: num_q, ok_list, err_list = self._multi.info_read() for curl in ok_list: self._finish(curl) completed += 1 for curl, errnum, errmsg in err_list: self._finish(curl, errnum, errmsg) completed += 1 if num_q == 0: break # Start fetching new URLs started = 0 while self._free_list and self._requests: started += 1 curl = self._free_list.pop() (request, callback) = self._requests.popleft() curl.info = { "headers": httputil.HTTPHeaders(), "buffer": cStringIO.StringIO(), "request": request, "callback": callback, "start_time": time.time(), } _curl_setup_request(curl, request, curl.info["buffer"], curl.info["headers"]) self._multi.add_handle(curl) if not started and not completed: break
def _perform(self): self._added_perform_callback = False if self._closed: return while True: while True: ret, num_handles = self._multi.perform() if ret != pycurl.E_CALL_MULTI_PERFORM: break # Update the set of active file descriptors. It is important # that this happen immediately after perform() because # fds that have been removed from fdset are free to be reused # in user callbacks. fds = {} (readable, writable, exceptable) = self._multi.fdset() for fd in readable: fds[fd] = fds.get(fd, 0) | 0x1 | 0x2 for fd in writable: fds[fd] = fds.get(fd, 0) | 0x4 for fd in exceptable: fds[fd] = fds.get(fd, 0) | 0x8 | 0x10 if fds and max(fds.iterkeys()) > 900: # Libcurl has a bug in which it behaves unpredictably with # file descriptors greater than 1024. (This is because # even though it uses poll() instead of select(), it still # uses FD_SET internally) Since curl opens its own file # descriptors we can't catch this problem when it happens, # and the best we can do is detect that it's about to # happen. Exiting is a lousy way to handle this error, # but there's not much we can do at this point. Exiting # (and getting restarted by whatever monitoring process # is handling crashed tornado processes) will at least # get things working again and hopefully bring the issue # to someone's attention. # If you run into this issue, you either have a file descriptor # leak or need to run more tornado processes (so that none # of them are handling more than 1000 simultaneous connections) print >> sys.stderr, "ERROR: File descriptor too high for libcurl. Exiting." logging.error("File descriptor too high for libcurl. Exiting.") sys.exit(1) for fd in self._fds: if fd not in fds: try: self.io_loop.remove_handler(fd) except (OSError, IOError) as e: if e[0] != errno.ENOENT: raise for fd, events in fds.iteritems(): old_events = self._fds.get(fd, None) if old_events is None: self.io_loop.add_handler(fd, self._handle_events, events) elif old_events != events: try: self.io_loop.update_handler(fd, events) except (OSError, IOError) as e: if e[0] == errno.ENOENT: self.io_loop.add_handler(fd, self._handle_events, events) else: raise self._fds = fds # Handle completed fetches completed = 0 while True: num_q, ok_list, err_list = self._multi.info_read() for curl in ok_list: self._finish(curl) completed += 1 for curl, errnum, errmsg in err_list: self._finish(curl, errnum, errmsg) completed += 1 if num_q == 0: break # Start fetching new URLs started = 0 while self._free_list and self._requests: started += 1 curl = self._free_list.pop() (request, callback) = self._requests.popleft() curl.info = { "headers": httputil.HTTPHeaders(), "buffer": cStringIO.StringIO(), "request": request, "callback": callback, "start_time": time.time(), } _curl_setup_request(curl, request, curl.info["buffer"], curl.info["headers"]) self._multi.add_handle(curl) if not started and not completed: break if self._timeout is not None: self.io_loop.remove_timeout(self._timeout) self._timeout = None if num_handles: self._timeout = self.io_loop.add_timeout( time.time() + 0.2, self._handle_timeout)