def test_get_returns_none_if_expired(self): cache = Cache(expiration_in_seconds=0.1) cache.put('http://www.google.com', 'response') time.sleep(0.5) expect(cache.get('http://www.google.com')).to_be_null() expect(cache.responses).not_to_include('http://www.google.com')
class Octopus(object): def __init__( self, concurrency=10, auto_start=False, cache=False, expiration_in_seconds=30, request_timeout_in_seconds=5, limiter=None ): self.concurrency = concurrency self.auto_start = auto_start self.cache = cache self.response_cache = Cache(expiration_in_seconds=expiration_in_seconds) self.request_timeout_in_seconds = request_timeout_in_seconds self.url_queue = OctopusQueue() self.limiter = limiter if auto_start: self.start() def from_requests_response(self, url, response): content = None if hasattr(response, 'content'): content = response.content return Response( url=url, status_code=response.status_code, headers=dict([(key, value) for key, value in response.headers.items()]), cookies=dict([(key, value) for key, value in response.cookies.items()]), text=response.text, effective_url=response.url, content=content, error=response.status_code > 399 and response.text or None, request_time=response.elapsed and response.elapsed.total_seconds or 0 ) def enqueue(self, url, handler, method='GET', **kw): if self.cache: response = self.response_cache.get(url) if response is not None: handler(url, response) return self.url_queue.put_nowait((url, handler, method, kw)) @property def queue_size(self): return self.url_queue.qsize() @property def is_empty(self): return self.url_queue.empty() def start(self): for i in range(self.concurrency): t = Thread(target=self.do_work) t.daemon = True t.start() def do_work(self): while True: url, handler, method, kwargs = self.url_queue.get() response = None if self.cache: response = self.response_cache.get(url) if response is None: if self.limiter and not self.limiter.acquire(url): logging.info('Could not acquire limit for url "%s".' % url) self.url_queue.task_done() self.url_queue.put_nowait((url, handler, method, kwargs)) self.limiter.publish_lock_miss(url) time.sleep(0.1) continue try: response = requests.request(method, url, timeout=self.request_timeout_in_seconds, **kwargs) except requests.exceptions.Timeout: err = sys.exc_info()[1] response = ResponseError( url=url, status_code=500, text=str(err), error=err, elapsed=timedelta(seconds=self.request_timeout_in_seconds) ) except Exception: err = sys.exc_info()[1] response = ResponseError( url=url, status_code=599, text=str(err), error=err ) finally: if self.limiter: self.limiter.release(url) original_response = response response = self.from_requests_response(url, response) original_response.close() if self.cache: self.response_cache.put(url, response) handler(url, response) self.url_queue.task_done() def wait(self, timeout=10): if timeout > 0: self.url_queue.join_with_timeout(timeout=timeout) else: self.url_queue.join()
class Octopus(object): def __init__(self, concurrency=10, auto_start=False, cache=False, expiration_in_seconds=30, request_timeout_in_seconds=5, limiter=None): self.concurrency = concurrency self.auto_start = auto_start self.cache = cache self.response_cache = Cache( expiration_in_seconds=expiration_in_seconds) self.request_timeout_in_seconds = request_timeout_in_seconds self.url_queue = OctopusQueue() self.limiter = limiter if auto_start: self.start() def from_requests_response(self, url, response): return Response( url=url, status_code=response.status_code, headers=dict([(key, value) for key, value in response.headers.items()]), cookies=dict([(key, value) for key, value in response.cookies.items()]), text=response.text, effective_url=response.url, error=response.status_code > 399 and response.text or None, request_time=response.elapsed and response.elapsed.total_seconds or 0) def enqueue(self, url, handler, method='GET', **kw): if self.cache: response = self.response_cache.get(url) if response is not None: handler(url, response) return self.url_queue.put_nowait((url, handler, method, kw)) @property def queue_size(self): return self.url_queue.qsize() @property def is_empty(self): return self.url_queue.empty() def start(self): for i in range(self.concurrency): t = Thread(target=self.do_work) t.daemon = True t.start() def do_work(self): while True: url, handler, method, kwargs = self.url_queue.get() response = None if self.cache: response = self.response_cache.get(url) if response is None: if self.limiter and not self.limiter.acquire(url): logging.info('Could not acquire limit for url "%s".' % url) self.url_queue.task_done() self.url_queue.put_nowait((url, handler, method, kwargs)) self.limiter.publish_lock_miss(url) time.sleep(0.1) continue try: response = requests.request( method, url, timeout=self.request_timeout_in_seconds, **kwargs) except requests.exceptions.Timeout: err = sys.exc_info()[1] response = ResponseError( url=url, status_code=500, text=str(err), error=err, elapsed=timedelta( seconds=self.request_timeout_in_seconds)) except Exception: err = sys.exc_info()[1] response = ResponseError(url=url, status_code=599, text=str(err), error=err) finally: if self.limiter: self.limiter.release(url) original_response = response response = self.from_requests_response(url, response) original_response.close() if self.cache: self.response_cache.put(url, response) handler(url, response) self.url_queue.task_done() def wait(self, timeout=10): if timeout > 0: self.url_queue.join_with_timeout(timeout=timeout) else: self.url_queue.join()
class TornadoOctopus(object): def __init__( self, concurrency=10, auto_start=False, cache=False, expiration_in_seconds=30, request_timeout_in_seconds=10, connect_timeout_in_seconds=5, ignore_pycurl=False, limiter=None, allow_connection_reuse=True): self.concurrency = concurrency self.auto_start = auto_start self.last_timeout = None self.cache = cache self.response_cache = Cache(expiration_in_seconds=expiration_in_seconds) self.request_timeout_in_seconds = request_timeout_in_seconds self.connect_timeout_in_seconds = connect_timeout_in_seconds self.ignore_pycurl = ignore_pycurl self.running_urls = 0 self.url_queue = [] if PYCURL_AVAILABLE and not self.ignore_pycurl: logging.debug('pycurl is available, thus Octopus will be using it instead of tornado\'s simple http client.') AsyncHTTPClient.configure("tornado.curl_httpclient.CurlAsyncHTTPClient") self.allow_connection_reuse = allow_connection_reuse else: self.allow_connection_reuse = True if auto_start: logging.debug('Auto starting...') self.start() self.limiter = limiter @property def queue_size(self): return len(self.url_queue) @property def is_empty(self): return self.queue_size == 0 def start(self): logging.debug('Creating IOLoop and http_client.') self.ioloop = IOLoop() self.http_client = AsyncHTTPClient(io_loop=self.ioloop) @classmethod def from_tornado_response(cls, url, response): cookies = response.request.headers.get('Cookie', '') if cookies: cookies = dict([cookie.split('=') for cookie in cookies.split(';')]) return Response( url=url, status_code=response.code, headers=dict([(key, value) for key, value in response.headers.items()]), cookies=cookies, text=response.body, effective_url=response.effective_url, error=response.error and str(response.error) or None, request_time=response.request_time ) def enqueue(self, url, handler, method='GET', **kw): logging.debug('Enqueueing %s...' % url) if self.cache: response = self.response_cache.get(url) if response is not None: logging.debug('Cache hit on %s.' % url) handler(url, response) return if self.running_urls < self.concurrency: logging.debug('Queue has space available for fetching %s.' % url) self.get_next_url(url, handler, method, **kw) else: logging.debug('Queue is full. Enqueueing %s for future fetch.' % url) self.url_queue.append((url, handler, method, kw)) def fetch(self, url, handler, method, **kw): self.running_urls += 1 if self.cache: response = self.response_cache.get(url) if response is not None: logging.debug('Cache hit on %s.' % url) self.running_urls -= 1 handler(url, response) return logging.info('Fetching %s...' % url) request = HTTPRequest( url=url, method=method, connect_timeout=self.connect_timeout_in_seconds, request_timeout=self.request_timeout_in_seconds, prepare_curl_callback=self.handle_curl_callback, **kw ) self.http_client.fetch(request, self.handle_request(url, handler)) def handle_curl_callback(self, curl): if not self.allow_connection_reuse: curl.setopt(pycurl.FRESH_CONNECT, 1) def get_next_url(self, request_url=None, handler=None, method=None, **kw): if request_url is None: if not self.url_queue: return request_url, handler, method, kw = self.url_queue.pop() self.fetch_next_url(request_url, handler, method, **kw) def fetch_next_url(self, request_url, handler, method, **kw): if self.limiter and not self.limiter.acquire(request_url): logging.info('Could not acquire limit for url "%s".' % request_url) self.url_queue.append((request_url, handler, method, kw)) deadline = timedelta(seconds=self.limiter.limiter_miss_timeout_ms / 1000.0) self.ioloop.add_timeout(deadline, self.get_next_url) self.limiter.publish_lock_miss(request_url) return False logging.debug('Queue has space available for fetching %s.' % request_url) self.fetch(request_url, handler, method, **kw) return True def handle_request(self, url, callback): def handle(response): logging.debug('Handler called for url %s...' % url) self.running_urls -= 1 response = self.from_tornado_response(url, response) logging.info('Got response(%s) from %s.' % (response.status_code, url)) if self.cache and response and response.status_code < 399: logging.debug('Putting %s into cache.' % url) self.response_cache.put(url, response) if self.limiter: self.limiter.release(url) try: callback(url, response) except Exception: logging.exception('Error calling callback for %s.' % url) if self.running_urls < self.concurrency and self.url_queue: self.get_next_url() logging.debug('Getting %d urls and still have %d more urls to get...' % (self.running_urls, self.remaining_requests)) if self.running_urls < 1 and self.remaining_requests == 0: logging.debug('Nothing else to get. Stopping Octopus...') self.stop() return handle def handle_wait_timeout(self, signal_number, frames): logging.debug('Timeout waiting for IOLoop to finish. Stopping IOLoop manually.') self.stop(force=True) def wait(self, timeout=10): self.last_timeout = timeout if not self.url_queue and not self.running_urls: logging.debug('No urls to wait for. Returning immediately.') return if timeout: logging.debug('Waiting for urls to be retrieved for %s seconds.' % timeout) self.ioloop.set_blocking_signal_threshold(timeout, self.handle_wait_timeout) else: logging.debug('Waiting for urls to be retrieved.') logging.info('Starting IOLoop with %d URLs still left to process.' % self.remaining_requests) self.ioloop.start() @property def remaining_requests(self): return len(self.url_queue) def stop(self, force=False): logging.info('Stopping IOLoop with %d URLs still left to process.' % self.remaining_requests) self.ioloop.stop()
class TornadoOctopus(object): def __init__(self, concurrency=10, auto_start=False, cache=False, expiration_in_seconds=30, request_timeout_in_seconds=10, connect_timeout_in_seconds=5, ignore_pycurl=False, limiter=None, allow_connection_reuse=True): self.concurrency = concurrency self.auto_start = auto_start self.last_timeout = None self.cache = cache self.response_cache = Cache( expiration_in_seconds=expiration_in_seconds) self.request_timeout_in_seconds = request_timeout_in_seconds self.connect_timeout_in_seconds = connect_timeout_in_seconds self.ignore_pycurl = ignore_pycurl self.running_urls = 0 self.url_queue = [] if PYCURL_AVAILABLE and not self.ignore_pycurl: logging.debug( 'pycurl is available, thus Octopus will be using it instead of tornado\'s simple http client.' ) AsyncHTTPClient.configure( "tornado.curl_httpclient.CurlAsyncHTTPClient") self.allow_connection_reuse = allow_connection_reuse else: self.allow_connection_reuse = True if auto_start: logging.debug('Auto starting...') self.start() self.limiter = limiter @property def queue_size(self): return len(self.url_queue) @property def is_empty(self): return self.queue_size == 0 def start(self): logging.debug('Creating IOLoop and http_client.') self.ioloop = IOLoop() self.http_client = AsyncHTTPClient(io_loop=self.ioloop) @classmethod def from_tornado_response(cls, url, response): cookies = response.request.headers.get('Cookie', '') if cookies: cookies = dict( [cookie.split('=') for cookie in cookies.split(';')]) return Response(url=url, status_code=response.code, headers=dict([ (key, value) for key, value in response.headers.items() ]), cookies=cookies, text=response.body, effective_url=response.effective_url, error=response.error and str(response.error) or None, request_time=response.request_time) def enqueue(self, url, handler, method='GET', **kw): logging.debug('Enqueueing %s...' % url) if self.cache: response = self.response_cache.get(url) if response is not None: logging.debug('Cache hit on %s.' % url) handler(url, response) return if self.running_urls < self.concurrency: logging.debug('Queue has space available for fetching %s.' % url) self.get_next_url(url, handler, method, **kw) else: logging.debug('Queue is full. Enqueueing %s for future fetch.' % url) self.url_queue.append((url, handler, method, kw)) def fetch(self, url, handler, method, **kw): self.running_urls += 1 if self.cache: response = self.response_cache.get(url) if response is not None: logging.debug('Cache hit on %s.' % url) self.running_urls -= 1 handler(url, response) return logging.info('Fetching %s...' % url) request = HTTPRequest(url=url, method=method, connect_timeout=self.connect_timeout_in_seconds, request_timeout=self.request_timeout_in_seconds, prepare_curl_callback=self.handle_curl_callback, **kw) self.http_client.fetch(request, self.handle_request(url, handler)) def handle_curl_callback(self, curl): if not self.allow_connection_reuse: curl.setopt(pycurl.FRESH_CONNECT, 1) def get_next_url(self, request_url=None, handler=None, method=None, **kw): if request_url is None: if not self.url_queue: return request_url, handler, method, kw = self.url_queue.pop() self.fetch_next_url(request_url, handler, method, **kw) def fetch_next_url(self, request_url, handler, method, **kw): if self.limiter and not self.limiter.acquire(request_url): logging.info('Could not acquire limit for url "%s".' % request_url) self.url_queue.append((request_url, handler, method, kw)) deadline = timedelta(seconds=self.limiter.limiter_miss_timeout_ms / 1000.0) self.ioloop.add_timeout(deadline, self.get_next_url) self.limiter.publish_lock_miss(request_url) return False logging.debug('Queue has space available for fetching %s.' % request_url) self.fetch(request_url, handler, method, **kw) return True def handle_request(self, url, callback): def handle(response): logging.debug('Handler called for url %s...' % url) self.running_urls -= 1 response = self.from_tornado_response(url, response) logging.info('Got response(%s) from %s.' % (response.status_code, url)) if self.cache and response and response.status_code < 399: logging.debug('Putting %s into cache.' % url) self.response_cache.put(url, response) if self.limiter: self.limiter.release(url) try: callback(url, response) except Exception: logging.exception('Error calling callback for %s.' % url) if self.running_urls < self.concurrency and self.url_queue: self.get_next_url() logging.debug( 'Getting %d urls and still have %d more urls to get...' % (self.running_urls, self.remaining_requests)) if self.running_urls < 1 and self.remaining_requests == 0: logging.debug('Nothing else to get. Stopping Octopus...') self.stop() return handle def handle_wait_timeout(self, signal_number, frames): logging.debug( 'Timeout waiting for IOLoop to finish. Stopping IOLoop manually.') self.stop(force=True) def wait(self, timeout=10): self.last_timeout = timeout if not self.url_queue and not self.running_urls: logging.debug('No urls to wait for. Returning immediately.') return if timeout: logging.debug('Waiting for urls to be retrieved for %s seconds.' % timeout) self.ioloop.set_blocking_signal_threshold(timeout, self.handle_wait_timeout) else: logging.debug('Waiting for urls to be retrieved.') logging.info('Starting IOLoop with %d URLs still left to process.' % self.remaining_requests) self.ioloop.start() @property def remaining_requests(self): return len(self.url_queue) def stop(self, force=False): logging.info('Stopping IOLoop with %d URLs still left to process.' % self.remaining_requests) self.ioloop.stop()
def test_can_get_after_put(self): cache = Cache(expiration_in_seconds=10) cache.put('http://www.google.com', 'response') expect(cache.get('http://www.google.com')).to_equal('response')