Beispiel #1
0
    def __init__(self, rq=None):
        """
        Create an Cassandra based HTTP cache.

        **Arguments:**
         * *cassandra_client* -- Cassandra client object.

        **Keyword arguments:**
         * *rq* -- Request Queuer object. (Default ``None``)

        """
        if rq is None:
            self.rq = RequestQueuer()
        else:
            self.rq = rq
Beispiel #2
0
 def __init__(self, config, pg=None):
     # Resource Mappings
     self.service_mapping = config["service_mapping"]
     self.service_args_mapping = config["service_args_mapping"]
     self.inverted_args_mapping = dict([(s[0], invert(s[1]))
         for s in self.service_args_mapping.items()])
     # configure stats
     self.stats = config.get('stats', stats.stats)
     stats.stats = self.stats
     # Request Queuer
     self.rq = RequestQueuer(
         max_simultaneous_requests=config["max_simultaneous_requests"],
         max_requests_per_host_per_second=config["max_requests_per_host_per_second"],
         max_simultaneous_requests_per_host=config["max_simultaneous_requests_per_host"])
     self.rq.setHostMaxRequestsPerSecond("127.0.0.1", 0)
     self.rq.setHostMaxSimultaneousRequests("127.0.0.1", 0)
     if pg is None:
         self.pg = PageGetter(rq=self.rq)
     else:
         self.pg = pg
Beispiel #3
0
    def __init__(self,
        cassandra_client,
        redis_client,
        disable_negative_cache=False,
        time_offset=0,
        rq=None):
        """
        Create an Cassandra based HTTP cache.

        **Arguments:**
            * *cassandra_client* -- Cassandra client object.
        **Keyword arguments:**
         * *rq* -- Request Queuer object. (Default ``None``)

        """
        self.cassandra_client = cassandra_client
        self.redis_client = redis_client
        self.disable_negative_cache = disable_negative_cache
        self.time_offset = time_offset
        if rq is None:
            self.rq = RequestQueuer()
        else:
            self.rq = rq
Beispiel #4
0
 def setUp(self):
     self.deferred = Deferred()
     self.mini_web_server = MiniWebServer()
     self.rq = RequestQueuer(max_requests_per_host_per_second=3, max_simultaneous_requests_per_host=5)
Beispiel #5
0
class RequestQueuerTestCase(unittest.TestCase):
    
    def setUp(self):
        self.deferred = Deferred()
        self.mini_web_server = MiniWebServer()
        self.rq = RequestQueuer(max_requests_per_host_per_second=3, max_simultaneous_requests_per_host=5)
        
    def tearDown(self):
        return self.mini_web_server.shutdown()

    def testRequestQueuerOnSuccess(self):  
        d = self.rq.getPage("http://127.0.0.1:8080/helloworld", timeout=5)
        return d

    def testRequestQueuerOnFailure(self): 
        d = self.rq.getPage("http://0.0.0.0:99", timeout=5)
        d.addErrback(self._getPageErrback)  
        return d      
    
    def testHostMaxRequestsPerSecond(self,):
        self.failUnlessEqual(
            self.rq.getHostMaxRequestsPerSecond("example.com"), 3)
        self.rq.setHostMaxRequestsPerSecond("example2.com", 7)
        self.failUnlessEqual(
            self.rq.getHostMaxRequestsPerSecond("example2.com"), 7)
            
    def testHostMaxSimultaneousRequests(self,):
        self.failUnlessEqual(
            self.rq.getHostMaxSimultaneousRequests("example.com"), 5)
        self.rq.setHostMaxSimultaneousRequests("example2.com", 11)
        self.failUnlessEqual(
            self.rq.getHostMaxSimultaneousRequests("example2.com"),
            11)
            
    def testActive(self):
        self.failUnlessEqual(isinstance(self.rq.getActive(), int), True)
            
    def testPending(self):
        self.failUnlessEqual(isinstance(self.rq.getPending(), int), True)

    def testActiveRequestsByHost(self):
        self.failUnlessEqual(isinstance(self.rq.getActiveRequestsByHost(), dict), True)

    def testPendingRequestsByHost(self):
        self.failUnlessEqual(isinstance(self.rq.getPendingRequestsByHost(), dict), True)

    def _getPageErrback(self, error):
        return True
Beispiel #6
0
class PageGetter(object):

    def __init__(self,
        cassandra_client,
        redis_client,
        disable_negative_cache=False,
        time_offset=0,
        rq=None):
        """
        Create an Cassandra based HTTP cache.

        **Arguments:**
            * *cassandra_client* -- Cassandra client object.
        **Keyword arguments:**
         * *rq* -- Request Queuer object. (Default ``None``)

        """
        self.cassandra_client = cassandra_client
        self.redis_client = redis_client
        self.disable_negative_cache = disable_negative_cache
        self.time_offset = time_offset
        if rq is None:
            self.rq = RequestQueuer()
        else:
            self.rq = rq
    
    @inlineCallbacks
    def getPage(self,
            url,
            method='GET',
            postdata=None,
            headers=None,
            agent="HiiSpider",
            timeout=5,
            cookies=None,
            follow_redirect=1,
            prioritize=False,
            hash_url=None,
            cache=0,
            content_sha1=None,
            confirm_cache_write=False,
            check_only_tld=False,
            disable_negative_cache=False):
        """
        Make a cached HTTP Request.

        **Arguments:**
         * *url* -- URL for the request.

        **Keyword arguments:**
         * *method* -- HTTP request method. (Default ``'GET'``)
         * *postdata* -- Dictionary of strings to post with the request.
           (Default ``None``)
         * *headers* -- Dictionary of strings to send as request headers.
           (Default ``None``)
         * *agent* -- User agent to send with request. (Default
           ``'HiiSpider'``)
         * *timeout* -- Request timeout, in seconds. (Default ``60``)
         * *cookies* -- Dictionary of strings to send as request cookies.
           (Default ``None``).
         * *follow_redirect* -- Boolean switch to follow HTTP redirects.
           (Default ``True``)
         * *prioritize* -- Move this request to the front of the request
           queue. (Default ``False``)
         * *hash_url* -- URL string used to indicate a common resource.
           Example: "http://digg.com" and "http://www.digg.com" could both
           use hash_url, "http://digg.com" (Default ``None``)
         * *cache* -- Cache mode. ``1``, immediately return contents of
           cache if available. ``0``, check resource, return cache if not
           stale. ``-1``, ignore cache. (Default ``0``)
         * *content_sha1* -- SHA-1 hash of content. If this matches the
           hash of data returned by the resource, raises a
           StaleContentException.
         * *confirm_cache_write* -- Wait to confirm cache write before returning.
         * *check_only_tld* -- for negative cache, check only the top level domain name
         * *disable_negative_cache* -- disable negative cache for this request
        """
        start = time.time()
        request_kwargs = {
            "method":method.upper(),
            "postdata":postdata,
            "headers":headers,
            "agent":agent,
            "timeout":timeout,
            "cookies":cookies,
            "follow_redirect":follow_redirect,
            "prioritize":prioritize}
        cache = int(cache)
        if cache not in [-1,0,1]:
            raise Exception("Unknown caching mode.")
        if not isinstance(url, str):
            url = convertToUTF8(url)
        if hash_url is not None and not isinstance(hash_url, str):
            hash_url = convertToUTF8(hash_url)
        # check negative cache
        host = _parse(url)[1]
        # if check_only_tld is true then parse the url down to the top level domain
        if check_only_tld:
            host_split = host.split('.', host.count('.')-1)
            host = host_split[len(host_split)-1]
        # Create request_hash to serve as a cache key from
        # either the URL or user-provided hash_url.
        hash_items = [hash_url or url, agent]
        if postdata:
            hash_items.append(repr(postdata))
        if headers and 'Authorization' in headers:
            items = headers['Authorization'].split(',')
            oauth_headers = [item for item in items
                if item.find('oauth_consumer_key') > -1 or
                item.find('oauth_token') > -1 or
                item.find('oauth_token_secret') > -1]
            if oauth_headers:
                hash_items.append(repr(oauth_headers))
        if cookies:
            hash_items.append(repr(cookies))
        request_hash = sha1(json.dumps(hash_items)).hexdigest()
#        if not disable_negative_cache and not self.disable_negative_cache:
#            yield self.checkNegativeCache(
#                    'negative_cache:%s' % host,
#                    'negative_req_cache:%s' % request_hash)
#        if request_kwargs["method"] != "GET":
#            data = yield self.rq.getPage(url, **request_kwargs)
#        else:
#            data = yield self._getPage(
#                    url, 
#                    request_hash, 
#                    request_kwargs, 
#                    cache, 
#                    content_sha1, 
#                    confirm_cache_write, 
#                    host)
#        logger.info("Got %s after %s" % (host, time.time() - start))
#        # Check for stale contents
        data = yield self.rq.getPage(url, **request_kwargs)
        if "content-sha1" not in data:
            data["content-sha1"] = sha1(data["response"]).hexdigest()
        if content_sha1 == data["content-sha1"]:
            logger.debug("Raising StaleContentException (4) on %s" % request_hash)
            raise StaleContentException(content_sha1)
        returnValue(data)

    @inlineCallbacks
    def checkNegativeCache(self, negative_cache_host_key, negative_req_cache_key):
        raw_negative_cache_host = yield self.redis_client.get(negative_cache_host_key)
        if raw_negative_cache_host:
            try:
                negative_cache_host = pickle.loads(str(decompress(raw_negative_cache_host)))
                if negative_cache_host['timeout'] > time.time():
                    # we get quite a lot of these, ~500/sec on occasions
                    stats.stats.increment('pg.negcache.hit', 0.1)
                    raise NegativeHostCacheException(str(negative_cache_host['error']))
            except NegativeHostCacheException:
                raise
            except Exception, e:
                logger.error('Removing host %s from the negative cache: %s' % (request_hash, e))
                stats.stats.increment('pg.negcache.flush')
                self.redis_client.delete(negative_cache_host_key)
        raw_negative_req_cache_item = yield self.redis_client.get(negative_req_cache_key)
        if raw_negative_req_cache_item:
            try:
                negative_req_cache_item = pickle.loads(str(decompress(raw_negative_req_cache_item)))
                if negative_req_cache_item['timeout'] > time.time():
                    stats.stats.increment('pg.negreqcache.hit')
                    raise NegativeReqCacheException(str(negative_req_cache_item['error']))
            except NegativeHostCacheException:
                raise
            except Exception, e:
                logger.error('Removing item %s from the negative cache: %s' % (negative_req_cache_key, e))
                stats.stats.increment('pg.negreqcache.flush', 0.5)
                self.redis_client.delete(negative_req_cache_key)
Beispiel #7
0
class PageGetter(object):

    def __init__(self,
        cassandra_client,
        redis_client,
        disable_negative_cache=False,
        time_offset=0,
        rq=None):
        """
        Create an Cassandra based HTTP cache.

        **Arguments:**
            * *cassandra_client* -- Cassandra client object.
        **Keyword arguments:**
         * *rq* -- Request Queuer object. (Default ``None``)

        """
        self.cassandra_client = cassandra_client
        self.redis_client = redis_client
        self.disable_negative_cache = disable_negative_cache
        self.time_offset = time_offset
        if rq is None:
            self.rq = RequestQueuer()
        else:
            self.rq = rq

    @inlineCallbacks
    def getPage(self,
            url,
            method='GET',
            postdata=None,
            headers=None,
            agent="HiiSpider",
            timeout=5,
            cookies=None,
            follow_redirect=1,
            prioritize=False,
            hash_url=None,
            cache=0,
            content_sha1=None,
            confirm_cache_write=False,
            check_only_tld=False,
            disable_negative_cache=False):
        """
        Make a cached HTTP Request.

        **Arguments:**
         * *url* -- URL for the request.

        **Keyword arguments:**
         * *method* -- HTTP request method. (Default ``'GET'``)
         * *postdata* -- Dictionary of strings to post with the request.
           (Default ``None``)
         * *headers* -- Dictionary of strings to send as request headers.
           (Default ``None``)
         * *agent* -- User agent to send with request. (Default
           ``'HiiSpider'``)
         * *timeout* -- Request timeout, in seconds. (Default ``60``)
         * *cookies* -- Dictionary of strings to send as request cookies.
           (Default ``None``).
         * *follow_redirect* -- Boolean switch to follow HTTP redirects.
           (Default ``True``)
         * *prioritize* -- Move this request to the front of the request
           queue. (Default ``False``)
         * *hash_url* -- URL string used to indicate a common resource.
           Example: "http://digg.com" and "http://www.digg.com" could both
           use hash_url, "http://digg.com" (Default ``None``)
         * *cache* -- Cache mode. ``1``, immediately return contents of
           cache if available. ``0``, check resource, return cache if not
           stale. ``-1``, ignore cache. (Default ``0``)
         * *content_sha1* -- SHA-1 hash of content. If this matches the
           hash of data returned by the resource, raises a
           StaleContentException.
         * *confirm_cache_write* -- Wait to confirm cache write before returning.
         * *check_only_tld* -- for negative cache, check only the top level domain name
         * *disable_negative_cache* -- disable negative cache for this request
        """
        stats.stats.increment("pg.getpage", 0.05)
        start = time.time()
        request_kwargs = {
            "method":method.upper(),
            "postdata":postdata,
            "headers":headers,
            "agent":agent,
            "timeout":timeout,
            "cookies":cookies,
            "follow_redirect":follow_redirect,
            "prioritize":prioritize}
        cache = int(cache)
        if cache not in [-1,0,1]:
            raise Exception("Unknown caching mode.")
        if not isinstance(url, str):
            url = convertToUTF8(url)
        if hash_url is not None and not isinstance(hash_url, str):
            hash_url = convertToUTF8(hash_url)
        # check negative cache
        host = _parse(url)[1]
        # if check_only_tld is true then parse the url down to the top level domain
        if check_only_tld:
            host_split = host.split('.', host.count('.')-1)
            host = host_split[len(host_split)-1]
        # Create request_hash to serve as a cache key from
        # either the URL or user-provided hash_url.
        hash_items = [hash_url or url, agent]
        if postdata:
            hash_items.append(repr(postdata))
        if headers and 'Authorization' in headers:
            items = headers['Authorization'].split(',')
            oauth_headers = [item for item in items
                if item.find('oauth_consumer_key') > -1 or
                item.find('oauth_token') > -1 or
                item.find('oauth_token_secret') > -1]
            if oauth_headers:
                hash_items.append(repr(oauth_headers))
        if cookies:
            hash_items.append(repr(cookies))
        request_hash = sha1(json.dumps(hash_items)).hexdigest()
        data = yield self.rq.getPage(url, **request_kwargs)
        if "content-sha1" not in data:
            data["content-sha1"] = sha1(data["response"]).hexdigest()
        if content_sha1 == data["content-sha1"]:
            stats.stats.increment('pg.stalecontent')
            raise StaleContentException(content_sha1)
        returnValue(data)
Beispiel #8
0
class BaseServer(object):

    exposed_functions = []
    exposed_function_resources = {}
    logging_handler = None
    shutdown_trigger_id = None
    uuid = uuid4().hex
    start_time = time.time()
    active_jobs = {}
    reserved_arguments = [
        "reservation_function_name",
        "reservation_created",
        "reservation_next_request",
        "reservation_error"]
    functions = {}
    delta_functions = {}
    categories = {}
    fast_cache = {}
    function_resource = None

    def __init__(self, config, pg=None):
        # Resource Mappings
        self.service_mapping = config["service_mapping"]
        self.service_args_mapping = config["service_args_mapping"]
        self.inverted_args_mapping = dict([(s[0], invert(s[1]))
            for s in self.service_args_mapping.items()])
        # configure stats
        self.stats = config.get('stats', stats.stats)
        stats.stats = self.stats
        # Request Queuer
        self.rq = RequestQueuer(
            max_simultaneous_requests=config["max_simultaneous_requests"],
            max_requests_per_host_per_second=config["max_requests_per_host_per_second"],
            max_simultaneous_requests_per_host=config["max_simultaneous_requests_per_host"])
        self.rq.setHostMaxRequestsPerSecond("127.0.0.1", 0)
        self.rq.setHostMaxSimultaneousRequests("127.0.0.1", 0)
        if pg is None:
            self.pg = PageGetter(rq=self.rq)
        else:
            self.pg = pg


    def start(self):
        start_deferred = Deferred()
        reactor.callWhenRunning(self._baseStart, start_deferred)
        return start_deferred

    def _baseStart(self, start_deferred):
        logger.debug("Starting Base components.")
        self.shutdown_trigger_id = reactor.addSystemEventTrigger(
            'before',
            'shutdown',
            self.shutdown)
        start_deferred.callback(True)

    @inlineCallbacks
    def shutdown(self):
        while self.rq.getPending() > 0 or self.rq.getActive() > 0:
            logger.debug("%s requests active, %s requests pending." % (
                self.rq.getPending(),
                self.rq.getActive()
            ))
            shutdown_deferred = Deferred()
            # Call the Deferred after a second to continue the loop.
            reactor.callLater(1, shutdown_deferred.callback)
            yield shutdown_deferred
        self.shutdown_trigger_id = None
        logger.critical("Server shut down.")
        logger.removeHandler(self.logging_handler)
        returnValue(True)

    def getManholeFactory(self, namespace, **passwords):
        realm = manhole_ssh.TerminalRealm()

        def getManhole(_):
            return manhole.Manhole(namespace)

        realm.chainedProtocolFactory.protocolFactory = getManhole
        p = portal.Portal(realm)
        p.registerChecker(
            checkers.InMemoryUsernamePasswordDatabaseDontUse(**passwords))
        f = manhole_ssh.ConchFactory(p)
        return f

    def delta(self, func, handler):
        self.delta_functions[id(func)] = handler

    def expose(self, *args, **kwargs):
        return self.makeCallable(expose=True, *args, **kwargs)

    @inlineCallbacks
    def executeJob(self, job):
        dotted_function = '.'.join(job.function_name.split('/'))
        timer = 'job.%s.duration' % (dotted_function)
        self.stats.timer.start(timer, 0.5)
        self.stats.timer.start('job.time', 0.1)
        if not job.mapped:
            job = self.mapJob(job)
        f = self.functions[job.function_name]
        if job.uuid is not None:
            self.active_jobs[job.uuid] = True
        if f["get_job_uuid"]:
            job.kwargs["job_uuid"] = job.uuid
        if f["check_fast_cache"]:
            job.kwargs["fast_cache"] = job.fast_cache
        try:
            data = yield self.executeFunction(job.function_name, **job.kwargs)
        except NegativeCacheException:
            self.stats.timer.stop(timer)
            self.stats.timer.stop('job.time')
            raise
        except QueueTimeoutException:
            self.stats.timer.stop(timer)
            self.stats.timer.stop('job.time')
            raise
        except Exception, e:
            self.stats.increment('job.%s.failure' % dotted_function)
            self.stats.timer.stop(timer)
            self.stats.timer.stop('job.time')
            raise
        finally:
Beispiel #9
0
class PageGetter:

    negitive_cache = {}

    def __init__(self, rq=None):
        """
        Create an Cassandra based HTTP cache.

        **Arguments:**
         * *cassandra_client* -- Cassandra client object.

        **Keyword arguments:**
         * *rq* -- Request Queuer object. (Default ``None``)

        """
        if rq is None:
            self.rq = RequestQueuer()
        else:
            self.rq = rq


    def getPage(self,
            url,
            method='GET',
            postdata=None,
            headers=None,
            agent="HiiSpider",
            timeout=60,
            cookies=None,
            follow_redirect=1,
            prioritize=False,
            hash_url=None,
            cache=0,
            content_sha1=None,
            confirm_cache_write=False,
            check_only_tld=False,
            disable_negative_cache=False,
            ):
        """
        Make a cached HTTP Request.

        **Arguments:**
         * *url* -- URL for the request.

        **Keyword arguments:**
         * *method* -- HTTP request method. (Default ``'GET'``)
         * *postdata* -- Dictionary of strings to post with the request.
           (Default ``None``)
         * *headers* -- Dictionary of strings to send as request headers.
           (Default ``None``)
         * *agent* -- User agent to send with request. (Default
           ``'HiiSpider'``)
         * *timeout* -- Request timeout, in seconds. (Default ``60``)
         * *cookies* -- Dictionary of strings to send as request cookies.
           (Default ``None``).
         * *follow_redirect* -- Boolean switch to follow HTTP redirects.
           (Default ``True``)
         * *prioritize* -- Move this request to the front of the request
           queue. (Default ``False``)
         * *hash_url* -- URL string used to indicate a common resource.
           Example: "http://digg.com" and "http://www.digg.com" could both
           use hash_url, "http://digg.com" (Default ``None``)
         * *cache* -- Cache mode. ``1``, immediately return contents of
           cache if available. ``0``, check resource, return cache if not
           stale. ``-1``, ignore cache. (Default ``0``)
         * *content_sha1* -- SHA-1 hash of content. If this matches the
           hash of data returned by the resource, raises a
           StaleContentException.
         * *confirm_cache_write* -- Wait to confirm cache write before returning.
        """
        request_kwargs = {
            "method":method.upper(),
            "postdata":postdata,
            "headers":headers,
            "agent":agent,
            "timeout":timeout,
            "cookies":cookies,
            "follow_redirect":follow_redirect,
            "prioritize":prioritize}
        cache = int(cache)
        cache=0
        if cache not in [-1,0,1]:
            raise Exception("Unknown caching mode.")
        if not isinstance(url, str):
            url = convertToUTF8(url)
        if hash_url is not None and not isinstance(hash_url, str):
            hash_url = convertToUTF8(hash_url)
        # check negitive cache
        host = _parse(url)[1]
        # if check_only_tld is true then parse the url down to the top level domain
        if check_only_tld:
            host_split = host.split('.', host.count('.')-1)
            host = host_split[len(host_split)-1]
        if host in self.negitive_cache:
            if not self.negitive_cache[host]['timeout'] < time.time():
                logger.error('Found %s in negitive cache, raising last known exception' % host)
                return self.negitive_cache[host]['error'].raiseException()
        # Create request_hash to serve as a cache key from
        # either the URL or user-provided hash_url.
        if hash_url is None:
            request_hash = hashlib.sha1(json.dumps([
                url,
                agent])).hexdigest()
        else:
            request_hash = hashlib.sha1(json.dumps([
                hash_url,
                agent])).hexdigest()

        d = self.rq.getPage(url, **request_kwargs)
        d.addCallback(self._checkForStaleContent, content_sha1, request_hash, host)
        d.addErrback(self._getPageErrback, host)
        return d

    def _checkForStaleContent(self, data, content_sha1, request_hash, host):
        if host in self.negitive_cache:
            logger.error('Removing %s from negitive cache' % host)
            del self.negitive_cache[host]
        if "content-sha1" not in data:
            data["content-sha1"] = hashlib.sha1(data["response"]).hexdigest()
        if content_sha1 == data["content-sha1"]:
            logger.debug("Raising StaleContentException (4) on %s" % request_hash)
            raise StaleContentException(content_sha1)
        else:
            return data

    def _getPageErrback(self, error, host):
        try:
            status = int(error.value.status)
        except:
            status = 500
        if status >= 500:
            if not host in self.negitive_cache:
                logger.error('Adding %s to negitive cache' % host)
                self.negitive_cache[host] = {
                    'timeout': time.time() + 300,
                    'retries': 1,
                    'error': error
                }
            else:
                if self.negitive_cache[host]['retries'] <= 5:
                    self.negitive_cache[host]['timeout'] = time.time() + 600
                    self.negitive_cache[host]['retries'] += 1
                else:
                    self.negitive_cache[host]['timeout'] = time.time() + 3600
                    self.negitive_cache[host]['retries'] += 1
                self.negitive_cache[host]['error'] = error
                logger.error('Updating negitive cache for host %s which has failed %d times' % (host, self.negitive_cache[host]['retries']))
        error.raiseException()