Exemple #1
0
    def getPage(self,
                url,
                last_modified=None,
                etag=None,
                method='GET',
                postdata=None,
                headers=None,
                agent="RequestQueuer",
                timeout=60,
                cookies=None,
                follow_redirect=True,
                prioritize=False,
                queue_timeout=5
                ):
        """
        Make an HTTP Request.

        **Arguments:**
         * *url* -- URL for the request.

        **Keyword arguments:**
         * *last_modified* -- Last modified date string to send as a request
           header. (Default ``None``)
         * *etag* -- Etag string to send as a request header. (Default
           ``None``)
         * *method* -- HTTP request method. (Default ``'GET'``)
         * *postdata* -- Dictionary of strings to post with the request.
           (Default ``None``)
         * *headers* -- Dictionary of strings to send as request headers.
           (Default ``None``)
         * *agent* -- User agent to send with request. (Default
           ``'RequestQueuer'``)
         * *timeout* -- Request timeout, in seconds. (Default ``60``)
         * *cookies* -- Dictionary of strings to send as request cookies.
           (Default ``None``).
         * *follow_redirect* -- Boolean switch to follow HTTP redirects.
           (Default ``True``)
         * *prioritize* -- Move this request to the front of the request
           queue. (Default ``False``)
         * *queue_timeout* -- Number of seconds to hold a request in the local
           queue before failure.
        """
        if headers is None:
            headers={}
        if postdata is not None:
            if isinstance(postdata, dict):
                for key in postdata:
                    postdata[key] = convertToUTF8(postdata[key])
                postdata = urllib.urlencode(postdata)
            else:
                convertToUTF8(postdata)
        if method.lower() == "post":
            headers["content-type"] = "application/x-www-form-urlencoded"
        if last_modified is not None:
            if isinstance(last_modified, (list, tuple)):
                last_modified = last_modified[0]
            time_tuple = dateutil.parser.parse(last_modified).timetuple()
            time_string = time.strftime("%a, %d %b %Y %T %z", time_tuple)
            headers['If-Modified-Since'] = time_string
        if etag is not None:
            headers["If-None-Match"] = etag
        req = {
            "url":convertToUTF8(url),
            "method":method,
            "postdata":postdata,
            "headers":headers,
            "agent":agent,
            "timeout":timeout,
            "cookies":cookies,
            "follow_redirect":follow_redirect,
            "deferred":Deferred(),
            "start":time.time()}
        host = _parse(req["url"])[1]
        if host in self.max_reqs_per_hosts_per_sec and host in self.pending_reqs:
            if len(self.pending_reqs[host]) * self.max_reqs_per_hosts_per_sec[host] > queue_timeout:
                req["deferred"].errback(QueueTimeoutException())
                return req["deferred"]
        req["host"] = host
        if host not in self.pending_reqs:
            self.pending_reqs[host] = []
        if prioritize:
            self.pending_reqs[host].insert(0, req)
        else:
            self.pending_reqs[host].append(req)
        self._checkActive()
        return req["deferred"]
    def getPage(self,
            url,
            method='GET',
            postdata=None,
            headers=None,
            agent="HiiSpider",
            timeout=5,
            cookies=None,
            follow_redirect=1,
            prioritize=False,
            hash_url=None,
            cache=0,
            content_sha1=None,
            confirm_cache_write=False,
            check_only_tld=False,
            disable_negative_cache=False):
        """
        Make a cached HTTP Request.

        **Arguments:**
         * *url* -- URL for the request.

        **Keyword arguments:**
         * *method* -- HTTP request method. (Default ``'GET'``)
         * *postdata* -- Dictionary of strings to post with the request.
           (Default ``None``)
         * *headers* -- Dictionary of strings to send as request headers.
           (Default ``None``)
         * *agent* -- User agent to send with request. (Default
           ``'HiiSpider'``)
         * *timeout* -- Request timeout, in seconds. (Default ``60``)
         * *cookies* -- Dictionary of strings to send as request cookies.
           (Default ``None``).
         * *follow_redirect* -- Boolean switch to follow HTTP redirects.
           (Default ``True``)
         * *prioritize* -- Move this request to the front of the request
           queue. (Default ``False``)
         * *hash_url* -- URL string used to indicate a common resource.
           Example: "http://digg.com" and "http://www.digg.com" could both
           use hash_url, "http://digg.com" (Default ``None``)
         * *cache* -- Cache mode. ``1``, immediately return contents of
           cache if available. ``0``, check resource, return cache if not
           stale. ``-1``, ignore cache. (Default ``0``)
         * *content_sha1* -- SHA-1 hash of content. If this matches the
           hash of data returned by the resource, raises a
           StaleContentException.
         * *confirm_cache_write* -- Wait to confirm cache write before returning.
         * *check_only_tld* -- for negative cache, check only the top level domain name
         * *disable_negative_cache* -- disable negative cache for this request
        """
        start = time.time()
        request_kwargs = {
            "method":method.upper(),
            "postdata":postdata,
            "headers":headers,
            "agent":agent,
            "timeout":timeout,
            "cookies":cookies,
            "follow_redirect":follow_redirect,
            "prioritize":prioritize}
        cache = int(cache)
        if cache not in [-1,0,1]:
            raise Exception("Unknown caching mode.")
        if not isinstance(url, str):
            url = convertToUTF8(url)
        if hash_url is not None and not isinstance(hash_url, str):
            hash_url = convertToUTF8(hash_url)
        # check negative cache
        host = _parse(url)[1]
        # if check_only_tld is true then parse the url down to the top level domain
        if check_only_tld:
            host_split = host.split('.', host.count('.')-1)
            host = host_split[len(host_split)-1]
        # Create request_hash to serve as a cache key from
        # either the URL or user-provided hash_url.
        hash_items = [hash_url or url, agent]
        if postdata:
            hash_items.append(repr(postdata))
        if headers and 'Authorization' in headers:
            items = headers['Authorization'].split(',')
            oauth_headers = [item for item in items
                if item.find('oauth_consumer_key') > -1 or
                item.find('oauth_token') > -1 or
                item.find('oauth_token_secret') > -1]
            if oauth_headers:
                hash_items.append(repr(oauth_headers))
        if cookies:
            hash_items.append(repr(cookies))
        request_hash = sha1(json.dumps(hash_items)).hexdigest()
#        if not disable_negative_cache and not self.disable_negative_cache:
#            yield self.checkNegativeCache(
#                    'negative_cache:%s' % host,
#                    'negative_req_cache:%s' % request_hash)
#        if request_kwargs["method"] != "GET":
#            data = yield self.rq.getPage(url, **request_kwargs)
#        else:
#            data = yield self._getPage(
#                    url, 
#                    request_hash, 
#                    request_kwargs, 
#                    cache, 
#                    content_sha1, 
#                    confirm_cache_write, 
#                    host)
#        logger.info("Got %s after %s" % (host, time.time() - start))
#        # Check for stale contents
        data = yield self.rq.getPage(url, **request_kwargs)
        if "content-sha1" not in data:
            data["content-sha1"] = sha1(data["response"]).hexdigest()
        if content_sha1 == data["content-sha1"]:
            logger.debug("Raising StaleContentException (4) on %s" % request_hash)
            raise StaleContentException(content_sha1)
        returnValue(data)
Exemple #3
0
    def getPage(self,
            url,
            method='GET',
            postdata=None,
            headers=None,
            agent="HiiSpider",
            timeout=60,
            cookies=None,
            follow_redirect=1,
            prioritize=False,
            hash_url=None,
            cache=0,
            content_sha1=None,
            confirm_cache_write=False,
            check_only_tld=False,
            disable_negative_cache=False,
            ):
        """
        Make a cached HTTP Request.

        **Arguments:**
         * *url* -- URL for the request.

        **Keyword arguments:**
         * *method* -- HTTP request method. (Default ``'GET'``)
         * *postdata* -- Dictionary of strings to post with the request.
           (Default ``None``)
         * *headers* -- Dictionary of strings to send as request headers.
           (Default ``None``)
         * *agent* -- User agent to send with request. (Default
           ``'HiiSpider'``)
         * *timeout* -- Request timeout, in seconds. (Default ``60``)
         * *cookies* -- Dictionary of strings to send as request cookies.
           (Default ``None``).
         * *follow_redirect* -- Boolean switch to follow HTTP redirects.
           (Default ``True``)
         * *prioritize* -- Move this request to the front of the request
           queue. (Default ``False``)
         * *hash_url* -- URL string used to indicate a common resource.
           Example: "http://digg.com" and "http://www.digg.com" could both
           use hash_url, "http://digg.com" (Default ``None``)
         * *cache* -- Cache mode. ``1``, immediately return contents of
           cache if available. ``0``, check resource, return cache if not
           stale. ``-1``, ignore cache. (Default ``0``)
         * *content_sha1* -- SHA-1 hash of content. If this matches the
           hash of data returned by the resource, raises a
           StaleContentException.
         * *confirm_cache_write* -- Wait to confirm cache write before returning.
        """
        request_kwargs = {
            "method":method.upper(),
            "postdata":postdata,
            "headers":headers,
            "agent":agent,
            "timeout":timeout,
            "cookies":cookies,
            "follow_redirect":follow_redirect,
            "prioritize":prioritize}
        cache = int(cache)
        cache=0
        if cache not in [-1,0,1]:
            raise Exception("Unknown caching mode.")
        if not isinstance(url, str):
            url = convertToUTF8(url)
        if hash_url is not None and not isinstance(hash_url, str):
            hash_url = convertToUTF8(hash_url)
        # check negitive cache
        host = _parse(url)[1]
        # if check_only_tld is true then parse the url down to the top level domain
        if check_only_tld:
            host_split = host.split('.', host.count('.')-1)
            host = host_split[len(host_split)-1]
        if host in self.negitive_cache:
            if not self.negitive_cache[host]['timeout'] < time.time():
                logger.error('Found %s in negitive cache, raising last known exception' % host)
                return self.negitive_cache[host]['error'].raiseException()
        # Create request_hash to serve as a cache key from
        # either the URL or user-provided hash_url.
        if hash_url is None:
            request_hash = hashlib.sha1(json.dumps([
                url,
                agent])).hexdigest()
        else:
            request_hash = hashlib.sha1(json.dumps([
                hash_url,
                agent])).hexdigest()

        d = self.rq.getPage(url, **request_kwargs)
        d.addCallback(self._checkForStaleContent, content_sha1, request_hash, host)
        d.addErrback(self._getPageErrback, host)
        return d