def getPage(self, url, last_modified=None, etag=None, method='GET', postdata=None, headers=None, agent="RequestQueuer", timeout=60, cookies=None, follow_redirect=True, prioritize=False, queue_timeout=5 ): """ Make an HTTP Request. **Arguments:** * *url* -- URL for the request. **Keyword arguments:** * *last_modified* -- Last modified date string to send as a request header. (Default ``None``) * *etag* -- Etag string to send as a request header. (Default ``None``) * *method* -- HTTP request method. (Default ``'GET'``) * *postdata* -- Dictionary of strings to post with the request. (Default ``None``) * *headers* -- Dictionary of strings to send as request headers. (Default ``None``) * *agent* -- User agent to send with request. (Default ``'RequestQueuer'``) * *timeout* -- Request timeout, in seconds. (Default ``60``) * *cookies* -- Dictionary of strings to send as request cookies. (Default ``None``). * *follow_redirect* -- Boolean switch to follow HTTP redirects. (Default ``True``) * *prioritize* -- Move this request to the front of the request queue. (Default ``False``) * *queue_timeout* -- Number of seconds to hold a request in the local queue before failure. """ if headers is None: headers={} if postdata is not None: if isinstance(postdata, dict): for key in postdata: postdata[key] = convertToUTF8(postdata[key]) postdata = urllib.urlencode(postdata) else: convertToUTF8(postdata) if method.lower() == "post": headers["content-type"] = "application/x-www-form-urlencoded" if last_modified is not None: if isinstance(last_modified, (list, tuple)): last_modified = last_modified[0] time_tuple = dateutil.parser.parse(last_modified).timetuple() time_string = time.strftime("%a, %d %b %Y %T %z", time_tuple) headers['If-Modified-Since'] = time_string if etag is not None: headers["If-None-Match"] = etag req = { "url":convertToUTF8(url), "method":method, "postdata":postdata, "headers":headers, "agent":agent, "timeout":timeout, "cookies":cookies, "follow_redirect":follow_redirect, "deferred":Deferred(), "start":time.time()} host = _parse(req["url"])[1] if host in self.max_reqs_per_hosts_per_sec and host in self.pending_reqs: if len(self.pending_reqs[host]) * self.max_reqs_per_hosts_per_sec[host] > queue_timeout: req["deferred"].errback(QueueTimeoutException()) return req["deferred"] req["host"] = host if host not in self.pending_reqs: self.pending_reqs[host] = [] if prioritize: self.pending_reqs[host].insert(0, req) else: self.pending_reqs[host].append(req) self._checkActive() return req["deferred"]
def getPage(self, url, method='GET', postdata=None, headers=None, agent="HiiSpider", timeout=5, cookies=None, follow_redirect=1, prioritize=False, hash_url=None, cache=0, content_sha1=None, confirm_cache_write=False, check_only_tld=False, disable_negative_cache=False): """ Make a cached HTTP Request. **Arguments:** * *url* -- URL for the request. **Keyword arguments:** * *method* -- HTTP request method. (Default ``'GET'``) * *postdata* -- Dictionary of strings to post with the request. (Default ``None``) * *headers* -- Dictionary of strings to send as request headers. (Default ``None``) * *agent* -- User agent to send with request. (Default ``'HiiSpider'``) * *timeout* -- Request timeout, in seconds. (Default ``60``) * *cookies* -- Dictionary of strings to send as request cookies. (Default ``None``). * *follow_redirect* -- Boolean switch to follow HTTP redirects. (Default ``True``) * *prioritize* -- Move this request to the front of the request queue. (Default ``False``) * *hash_url* -- URL string used to indicate a common resource. Example: "http://digg.com" and "http://www.digg.com" could both use hash_url, "http://digg.com" (Default ``None``) * *cache* -- Cache mode. ``1``, immediately return contents of cache if available. ``0``, check resource, return cache if not stale. ``-1``, ignore cache. (Default ``0``) * *content_sha1* -- SHA-1 hash of content. If this matches the hash of data returned by the resource, raises a StaleContentException. * *confirm_cache_write* -- Wait to confirm cache write before returning. * *check_only_tld* -- for negative cache, check only the top level domain name * *disable_negative_cache* -- disable negative cache for this request """ start = time.time() request_kwargs = { "method":method.upper(), "postdata":postdata, "headers":headers, "agent":agent, "timeout":timeout, "cookies":cookies, "follow_redirect":follow_redirect, "prioritize":prioritize} cache = int(cache) if cache not in [-1,0,1]: raise Exception("Unknown caching mode.") if not isinstance(url, str): url = convertToUTF8(url) if hash_url is not None and not isinstance(hash_url, str): hash_url = convertToUTF8(hash_url) # check negative cache host = _parse(url)[1] # if check_only_tld is true then parse the url down to the top level domain if check_only_tld: host_split = host.split('.', host.count('.')-1) host = host_split[len(host_split)-1] # Create request_hash to serve as a cache key from # either the URL or user-provided hash_url. hash_items = [hash_url or url, agent] if postdata: hash_items.append(repr(postdata)) if headers and 'Authorization' in headers: items = headers['Authorization'].split(',') oauth_headers = [item for item in items if item.find('oauth_consumer_key') > -1 or item.find('oauth_token') > -1 or item.find('oauth_token_secret') > -1] if oauth_headers: hash_items.append(repr(oauth_headers)) if cookies: hash_items.append(repr(cookies)) request_hash = sha1(json.dumps(hash_items)).hexdigest() # if not disable_negative_cache and not self.disable_negative_cache: # yield self.checkNegativeCache( # 'negative_cache:%s' % host, # 'negative_req_cache:%s' % request_hash) # if request_kwargs["method"] != "GET": # data = yield self.rq.getPage(url, **request_kwargs) # else: # data = yield self._getPage( # url, # request_hash, # request_kwargs, # cache, # content_sha1, # confirm_cache_write, # host) # logger.info("Got %s after %s" % (host, time.time() - start)) # # Check for stale contents data = yield self.rq.getPage(url, **request_kwargs) if "content-sha1" not in data: data["content-sha1"] = sha1(data["response"]).hexdigest() if content_sha1 == data["content-sha1"]: logger.debug("Raising StaleContentException (4) on %s" % request_hash) raise StaleContentException(content_sha1) returnValue(data)
def getPage(self, url, method='GET', postdata=None, headers=None, agent="HiiSpider", timeout=60, cookies=None, follow_redirect=1, prioritize=False, hash_url=None, cache=0, content_sha1=None, confirm_cache_write=False, check_only_tld=False, disable_negative_cache=False, ): """ Make a cached HTTP Request. **Arguments:** * *url* -- URL for the request. **Keyword arguments:** * *method* -- HTTP request method. (Default ``'GET'``) * *postdata* -- Dictionary of strings to post with the request. (Default ``None``) * *headers* -- Dictionary of strings to send as request headers. (Default ``None``) * *agent* -- User agent to send with request. (Default ``'HiiSpider'``) * *timeout* -- Request timeout, in seconds. (Default ``60``) * *cookies* -- Dictionary of strings to send as request cookies. (Default ``None``). * *follow_redirect* -- Boolean switch to follow HTTP redirects. (Default ``True``) * *prioritize* -- Move this request to the front of the request queue. (Default ``False``) * *hash_url* -- URL string used to indicate a common resource. Example: "http://digg.com" and "http://www.digg.com" could both use hash_url, "http://digg.com" (Default ``None``) * *cache* -- Cache mode. ``1``, immediately return contents of cache if available. ``0``, check resource, return cache if not stale. ``-1``, ignore cache. (Default ``0``) * *content_sha1* -- SHA-1 hash of content. If this matches the hash of data returned by the resource, raises a StaleContentException. * *confirm_cache_write* -- Wait to confirm cache write before returning. """ request_kwargs = { "method":method.upper(), "postdata":postdata, "headers":headers, "agent":agent, "timeout":timeout, "cookies":cookies, "follow_redirect":follow_redirect, "prioritize":prioritize} cache = int(cache) cache=0 if cache not in [-1,0,1]: raise Exception("Unknown caching mode.") if not isinstance(url, str): url = convertToUTF8(url) if hash_url is not None and not isinstance(hash_url, str): hash_url = convertToUTF8(hash_url) # check negitive cache host = _parse(url)[1] # if check_only_tld is true then parse the url down to the top level domain if check_only_tld: host_split = host.split('.', host.count('.')-1) host = host_split[len(host_split)-1] if host in self.negitive_cache: if not self.negitive_cache[host]['timeout'] < time.time(): logger.error('Found %s in negitive cache, raising last known exception' % host) return self.negitive_cache[host]['error'].raiseException() # Create request_hash to serve as a cache key from # either the URL or user-provided hash_url. if hash_url is None: request_hash = hashlib.sha1(json.dumps([ url, agent])).hexdigest() else: request_hash = hashlib.sha1(json.dumps([ hash_url, agent])).hexdigest() d = self.rq.getPage(url, **request_kwargs) d.addCallback(self._checkForStaleContent, content_sha1, request_hash, host) d.addErrback(self._getPageErrback, host) return d