def setopt(name, value): option_name = 'CURLOPT_%s' % name.upper() if name.islower() and hasattr(const, option_name): option_value = getattr(const, option_name) if name in self._CURLOPT_SLIST: value = lib.list2pointer_slist(value) if name in self._slist: lib.curl_slist_free_all(self._slist[name]) del self._slist[name] else: self._slist[name] = value elif hasattr(prototype, name): if callable(value): value = getattr(prototype, name)(value) elif name == 'postfields' and isinstance(value, dict): value = urllib.urlencode(value) elif name == 'share': value = value._handle elif name == 'url' and value: value = iri2uri(value) if isinstance(value, unicode): value = value.encode('utf-8') # setopt lib.curl_easy_setopt(self._handle, option_value, value) #print option_name, value self._buff[option_name] = value else: raise ValueError('invalid option name "%s"' % name)
def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS): """ Performs a single HTTP request. The 'uri' is the URI of the HTTP resource and can begin with either 'http' or 'https'. The value of 'uri' must be an absolute URI. The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc. There is no restriction on the methods allowed. The 'body' is the entity body to be sent with the request. It is a string object. Any extra headers that are to be sent with the request should be provided in the 'headers' dictionary. The maximum number of redirect to follow before raising an exception is 'redirections. The default is 5. The return value is a tuple of (response, content), the first being and instance of the 'Response' class, the second being a string that contains the response entity body. """ if headers is None: headers = {} else: headers = _normalize_headers(headers) if not headers.has_key('user-agent'): headers['user-agent'] = "Python-httplib2/%s" % __version__ uri = iri2uri(uri) (scheme, authority, request_uri, defrag_uri) = urlnorm(uri) if not self.connections.has_key(scheme+":"+authority): connection_type = (scheme == 'https') and httplib.HTTPSConnection or httplib.HTTPConnection conn = self.connections[scheme+":"+authority] = connection_type(authority) conn.set_debuglevel(debuglevel) else: conn = self.connections[scheme+":"+authority] if method in ["GET", "HEAD"] and 'range' not in headers: headers['accept-encoding'] = 'compress, gzip' info = email.Message.Message() cached_value = None if self.cache: cachekey = defrag_uri cached_value = self.cache.get(cachekey) if cached_value: try: info = email.message_from_string(cached_value) content = cached_value.split('\r\n\r\n', 1)[1] except Exception, e: self.cache.delete(cachekey) cachekey = None cached_value = None
def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None): """ Performs a single HTTP request. The 'uri' is the URI of the HTTP resource and can begin with either 'http' or 'https'. The value of 'uri' must be an absolute URI. The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc. There is no restriction on the methods allowed. The 'body' is the entity body to be sent with the request. It is a string object. Any extra headers that are to be sent with the request should be provided in the 'headers' dictionary. The maximum number of redirect to follow before raising an exception is 'redirections. The default is 5. The return value is a tuple of (response, content), the first being and instance of the 'Response' class, the second being a string that contains the response entity body. """ try: if headers is None: headers = {} else: headers = _normalize_headers(headers) if not headers.has_key('user-agent'): headers['user-agent'] = "Python-httplib2/%s" % __version__ uri = iri2uri(uri) (scheme, authority, request_uri, defrag_uri) = urlnorm(uri) conn_key = scheme+":"+authority if conn_key in self.connections: conn = self.connections[conn_key] else: if not connection_type: connection_type = (scheme == 'https') and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout certs = list(self.certificates.iter(authority)) if scheme == 'https' and certs: conn = self.connections[conn_key] = connection_type(authority, key_file=certs[0][0], cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info) else: conn = self.connections[conn_key] = connection_type(authority, timeout=self.timeout, proxy_info=self.proxy_info) conn.set_debuglevel(debuglevel) if method in ["GET", "HEAD"] and 'range' not in headers: headers['accept-encoding'] = 'compress, gzip' info = email.Message.Message() cached_value = None if self.cache: cachekey = defrag_uri cached_value = self.cache.get(cachekey) if cached_value: info = email.message_from_string(cached_value) try: content = cached_value.split('\r\n\r\n', 1)[1] except IndexError: self.cache.delete(cachekey) cachekey = None cached_value = None else: cachekey = None if method in ["PUT"] and self.cache and info.has_key('etag') and not self.ignore_etag and 'if-match' not in headers: # http://www.w3.org/1999/04/Editing/ headers['if-match'] = info['etag'] if method not in ["GET", "HEAD"] and self.cache and cachekey: # RFC 2616 Section 13.10 self.cache.delete(cachekey) if cached_value and method in ["GET", "HEAD"] and self.cache and 'range' not in headers: if info.has_key('-x-permanent-redirect-url'): # Should cached permanent redirects be counted in our redirection count? For now, yes. (response, new_content) = self.request(info['-x-permanent-redirect-url'], "GET", headers = headers, redirections = redirections - 1) response.previous = Response(info) response.previous.fromcache = True else: # Determine our course of action: # Is the cached entry fresh or stale? # Has the client requested a non-cached response? # # There seems to be three possible answers: # 1. [FRESH] Return the cache entry w/o doing a GET # 2. [STALE] Do the GET (but add in cache validators if available) # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request entry_disposition = _entry_disposition(info, headers) if entry_disposition == "FRESH": if not cached_value: info['status'] = '504' content = "" response = Response(info) if cached_value: response.fromcache = True return (response, content) if entry_disposition == "STALE": if info.has_key('etag') and not self.ignore_etag and not 'if-none-match' in headers: headers['if-none-match'] = info['etag'] if info.has_key('last-modified') and not 'last-modified' in headers: headers['if-modified-since'] = info['last-modified'] elif entry_disposition == "TRANSPARENT": pass (response, new_content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) if response.status == 304 and method == "GET": # Rewrite the cache entry with the new end-to-end headers # Take all headers that are in response # and overwrite their values in info. # unless they are hop-by-hop, or are listed in the connection header. for key in _get_end2end_headers(response): info[key] = response[key] merged_response = Response(info) if hasattr(response, "_stale_digest"): merged_response._stale_digest = response._stale_digest _updateCache(headers, merged_response, content, self.cache, cachekey) response = merged_response response.status = 200 response.fromcache = True elif response.status == 200: content = new_content else: self.cache.delete(cachekey) content = new_content else: (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) except Exception, e: if self.force_exception_to_status_code: if isinstance(e, HttpLib2ErrorWithResponse): response = e.response content = e.content response.status = 500 response.reason = str(e) elif isinstance(e, socket.timeout): content = "Request Timeout" response = Response( { "content-type": "text/plain", "status": "408", "content-length": len(content) }) response.reason = "Request Timeout" else: content = str(e) response = Response( { "content-type": "text/plain", "status": "400", "content-length": len(content) }) response.reason = "Bad Request" else: raise
def request(self, uri, method="GET", body=None, headers=None, redirections=DEFAULT_MAX_REDIRECTS, connection_type=None): """ Performs a single HTTP request. The 'uri' is the URI of the HTTP resource and can begin with either 'http' or 'https'. The value of 'uri' must be an absolute URI. The 'method' is the HTTP method to perform, such as GET, POST, DELETE, etc. There is no restriction on the methods allowed. The 'body' is the entity body to be sent with the request. It is a string object. Any extra headers that are to be sent with the request should be provided in the 'headers' dictionary. The maximum number of redirect to follow before raising an exception is 'redirections. The default is 5. The return value is a tuple of (response, content), the first being and instance of the 'Response' class, the second being a string that contains the response entity body. """ try: if headers is None: headers = {} else: headers = _normalize_headers(headers) if not headers.has_key('user-agent'): headers['user-agent'] = "Python-httplib2/%s" % __version__ uri = iri2uri(uri) (scheme, authority, request_uri, defrag_uri) = urlnorm(uri) conn_key = scheme + ":" + authority if conn_key in self.connections: conn = self.connections[conn_key] else: if not connection_type: connection_type = ( scheme == 'https' ) and HTTPSConnectionWithTimeout or HTTPConnectionWithTimeout certs = list(self.certificates.iter(authority)) if scheme == 'https' and certs: conn = self.connections[conn_key] = connection_type( authority, key_file=certs[0][0], cert_file=certs[0][1], timeout=self.timeout, proxy_info=self.proxy_info) else: conn = self.connections[conn_key] = connection_type( authority, timeout=self.timeout, proxy_info=self.proxy_info) conn.set_debuglevel(debuglevel) if method in ["GET", "HEAD"] and 'range' not in headers: headers['accept-encoding'] = 'compress, gzip' info = email.Message.Message() cached_value = None if self.cache: cachekey = defrag_uri cached_value = self.cache.get(cachekey) if cached_value: info = email.message_from_string(cached_value) try: content = cached_value.split('\r\n\r\n', 1)[1] except IndexError: self.cache.delete(cachekey) cachekey = None cached_value = None else: cachekey = None if method in ["PUT"] and self.cache and info.has_key( 'etag' ) and not self.ignore_etag and 'if-match' not in headers: # http://www.w3.org/1999/04/Editing/ headers['if-match'] = info['etag'] if method not in ["GET", "HEAD"] and self.cache and cachekey: # RFC 2616 Section 13.10 self.cache.delete(cachekey) if cached_value and method in [ "GET", "HEAD" ] and self.cache and 'range' not in headers: if info.has_key('-x-permanent-redirect-url'): # Should cached permanent redirects be counted in our redirection count? For now, yes. (response, new_content) = self.request( info['-x-permanent-redirect-url'], headers=headers, redirections=redirections - 1) response.previous = Response(info) response.previous.fromcache = True else: # Determine our course of action: # Is the cached entry fresh or stale? # Has the client requested a non-cached response? # # There seems to be three possible answers: # 1. [FRESH] Return the cache entry w/o doing a GET # 2. [STALE] Do the GET (but add in cache validators if available) # 3. [TRANSPARENT] Do a GET w/o any cache validators (Cache-Control: no-cache) on the request entry_disposition = _entry_disposition(info, headers) if entry_disposition == "FRESH": if not cached_value: info['status'] = '504' content = "" response = Response(info) if cached_value: response.fromcache = True return (response, content) if entry_disposition == "STALE": if info.has_key( 'etag' ) and not self.ignore_etag and not 'if-none-match' in headers: headers['if-none-match'] = info['etag'] if info.has_key('last-modified' ) and not 'last-modified' in headers: headers['if-modified-since'] = info[ 'last-modified'] elif entry_disposition == "TRANSPARENT": pass (response, new_content) = self._request( conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) if response.status == 304 and method == "GET": # Rewrite the cache entry with the new end-to-end headers # Take all headers that are in response # and overwrite their values in info. # unless they are hop-by-hop, or are listed in the connection header. for key in _get_end2end_headers(response): info[key] = response[key] merged_response = Response(info) if hasattr(response, "_stale_digest"): merged_response._stale_digest = response._stale_digest _updateCache(headers, merged_response, content, self.cache, cachekey) response = merged_response response.status = 200 response.fromcache = True elif response.status == 200: content = new_content else: self.cache.delete(cachekey) content = new_content else: (response, content) = self._request(conn, authority, uri, request_uri, method, body, headers, redirections, cachekey) except Exception, e: if self.force_exception_to_status_code: if isinstance(e, HttpLib2ErrorWithResponse): response = e.response content = e.content response.status = 500 response.reason = str(e) elif isinstance(e, socket.timeout): content = "Request Timeout" response = Response({ "content-type": "text/plain", "status": "408", "content-length": len(content) }) response.reason = "Request Timeout" else: content = str(e) response = Response({ "content-type": "text/plain", "status": "400", "content-length": len(content) }) response.reason = "Bad Request" else: raise
def getpage(self, pgreq, addlHeaders = None, returnMultiple = False, callBack=None, postData=None, soup=False): # pgreq = fixurl(pgreq) # print pgreq # print type(pgreq) originalString = pgreq log = self.log pgctnt = "Failed" pghandle = None loopctr = 0 # Encode Unicode URL's properly pgreq = iri2uri.iri2uri(pgreq) try: # TODO: make this more sensible if addlHeaders != None and postData != None: log.info("Making a post-request with additional headers!") pgreq = urllib.request.Request(pgreq, headers=addlHeaders, data=urllib.parse.urlencode(postData).encode("utf-8")) elif addlHeaders != None: pgreq = urllib.request.Request(pgreq, headers=addlHeaders) elif postData != None: log.info("Making a post request!") pgreq = urllib.request.Request(pgreq, data=urllib.parse.urlencode(postData).encode("utf-8")) else: pgreq = urllib.request.Request(pgreq) except: log.critical("Invalid header or url") raise errored = False lastErr = "" delay = 1.5 if not self.testMode: while 1: loopctr = loopctr + 1 if loopctr > self.errorOutCount: log.error("Failed to retrieve Website : %s at %s All Attempts Exhausted", pgreq.get_full_url(), time.ctime(time.time())) pgctnt = "Failed" try: print(("Critical Failure to retrieve page! %s at %s, attempt %s" % (pgreq.get_full_url(), time.ctime(time.time()), loopctr))) print(("Error:", lastErr)) print("Exiting") except: print("And the URL could not be printed due to an encoding error") break #print "execution", loopctr try: # print("request type = ", type(pgreq)) pghandle = self.opener.open(pgreq) # Get Webpage except urllib.error.HTTPError as e: # Lotta logging log.warning("Error opening page: %s at %s On Attempt %s.", pgreq.get_full_url(), time.ctime(time.time()), loopctr) log.warning("Error Code: %s", e) #traceback.print_exc() lastErr = e try: log.warning("Error opening page: %s at %s On Attempt %s.", pgreq.get_full_url(), time.ctime(time.time()), loopctr) log.warning("Error: %s, Original URL: %s", e, originalString) errored = True except: log.warning("And the URL could not be printed due to an encoding error") if e.code == 404: #print "Unrecoverable - Page not found. Breaking" log.critical("Unrecoverable - Page not found. Breaking") break time.sleep(delay) except UnicodeEncodeError: log.critical("Unrecoverable Unicode issue retreiving page - %s", originalString) break except Exception: errored = True #traceback.print_exc() lastErr = sys.exc_info() log.warning("Retreival failed. Traceback:") log.warning(lastErr) log.warning(traceback.format_exc()) log.warning("Error Retrieving Page! - Trying again - Waiting 2.5 seconds") try: print(("Error on page - %s" % originalString)) except: print("And the URL could not be printed due to an encoding error") time.sleep(delay) continue if pghandle != None: try: log.info("Request for URL: %s succeeded at %s On Attempt %s. Recieving...", pgreq.get_full_url(), time.ctime(time.time()), loopctr) if callBack: pgctnt = self.chunkRead(pghandle, 2 ** 17, reportHook = callBack) else: pgctnt = pghandle.read() if pgctnt != None: log.info("URL fully retrieved.") preDecompSize = len(pgctnt)/1000.0 encoded = pghandle.headers.get('Content-Encoding') #preLen = len(pgctnt) if encoded == 'deflate': compType = "deflate" pgctnt = zlib.decompress(pgctnt, -zlib.MAX_WBITS) elif encoded == 'gzip': compType = "gzip" buf = io.BytesIO(pgctnt) f = gzip.GzipFile(fileobj=buf) pgctnt = f.read() elif encoded == "sdch": raise ValueError("Wait, someone other then google actually supports SDCH compression?") else: compType = "none" decompSize = len(pgctnt)/1000.0 # self.log.info("Page content type = %s", type(pgctnt)) cType = pghandle.headers.get("Content-Type") self.log.info("Compression type = %s. Content Size compressed = %0.3fK. Decompressed = %0.3fK. File type: %s.", compType, preDecompSize, decompSize, cType) if "text/html" in cType: # If this is a html/text page, we want to decode it using the local encoding if (";" in cType) and ("=" in cType): # the server is reporting an encoding. Now we use it to decode the dummy_docType, charset = cType.split(";") charset = charset.split("=")[-1] else: # The server is not reporting an encoding in the headers. # this *should* probably be done using a parser. # However, it seems to be grossly overkill to shove the whole page (which can be quite large) through a parser just to pull out a tag that # should be right near the page beginning anyways. # As such, it's a regular expression for the moment # Regex is of bytes type, since we can't convert a string to unicode until we know the encoding the # bytes string is using, and we need the regex to get that encoding coding = re.search(b"charset=[\'\"]?([a-zA-Z0-9\-]*)[\'\"]?", pgctnt, flags=re.IGNORECASE) cType = b"" if coding: cType = coding.group(1) if (b";" in cType) and (b"=" in cType): # the server is reporting an encoding. Now we use it to decode the dummy_docType, charset = cType.split(b";") charset = charset.split(b"=")[-1] else: charset = "iso-8859-1" try: pgctnt = str(pgctnt, charset) except UnicodeDecodeError: self.log.error("Encoding Error! Stripping invalid chars.") pgctnt = pgctnt.decode('utf-8', errors='ignore') if soup: pgctnt = bs4.BeautifulSoup(pgctnt) elif "text/plain" in cType or "text/xml" in cType: pgctnt = bs4.UnicodeDammit(pgctnt).unicode_markup elif "text" in cType: self.log.critical("Unknown content type!") self.log.critical(cType) print("Unknown content type!") print(cType) break except: print(("pghandle = ", pghandle)) traceback.print_exc() log.error(sys.exc_info()) log.error("Error Retrieving Page! - Transfer failed. Waiting %s seconds before retrying", delay) try: print(("Critical Failure to retrieve page! %s at %s" % (pgreq.get_full_url(), time.ctime(time.time())))) print("Exiting") except: print("And the URL could not be printed due to an encoding error") print() log.error(pghandle) time.sleep(delay) if errored and pghandle != None: print(("Later attempt succeeded %s" % pgreq.get_full_url())) #print len(pgctnt) elif errored and pghandle == None: raise urllib.error.URLError("Failed to retreive page!") if returnMultiple: if self.testMode: raise ValueError("testing mode does not support multiple return values yet!") return pgctnt, pghandle else: if self.testMode: return self.testMode else: return pgctnt