def _mech_open(self, url, data=None, update_history=True, visit=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): try: url.get_full_url except AttributeError: # string URL -- convert to absolute URL if required scheme, authority = _rfc3986.urlsplit(url)[:2] if scheme is None: # relative URL if self._response is None: raise BrowserStateError("can't fetch relative reference: " "not viewing any document") url = _rfc3986.urljoin(self._response.geturl(), url) request = self._request(url, data, visit, timeout) visit = request.visit if visit is None: visit = True if visit: self._visit_request(request, update_history) success = True try: response = UserAgentBase.open(self, request, data) except urllib2.HTTPError, error: success = False if error.fp is None: # not a response raise response = error
def _mech_open(self, url, data=None, update_history=True, visit=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): try: url.get_full_url except AttributeError: # string URL -- convert to absolute URL if required scheme, authority = _rfc3986.urlsplit(url)[:2] if scheme is None: # relative URL if self._response is None: raise BrowserStateError( "can't fetch relative reference: " "not viewing any document") url = _rfc3986.urljoin(self._response.geturl(), url) request = self._request(url, data, visit, timeout) visit = request.visit if visit is None: visit = True if visit: self._visit_request(request, update_history) success = True try: response = UserAgentBase.open(self, request, data) except urllib2.HTTPError, error: success = False if error.fp is None: # not a response raise response = error
def get_selector(self): scheme, authority, path, query, fragment = _rfc3986.urlsplit( self.__r_host) if path == "": path = "/" # RFC 2616, section 3.2.2 fragment = None # RFC 3986, section 3.5 return _rfc3986.urlunsplit([scheme, authority, path, query, fragment])
def _mech_open(self, url, data=None, update_history=True, visit=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT): try: url.get_full_url except AttributeError: # string URL -- convert to absolute URL if required scheme, authority = _rfc3986.urlsplit(url)[:2] if scheme is None: # relative URL if self._response is None: raise BrowserStateError("can't fetch relative reference: " "not viewing any document") url = _rfc3986.urljoin(self._response.geturl(), url) request = self._request(url, data, visit, timeout) visit = request.visit if visit is None: visit = True if visit: self._visit_request(request, update_history) success = True try: response = UserAgentBase.open(self, request, data) except urllib2.HTTPError as error: success = False if error.fp is None: # not a response raise response = error # except (IOError, socket.error, OSError), error: # Yes, urllib2 really does raise all these :-(( # See test_urllib2.py for examples of socket.gaierror and OSError, # plus note that FTPHandler raises IOError. # XXX I don't seem to have an example of exactly socket.error being # raised, only socket.gaierror... # I don't want to start fixing these here, though, since this is a # subclass of OpenerDirector, and it would break old code. Even in # Python core, a fix would need some backwards-compat. hack to be # acceptable. # raise if visit: self._set_response(response, False) response = copy.copy(self._response) elif response is not None: response = _response.upgrade_response(response) if not success: raise response return response
def _add_referer_header(self, request, origin_request=True): if self.request is None: return request scheme = request.get_type() original_scheme = self.request.get_type() if scheme not in ["http", "https"]: return request if not origin_request and not self.request.has_header("Referer"): return request if (self._handle_referer and original_scheme in ["http", "https"] and not (original_scheme == "https" and scheme != "https")): # strip URL fragment (RFC 2616 14.36) parts = _rfc3986.urlsplit(self.request.get_full_url()) parts = parts[:-1] + (None, ) referer = _rfc3986.urlunsplit(parts) request.add_unredirected_header("Referer", referer) return request
def _add_referer_header(self, request, origin_request=True): if self.request is None: return request scheme = request.get_type() original_scheme = self.request.get_type() if scheme not in ["http", "https"]: return request if not origin_request and not self.request.has_header("Referer"): return request if (self._handle_referer and original_scheme in ["http", "https"] and not (original_scheme == "https" and scheme != "https")): # strip URL fragment (RFC 2616 14.36) parts = _rfc3986.urlsplit(self.request.get_full_url()) parts = parts[:-1]+(None,) referer = _rfc3986.urlunsplit(parts) request.add_unredirected_header("Referer", referer) return request
def is_html(ct_headers, url, allow_xhtml=False): """ ct_headers: Sequence of Content-Type headers url: Response URL """ if not ct_headers: # guess ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1] html_exts = [".htm", ".html"] if allow_xhtml: html_exts += [".xhtml"] return ext in html_exts # use first header ct = split_header_words(ct_headers)[0][0][0] html_types = ["text/html"] if allow_xhtml: html_types += ["text/xhtml", "text/xml", "application/xml", "application/xhtml+xml"] return ct in html_types
def is_html(ct_headers, url, allow_xhtml=False): """ ct_headers: Sequence of Content-Type headers url: Response URL """ if not ct_headers: # guess ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1] html_exts = [".htm", ".html"] if allow_xhtml: html_exts += [".xhtml"] return ext in html_exts # use first header ct = split_header_words(ct_headers)[0][0][0] html_types = ["text/html"] if allow_xhtml: html_types += [ "text/xhtml", "text/xml", "application/xml", "application/xhtml+xml", ] return ct in html_types
def retrieve(self, fullurl, filename=None, reporthook=None, data=None, timeout=_sockettimeout._GLOBAL_DEFAULT_TIMEOUT, open=open_file): """Returns (filename, headers). For remote objects, the default filename will refer to a temporary file. Temporary files are removed when the OpenerDirector.close() method is called. For file: URLs, at present the returned filename is None. This may change in future. If the actual number of bytes read is less than indicated by the Content-Length header, raises ContentTooShortError (a URLError subclass). The exception's .result attribute contains the (filename, headers) that would have been returned. """ req = self._request(fullurl, data, False, timeout) scheme = req.get_type() fp = self.open(req) try: headers = fp.info() if filename is None and scheme == 'file': # XXX req.get_selector() seems broken here, return None, # pending sanity :-/ return None, headers #return urllib.url2pathname(req.get_selector()), headers if filename: tfp = open(filename, 'wb') else: path = _rfc3986.urlsplit(req.get_full_url())[2] suffix = os.path.splitext(path)[1] fd, filename = tempfile.mkstemp(suffix) self._tempfiles.append(filename) tfp = os.fdopen(fd, 'wb') try: result = filename, headers bs = self.BLOCK_SIZE size = -1 read = 0 blocknum = 0 if reporthook: if "content-length" in headers: size = int(headers["Content-Length"]) reporthook(blocknum, bs, size) while 1: block = fp.read(bs) if block == "": break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) finally: tfp.close() finally: fp.close() # raise exception if actual size does not match content-length header if size >= 0 and read < size: raise ContentTooShortError( "retrieval incomplete: " "got only %i out of %i bytes" % (read, size), result ) return result
def is_html_file_extension(url, allow_xhtml): ext = os.path.splitext(_rfc3986.urlsplit(url)[2])[1] html_exts = [".htm", ".html"] if allow_xhtml: html_exts += [".xhtml"] return ext in html_exts
def retrieve(self, fullurl, filename=None, reporthook=None, data=None): """Returns (filename, headers). For remote objects, the default filename will refer to a temporary file. Temporary files are removed when the OpenerDirector.close() method is called. For file: URLs, at present the returned filename is None. This may change in future. If the actual number of bytes read is less than indicated by the Content-Length header, raises ContentTooShortError (a URLError subclass). The exception's .result attribute contains the (filename, headers) that would have been returned. """ req = self._request(fullurl, data, False) scheme = req.get_type() fp = self.open(req) headers = fp.info() if filename is None and scheme == 'file': # XXX req.get_selector() seems broken here, return None, # pending sanity :-/ return None, headers #return urllib.url2pathname(req.get_selector()), headers if filename: tfp = open(filename, 'wb') else: path = _rfc3986.urlsplit(fullurl)[2] suffix = os.path.splitext(path)[1] fd, filename = tempfile.mkstemp(suffix) self._tempfiles.append(filename) tfp = os.fdopen(fd, 'wb') result = filename, headers bs = self.BLOCK_SIZE size = -1 read = 0 blocknum = 0 if reporthook: if "content-length" in headers: size = int(headers["Content-Length"]) reporthook(blocknum, bs, size) while 1: block = fp.read(bs) if block == "": break read += len(block) tfp.write(block) blocknum += 1 if reporthook: reporthook(blocknum, bs, size) fp.close() tfp.close() del fp del tfp # raise exception if actual size does not match content-length header if size >= 0 and read < size: raise ContentTooShortError( "retrieval incomplete: " "got only %i out of %i bytes" % (read, size), result ) return result