def whereIsYadis(resp): """Given a HTTPResponse, return the location of the Yadis document. May be the URL just retrieved, another URL, or None, if I can't find any. [non-blocking] @returns: str or None """ # Attempt to find out where to go to discover the document # or if we already have it content_type = resp.headers.get('content-type') # According to the spec, the content-type header must be an exact # match, or else we have to look for an indirection. if (content_type and content_type.split(';', 1)[0].lower() == YADIS_CONTENT_TYPE): return resp.final_url else: # Try the header yadis_loc = resp.headers.get(YADIS_HEADER_NAME.lower()) if not yadis_loc: # Parse as HTML if the header is missing. # # XXX: do we want to do something with content-type, like # have a whitelist or a blacklist (for detecting that it's # HTML)? try: yadis_loc = findHTMLMeta(StringIO(resp.body)) except MetaNotFound: pass return yadis_loc
def whereIsYadis(resp): """Given a HTTPResponse, return the location of the Yadis document. May be the URL just retrieved, another URL, or None, if I can't find any. [non-blocking] @returns: str or None """ # Attempt to find out where to go to discover the document # or if we already have it content_type = resp.headers.get('content-type') # According to the spec, the content-type header must be an exact # match, or else we have to look for an indirection. if content_type and content_type.split(';', 1)[0].lower() == YADIS_CONTENT_TYPE: return resp.final_url else: # Try the header yadis_loc = resp.headers.get(YADIS_HEADER_NAME.lower()) if not yadis_loc: # Parse as HTML if the header is missing. # # XXX: do we want to do something with content-type, like # have a whitelist or a blacklist (for detecting that it's # HTML)? # Decode body by encoding of file content_type = content_type or '' encoding = content_type.rsplit(';', 1) if len(encoding) == 2 and encoding[1].strip().startswith( 'charset='): encoding = encoding[1].split('=', 1)[1].strip() else: encoding = 'UTF-8' try: content = resp.body.decode(encoding) except UnicodeError: # Keep encoded version in case yadis location can be found before encoding shut this up. # Possible errors will be caught lower. content = resp.body try: yadis_loc = findHTMLMeta(StringIO(content)) except (MetaNotFound, UnicodeError): # UnicodeError: Response body could not be encoded and xrds location # could not be found before troubles occurs. pass return yadis_loc
def whereIsYadis(resp): """Given a HTTPResponse, return the location of the Yadis document. May be the URL just retrieved, another URL, or None, if I can't find any. [non-blocking] @returns: str or None """ # Attempt to find out where to go to discover the document # or if we already have it content_type = resp.headers.get('content-type') # According to the spec, the content-type header must be an exact # match, or else we have to look for an indirection. if (content_type and content_type.split(';', 1)[0].lower() == YADIS_CONTENT_TYPE): return resp.final_url else: # Try the header yadis_loc = resp.headers.get(YADIS_HEADER_NAME.lower()) if not yadis_loc: # Parse as HTML if the header is missing. # # XXX: do we want to do something with content-type, like # have a whitelist or a blacklist (for detecting that it's # HTML)? # Decode body by encoding of file content_type = content_type or '' encoding = content_type.rsplit(';', 1) if len(encoding) == 2 and encoding[1].strip().startswith('charset='): encoding = encoding[1].split('=', 1)[1].strip() else: encoding = 'UTF-8' try: content = resp.body.decode(encoding) except UnicodeError: # Keep encoded version in case yadis location can be found before encoding shut this up. # Possible errors will be caught lower. content = resp.body try: yadis_loc = findHTMLMeta(StringIO(content)) except (MetaNotFound, UnicodeError): # UnicodeError: Response body could not be encoded and xrds location # could not be found before troubles occurs. pass return yadis_loc
def handle_starttag(self, tag, attrs): # if we ever see a start body tag, bail out right away, since # we want to prevent the meta tag from appearing in the body # [2] if tag == 'body': self._terminate() if self.phase == self.TOP: # At the top level, allow a html tag or a head tag to move # to the head or html phase if tag == 'head': # [3] self.phase = self.HEAD elif tag == 'html': # [4] self.phase = self.HTML elif self.phase == self.HTML: # if we are in the html tag, allow a head tag to move to # the HEAD phase. If we get another html tag, then bail # out if tag == 'head': # [3] self.phase = self.HEAD elif tag == 'html': # [5] self._terminate() elif self.phase == self.HEAD: # If we are in the head phase, look for the appropriate # meta tag. If we get a head or body tag, bail out. if tag == 'meta': attrs_d = dict(attrs) http_equiv = attrs_d.get('http-equiv', '').lower() if http_equiv == YADIS_HEADER_NAME.lower(): raw_attr = attrs_d.get('content') yadis_loc = substituteEntities(raw_attr) # [6] self.phase = self.FOUND raise ParseDone(yadis_loc) elif tag in ['head', 'html']: # [5], [7] self._terminate()
def handle_starttag(self, tag, attrs): # if we ever see a start body tag, bail out right away, since # we want to prevent the meta tag from appearing in the body # [2] if tag=='body': self._terminate() if self.phase == self.TOP: # At the top level, allow a html tag or a head tag to move # to the head or html phase if tag == 'head': # [3] self.phase = self.HEAD elif tag == 'html': # [4] self.phase = self.HTML elif self.phase == self.HTML: # if we are in the html tag, allow a head tag to move to # the HEAD phase. If we get another html tag, then bail # out if tag == 'head': # [3] self.phase = self.HEAD elif tag == 'html': # [5] self._terminate() elif self.phase == self.HEAD: # If we are in the head phase, look for the appropriate # meta tag. If we get a head or body tag, bail out. if tag == 'meta': attrs_d = dict(attrs) http_equiv = attrs_d.get('http-equiv', '').lower() if http_equiv == YADIS_HEADER_NAME.lower(): raw_attr = attrs_d.get('content') yadis_loc = substituteEntities(raw_attr) # [6] self.phase = self.FOUND raise ParseDone(yadis_loc) elif tag in ['head', 'html']: # [5], [7] self._terminate()
def findHTMLMeta(stream): """Look for a meta http-equiv tag with the YADIS header name. @param stream: Source of the html text @type stream: Readable text I/O file object @return: The URI from which to fetch the XRDS document @rtype: six.text_type @raises MetaNotFound: raised with the content that was searched as the first parameter. """ parser = etree.HTMLParser() try: html = etree.parse(stream, parser) except (ValueError, etree.XMLSyntaxError): raise MetaNotFound("Couldn't parse HTML page.") # Invalid input may return element with no content if html.getroot() is None: raise MetaNotFound("Couldn't parse HTML page.") # Create a XPath evaluator with a local function to lowercase values. xpath_evaluator = etree.XPathEvaluator(html, extensions={ (None, 'lower-case'): xpath_lower_case }) # Find YADIS meta tag, case insensitive to the header name. yadis_headers = xpath_evaluator( '/html/head/meta[lower-case(@http-equiv)="{}"]'.format( YADIS_HEADER_NAME.lower())) if not yadis_headers: raise MetaNotFound('Yadis meta tag not found.') yadis_header = yadis_headers[0] yadis_url = yadis_header.get('content') if yadis_url is None: raise MetaNotFound('Attribute "content" missing in yadis meta tag.') return yadis_url
def whereIsYadis(resp): """Given a HTTPResponse, return the location of the Yadis document. May be the URL just retrieved, another URL, or None if no suitable URL can be found. [non-blocking] @returns: str or None """ # Attempt to find out where to go to discover the document # or if we already have it content_type = resp.headers.get('content-type') # According to the spec, the content-type header must be an exact # match, or else we have to look for an indirection. if (content_type and content_type.split(';', 1)[0].lower() == YADIS_CONTENT_TYPE): return resp.final_url else: # Try the header yadis_loc = resp.headers.get(YADIS_HEADER_NAME.lower()) if not yadis_loc: # Parse as HTML if the header is missing. # # XXX: do we want to do something with content-type, like # have a whitelist or a blacklist (for detecting that it's # HTML)? # Decode body by encoding of file content_type = content_type or '' encoding = content_type.rsplit(';', 1) if (len(encoding) == 2 and encoding[1].strip().startswith('charset=')): encoding = encoding[1].split('=', 1)[1].strip() else: encoding = 'utf-8' if isinstance(resp.body, bytes): try: content = resp.body.decode(encoding) except UnicodeError: # All right, the detected encoding has failed. Try with # UTF-8 (even if there was no detected encoding and we've # defaulted to UTF-8, it's not that expensive an operation) try: content = resp.body.decode('utf-8') except UnicodeError: # At this point the content cannot be decoded to a str # using the detected encoding or falling back to utf-8, # so we have to resort to replacing undecodable chars. # This *will* result in broken content but there isn't # anything else that can be done. content = resp.body.decode(encoding, 'replace') else: content = resp.body try: yadis_loc = findHTMLMeta(StringIO(content)) except (MetaNotFound, UnicodeError): # UnicodeError: Response body could not be encoded and xrds # location could not be found before troubles occur. pass return yadis_loc
def findHTMLMeta(stream): """Look for a meta http-equiv tag with the YADIS header name. @param stream: Source of the html text @type stream: Readable text I/O file object @return: The URI from which to fetch the XRDS document @rtype: six.text_type @raises MetaNotFound: raised with the content that was searched as the first parameter. """ parser = etree.HTMLParser() try: html = etree.parse(stream, parser) except (ValueError, etree.XMLSyntaxError): raise MetaNotFound("Couldn't parse HTML page.") # Invalid input may return element with no content if html.getroot() is None: raise MetaNotFound("Couldn't parse HTML page.") # Create a XPath evaluator with a local function to lowercase values. xpath_evaluator = etree.XPathEvaluator(html, extensions={(None, 'lower-case'): xpath_lower_case}) # Find YADIS meta tag, case insensitive to the header name. yadis_headers = xpath_evaluator('/html/head/meta[lower-case(@http-equiv)="{}"]'.format(YADIS_HEADER_NAME.lower())) if not yadis_headers: raise MetaNotFound('Yadis meta tag not found.') yadis_header = yadis_headers[0] yadis_url = yadis_header.get('content') if yadis_url is None: raise MetaNotFound('Attribute "content" missing in yadis meta tag.') return yadis_url
def discover(uri): """Discover services for a given URI. @param uri: The identity URI as a well-formed http or https URI. The well-formedness and the protocol are not checked, but the results of this function are undefined if those properties do not hold. @return: DiscoveryResult object @raises Exception: Any exception that can be raised by fetching a URL with the given fetcher. """ result = DiscoveryResult(uri) resp = fetchers.fetch(uri, headers={'Accept': YADIS_ACCEPT_HEADER}) if resp.status != 200: raise DiscoveryFailure( 'HTTP Response status from identity URL host is not 200. ' 'Got status %r' % (resp.status,), resp) # Note the URL after following redirects result.normalized_uri = resp.final_url # Attempt to find out where to go to discover the document # or if we already have it result.content_type = resp.headers.get('content-type') # According to the spec, the content-type header must be an exact # match, or else we have to look for an indirection. if (result.content_type and result.content_type.split(';', 1)[0].lower() == YADIS_CONTENT_TYPE): result.xrds_uri = result.normalized_uri else: # Try the header yadis_loc = resp.headers.get(YADIS_HEADER_NAME.lower()) if not yadis_loc: # Parse as HTML if the header is missing. # # XXX: do we want to do something with content-type, like # have a whitelist or a blacklist (for detecting that it's # HTML)? try: yadis_loc = findHTMLMeta(StringIO(resp.body)) except MetaNotFound: pass # At this point, we have not found a YADIS Location URL. We # will return the content that we scanned so that the caller # can try to treat it as an XRDS if it wishes. if yadis_loc: result.xrds_uri = yadis_loc resp = fetchers.fetch(yadis_loc) if resp.status != 200: exc = DiscoveryFailure( 'HTTP Response status from Yadis host is not 200. ' 'Got status %r' % (resp.status,), resp) exc.identity_url = result.normalized_uri raise exc result.content_type = resp.headers.get('content-type') result.response_text = resp.body return result