def whereIsYadis(resp): """Given a HTTPResponse, return the location of the Yadis document. May be the URL just retrieved, another URL, or None, if I can't find any. [non-blocking] @returns: str or None """ # Attempt to find out where to go to discover the document # or if we already have it content_type = resp.headers.get('content-type') # According to the spec, the content-type header must be an exact # match, or else we have to look for an indirection. if (content_type and content_type.split(';', 1)[0].lower() == YADIS_CONTENT_TYPE): return resp.final_url else: # Try the header yadis_loc = resp.headers.get(YADIS_HEADER_NAME.lower()) if not yadis_loc: # Parse as HTML if the header is missing. # # XXX: do we want to do something with content-type, like # have a whitelist or a blacklist (for detecting that it's # HTML)? try: yadis_loc = findHTMLMeta(StringIO(resp.body)) except MetaNotFound: pass return yadis_loc
def whereIsYadis(resp): """Given a HTTPResponse, return the location of the Yadis document. May be the URL just retrieved, another URL, or None, if I can't find any. [non-blocking] @returns: str or None """ # Attempt to find out where to go to discover the document # or if we already have it content_type = resp.headers.get('content-type') # According to the spec, the content-type header must be an exact # match, or else we have to look for an indirection. if content_type and content_type.split(';', 1)[0].lower() == YADIS_CONTENT_TYPE: return resp.final_url else: # Try the header yadis_loc = resp.headers.get(YADIS_HEADER_NAME.lower()) if not yadis_loc: # Parse as HTML if the header is missing. # # XXX: do we want to do something with content-type, like # have a whitelist or a blacklist (for detecting that it's # HTML)? # Decode body by encoding of file content_type = content_type or '' encoding = content_type.rsplit(';', 1) if len(encoding) == 2 and encoding[1].strip().startswith( 'charset='): encoding = encoding[1].split('=', 1)[1].strip() else: encoding = 'UTF-8' try: content = resp.body.decode(encoding) except UnicodeError: # Keep encoded version in case yadis location can be found before encoding shut this up. # Possible errors will be caught lower. content = resp.body try: yadis_loc = findHTMLMeta(StringIO(content)) except (MetaNotFound, UnicodeError): # UnicodeError: Response body could not be encoded and xrds location # could not be found before troubles occurs. pass return yadis_loc
def whereIsYadis(resp): """Given a HTTPResponse, return the location of the Yadis document. May be the URL just retrieved, another URL, or None, if I can't find any. [non-blocking] @returns: str or None """ # Attempt to find out where to go to discover the document # or if we already have it content_type = resp.headers.get('content-type') # According to the spec, the content-type header must be an exact # match, or else we have to look for an indirection. if (content_type and content_type.split(';', 1)[0].lower() == YADIS_CONTENT_TYPE): return resp.final_url else: # Try the header yadis_loc = resp.headers.get(YADIS_HEADER_NAME.lower()) if not yadis_loc: # Parse as HTML if the header is missing. # # XXX: do we want to do something with content-type, like # have a whitelist or a blacklist (for detecting that it's # HTML)? # Decode body by encoding of file content_type = content_type or '' encoding = content_type.rsplit(';', 1) if len(encoding) == 2 and encoding[1].strip().startswith('charset='): encoding = encoding[1].split('=', 1)[1].strip() else: encoding = 'UTF-8' try: content = resp.body.decode(encoding) except UnicodeError: # Keep encoded version in case yadis location can be found before encoding shut this up. # Possible errors will be caught lower. content = resp.body try: yadis_loc = findHTMLMeta(StringIO(content)) except (MetaNotFound, UnicodeError): # UnicodeError: Response body could not be encoded and xrds location # could not be found before troubles occurs. pass return yadis_loc
def test_multiple_headers(self): buff = StringIO( '<html><head>' '<meta http-equiv="X-XRDS-Location" content="found">' '<meta http-equiv="X-XRDS-Location" content="not-found">') self.assertEqual(findHTMLMeta(buff), 'found')
def test_javascript_in_head(self): buff = StringIO( '<html><head><script type="text/javascript">document.write("<body>");</script>' '<META http-equiv="X-XRDS-Location" content="found">') self.assertEqual(findHTMLMeta(buff), 'found')
def test_missing_html_tag(self): buff = StringIO( '<head><meta http-equiv="X-XRDS-Location" content="found">') self.assertEqual(findHTMLMeta(buff), 'found')
def test_top_level_bogus(self): buff = StringIO( '</porky><html><head><meta http-equiv="X-XRDS-Location" content="found">' ) self.assertEqual(findHTMLMeta(buff), 'found')
def test_case_insensitive_header_name(self): buff = StringIO( '<html><head><meta http-equiv="x-xrds-location" content="found"></head></html>' ) self.assertEqual(findHTMLMeta(buff), 'found')
def test_missing_html_tag(self): buff = StringIO('<head><meta http-equiv="X-XRDS-Location" content="found">') self.assertEqual(findHTMLMeta(buff), 'found')
def test_empty_string(self): buff = StringIO('<head><meta http-equiv="X-XRDS-Location" content="">') self.assertEqual(findHTMLMeta(buff), '')
def test_case_insensitive_header_name(self): buff = StringIO('<html><head><meta http-equiv="x-xrds-location" content="found"></head></html>') self.assertEqual(findHTMLMeta(buff), 'found')
def test_decimal_entity(self): buff = StringIO('<head><meta http-equiv="X-XRDS-Location" content="found">') self.assertEqual(findHTMLMeta(buff), 'found')
def test_standard_entity(self): buff = StringIO('<head><meta http-equiv="X-XRDS-Location" content="&">') self.assertEqual(findHTMLMeta(buff), '&')
def test_multiple_headers(self): buff = StringIO('<html><head>' '<meta http-equiv="X-XRDS-Location" content="found">' '<meta http-equiv="X-XRDS-Location" content="not-found">') self.assertEqual(findHTMLMeta(buff), 'found')
def test_javascript_in_head(self): buff = StringIO('<html><head><script type="text/javascript">document.write("<body>");</script>' '<META http-equiv="X-XRDS-Location" content="found">') self.assertEqual(findHTMLMeta(buff), 'found')
def test_standard_entity(self): buff = StringIO( '<head><meta http-equiv="X-XRDS-Location" content="&">') self.assertEqual(findHTMLMeta(buff), '&')
def test_xhtml(self): buff = StringIO('<html><head><meta http-equiv="X-XRDS-Location" content="found" /></head></html>') self.assertEqual(findHTMLMeta(buff), 'found')
def test_decimal_entity(self): buff = StringIO( '<head><meta http-equiv="X-XRDS-Location" content="found">') self.assertEqual(findHTMLMeta(buff), 'found')
def test_xhtml(self): buff = StringIO( '<html><head><meta http-equiv="X-XRDS-Location" content="found" /></head></html>' ) self.assertEqual(findHTMLMeta(buff), 'found')
def whereIsYadis(resp): """Given a HTTPResponse, return the location of the Yadis document. May be the URL just retrieved, another URL, or None if no suitable URL can be found. [non-blocking] @returns: str or None """ # Attempt to find out where to go to discover the document # or if we already have it content_type = resp.headers.get('content-type') # According to the spec, the content-type header must be an exact # match, or else we have to look for an indirection. if (content_type and content_type.split(';', 1)[0].lower() == YADIS_CONTENT_TYPE): return resp.final_url else: # Try the header yadis_loc = resp.headers.get(YADIS_HEADER_NAME.lower()) if not yadis_loc: # Parse as HTML if the header is missing. # # XXX: do we want to do something with content-type, like # have a whitelist or a blacklist (for detecting that it's # HTML)? # Decode body by encoding of file content_type = content_type or '' encoding = content_type.rsplit(';', 1) if (len(encoding) == 2 and encoding[1].strip().startswith('charset=')): encoding = encoding[1].split('=', 1)[1].strip() else: encoding = 'utf-8' if isinstance(resp.body, bytes): try: content = resp.body.decode(encoding) except UnicodeError: # All right, the detected encoding has failed. Try with # UTF-8 (even if there was no detected encoding and we've # defaulted to UTF-8, it's not that expensive an operation) try: content = resp.body.decode('utf-8') except UnicodeError: # At this point the content cannot be decoded to a str # using the detected encoding or falling back to utf-8, # so we have to resort to replacing undecodable chars. # This *will* result in broken content but there isn't # anything else that can be done. content = resp.body.decode(encoding, 'replace') else: content = resp.body try: yadis_loc = findHTMLMeta(StringIO(content)) except (MetaNotFound, UnicodeError): # UnicodeError: Response body could not be encoded and xrds # location could not be found before troubles occur. pass return yadis_loc
def test_top_level_bogus(self): buff = StringIO('</porky><html><head><meta http-equiv="X-XRDS-Location" content="found">') self.assertEqual(findHTMLMeta(buff), 'found')
def discover(uri): """Discover services for a given URI. @param uri: The identity URI as a well-formed http or https URI. The well-formedness and the protocol are not checked, but the results of this function are undefined if those properties do not hold. @return: DiscoveryResult object @raises Exception: Any exception that can be raised by fetching a URL with the given fetcher. """ result = DiscoveryResult(uri) resp = fetchers.fetch(uri, headers={'Accept': YADIS_ACCEPT_HEADER}) if resp.status != 200: raise DiscoveryFailure( 'HTTP Response status from identity URL host is not 200. ' 'Got status %r' % (resp.status,), resp) # Note the URL after following redirects result.normalized_uri = resp.final_url # Attempt to find out where to go to discover the document # or if we already have it result.content_type = resp.headers.get('content-type') # According to the spec, the content-type header must be an exact # match, or else we have to look for an indirection. if (result.content_type and result.content_type.split(';', 1)[0].lower() == YADIS_CONTENT_TYPE): result.xrds_uri = result.normalized_uri else: # Try the header yadis_loc = resp.headers.get(YADIS_HEADER_NAME.lower()) if not yadis_loc: # Parse as HTML if the header is missing. # # XXX: do we want to do something with content-type, like # have a whitelist or a blacklist (for detecting that it's # HTML)? try: yadis_loc = findHTMLMeta(StringIO(resp.body)) except MetaNotFound: pass # At this point, we have not found a YADIS Location URL. We # will return the content that we scanned so that the caller # can try to treat it as an XRDS if it wishes. if yadis_loc: result.xrds_uri = yadis_loc resp = fetchers.fetch(yadis_loc) if resp.status != 200: exc = DiscoveryFailure( 'HTTP Response status from Yadis host is not 200. ' 'Got status %r' % (resp.status,), resp) exc.identity_url = result.normalized_uri raise exc result.content_type = resp.headers.get('content-type') result.response_text = resp.body return result