def canon_username(username, allow_reserved=True): username = to_utf8(username) username = demoronize(username) username = username.decode('utf-8') username = username.lower() username = unicodedata.normalize('NFKC', username) username = username.strip() if not username: raise ValueError("user name must be non-empty") username = '******'.join(username.split()) username = username_premap_re.sub(lambda m: username_premap[m.group(0)], username) if allow_reserved: if _valid_user_name_re.match( to_utf8(username)) and username[:len('xn--')] == 'xn--': username = to_utf8(username).decode('idna') elif _restrictive_valid_user_name_re.match( to_utf8(username)) and username[:len('xn--')] == 'xn--': username = to_utf8(username).decode('idna') if _username_nonstarting_utf8_re.match(username.encode('utf-8')): raise ValueError( "user name begins with a character not permitted in that position") if _invalid_username_utf8_re.search(username.encode('utf-8')): raise DisplayedValueError( "user name contains characters that are not permitted (reason code: XID-in)", "user name contains characters that are not permitted") if not _valid_input_username_utf8_re.match(username.encode('utf-8')): raise DisplayedValueError( "user name contains characters that are not permitted (reason code: IDN-in) FOO FOO FOO ", "user name contains characters that are not permitted") try: username = username.encode('idna') except: raise DisplayedValueError( "user name contains characters that are not permitted (reason code: IDN)", "user name contains characters that are not permitted") if not _valid_output_username_utf8_re.match( username.decode('idna').encode('utf-8')): raise DisplayedValueError( "user name contains characters that are not permitted (reason code: IDN-out)", "user name contains characters that are not permitted") if allow_reserved and len(username) < 2: raise ValueError( "user name is too short (must contain at least two characters)") elif not allow_reserved and len(username) < 3: raise ValueError( "user name is too short (must contain at least three characters)") if len(username) > 63: raise ValueError( "user name is too long (must contain at most sixty-three characters)" ) if not allow_reserved and not _valid_user_name_re.match(username): raise ValueError( "user name must start with a letter or digit, end with a letter or digit, and contain only letters, digits and hyphens" ) # allow_reserved is ignored for now return username
def embed_text(text, xml_lang=None, xhtml_dir='ltr'): ''' Prepare a span of plain text for embedding in arbitrary other plain text. Each line of the span is wrapped in [RFC2482] language tags corresponding to xml_lang (if xml_lang is neither empty nor None) and a Unicode bidirectional embedding of the xhtml_dir direction (default is "ltr" for left-to-right; "rtl" for right-to-left is also allowed.) An empty string or None is simply returned. The result is returned as a UTF-8 string. References [RFC2482] Whistler, K. and Adams, G., "Language Tagging in Unicode Plain Text", RFC 2482, January 1999. ''' if text is None: return None if not text: return to_utf8(text) text = to_unicode(text) def _tag_line(line): line = to_unicode(line) if line and xml_lang: tag = u''.join([ unichr( uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') + uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang)) ]) line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}' return line def _splitlines(text): return u'\n'.join(u'\n'.join( text.split(u'\r\n')).split('\r')).split('\n') return '\r\n'.join([ (_tag_line(line) and to_utf8(u'%s%s\N{POP DIRECTIONAL FORMATTING}' % ( { 'ltr': u'\N{LEFT-TO-RIGHT EMBEDDING}', 'rtl': u'\N{RIGHT-TO-LEFT EMBEDDING}' }[xhtml_dir], _tag_line(line), )) or to_utf8(line)) for line in _splitlines(to_unicode(text)) ])
def canon_lang(lang): ''' Returns an RFC 3066 language value in normalized form; language codes from ISO 639 are lower case, while country codes from ISO 3166 are upper case; IANA-assigned and private subtags are assumed to be lower-case: en en-US en-scouse en-US-tx sgn-US-ma i-tsolyani x-37334 ''' lang = to_utf8(lang) lang = lang.decode('UTF-8') lang = lang.lower() # replace underscores sometimes found in locale names lang = '-'.join(lang.split('_')) lang_parts = lang.split('-') if len(lang_parts) > 1 and (len(lang_parts[1]) in (2, 3)): lang_parts[1] = lang_parts[1].upper() lang = '-'.join(lang_parts) if not _valid_language_code_re.match(lang): raise ValueError( "RFC 3066 language value must be in normalized form; language codes from ISO 639 are lower case, while country codes from ISO 3166 are upper case; IANA-assigned and private subtags are lower case (ASCII only)" ) return lang.encode('UTF-8')
def js_escape(s): ''' Escape a UTF-8 or Unicode string for use inside a JavaScript or ECMAScript string literal, potentially for embedding in SGML or XML PCDATA or CDATA. ''' def _js_escape_char(match): ch = match.group(0) if match.groupdict()['pyquote']: return ch.encode('unicode-escape') if match.groupdict()['backslash']: return r'\%s' % ch if match.groupdict()['empty']: return r'\x2f>' if match.groupdict()['close']: return r'<\x2f' if match.groupdict()['ccdata']: return r'\x5d]>' if match.groupdict()['entityref']: return r'\x26' och = uniord(ch) if och > 0x10ffff: assert "Codepoints outside the UTF-16 range are not supported."[:0] if och > 0xffff: # do UTF-16 encoding for chars outside the BMP return r'\u%04.4x\u%04.4x' % (((och - 0x10000) >> 10) | 0xd800, ((och - 0x10000) & 0x3ff) | 0xdc00) if och > 0xff: return r'\u%04.04x' % och return r'\x%02.2x' % och return to_utf8(_js_escape_re.sub(_js_escape_char, to_unicode(s)))
def canon_email(address): address = to_utf8(address) address = parseaddr(address)[1] if not address: raise ValueError("email address must be non-empty") if '@' not in address: raise ValueError( "email address does not contain the required '@' character") localpart, hostname = address.split('@') hostname = hostname.rstrip('.') if not hostname: raise ValueError("email address hostname must be non-empty") if '.' not in hostname: raise ValueError( "email address hostname must contain the '.' character and a domain name" ) if not localpart: raise ValueError("email address local part must be non-empty") # if '+' in localpart: # raise ValueError("email address local part must not contain the '+' character.") hostname = encode_idna(hostname) hostname = hostname.decode('utf-8').lower().encode('utf-8') address = '%s@%s' % (localpart, hostname) if not _valid_email_re.match(address): raise ValueError("email address does not match the permitted pattern") return address
def js_escape(s): ''' Escape a UTF-8 or Unicode string for use inside a JavaScript or ECMAScript string literal, potentially for embedding in SGML or XML PCDATA or CDATA. ''' def _js_escape_char(match): ch = match.group(0) if match.groupdict()['pyquote']: return ch.encode('unicode-escape') if match.groupdict()['backslash']: return r'\%s' % ch if match.groupdict()['empty']: return r'\x2f>' if match.groupdict()['close']: return r'<\x2f' if match.groupdict()['ccdata']: return r'\x5d]>' if match.groupdict()['entityref']: return r'\x26' och = uniord(ch) if och > 0x10ffff: assert "Codepoints outside the UTF-16 range are not supported." [:0] if och > 0xffff: # do UTF-16 encoding for chars outside the BMP return r'\u%04.4x\u%04.4x' % ( ((och - 0x10000) >> 10) | 0xd800, ((och - 0x10000) & 0x3ff) | 0xdc00) if och > 0xff: return r'\u%04.04x' % och return r'\x%02.2x' % och return to_utf8(_js_escape_re.sub(_js_escape_char, to_unicode(s)))
def embed_text(text, xml_lang = None, xhtml_dir = 'ltr'): ''' Prepare a span of plain text for embedding in arbitrary other plain text. Each line of the span is wrapped in [RFC2482] language tags corresponding to xml_lang (if xml_lang is neither empty nor None) and a Unicode bidirectional embedding of the xhtml_dir direction (default is "ltr" for left-to-right; "rtl" for right-to-left is also allowed.) An empty string or None is simply returned. The result is returned as a UTF-8 string. References [RFC2482] Whistler, K. and Adams, G., "Language Tagging in Unicode Plain Text", RFC 2482, January 1999. ''' if text is None: return None if not text: return to_utf8(text) text = to_unicode(text) def _tag_line(line): line = to_unicode(line) if line and xml_lang: tag = u''.join([ unichr(uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') + uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang)) ]) line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}' return line def _splitlines(text): return u'\n'.join(u'\n'.join(text.split(u'\r\n')).split('\r')).split('\n') return '\r\n'.join( [ ( _tag_line(line) and to_utf8(u'%s%s\N{POP DIRECTIONAL FORMATTING}' % ( { 'ltr': u'\N{LEFT-TO-RIGHT EMBEDDING}', 'rtl': u'\N{RIGHT-TO-LEFT EMBEDDING}' }[xhtml_dir], _tag_line(line), )) or to_utf8(line) ) for line in _splitlines(to_unicode(text)) ] )
def fix_xmlutf8(s): ''' Convert control characters (C0 and C1) and invalid UTF-8 sequences to the UTF-8 encoding of the Unicode replacement character. ''' o = [] for match in _xmlutf8_re.finditer(to_utf8(s)): if match.group(1): o.append(match.group(1)) else: o += [ u'\ufffd'.encode('utf-8') * len(match.group(0)) ] return ''.join(o)
def fix_xmlutf8(s): ''' Convert control characters (C0 and C1) and invalid UTF-8 sequences to the UTF-8 encoding of the Unicode replacement character. ''' o = [] for match in _xmlutf8_re.finditer(to_utf8(s)): if match.group(1): o.append(match.group(1)) else: o += [u'\ufffd'.encode('utf-8') * len(match.group(0))] return ''.join(o)
def canon_password(username, password, allow_weak): ''' N.B. allow_weak = True should be used for lookup but not storage as it allows unassigned codepoints. ''' username = to_utf8(username) password = to_utf8(password) password = demoronize(password) password = password.decode('utf-8') password = saslprep(password, allow_unassigned=allow_weak) if not allow_weak: if len(password) < 6: # FIXME: This error message is wrong -- there is no actual maximum length. raise ValueError( 'Please enter a password of between 6 and 20 characters') try: cpassword = canon_username(password, allow_reserved=True).decode('idna') except: cpassword = password.decode('utf-8') try: username = canon_username(username, allow_reserved=True).decode('idna') except: try: username = username.decode('idna') except: username = username.decode('utf-8') # import here because this is a big, slow module from BTL.canonical.identifier import confuse password_letters = list(set([ch for ch in confuse(cpassword)])) password_letters.sort() username_letters = list(set([ch for ch in confuse(username)])) username_letters.sort() if cpassword in username or u''.join(password_letters) == u''.join( username_letters): raise ValueError('password is too similar to user name') # TODO: password re-use prevention (password history) # TODO: complexity checks (dictionary?) # TODO: lockout (temporary and permanent) after failed login attempts return password
def canon_idn(idn): ''' Canonicalize an internationalized domain name ''' idn = to_utf8(idn) idn = escape_non_uri(idn) if idn[:1] + idn[-1:] != '[]': try: idn = urllib.quote(decode_idna(urllib.unquote(idn))) except: pass return iri_unquote(idn)
def canon_person(name): name = to_utf8(name) name = demoronize(name) name = name.decode('utf-8') name = name.lower() name = unicodedata.normalize('NFKC', name) name = name.strip() if not name: raise ValueError("name must be non-empty") name = '-'.join(name.split()) name = username_premap_re.sub(lambda m: username_premap[m.group(0)], name) return name
def embed_xhtml(xhtml, xml_lang = None, xhtml_dir = 'ltr'): ''' Prepare a span of XHTML text for embedding in arbitrary other XHTML text. Each line of the span is wrapped in language tags corresponding to xml_lang (if xml_lang is neither empty nor None) and a Unicode bidirectional embedding of the xhtml_dir direction (default is "ltr" for left-to-right; "rtl" for right-to-left is also allowed.) An empty string or None is simply returned. The result is returned as a UTF-8 string. ''' if xhtml is None: return None if not xhtml: return to_utf8(xhtml) xhtml = to_utf8(xhtml) assert xhtml_dir in ('ltr', 'rtl') return to_utf8('<span dir="%s"%s>%s</span>' % ( to_utf8(xhtml_dir), (xml_lang and (' xml:lang="%s"' % _canon_lang(xml_lang)) or ''), xhtml))
def embed_xhtml(xhtml, xml_lang=None, xhtml_dir='ltr'): ''' Prepare a span of XHTML text for embedding in arbitrary other XHTML text. Each line of the span is wrapped in language tags corresponding to xml_lang (if xml_lang is neither empty nor None) and a Unicode bidirectional embedding of the xhtml_dir direction (default is "ltr" for left-to-right; "rtl" for right-to-left is also allowed.) An empty string or None is simply returned. The result is returned as a UTF-8 string. ''' if xhtml is None: return None if not xhtml: return to_utf8(xhtml) xhtml = to_utf8(xhtml) assert xhtml_dir in ('ltr', 'rtl') return to_utf8('<span dir="%s"%s>%s</span>' % (to_utf8(xhtml_dir), (xml_lang and (' xml:lang="%s"' % _canon_lang(xml_lang)) or ''), xhtml))
def canon_uri(uri, unsafe=False): ''' Canonicalize a URI and return it. Unless the optional parameter unsafe = True is given, None is returned if the URI is not a valid, "safe" FTP, HTTP or HTTPS URI. ''' uri = to_utf8(uri) if urlparse.urlparse(uri)[0] == '': uri = 'http://' + uri uri = escape_non_uri(uri) uri = normalize_uri_escapes(uri) uri = uri_encode_idna(uri) uri = fix_query_string_spaces(uri) if unsafe or (safe_uri_re.match(uri) and valid_uri_re.match(uri) and uri == uri.decode('us-ascii', 'replace').encode( 'us-ascii', 'replace')): return uri
def demoronize(chars): ''' Transform Unicode C1 codepoints to non-C1 codepoints as if Unicode were a windows-1252 superset rather than an ISO-8859-1 superset. C1 characters not mapped in windows-1252 are mapped according to macroman. ''' chars = to_utf8(chars) o = [] for match in c1_re.finditer(chars): if match.group(1): byteval = match.group(0).decode('utf-8').encode('iso-8859-1') o.append((byteval.decode('windows-1252', 'ignore') or byteval.decode('macroman')).encode('utf-8')) else: o.append(match.group(0)) return ''.join(o)
def demoronize_idna(chars): ''' Decode IDNA tokens and ensure that decoded hostnames have the correct directionality using Unicode bidirectional embedding control characters. Note that such control characters are not valid in IRIs or IDNs. ''' chars = to_utf8(chars) o = [] for match in idn_re.finditer(chars): if match.group(1): try: o.append(u'\N{LEFT-TO-RIGHT EMBEDDING}'.encode('utf-8') + canon_idn(match.group(0)) + u'\N{POP DIRECTIONAL FORMATTING}'.encode('utf-8')) except: o.append(match.group(0)) else: o.append(match.group(0)) return ''.join(o)
def decode_idna(hostname): hostname = to_utf8(hostname) segments = [] for segment in hostname.split('.'): try: if segment[:len('xn--')].lower() != 'xn--'.lower(): raise TypeError('not idna') usegment = segment.lower().decode('idna') if unicode(segment) == unicode(segment).lower(): usegment = usegment.lower() elif unicode(segment) == unicode(segment).upper(): usegment = usegment.upper() elif unicode(segment) == unicode(segment).capitalize(): usegment = usegment.capitalize() elif unicode(segment) == unicode(segment).title(): usegment = usegment.title() segment = usegment.encode('utf-8') except: pass segments.append(segment) return '.'.join(segments)
def decode_xmlchars(chars): chars = to_utf8(chars) o = [] for match in xmlchars_re.finditer(chars): if match.group(1): o.append(xmlentities[match.group(1)]) elif match.group(2): try: o.append( unichr(int(match.group(2), 16)).encode('utf-8', 'replace')) except: o.append(u'\N{REPLACEMENT CHARACTER}') elif match.group(3): try: o.append( unichr(int(match.group(3), 10)).encode('utf-8', 'replace')) except: o.append(u'\N{REPLACEMENT CHARACTER}') else: o.append(match.group(0)) return ''.join(o)
def encode_idna(hostname): hostname = to_utf8(hostname) segments = [] for segment in hostname.split('.'): try: usegment = segment.decode('utf-8') idna = usegment.encode('idna') if idna[:len('xn--')] != 'xn--': raise TypeError('not idna') if usegment == usegment.lower(): segment = idna.lower() elif usegment == usegment.upper(): segment = idna.upper() elif usegment == usegment.capitalize(): segment = idna.capitalize() elif usegment == usegment.title(): segment = idna.title() else: segment = idna except: pass segments.append(segment) return '.'.join(segments)
def urlify_username(username): ''' Escape a username for use in a URI. ''' return urllib.quote(to_utf8(canon_username(username, True).decode('idna')), safe='')
def xhtml_to_text(value): ''' Converts an XHTML fragment to plain text. FIXME: This should support special XHTML rules for <br />, <img alt="..." />, etc. ''' def _innerText(node): ''' Returns the concatenated plain text from a DOM node. ''' if node.nodeType in (node.CDATA_SECTION_NODE, node.TEXT_NODE): return node.nodeValue if node.nodeType in (node.ELEMENT_NODE, node.DOCUMENT_NODE, node.DOCUMENT_FRAGMENT_NODE): return u''.join([ _innerText(child) for child in node.childNodes ]) if value is None: return value xdoc = '<?xml version="1.0" encoding="utf-8"?>\n<div xmlns="http://www.w3.org/1999/xhtml">%s</div>' % to_utf8(value) dom = xml.dom.minidom.parseString(xdoc) return _innerText(dom.documentElement)
(u'<\"\'>&;/! \0\a\b\n\r\t\v\f', ur'<\"\'>&;/! \0\7\b\n\r\t\v\f', r'<\"\'>\x26;/! \x00\x07\x08\n\r\t\x0b\x0c'), (u"// This <hack /> shouldn't close the </script> tag! < / > Nor should this close a <![CDATA[ CDATA section ]]>", ur"// This <hack /> shouldn't close the </script> tag! < / > Nor should this close a <![CDATA[ CDATA section ]]>", r"// This <hack \x2f> shouldn\'t close the <\x2fscript> tag! < / > Nor should this close a <![CDATA[ CDATA section \x5d]>" ), ): assert js_unescape(js_escape(i)) == i assert js_escape(i) == o assert js_unescape(o) == i assert js_unescape(o2) == i for i, o_xhtml, o_text in ( ((None, ), None, None), (('', ), '', ''), (('x', ), '<span dir="ltr">x</span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}x\N{POP DIRECTIONAL FORMATTING}') ), (('<>&<img />', ), '<span dir="ltr"><>&<img /></span>', to_utf8( u'\N{LEFT-TO-RIGHT EMBEDDING}<>&<img />\N{POP DIRECTIONAL FORMATTING}' )), (('a\nb', ), '<span dir="ltr">a\nb</span>', to_utf8( u'\N{LEFT-TO-RIGHT EMBEDDING}a\N{POP DIRECTIONAL FORMATTING}\r\n\N{LEFT-TO-RIGHT EMBEDDING}b\N{POP DIRECTIONAL FORMATTING}' )), (('a\nb\n', ), '<span dir="ltr">a\nb\n</span>', to_utf8( u'\N{LEFT-TO-RIGHT EMBEDDING}a\N{POP DIRECTIONAL FORMATTING}\r\n\N{LEFT-TO-RIGHT EMBEDDING}b\N{POP DIRECTIONAL FORMATTING}\r\n' )), ((
def xhtml_to_text(value): ''' Converts an XHTML fragment to plain text. FIXME: This should support special XHTML rules for <br />, <img alt="..." />, etc. ''' def _innerText(node): ''' Returns the concatenated plain text from a DOM node. ''' if node.nodeType in (node.CDATA_SECTION_NODE, node.TEXT_NODE): return node.nodeValue if node.nodeType in (node.ELEMENT_NODE, node.DOCUMENT_NODE, node.DOCUMENT_FRAGMENT_NODE): return u''.join([_innerText(child) for child in node.childNodes]) if value is None: return value xdoc = '<?xml version="1.0" encoding="utf-8"?>\n<div xmlns="http://www.w3.org/1999/xhtml">%s</div>' % to_utf8( value) dom = xml.dom.minidom.parseString(xdoc) return _innerText(dom.documentElement)
def test(): ''' Tiny smoke test to make sure this module works. ''' for i, o in ( (None, None), ('', u''), (u'', u''), ('Hello, world!', u'Hello, world!'), (u'Hello, world!', u'Hello, world!'), (u' ', u' '), (u' foo ', u' foo '), (u'<', u'<'), (u'>', u'>'), (u'&', u'&'), (u'>', u'>'), (u'\"', u'"'), (u'\'', u'\''), (u' ', u' '), (u' ', u'\n'), (u' ', u'\r'), (u'\t', u'\t'), (u'\n', u'\n'), (u'\r\n', u'\n'), (u'\r', u'\n'), (u'
', u'\n'), (u'
', u'\n'), (u' ', u'\r'), (u'
', u'\r'), (u'line 1\nline 2\nline 3\nline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\rline 2\rline 3\rline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\r\nline 2\r\nline 3\r\nline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\n\rline 2\n\rline 3\n\rline 4', u'line 1\n\nline 2\n\nline 3\n\nline 4'), (u'line 1\nline 2\r\nline 3\rline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\nline 2 \nline 3 line 4', u'line 1\nline 2\r\nline 3\rline 4'), (u'<![CDATA[]]>', u''), (u'<![CDATA[Hello, world!]]>', u'<![CDATA[Hello, world!]]>'), (u'<![CDATA[<\"\'>&;/! \n\r\t]]>', u'<![CDATA[<\"\'>&;/! \n\n\t]]>'), (u'<![CDATA[<]]>\"\'><![CDATA[&]]>;/! \n\r\t', u'<![CDATA[<]]>"\'><![CDATA[&]]>;/! \n\n\t'), (u'\ufffd', u'\ufffd'), (u'\x85', u'\x85'), (u'\x80', u'\x80'), (u'\x7f', u'\x7f'), (u'\x9f', u'\x9f'), (u'\xa0', u'\xa0'), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\n\n ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd' ), ): assert canon_xhtml(i) == o if i == o: assert verify_xhtml(i) == o else: try: verify_xhtml(i) assert "XHTML fragment verification should have failed."[:0] except: pass for i, o in ( (None, None), ('', u''), (u'', u''), ('Hello, world!', u'Hello, world!'), (u'Hello, world!', u'Hello, world!'), (u' ', u' '), (u' foo ', u' foo '), (u'<', u'<'), (u'>', u'>'), (u'&', u'&'), (u'>', u'>'), (u'\"', u'\"'), (u'\'', u'\''), (u' ', u' '), (u'\n', u'\n'), (u'\r', u'\n'), (u'\t', u'\t'), (u'\n', u'\n'), (u'\r\n', u'\n'), (u'line 1\nline 2\nline 3\nline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\rline 2\rline 3\rline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\r\nline 2\r\nline 3\r\nline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\n\rline 2\n\rline 3\n\rline 4', u'line 1\n\nline 2\n\nline 3\n\nline 4'), (u'line 1\nline 2\r\nline 3\rline 4', u'line 1\nline 2\nline 3\nline 4'), (u'<\"\'>&;/! \n\r\t', u'<\"\'>&;/! \n\n\t'), (u'\ufffd', u'\ufffd'), (u'\x85', u'\u2026'), (u'\x80', u'\u20ac'), (u'\x7f', u'\ufffd'), (u'\x9f', u'\u0178'), (u'\xa0', u'\xa0'), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\n\n ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd' ), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}'), ('\x00', u'\ufffd'), ('\x1f', u'\ufffd'), ('\x7f', u'\ufffd'), ('\x80', u'\ufffd'), ('\x84', u'\ufffd'), ('\x86', u'\ufffd'), ('\x9f', u'\ufffd'), ('\xa0', u'\ufffd'), (u'\x80', u'\u20ac'), (u'\x84', u'\u201e'), (u'\x85', u'\u2026'), (u'\x86', u'\u2020'), (u'\x8f', u'\xe8'), (u'\xa0', u'\xa0'), (u'\xa1', u'\xa1'), (u'\u3000', u'\u3000'), (u'\ud800', u''.join([u'\ufffd' for x in to_utf8(u'\ud800')])), (u'\udbff', u''.join([u'\ufffd' for x in to_utf8(u'\udbff')])), (u'\udc00', u''.join([u'\ufffd' for x in to_utf8(u'\udc00')])), (u'\udfff', u''.join([u'\ufffd' for x in to_utf8(u'\udfff')])), (u'\uff21', u'\uff21'), (u'\ufffd', u'\ufffd'), (u'\uffff', u''.join([u'\ufffd' for x in to_utf8(u'\uffff')])), (u'\U00010000', u'\U00010000'), (u'\U0001ffff', u''.join([u'\ufffd' for x in to_utf8(u'\U0001ffff')])), (u'\U0010fffd', u'\U0010fffd'), (u'\U0010ffff', u''.join([u'\ufffd' for x in to_utf8(u'\U0010ffff')])), ('<\"\'>&;/! \0\a\b\n\r\t\v\f', u'<\"\'>&;/! \ufffd\ufffd\ufffd\n\n\t\ufffd\ufffd'), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\n\n ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd' ), (u'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f', u'\u20ac\xc5\u201a\u0192\u201e\u2026\u2020\u2021\u02c6\u2030\u0160\u2039\u0152\xe7\u017d\xe8\xea\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u02dc\u2122\u0161\u203a\u0153\xf9\u017e\u0178' ), (u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe', u''.join([ u'\ufffd' for ch in u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe' .encode('utf-8') ])), ): assert canon_text(i) == o if to_utf8(i) == to_utf8(o): assert verify_text(i) == o else: try: verify_text(i) assert "plain text verification should have failed."[:0] except: pass for i, o in ( ('', ''), ('Hello, world!', 'Hello, world!'), (u'Hello, world!', 'Hello, world!'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}'), ('<', '<'), ('>', '>'), ('&', '&'), ('\"', '"'), ('\'', '\''), (' ', ' '), ('\n', '\n'), ('\r', '\r'), ('\t', '\t'), ('\x00', u'\ufffd'), ('\x1f', u'\ufffd'), ('\x7f', u'\ufffd'), ('\x80', u'\ufffd'), ('\x84', u'\ufffd'), ('\x86', u'\ufffd'), ('\x9f', u'\ufffd'), ('\xa0', u'\ufffd'), (u'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f', u'\u20ac\xc5\u201a\u0192\u201e\u2026\u2020\u2021\u02c6\u2030\u0160\u2039\u0152\xe7\u017d\xe8\xea\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u02dc\u2122\u0161\u203a\u0153\xf9\u017e\u0178' ), (u'\xa0', u'\xa0'), (u'\xa1', u'\xa1'), (u'\u3000', u'\u3000'), (u'\ud800', u''.join([u'\ufffd' for x in to_utf8(u'\ud800')])), (u'\udbff', u''.join([u'\ufffd' for x in to_utf8(u'\udbff')])), (u'\udc00', u''.join([u'\ufffd' for x in to_utf8(u'\udc00')])), (u'\udfff', u''.join([u'\ufffd' for x in to_utf8(u'\udfff')])), (u'\uff21', u'\uff21'), (u'\ufffd', u'\ufffd'), (u'\uffff', u''.join([u'\ufffd' for x in to_utf8(u'\uffff')])), (u'\U00010000', u'\U00010000'), (u'\U0001ffff', u''.join([u'\ufffd' for x in to_utf8(u'\U0001ffff')])), (u'\U0010fffd', u'\U0010fffd'), (u'\U0010ffff', u''.join([u'\ufffd' for x in to_utf8(u'\U0010ffff')])), ('<\"\'>&;/! \0\a\b\n\r\t\v\f', u'<"\'>&;/! \ufffd\ufffd\ufffd\n\r\t\ufffd\ufffd'), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd' ), (u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe', u''.join([ u'\ufffd' for ch in u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe' .encode('utf-8') ])), ): assert text_to_xhtml(i) == o assert xhtml_to_text(o) == canon_text(i) for i, o in ( ('', ''), ('Hello, world!', 'Hello, world!'), (u'Hello, world!', 'Hello, world!'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}'), ('<', '<'), ('>', '>'), ('&', '&'), ('\"', '"'), ('\'', '\''), (' ', ' '), ('\n', '\n'), ('\r', '\n'), ('\t', '\t'), ('\x00', u'\ufffd'), ('\x1f', u'\ufffd'), ('\x7f', u'\ufffd'), ('\x80', u'\ufffd'), ('\x84', u'\ufffd'), ('\x86', u'\ufffd'), ('\x9f', u'\ufffd'), ('\xa0', u'\ufffd'), (u'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f', u'\u20ac\xc5\u201a\u0192\u201e\u2026\u2020\u2021\u02c6\u2030\u0160\u2039\u0152\xe7\u017d\xe8\xea\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u02dc\u2122\u0161\u203a\u0153\xf9\u017e\u0178' ), (u'\xa0', u'\xa0'), (u'\xa1', u'\xa1'), (u'\u3000', u'\u3000'), (u'\ud800', u''.join([u'\ufffd' for x in to_utf8(u'\ud800')])), (u'\udbff', u''.join([u'\ufffd' for x in to_utf8(u'\udbff')])), (u'\udc00', u''.join([u'\ufffd' for x in to_utf8(u'\udc00')])), (u'\udfff', u''.join([u'\ufffd' for x in to_utf8(u'\udfff')])), (u'\uff21', u'\uff21'), (u'\ufffd', u'\ufffd'), (u'\uffff', u''.join([u'\ufffd' for x in to_utf8(u'\uffff')])), (u'\U00010000', u'\U00010000'), (u'\U0001ffff', u''.join([u'\ufffd' for x in to_utf8(u'\U0001ffff')])), (u'\U0010fffd', u'\U0010fffd'), (u'\U0010ffff', u''.join([u'\ufffd' for x in to_utf8(u'\U0010ffff')])), ('<\"\'>&;/! \0\a\b\n\r\t\v\f', u'<"\'>&;/! \ufffd\ufffd\ufffd\n\n\t\ufffd\ufffd'), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\n\n ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd' ), (u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe', u''.join([ u'\ufffd' for ch in u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe' .encode('utf-8') ])), (u'<i>Italics</i> are permitted.', u'<em>Italics</em> are permitted.'), (u'<em>Emphasis</em> is not permitted.', u'<em>Emphasis</em> is not permitted.'), (u'A bare set of curly braces works: {}', u'A bare set of curly braces works: {}'), (u'A bare set of curly braces with garbage inside works: {garbage}', u'A bare set of curly braces with garbage inside works: {garbage}'), (u'The game of {*Chess} is ancient and has many variations', u'The game of <a href="tag:bittorrent.com,2006-01-01:Game/Chess">Chess</a> is ancient and has many variations' ), (u'The show {#Monkeyspit} does not exist, AFAIK.', u'The show <a href="tag:bittorrent.com,2006-01-01:Show/Monkeyspit">Monkeyspit</a> does not exist, AFAIK.' ), (u'{//}', u'<a href="http:///">/</a>'), (u'{/www.bittorrent.com}', u'<a href="http://www.bittorrent.com">www.bittorrent.com</a>'), (u'{/http://www.bittorrent.com}', u'<a href="http://www.bittorrent.com">http://www.bittorrent.com</a>'), (u'{/https://www.bittorrent.com}', u'<a href="https://www.bittorrent.com">https://www.bittorrent.com</a>' ), (u'{/ftp://ftp.ubuntu.com}', u'<a href="ftp://ftp.ubuntu.com">ftp://ftp.ubuntu.com</a>'), (u'{/<i>italics</i>}', u'<a href="http://italics"><em>italics</em></a>'), (u'<i>{/italics</i>}', u'<i><a href="http://italics%3C/i%3E">italics</i></a>'), (u'<i>{/italics}</i>', u'<i><a href="http://italics">italics</a></i>'), (u'The show <i>{#Monkeyspit}</i> does not exist, AFAIK.', u'The show <i><a href="tag:bittorrent.com,2006-01-01:Show/Monkeyspit">Monkeyspit</a></i> does not exist, AFAIK.' ), (u'The show {#<i>Monkeyspit</i>} does not exist, AFAIK.', u'The show <a href="tag:bittorrent.com,2006-01-01:Show/Monkeyspit"><em>Monkeyspit</em></a> does not exist, AFAIK.' ), (u'<i>i</i>{* }{# }{^ }{= }{& }{+ }{@ }{$ }{% }{~ }{! }{- }{/ }{\\ }{| }', u'<em>i</em><a href="tag:bittorrent.com,2006-01-01:Game/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Show/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Album/%20"> </a><a href="tag:bittorrent.com,2006-01-01:DVD/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Music/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Show/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Company/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Name/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Unlisted%20Name/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Place/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Medium/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Book/%20"> </a><a href="http://%20"> </a><a href="tag:bittorrent.com,2006-01-01:Term/%20"> </a><a href="tag:bittorrent.com,2006-01-01:VHS/%20"> </a>' ), ): assert dinotext_to_xhtml(i) == o pass
def canon_xhtml(value): ''' Canonicalizes an XHTML fragment. FIXME: This should perform XML canonicalization. FIXME: This should preserve explicitly encoded whitespace. ''' if value is not None: xdoc = '<?xml version="1.0" encoding="utf-8"?>\n<div xmlns="http://www.w3.org/1999/xhtml">%s</div>' % to_utf8( value) dom = xml.dom.minidom.parseString(xdoc) dom.normalize() value = to_unicode(''.join([ elt.toxml(encoding='utf-8') for elt in dom.documentElement.childNodes ])) return value
def test(): ''' Tiny smoke test to make sure the funcitons in this library work. ''' for i, o in ( ('', ''), ('Hello, world!', 'Hello, world!'), (u'Hello, world!', 'Hello, world!'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}'.encode('utf-8')), ('\x00', to_utf8(u'\ufffd')), ('\x1f', to_utf8(u'\ufffd')), ('\x7f', to_utf8(u'\ufffd')), ('\x80', to_utf8(u'\ufffd')), ('\x84', to_utf8(u'\ufffd')), ('\x86', to_utf8(u'\ufffd')), ('\x9f', to_utf8(u'\ufffd')), ('\xa0', to_utf8(u'\ufffd')), (u'\x80', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\x80') ])), (u'\x84', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\x84') ])), (u'\x85', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\x85') ])), (u'\x86', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\x86') ])), (u'\x8f', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\x8f') ])), (u'\xa0', to_utf8(u'\xa0')), (u'\xa1', to_utf8(u'\xa1')), (u'\u3000', to_utf8(u'\u3000')), (u'\ud800', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\ud800') ])), (u'\udbff', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\udbff') ])), (u'\udc00', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\udc00') ])), (u'\udfff', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\udfff') ])), (u'\uff21', to_utf8(u'\uff21')), (u'\ufffd', to_utf8(u'\ufffd')), (u'\uffff', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\uffff') ])), (u'\U00010000', to_utf8(u'\U00010000')), (u'\U0001ffff', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\U0001ffff') ])), (u'\U0010fffd', to_utf8(u'\U0010fffd')), (u'\U0010ffff', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\U0010ffff') ])), ('<\"\'>&;/! \0\a\b\n\r\t\v\f', to_utf8(u'<\"\'>&;/! \ufffd\ufffd\ufffd\n\r\t\ufffd\ufffd')), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd'.encode('utf-8')), (u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe', u''.join([ u'\ufffd' for ch in u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe'.encode('utf-8') ]).encode('utf-8')), ): assert fix_xmlutf8(i) == o for i, o in ( ('', ''), ('Hello, world!', 'Hello, world!'), (u'Hello, world!', 'Hello, world!'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}'.encode('utf-8')), ('<', '<'), ('>', '>'), ('&', '&'), ('\"', '"'), ('\'', '''), (' ', ' '), ('\n', ' '), ('\r', ' '), ('\t', '	'), ('\x00', to_utf8(u'\ufffd')), ('\x1f', to_utf8(u'\ufffd')), ('\x7f', to_utf8(u'\ufffd')), ('\x80', to_utf8(u'\ufffd')), ('\x84', to_utf8(u'\ufffd')), ('\x86', to_utf8(u'\ufffd')), ('\x9f', to_utf8(u'\ufffd')), ('\xa0', to_utf8(u'\ufffd')), (u'\x80', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\x80') ])), (u'\x84', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\x84') ])), (u'\x85', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\x85') ])), (u'\x86', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\x86') ])), (u'\x8f', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\x8f') ])), (u'\xa0', ' '), (u'\xa1', to_utf8(u'\xa1')), (u'\u3000', ' '), (u'\ud800', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\ud800') ])), (u'\udbff', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\udbff') ])), (u'\udc00', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\udc00') ])), (u'\udfff', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\udfff') ])), (u'\uff21', to_utf8(u'\uff21')), (u'\ufffd', to_utf8(u'\ufffd')), (u'\uffff', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\uffff') ])), (u'\U00010000', to_utf8(u'\U00010000')), (u'\U0001ffff', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\U0001ffff') ])), (u'\U0010fffd', to_utf8(u'\U0010fffd')), (u'\U0010ffff', ''.join([ to_utf8(u'\ufffd') for x in to_utf8(u'\U0010ffff') ])), ('<\"\'>&;/! \0\a\b\n\r\t\v\f', to_utf8(u'<"'>&;/! \ufffd\ufffd\ufffd 	\ufffd\ufffd')), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'	  ~ \ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd'.encode('utf-8')), (u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe', u''.join([ u'\ufffd' for ch in u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe'.encode('utf-8') ]).encode('utf-8')), ): assert xml_escape(i) == o for i, o in ( ('hello, world!', 'hello%2C%20world%21'), (':/@[.]?=;&#', '%3A%2F%40%5B.%5D%3F%3D%3B%26%23'), ('\x00\x1f\x20\x7f\x80\xff', '%00%1F%20%7F%80%FF'), (u'\x00\x1f\x20\x7f\x80\xff\u0100\ufffd\U00010000\U0010fffd', '%00%1F%20%7F%C2%80%C3%BF%C4%80%EF%BF%BD%F0%90%80%80%F4%8F%BF%BD'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', '%24%C2%A3%E2%82%AC%C2%A5'), ): assert uri_escape(i) == o for i, o2, o in ( ('\n', r'\12', r'\n'), ('\r', r'\15', r'\r'), ('\t', r'\11', r'\t'), ('\"', r'\42', r'\"'), ('\'', r'\47', r'\''), ('\\', r'\134', r'\\'), ('A', r'\101', 'A'), (u'A', r'\101', 'A'), ('Hello, world!', '\110\145\154\154\157\54\40\167\157\162\154\144\41', 'Hello, world!'), (u'\0\a\b\n\r\t\v\f\"\\\"\'\\\'\x7f\x80\x81\xff\u0100\ufffd\U00010000\U0010fffd', ur'\0\x07\b\n\r\t\v\f\"\\\"\'\\\'\x7f\x80\x81\xff\u0100\ufffd\ud800\udc00\udbff\udffd', r'\x00\x07\x08\n\r\t\x0b\x0c\"\\\"\'\\\'\x7f\x80\x81\xff\u0100\ufffd\ud800\udc00\udbff\udffd'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', ur'\u0024\u00a3\u20ac\u00a5', r'$\xa3\u20ac\xa5'), (u'<\"\'>&;/! \0\a\b\n\r\t\v\f', ur'<\"\'>&;/! \0\7\b\n\r\t\v\f', r'<\"\'>\x26;/! \x00\x07\x08\n\r\t\x0b\x0c'), (u"// This <hack /> shouldn't close the </script> tag! < / > Nor should this close a <![CDATA[ CDATA section ]]>", ur"// This <hack /> shouldn't close the </script> tag! < / > Nor should this close a <![CDATA[ CDATA section ]]>", r"// This <hack \x2f> shouldn\'t close the <\x2fscript> tag! < / > Nor should this close a <![CDATA[ CDATA section \x5d]>"), ): assert js_unescape(js_escape(i)) == i assert js_escape(i) == o assert js_unescape(o) == i assert js_unescape(o2) == i
def uri_escape(s): ''' Escape a UTF-8 or Unicode string for use inside a URI or IRI. ''' return urllib.quote(to_utf8(s), safe = '')
def urlify_filename(filename): return urllib.quote(to_utf8(canonical_filename(filename)))
def uri_escape(s): ''' Escape a UTF-8 or Unicode string for use inside a URI or IRI. ''' return urllib.quote(to_utf8(s), safe='')
def test(): ''' Tiny smoke test to make sure this module works. ''' for i, o in ( (None, None), ('', u''), (u'', u''), ('Hello, world!', u'Hello, world!'), (u'Hello, world!', u'Hello, world!'), (u' ', u' '), (u' foo ', u' foo '), (u'<', u'<'), (u'>', u'>'), (u'&', u'&'), (u'>', u'>'), (u'\"', u'"'), (u'\'', u'\''), (u' ', u' '), (u' ', u'\n'), (u' ', u'\r'), (u'\t', u'\t'), (u'\n', u'\n'), (u'\r\n', u'\n'), (u'\r', u'\n'), (u'
', u'\n'), (u'
', u'\n'), (u' ', u'\r'), (u'
', u'\r'), (u'line 1\nline 2\nline 3\nline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\rline 2\rline 3\rline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\r\nline 2\r\nline 3\r\nline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\n\rline 2\n\rline 3\n\rline 4', u'line 1\n\nline 2\n\nline 3\n\nline 4'), (u'line 1\nline 2\r\nline 3\rline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\nline 2 \nline 3 line 4', u'line 1\nline 2\r\nline 3\rline 4'), (u'<![CDATA[]]>', u''), (u'<![CDATA[Hello, world!]]>', u'<![CDATA[Hello, world!]]>'), (u'<![CDATA[<\"\'>&;/! \n\r\t]]>', u'<![CDATA[<\"\'>&;/! \n\n\t]]>'), (u'<![CDATA[<]]>\"\'><![CDATA[&]]>;/! \n\r\t', u'<![CDATA[<]]>"\'><![CDATA[&]]>;/! \n\n\t'), (u'\ufffd', u'\ufffd'), (u'\x85', u'\x85'), (u'\x80', u'\x80'), (u'\x7f', u'\x7f'), (u'\x9f', u'\x9f'), (u'\xa0', u'\xa0'), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\n\n ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd'), ): assert canon_xhtml(i) == o if i == o: assert verify_xhtml(i) == o else: try: verify_xhtml(i) assert "XHTML fragment verification should have failed." [:0] except: pass for i, o in ( (None, None), ('', u''), (u'', u''), ('Hello, world!', u'Hello, world!'), (u'Hello, world!', u'Hello, world!'), (u' ', u' '), (u' foo ', u' foo '), (u'<', u'<'), (u'>', u'>'), (u'&', u'&'), (u'>', u'>'), (u'\"', u'\"'), (u'\'', u'\''), (u' ', u' '), (u'\n', u'\n'), (u'\r', u'\n'), (u'\t', u'\t'), (u'\n', u'\n'), (u'\r\n', u'\n'), (u'line 1\nline 2\nline 3\nline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\rline 2\rline 3\rline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\r\nline 2\r\nline 3\r\nline 4', u'line 1\nline 2\nline 3\nline 4'), (u'line 1\n\rline 2\n\rline 3\n\rline 4', u'line 1\n\nline 2\n\nline 3\n\nline 4'), (u'line 1\nline 2\r\nline 3\rline 4', u'line 1\nline 2\nline 3\nline 4'), (u'<\"\'>&;/! \n\r\t', u'<\"\'>&;/! \n\n\t'), (u'\ufffd', u'\ufffd'), (u'\x85', u'\u2026'), (u'\x80', u'\u20ac'), (u'\x7f', u'\ufffd'), (u'\x9f', u'\u0178'), (u'\xa0', u'\xa0'), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\n\n ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}'), ('\x00', u'\ufffd'), ('\x1f', u'\ufffd'), ('\x7f', u'\ufffd'), ('\x80', u'\ufffd'), ('\x84', u'\ufffd'), ('\x86', u'\ufffd'), ('\x9f', u'\ufffd'), ('\xa0', u'\ufffd'), (u'\x80', u'\u20ac'), (u'\x84', u'\u201e'), (u'\x85', u'\u2026'), (u'\x86', u'\u2020'), (u'\x8f', u'\xe8'), (u'\xa0', u'\xa0'), (u'\xa1', u'\xa1'), (u'\u3000', u'\u3000'), (u'\ud800', u''.join([ u'\ufffd' for x in to_utf8(u'\ud800') ])), (u'\udbff', u''.join([ u'\ufffd' for x in to_utf8(u'\udbff') ])), (u'\udc00', u''.join([ u'\ufffd' for x in to_utf8(u'\udc00') ])), (u'\udfff', u''.join([ u'\ufffd' for x in to_utf8(u'\udfff') ])), (u'\uff21', u'\uff21'), (u'\ufffd', u'\ufffd'), (u'\uffff', u''.join([ u'\ufffd' for x in to_utf8(u'\uffff') ])), (u'\U00010000', u'\U00010000'), (u'\U0001ffff', u''.join([ u'\ufffd' for x in to_utf8(u'\U0001ffff') ])), (u'\U0010fffd', u'\U0010fffd'), (u'\U0010ffff', u''.join([ u'\ufffd' for x in to_utf8(u'\U0010ffff') ])), ('<\"\'>&;/! \0\a\b\n\r\t\v\f', u'<\"\'>&;/! \ufffd\ufffd\ufffd\n\n\t\ufffd\ufffd'), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\n\n ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd'), (u'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f', u'\u20ac\xc5\u201a\u0192\u201e\u2026\u2020\u2021\u02c6\u2030\u0160\u2039\u0152\xe7\u017d\xe8\xea\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u02dc\u2122\u0161\u203a\u0153\xf9\u017e\u0178'), (u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe', u''.join([ u'\ufffd' for ch in u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe'.encode('utf-8') ])), ): assert canon_text(i) == o if to_utf8(i) == to_utf8(o): assert verify_text(i) == o else: try: verify_text(i) assert "plain text verification should have failed." [:0] except: pass for i, o in ( ('', ''), ('Hello, world!', 'Hello, world!'), (u'Hello, world!', 'Hello, world!'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}'), ('<', '<'), ('>', '>'), ('&', '&'), ('\"', '"'), ('\'', '\''), (' ', ' '), ('\n', '\n'), ('\r', '\r'), ('\t', '\t'), ('\x00', u'\ufffd'), ('\x1f', u'\ufffd'), ('\x7f', u'\ufffd'), ('\x80', u'\ufffd'), ('\x84', u'\ufffd'), ('\x86', u'\ufffd'), ('\x9f', u'\ufffd'), ('\xa0', u'\ufffd'), (u'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f', u'\u20ac\xc5\u201a\u0192\u201e\u2026\u2020\u2021\u02c6\u2030\u0160\u2039\u0152\xe7\u017d\xe8\xea\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u02dc\u2122\u0161\u203a\u0153\xf9\u017e\u0178'), (u'\xa0', u'\xa0'), (u'\xa1', u'\xa1'), (u'\u3000', u'\u3000'), (u'\ud800', u''.join([ u'\ufffd' for x in to_utf8(u'\ud800') ])), (u'\udbff', u''.join([ u'\ufffd' for x in to_utf8(u'\udbff') ])), (u'\udc00', u''.join([ u'\ufffd' for x in to_utf8(u'\udc00') ])), (u'\udfff', u''.join([ u'\ufffd' for x in to_utf8(u'\udfff') ])), (u'\uff21', u'\uff21'), (u'\ufffd', u'\ufffd'), (u'\uffff', u''.join([ u'\ufffd' for x in to_utf8(u'\uffff') ])), (u'\U00010000', u'\U00010000'), (u'\U0001ffff', u''.join([ u'\ufffd' for x in to_utf8(u'\U0001ffff') ])), (u'\U0010fffd', u'\U0010fffd'), (u'\U0010ffff', u''.join([ u'\ufffd' for x in to_utf8(u'\U0010ffff') ])), ('<\"\'>&;/! \0\a\b\n\r\t\v\f', u'<"\'>&;/! \ufffd\ufffd\ufffd\n\r\t\ufffd\ufffd'), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd'), (u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe', u''.join([ u'\ufffd' for ch in u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe'.encode('utf-8') ])), ): assert text_to_xhtml(i) == o assert xhtml_to_text(o) == canon_text(i) for i, o in ( ('', ''), ('Hello, world!', 'Hello, world!'), (u'Hello, world!', 'Hello, world!'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}'), ('<', '<'), ('>', '>'), ('&', '&'), ('\"', '"'), ('\'', '\''), (' ', ' '), ('\n', '\n'), ('\r', '\n'), ('\t', '\t'), ('\x00', u'\ufffd'), ('\x1f', u'\ufffd'), ('\x7f', u'\ufffd'), ('\x80', u'\ufffd'), ('\x84', u'\ufffd'), ('\x86', u'\ufffd'), ('\x9f', u'\ufffd'), ('\xa0', u'\ufffd'), (u'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f', u'\u20ac\xc5\u201a\u0192\u201e\u2026\u2020\u2021\u02c6\u2030\u0160\u2039\u0152\xe7\u017d\xe8\xea\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u02dc\u2122\u0161\u203a\u0153\xf9\u017e\u0178'), (u'\xa0', u'\xa0'), (u'\xa1', u'\xa1'), (u'\u3000', u'\u3000'), (u'\ud800', u''.join([ u'\ufffd' for x in to_utf8(u'\ud800') ])), (u'\udbff', u''.join([ u'\ufffd' for x in to_utf8(u'\udbff') ])), (u'\udc00', u''.join([ u'\ufffd' for x in to_utf8(u'\udc00') ])), (u'\udfff', u''.join([ u'\ufffd' for x in to_utf8(u'\udfff') ])), (u'\uff21', u'\uff21'), (u'\ufffd', u'\ufffd'), (u'\uffff', u''.join([ u'\ufffd' for x in to_utf8(u'\uffff') ])), (u'\U00010000', u'\U00010000'), (u'\U0001ffff', u''.join([ u'\ufffd' for x in to_utf8(u'\U0001ffff') ])), (u'\U0010fffd', u'\U0010fffd'), (u'\U0010ffff', u''.join([ u'\ufffd' for x in to_utf8(u'\U0010ffff') ])), ('<\"\'>&;/! \0\a\b\n\r\t\v\f', u'<"\'>&;/! \ufffd\ufffd\ufffd\n\n\t\ufffd\ufffd'), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\n\n ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd'), (u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe', u''.join([ u'\ufffd' for ch in u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe'.encode('utf-8') ])), (u'<i>Italics</i> are permitted.', u'<em>Italics</em> are permitted.'), (u'<em>Emphasis</em> is not permitted.', u'<em>Emphasis</em> is not permitted.'), (u'A bare set of curly braces works: {}', u'A bare set of curly braces works: {}'), (u'A bare set of curly braces with garbage inside works: {garbage}', u'A bare set of curly braces with garbage inside works: {garbage}'), (u'The game of {*Chess} is ancient and has many variations', u'The game of <a href="tag:bittorrent.com,2006-01-01:Game/Chess">Chess</a> is ancient and has many variations'), (u'The show {#Monkeyspit} does not exist, AFAIK.', u'The show <a href="tag:bittorrent.com,2006-01-01:Show/Monkeyspit">Monkeyspit</a> does not exist, AFAIK.'), (u'{//}', u'<a href="http:///">/</a>'), (u'{/www.bittorrent.com}', u'<a href="http://www.bittorrent.com">www.bittorrent.com</a>'), (u'{/http://www.bittorrent.com}', u'<a href="http://www.bittorrent.com">http://www.bittorrent.com</a>'), (u'{/https://www.bittorrent.com}', u'<a href="https://www.bittorrent.com">https://www.bittorrent.com</a>'), (u'{/ftp://ftp.ubuntu.com}', u'<a href="ftp://ftp.ubuntu.com">ftp://ftp.ubuntu.com</a>'), (u'{/<i>italics</i>}', u'<a href="http://italics"><em>italics</em></a>'), (u'<i>{/italics</i>}', u'<i><a href="http://italics%3C/i%3E">italics</i></a>'), (u'<i>{/italics}</i>', u'<i><a href="http://italics">italics</a></i>'), (u'The show <i>{#Monkeyspit}</i> does not exist, AFAIK.', u'The show <i><a href="tag:bittorrent.com,2006-01-01:Show/Monkeyspit">Monkeyspit</a></i> does not exist, AFAIK.'), (u'The show {#<i>Monkeyspit</i>} does not exist, AFAIK.', u'The show <a href="tag:bittorrent.com,2006-01-01:Show/Monkeyspit"><em>Monkeyspit</em></a> does not exist, AFAIK.'), (u'<i>i</i>{* }{# }{^ }{= }{& }{+ }{@ }{$ }{% }{~ }{! }{- }{/ }{\\ }{| }', u'<em>i</em><a href="tag:bittorrent.com,2006-01-01:Game/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Show/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Album/%20"> </a><a href="tag:bittorrent.com,2006-01-01:DVD/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Music/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Show/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Company/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Name/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Unlisted%20Name/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Place/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Medium/%20"> </a><a href="tag:bittorrent.com,2006-01-01:Book/%20"> </a><a href="http://%20"> </a><a href="tag:bittorrent.com,2006-01-01:Term/%20"> </a><a href="tag:bittorrent.com,2006-01-01:VHS/%20"> </a>'), ): assert dinotext_to_xhtml(i) == o pass
def test(): ''' Tiny smoke test to make sure the funcitons in this library work. ''' for i, o in ( ('', ''), ('Hello, world!', 'Hello, world!'), (u'Hello, world!', 'Hello, world!'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}'.encode( 'utf-8')), ('\x00', to_utf8(u'\ufffd')), ('\x1f', to_utf8(u'\ufffd')), ('\x7f', to_utf8(u'\ufffd')), ('\x80', to_utf8(u'\ufffd')), ('\x84', to_utf8(u'\ufffd')), ('\x86', to_utf8(u'\ufffd')), ('\x9f', to_utf8(u'\ufffd')), ('\xa0', to_utf8(u'\ufffd')), (u'\x80', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\x80')])), (u'\x84', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\x84')])), (u'\x85', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\x85')])), (u'\x86', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\x86')])), (u'\x8f', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\x8f')])), (u'\xa0', to_utf8(u'\xa0')), (u'\xa1', to_utf8(u'\xa1')), (u'\u3000', to_utf8(u'\u3000')), (u'\ud800', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\ud800')])), (u'\udbff', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\udbff')])), (u'\udc00', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\udc00')])), (u'\udfff', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\udfff')])), (u'\uff21', to_utf8(u'\uff21')), (u'\ufffd', to_utf8(u'\ufffd')), (u'\uffff', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\uffff')])), (u'\U00010000', to_utf8(u'\U00010000')), (u'\U0001ffff', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\U0001ffff')])), (u'\U0010fffd', to_utf8(u'\U0010fffd')), (u'\U0010ffff', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\U0010ffff')])), ('<\"\'>&;/! \0\a\b\n\r\t\v\f', to_utf8(u'<\"\'>&;/! \ufffd\ufffd\ufffd\n\r\t\ufffd\ufffd')), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd' .encode('utf-8')), (u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe', u''.join([ u'\ufffd' for ch in u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe' .encode('utf-8') ]).encode('utf-8')), ): assert fix_xmlutf8(i) == o for i, o in ( ('', ''), ('Hello, world!', 'Hello, world!'), (u'Hello, world!', 'Hello, world!'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}'.encode( 'utf-8')), ('<', '<'), ('>', '>'), ('&', '&'), ('\"', '"'), ('\'', '''), (' ', ' '), ('\n', ' '), ('\r', ' '), ('\t', '	'), ('\x00', to_utf8(u'\ufffd')), ('\x1f', to_utf8(u'\ufffd')), ('\x7f', to_utf8(u'\ufffd')), ('\x80', to_utf8(u'\ufffd')), ('\x84', to_utf8(u'\ufffd')), ('\x86', to_utf8(u'\ufffd')), ('\x9f', to_utf8(u'\ufffd')), ('\xa0', to_utf8(u'\ufffd')), (u'\x80', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\x80')])), (u'\x84', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\x84')])), (u'\x85', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\x85')])), (u'\x86', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\x86')])), (u'\x8f', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\x8f')])), (u'\xa0', ' '), (u'\xa1', to_utf8(u'\xa1')), (u'\u3000', ' '), (u'\ud800', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\ud800')])), (u'\udbff', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\udbff')])), (u'\udc00', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\udc00')])), (u'\udfff', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\udfff')])), (u'\uff21', to_utf8(u'\uff21')), (u'\ufffd', to_utf8(u'\ufffd')), (u'\uffff', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\uffff')])), (u'\U00010000', to_utf8(u'\U00010000')), (u'\U0001ffff', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\U0001ffff')])), (u'\U0010fffd', to_utf8(u'\U0010fffd')), (u'\U0010ffff', ''.join([to_utf8(u'\ufffd') for x in to_utf8(u'\U0010ffff')])), ('<\"\'>&;/! \0\a\b\n\r\t\v\f', to_utf8( u'<"'>&;/! \ufffd\ufffd\ufffd 	\ufffd\ufffd' )), (u'\t\n\r\r ~\xa0\ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd', u'	  ~ \ud7ff\ue000\ufffd\U00010000\U0001fffd\U00020000\U0002fffd\U00030000\U0003fffd\U00040000\U0004fffd\U00050000\U0005fffd\U00060000\U0006fffd\U00070000\U0007fffd\U00080000\U0008fffd\U00090000\U0009fffd\U000a0000\U000afffd\U000b0000\U000bfffd\U000c0000\U000cfffd\U000d0000\U000dfffd\U000e0000\U000efffd\U000f0000\U000ffffd\U00100000\U0010fffd' .encode('utf-8')), (u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe', u''.join([ u'\ufffd' for ch in u'\x00\x01\x02\x03\x04\x05\x06\x07\x08\x0b\x0c\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f\x7f\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f\udc00\udfff\ud800\udbff\ufffe\uffff\U0001fffe\U0001ffff\U0002fffe\U0002ffff\U0003fffe\U0003ffff\U0004fffe\U0004ffff\U0005fffe\U0005ffff\U0006fffe\U0006ffff\U0007fffe\U0007ffff\U0008fffe\U0008ffff\U0009fffe\U0009ffff\U000afffe\U000affff\U000bfffe\U000bffff\U000cfffe\U000cffff\U000dfffe\U000dffff\U000efffe\U000effff\U000ffffe\U000fffff\U0010fffe' .encode('utf-8') ]).encode('utf-8')), ): assert xml_escape(i) == o for i, o in ( ('hello, world!', 'hello%2C%20world%21'), (':/@[.]?=;&#', '%3A%2F%40%5B.%5D%3F%3D%3B%26%23'), ('\x00\x1f\x20\x7f\x80\xff', '%00%1F%20%7F%80%FF'), (u'\x00\x1f\x20\x7f\x80\xff\u0100\ufffd\U00010000\U0010fffd', '%00%1F%20%7F%C2%80%C3%BF%C4%80%EF%BF%BD%F0%90%80%80%F4%8F%BF%BD'), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', '%24%C2%A3%E2%82%AC%C2%A5'), ): assert uri_escape(i) == o for i, o2, o in ( ('\n', r'\12', r'\n'), ('\r', r'\15', r'\r'), ('\t', r'\11', r'\t'), ('\"', r'\42', r'\"'), ('\'', r'\47', r'\''), ('\\', r'\134', r'\\'), ('A', r'\101', 'A'), (u'A', r'\101', 'A'), ('Hello, world!', '\110\145\154\154\157\54\40\167\157\162\154\144\41', 'Hello, world!'), (u'\0\a\b\n\r\t\v\f\"\\\"\'\\\'\x7f\x80\x81\xff\u0100\ufffd\U00010000\U0010fffd', ur'\0\x07\b\n\r\t\v\f\"\\\"\'\\\'\x7f\x80\x81\xff\u0100\ufffd\ud800\udc00\udbff\udffd', r'\x00\x07\x08\n\r\t\x0b\x0c\"\\\"\'\\\'\x7f\x80\x81\xff\u0100\ufffd\ud800\udc00\udbff\udffd' ), (u'\N{DOLLAR SIGN}\N{POUND SIGN}\N{EURO SIGN}\N{YEN SIGN}', ur'\u0024\u00a3\u20ac\u00a5', r'$\xa3\u20ac\xa5'), (u'<\"\'>&;/! \0\a\b\n\r\t\v\f', ur'<\"\'>&;/! \0\7\b\n\r\t\v\f', r'<\"\'>\x26;/! \x00\x07\x08\n\r\t\x0b\x0c'), (u"// This <hack /> shouldn't close the </script> tag! < / > Nor should this close a <![CDATA[ CDATA section ]]>", ur"// This <hack /> shouldn't close the </script> tag! < / > Nor should this close a <![CDATA[ CDATA section ]]>", r"// This <hack \x2f> shouldn\'t close the <\x2fscript> tag! < / > Nor should this close a <![CDATA[ CDATA section \x5d]>" ), ): assert js_unescape(js_escape(i)) == i assert js_escape(i) == o assert js_unescape(o) == i assert js_unescape(o2) == i
def test(): ''' Run a quick smoke test to make sure this module still works. ''' for unsafe, uri_in, uri_out, iri_out in ( ( False, 'http://www.example.org', 'http://www.example.org', 'http://www.example.org', ), ( True, 'http://www.example.org', 'http://www.example.org', 'http://www.example.org', ), ( False, 'javascript:alert("0wn3d!")', None, None, ), ( True, 'javascript:alert("0wn3d!")', 'javascript:alert(%220wn3d!%22)', 'javascript:alert(%220wn3d!%22)', ), ( False, 'http://www.example.org/?q=monkey%20rockets', 'http://www.example.org/?q=monkey+rockets', 'http://www.example.org/?q=monkey+rockets', ), ( True, 'http://www.example.org/?q=monkey%20rockets', 'http://www.example.org/?q=monkey+rockets', 'http://www.example.org/?q=monkey+rockets', ), ( False, 'http://www.example.org/?q=monkey rockets', 'http://www.example.org/?q=monkey+rockets', 'http://www.example.org/?q=monkey+rockets', ), ( True, 'http://www.example.org/?q=monkey rockets', 'http://www.example.org/?q=monkey+rockets', 'http://www.example.org/?q=monkey+rockets', ), ( True, 'http://www.example.org/', 'http://www.example.org/', 'http://www.example.org/', ), ( True, u'http://www.example.org/', 'http://www.example.org/', 'http://www.example.org/', ), ( True, u'http://\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}\N{LEFT-TO-RIGHT EMBEDDING}\N{RIGHT-TO-LEFT EMBEDDING}\N{LEFT-TO-RIGHT OVERRIDE}\N{RIGHT-TO-LEFT OVERRIDE}\N{POP DIRECTIONAL FORMATTING}/', 'http://%E2%80%8E%E2%80%8F%E2%80%AA%E2%80%AB%E2%80%AD%E2%80%AE%E2%80%AC/', 'http://%E2%80%8E%E2%80%8F%E2%80%AA%E2%80%AB%E2%80%AD%E2%80%AE%E2%80%AC/', ), ( True, 'http://\xe2\x80\x8e\xe2\x80\x8f\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xad\xe2\x80\xae\xe2\x80\xac/', 'http://%E2%80%8E%E2%80%8F%E2%80%AA%E2%80%AB%E2%80%AD%E2%80%AE%E2%80%AC/', 'http://%E2%80%8E%E2%80%8F%E2%80%AA%E2%80%AB%E2%80%AD%E2%80%AE%E2%80%AC/', ), ( True, 'http://xn--fiqz9szqa.net/', 'http://xn--fiqz9szqa.net/', 'http://\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net/', ), ( True, 'http://\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net/', 'http://xn--fiqz9szqa.net/', 'http://\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net/', ), ( True, u'http://\u4e2d\u570b\u57ce.net/', 'http://xn--fiqz9szqa.net/', 'http://\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net/', ), ( True, 'http://%E4%B8%AD%E5%9C%8B%E5%9F%8E.net/', 'http://xn--fiqz9szqa.net/', 'http://\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net/', ), ( True, 'http://xn--afiqz9szqa.net/', 'http://xn--afiqz9szqa.net/', 'http://xn--afiqz9szqa.net/', ), ( False, 'www.example.org', 'http://www.example.org', 'http://www.example.org', ), ( True, 'www.example.org', 'http://www.example.org', 'http://www.example.org', ), ( False, 'www.example.org/?q=monkey%20rockets', 'http://www.example.org/?q=monkey+rockets', 'http://www.example.org/?q=monkey+rockets', ), ( True, 'www.example.org/?q=monkey%20rockets', 'http://www.example.org/?q=monkey+rockets', 'http://www.example.org/?q=monkey+rockets', ), ( False, 'www.example.org/?q=monkey rockets', 'http://www.example.org/?q=monkey+rockets', 'http://www.example.org/?q=monkey+rockets', ), ( True, 'www.example.org/?q=monkey rockets', 'http://www.example.org/?q=monkey+rockets', 'http://www.example.org/?q=monkey+rockets', ), ( True, 'www.example.org/', 'http://www.example.org/', 'http://www.example.org/', ), ( True, u'www.example.org/', 'http://www.example.org/', 'http://www.example.org/', ), ( True, u'\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}\N{LEFT-TO-RIGHT EMBEDDING}\N{RIGHT-TO-LEFT EMBEDDING}\N{LEFT-TO-RIGHT OVERRIDE}\N{RIGHT-TO-LEFT OVERRIDE}\N{POP DIRECTIONAL FORMATTING}/', 'http://%E2%80%8E%E2%80%8F%E2%80%AA%E2%80%AB%E2%80%AD%E2%80%AE%E2%80%AC/', 'http://%E2%80%8E%E2%80%8F%E2%80%AA%E2%80%AB%E2%80%AD%E2%80%AE%E2%80%AC/', ), ( True, '\xe2\x80\x8e\xe2\x80\x8f\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xad\xe2\x80\xae\xe2\x80\xac/', 'http://%E2%80%8E%E2%80%8F%E2%80%AA%E2%80%AB%E2%80%AD%E2%80%AE%E2%80%AC/', 'http://%E2%80%8E%E2%80%8F%E2%80%AA%E2%80%AB%E2%80%AD%E2%80%AE%E2%80%AC/', ), ( True, 'xn--fiqz9szqa.net/', 'http://xn--fiqz9szqa.net/', 'http://\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net/', ), ( True, '\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net/', 'http://xn--fiqz9szqa.net/', 'http://\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net/', ), ( True, u'\u4e2d\u570b\u57ce.net/', 'http://xn--fiqz9szqa.net/', 'http://\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net/', ), ( True, '%E4%B8%AD%E5%9C%8B%E5%9F%8E.net/', 'http://xn--fiqz9szqa.net/', 'http://\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net/', ), ( True, 'xn--afiqz9szqa.net/', 'http://xn--afiqz9szqa.net/', 'http://xn--afiqz9szqa.net/', ), ( True, 'xn--afiqz9szqa.net/', 'http://xn--afiqz9szqa.net/', 'http://xn--afiqz9szqa.net/', ), ): # NOTE: please fix canon_uri/canon_iri rather than commenting these out if the tests fail on your system assert canon_uri(uri=uri_in, unsafe=unsafe) == uri_out assert canon_iri(iri=uri_in, unsafe=unsafe) == iri_out pass for idn_in, idn_out in ( ( 'www.example.org', 'www.example.org', ), ( u'www.example.org', 'www.example.org', ), ( u'\N{LEFT-TO-RIGHT MARK}\N{RIGHT-TO-LEFT MARK}\N{LEFT-TO-RIGHT EMBEDDING}\N{RIGHT-TO-LEFT EMBEDDING}\N{LEFT-TO-RIGHT OVERRIDE}\N{RIGHT-TO-LEFT OVERRIDE}\N{POP DIRECTIONAL FORMATTING}', '%E2%80%8E%E2%80%8F%E2%80%AA%E2%80%AB%E2%80%AD%E2%80%AE%E2%80%AC', ), ( '\xe2\x80\x8e\xe2\x80\x8f\xe2\x80\xaa\xe2\x80\xab\xe2\x80\xad\xe2\x80\xae\xe2\x80\xac', '%E2%80%8E%E2%80%8F%E2%80%AA%E2%80%AB%E2%80%AD%E2%80%AE%E2%80%AC', ), ( 'xn--fiqz9szqa.net', '\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net', ), ( '\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net', '\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net', ), ( u'\u4e2d\u570b\u57ce.net', '\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net', ), ( '%E4%B8%AD%E5%9C%8B%E5%9F%8E.net', '\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net', ), ( 'xn--afiqz9szqa.net', 'xn--afiqz9szqa.net', ), ): assert canon_idn(idn=idn_in) == idn_out for xmlchars, chars in ( ( 'This is a simple test.', 'This is a simple test.', ), ( '< < << > > >> & & && " " "" \' &apos &apos'', '< < << > > >> & & && " " "" \' \' \'\'', ), ( '< < << > > >> & & && " " "" \' ' ''', '< < << > > >> & & && " " "" \' \' \'\'', ), ( '< < << > > >> & & && " " "" \' ' ''', '< < << > > >> & & && " " "" \' \' \'\'', ), ( '< < << > > >> & & && " " "" \' ' ''', '< < << > > >> & & && " " "" \' \' \'\'', ), ( '< < << > > >> & & && " " "" \' ' ''', '< < << > > >> & & && " " "" \' \' \'\'', ), ( '< < << > > >> & & && " " "" \' ' ''', '< < << > > >> & & && " " "" \' \' \'\'', ), ( '< < << > > >> & & && " " "" \' ' ''', '< < << > > >> & & && " " "" \' \' \'\'', ), ( '< < << > > >> & & && " " "" \' ' ''', '< < << > > >> & & && " " "" \' \' \'\'', ), ( '< < << > > >> & & && " " "" \' ' ''', '< < << > > >> & & && " " "" \' \' \'\'', ), ): assert decode_xmlchars(xmlchars) == chars for i, o in ( (u'*****@*****.**', u'*****@*****.**'), (u'*****@*****.**', u'*****@*****.**'), (u'*****@*****.**', u'*****@*****.**'), (u'*****@*****.**', u'*****@*****.**'), (u'[email protected]', u'[email protected]'), (u'[email protected]', u'[email protected]'), ( '*****@*****.**', '*****@*****.**', ), ( u'*****@*****.**', '*****@*****.**', ), ( '*****@*****.**', '*****@*****.**', ), ( '*****@*****.**', '*****@*****.**', ), ( 'Shrubbery@\xe4\xb8\xad\xe5\x9c\x8b\xe5\x9f\x8e.net', '*****@*****.**', ), ( u'Shrubbery@\u4e2d\u570b\u57ce.net', '*****@*****.**', ), ( '*****@*****.**', '*****@*****.**', ), ( '*****@*****.**', '*****@*****.**', ), ): assert to_utf8(canon_email(i)) == to_utf8(o) for i in ( '', # must be non-empty 'root', # missing '@' '@', # empty hostname and local part 'root@', # empty hostname '@127.0.0.1', # empty local part 'root@localhost', # hostname missing required '.' ): try: canon_email(i) except: pass else: assert "canon_email should have generated an exception"[:0] for i, o in ( ('x' * 6, 'x' * 6), ('x' * 63, 'x' * 63), ('short1', 'short1'), (' strip this ', 'strip-this'), (u'zo\N{LATIN SMALL LETTER E WITH DIAERESIS}hep', 'xn--zohep-osa'), ('xn--zohep-osa', 'xn--zohep-osa'), ): assert to_utf8(canon_username(i)) == to_utf8(o) for i in ( '', # must be non-empty 'x', # too short 'x' * 64, # too long 'way too long' * 256, 'xn--foo', # characters not allowed ): try: canon_username(i) except: pass else: assert "canon_username should have generated an exception"[:0] for i in ( '', # must be non-empty 'xx', # too short 'x' * 64, # too long 'sh', 'way too long' * 256, 'xn--foo', # characters not allowed ): try: canon_username(i, allow_reserved=False) except: pass else: assert "canon_username should have generated an exception"[:0] for i, o in ( ('x' * 6, 'x' * 6), ('x' * 63, 'x' * 63), ('short1', 'short1'), (' strip this ', 'strip-this'), (u'zo\N{LATIN SMALL LETTER E WITH DIAERESIS}hep', 'zo%C3%ABhep'), ('xn--zohep-osa', 'zo%C3%ABhep'), ): assert urlify_username(i) == to_utf8(o) for i in ( '', # must be non-empty 'x', # too short 'x' * 64, # too long 'way too long' * 256, 'xn--foo', # characters not allowed ): try: urlify_username(i) except: pass else: assert "urlify_username should have generated an exception"[:0] for i, o in ( ('', ''), ('x' * 6, u'x' * 6), ('x' * 63, u'x' * 63), ('short1', u'short1'), (' strip this ', u'strip-this'), (u'zo\N{LATIN SMALL LETTER E WITH DIAERESIS}hep', 'zo\xC3\xABhep'), (u'zo\N{LATIN SMALL LETTER E WITH DIAERESIS}hep', 'z0\xC3\xABhep'), (u'zo\N{LATIN SMALL LETTER E WITH DIAERESIS}hep', 'zO\xC3\xABhep'), ('x' * 5, 'x' * 5), ('x' * 64, 'x' * 64), ('short', 'short'), ('way too long' * 256, 'way too long' * 256), ): assert account_name_graphic(i) == account_name_graphic(o) for i in ( 'xn--foo', # characters not allowed 'xn--zohep-osa', ): try: account_name_graphic(i) except: pass else: assert "account_name_graphic should have generated an exception"[: 0] for i, o in ( (u'', u''), ('Hello, world!', u'Hello, world!'), (u'Hello, world!', u'Hello, world!'), (u'\x00', u'\x00'), (u'\x7f', u'\x7f'), (u'\xa0', u'\xa0'), (u'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f', u'\u20ac\xc5\u201a\u0192\u201e\u2026\u2020\u2021\u02c6\u2030\u0160\u2039\u0152\xe7\u017d\xe8\xea\u2018\u2019\u201c\u201d\u2022\u2013\u2014\u02dc\u2122\u0161\u203a\u0153\xf9\u017e\u0178' ), ): assert demoronize(i) == to_utf8(o) for i in ( 'en', 'en-US', 'en-scouse', 'en-US-tx', 'sgn-US-ma', 'i-tsolyani', 'x-37334', ): assert canon_lang(i) == i assert canon_lang('_'.join(i.split('-'))) == i assert canon_lang(i.lower()) == i assert canon_lang(i.upper()) == i assert canon_lang(i.title()) == i pass
def canon_xhtml(value): ''' Canonicalizes an XHTML fragment. FIXME: This should perform XML canonicalization. FIXME: This should preserve explicitly encoded whitespace. ''' if value is not None: xdoc = '<?xml version="1.0" encoding="utf-8"?>\n<div xmlns="http://www.w3.org/1999/xhtml">%s</div>' % to_utf8(value) dom = xml.dom.minidom.parseString(xdoc) dom.normalize() value = to_unicode(''.join([ elt.toxml(encoding = 'utf-8') for elt in dom.documentElement.childNodes ])) return value
r'$\xa3\u20ac\xa5'), (u'<\"\'>&;/! \0\a\b\n\r\t\v\f', ur'<\"\'>&;/! \0\7\b\n\r\t\v\f', r'<\"\'>\x26;/! \x00\x07\x08\n\r\t\x0b\x0c'), (u"// This <hack /> shouldn't close the </script> tag! < / > Nor should this close a <![CDATA[ CDATA section ]]>", ur"// This <hack /> shouldn't close the </script> tag! < / > Nor should this close a <![CDATA[ CDATA section ]]>", r"// This <hack \x2f> shouldn\'t close the <\x2fscript> tag! < / > Nor should this close a <![CDATA[ CDATA section \x5d]>"), ): assert js_unescape(js_escape(i)) == i assert js_escape(i) == o assert js_unescape(o) == i assert js_unescape(o2) == i for i, o_xhtml, o_text in ( ((None, ), None, None), (('', ), '', ''), (('x', ), '<span dir="ltr">x</span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}x\N{POP DIRECTIONAL FORMATTING}')), (('<>&<img />', ), '<span dir="ltr"><>&<img /></span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}<>&<img />\N{POP DIRECTIONAL FORMATTING}')), (('a\nb', ), '<span dir="ltr">a\nb</span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}a\N{POP DIRECTIONAL FORMATTING}\r\n\N{LEFT-TO-RIGHT EMBEDDING}b\N{POP DIRECTIONAL FORMATTING}')), (('a\nb\n', ), '<span dir="ltr">a\nb\n</span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}a\N{POP DIRECTIONAL FORMATTING}\r\n\N{LEFT-TO-RIGHT EMBEDDING}b\N{POP DIRECTIONAL FORMATTING}\r\n')), ((None, 'EN-US', ), None, None), (('', 'EN-US', ), '', ''), (('x', 'EN-US', ), '<span dir="ltr" xml:lang="en-US">x</span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}\N{LANGUAGE TAG}\N{TAG LATIN SMALL LETTER E}\N{TAG LATIN SMALL LETTER N}\N{TAG HYPHEN-MINUS}\N{TAG LATIN CAPITAL LETTER U}\N{TAG LATIN CAPITAL LETTER S}x\N{LANGUAGE TAG}\N{CANCEL TAG}\N{POP DIRECTIONAL FORMATTING}')), (('<>&<img />', 'EN-US', ), '<span dir="ltr" xml:lang="en-US"><>&<img /></span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}\N{LANGUAGE TAG}\N{TAG LATIN SMALL LETTER E}\N{TAG LATIN SMALL LETTER N}\N{TAG HYPHEN-MINUS}\N{TAG LATIN CAPITAL LETTER U}\N{TAG LATIN CAPITAL LETTER S}<>&<img />\N{LANGUAGE TAG}\N{CANCEL TAG}\N{POP DIRECTIONAL FORMATTING}')), (('a\nb', 'EN-US', ), '<span dir="ltr" xml:lang="en-US">a\nb</span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}\N{LANGUAGE TAG}\N{TAG LATIN SMALL LETTER E}\N{TAG LATIN SMALL LETTER N}\N{TAG HYPHEN-MINUS}\N{TAG LATIN CAPITAL LETTER U}\N{TAG LATIN CAPITAL LETTER S}a\N{LANGUAGE TAG}\N{CANCEL TAG}\N{POP DIRECTIONAL FORMATTING}\r\n\N{LEFT-TO-RIGHT EMBEDDING}\N{LANGUAGE TAG}\N{TAG LATIN SMALL LETTER E}\N{TAG LATIN SMALL LETTER N}\N{TAG HYPHEN-MINUS}\N{TAG LATIN CAPITAL LETTER U}\N{TAG LATIN CAPITAL LETTER S}b\N{LANGUAGE TAG}\N{CANCEL TAG}\N{POP DIRECTIONAL FORMATTING}')), (('a\nb\n', 'EN-US', ), '<span dir="ltr" xml:lang="en-US">a\nb\n</span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}\N{LANGUAGE TAG}\N{TAG LATIN SMALL LETTER E}\N{TAG LATIN SMALL LETTER N}\N{TAG HYPHEN-MINUS}\N{TAG LATIN CAPITAL LETTER U}\N{TAG LATIN CAPITAL LETTER S}a\N{LANGUAGE TAG}\N{CANCEL TAG}\N{POP DIRECTIONAL FORMATTING}\r\n\N{LEFT-TO-RIGHT EMBEDDING}\N{LANGUAGE TAG}\N{TAG LATIN SMALL LETTER E}\N{TAG LATIN SMALL LETTER N}\N{TAG HYPHEN-MINUS}\N{TAG LATIN CAPITAL LETTER U}\N{TAG LATIN CAPITAL LETTER S}b\N{LANGUAGE TAG}\N{CANCEL TAG}\N{POP DIRECTIONAL FORMATTING}\r\n')), ((None, None, 'ltr', ), None, None), (('', None, 'ltr', ), '', ''), (('x', None, 'ltr', ), '<span dir="ltr">x</span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}x\N{POP DIRECTIONAL FORMATTING}')), (('<>&<img />', None, 'ltr', ), '<span dir="ltr"><>&<img /></span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}<>&<img />\N{POP DIRECTIONAL FORMATTING}')), (('a\nb', None, 'ltr', ), '<span dir="ltr">a\nb</span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}a\N{POP DIRECTIONAL FORMATTING}\r\n\N{LEFT-TO-RIGHT EMBEDDING}b\N{POP DIRECTIONAL FORMATTING}')), (('a\nb\n', None, 'ltr', ), '<span dir="ltr">a\nb\n</span>', to_utf8(u'\N{LEFT-TO-RIGHT EMBEDDING}a\N{POP DIRECTIONAL FORMATTING}\r\n\N{LEFT-TO-RIGHT EMBEDDING}b\N{POP DIRECTIONAL FORMATTING}\r\n')),