def js_unescape(s): ''' Unescape a JavaScript or ECMAScript string literal. Note that this also decodes \\v, which is not part of the ECMAScript standard. ''' def _js_unescape_char(match): return match.group(0).decode('unicode-escape') return to_unicode(fix_utf16(_js_unescape_re.sub(_js_unescape_char, to_unicode(s))))
def js_unescape(s): ''' Unescape a JavaScript or ECMAScript string literal. Note that this also decodes \\v, which is not part of the ECMAScript standard. ''' def _js_unescape_char(match): return match.group(0).decode('unicode-escape') return to_unicode( fix_utf16(_js_unescape_re.sub(_js_unescape_char, to_unicode(s))))
def _tag_line(line): line = to_unicode(line) if line and xml_lang: tag = u''.join([ unichr( uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') + uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang)) ]) line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}' return line
def embed_text(text, xml_lang=None, xhtml_dir='ltr'): ''' Prepare a span of plain text for embedding in arbitrary other plain text. Each line of the span is wrapped in [RFC2482] language tags corresponding to xml_lang (if xml_lang is neither empty nor None) and a Unicode bidirectional embedding of the xhtml_dir direction (default is "ltr" for left-to-right; "rtl" for right-to-left is also allowed.) An empty string or None is simply returned. The result is returned as a UTF-8 string. References [RFC2482] Whistler, K. and Adams, G., "Language Tagging in Unicode Plain Text", RFC 2482, January 1999. ''' if text is None: return None if not text: return to_utf8(text) text = to_unicode(text) def _tag_line(line): line = to_unicode(line) if line and xml_lang: tag = u''.join([ unichr( uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') + uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang)) ]) line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}' return line def _splitlines(text): return u'\n'.join(u'\n'.join( text.split(u'\r\n')).split('\r')).split('\n') return '\r\n'.join([ (_tag_line(line) and to_utf8(u'%s%s\N{POP DIRECTIONAL FORMATTING}' % ( { 'ltr': u'\N{LEFT-TO-RIGHT EMBEDDING}', 'rtl': u'\N{RIGHT-TO-LEFT EMBEDDING}' }[xhtml_dir], _tag_line(line), )) or to_utf8(line)) for line in _splitlines(to_unicode(text)) ])
def js_escape(s): ''' Escape a UTF-8 or Unicode string for use inside a JavaScript or ECMAScript string literal, potentially for embedding in SGML or XML PCDATA or CDATA. ''' def _js_escape_char(match): ch = match.group(0) if match.groupdict()['pyquote']: return ch.encode('unicode-escape') if match.groupdict()['backslash']: return r'\%s' % ch if match.groupdict()['empty']: return r'\x2f>' if match.groupdict()['close']: return r'<\x2f' if match.groupdict()['ccdata']: return r'\x5d]>' if match.groupdict()['entityref']: return r'\x26' och = uniord(ch) if och > 0x10ffff: assert "Codepoints outside the UTF-16 range are not supported." [:0] if och > 0xffff: # do UTF-16 encoding for chars outside the BMP return r'\u%04.4x\u%04.4x' % ( ((och - 0x10000) >> 10) | 0xd800, ((och - 0x10000) & 0x3ff) | 0xdc00) if och > 0xff: return r'\u%04.04x' % och return r'\x%02.2x' % och return to_utf8(_js_escape_re.sub(_js_escape_char, to_unicode(s)))
def js_escape(s): ''' Escape a UTF-8 or Unicode string for use inside a JavaScript or ECMAScript string literal, potentially for embedding in SGML or XML PCDATA or CDATA. ''' def _js_escape_char(match): ch = match.group(0) if match.groupdict()['pyquote']: return ch.encode('unicode-escape') if match.groupdict()['backslash']: return r'\%s' % ch if match.groupdict()['empty']: return r'\x2f>' if match.groupdict()['close']: return r'<\x2f' if match.groupdict()['ccdata']: return r'\x5d]>' if match.groupdict()['entityref']: return r'\x26' och = uniord(ch) if och > 0x10ffff: assert "Codepoints outside the UTF-16 range are not supported."[:0] if och > 0xffff: # do UTF-16 encoding for chars outside the BMP return r'\u%04.4x\u%04.4x' % (((och - 0x10000) >> 10) | 0xd800, ((och - 0x10000) & 0x3ff) | 0xdc00) if och > 0xff: return r'\u%04.04x' % och return r'\x%02.2x' % och return to_utf8(_js_escape_re.sub(_js_escape_char, to_unicode(s)))
def embed_text(text, xml_lang = None, xhtml_dir = 'ltr'): ''' Prepare a span of plain text for embedding in arbitrary other plain text. Each line of the span is wrapped in [RFC2482] language tags corresponding to xml_lang (if xml_lang is neither empty nor None) and a Unicode bidirectional embedding of the xhtml_dir direction (default is "ltr" for left-to-right; "rtl" for right-to-left is also allowed.) An empty string or None is simply returned. The result is returned as a UTF-8 string. References [RFC2482] Whistler, K. and Adams, G., "Language Tagging in Unicode Plain Text", RFC 2482, January 1999. ''' if text is None: return None if not text: return to_utf8(text) text = to_unicode(text) def _tag_line(line): line = to_unicode(line) if line and xml_lang: tag = u''.join([ unichr(uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') + uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang)) ]) line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}' return line def _splitlines(text): return u'\n'.join(u'\n'.join(text.split(u'\r\n')).split('\r')).split('\n') return '\r\n'.join( [ ( _tag_line(line) and to_utf8(u'%s%s\N{POP DIRECTIONAL FORMATTING}' % ( { 'ltr': u'\N{LEFT-TO-RIGHT EMBEDDING}', 'rtl': u'\N{RIGHT-TO-LEFT EMBEDDING}' }[xhtml_dir], _tag_line(line), )) or to_utf8(line) ) for line in _splitlines(to_unicode(text)) ] )
def verify_text(value): ''' Raises an exception if plain text is not in canonical form. ''' if value is not None: value = to_unicode(value) if value != canon_text(value): raise ValueError("plain text is not in canonical form") return value
def text_to_xhtml(value): ''' Converts plain text to an XHTML fragment. Converts C1 control characters as if they were Windows-1252 or MacRoman codepoints rather than ISO-8859-1 codepoints. Converts characters not allowed in XHTML to the Unicode replacement character. ''' if value is not None: value = to_unicode(xml_escape(demoronize(value))) return canon_xhtml(value)
def canon_xhtml(value): ''' Canonicalizes an XHTML fragment. FIXME: This should perform XML canonicalization. FIXME: This should preserve explicitly encoded whitespace. ''' if value is not None: xdoc = '<?xml version="1.0" encoding="utf-8"?>\n<div xmlns="http://www.w3.org/1999/xhtml">%s</div>' % to_utf8(value) dom = xml.dom.minidom.parseString(xdoc) dom.normalize() value = to_unicode(''.join([ elt.toxml(encoding = 'utf-8') for elt in dom.documentElement.childNodes ])) return value
def xml_escape(s): ''' Serialize a UTF-8 or Unicode string for use inside a UTF-8 XML or XHTML document. Characters not allowed in XML or XHTML documents are converted to the Unicode replacement character. Whitespace characters are encoded as numeric character references. ''' o = [] for match in _xml_escapes_re.finditer(to_unicode(fix_xmlutf8(s))): if match.group(1): o.append(cgi.escape(match.group(0).encode('utf-8', 'replace'), quote = True)) elif match.group(2): o.append('&#%d;' % uniord(match.group(0))) else: o.append(match.group(0).encode('utf-8', 'replace')) return ''.join(o)
def canon_xhtml(value): ''' Canonicalizes an XHTML fragment. FIXME: This should perform XML canonicalization. FIXME: This should preserve explicitly encoded whitespace. ''' if value is not None: xdoc = '<?xml version="1.0" encoding="utf-8"?>\n<div xmlns="http://www.w3.org/1999/xhtml">%s</div>' % to_utf8( value) dom = xml.dom.minidom.parseString(xdoc) dom.normalize() value = to_unicode(''.join([ elt.toxml(encoding='utf-8') for elt in dom.documentElement.childNodes ])) return value
def xml_escape(s): ''' Serialize a UTF-8 or Unicode string for use inside a UTF-8 XML or XHTML document. Characters not allowed in XML or XHTML documents are converted to the Unicode replacement character. Whitespace characters are encoded as numeric character references. ''' o = [] for match in _xml_escapes_re.finditer(to_unicode(fix_xmlutf8(s))): if match.group(1): o.append( cgi.escape(match.group(0).encode('utf-8', 'replace'), quote=True)) elif match.group(2): o.append('&#%d;' % uniord(match.group(0))) else: o.append(match.group(0).encode('utf-8', 'replace')) return ''.join(o)
def account_name_graphic(value, prefix_reserved=[u'xn--'], full_name_reserved=[]): ''' Transform an acocunt name for graphic form collation. ''' if value is not None: from BTL.canonical.identifier import confuse value = to_unicode(value) value = unicodedata.normalize('NFC', value) value = value.strip() value = u'-'.join(value.split()) value = username_premap_re.sub(lambda m: username_premap[m.group(0)], value) global _account_name_graphic_premap, _account_name_graphic_premap_re value = value.strip() value = confuse(u'-').join(value.split()) if _account_name_graphic_premap is None: _account_name_graphic_premap = dict([ (confuse(k), confuse(v)) for k, v in username_premap.iteritems() ]) if _account_name_graphic_premap_re is None: _account_name_graphic_premap_re = re.compile( ur'|'.join([ ur'(?:%s)' % re.escape(i) for i in _account_name_graphic_premap ]), re.UNICODE) value = _account_name_graphic_premap_re.sub( lambda m: _account_name_graphic_premap[m.group(0)], value) value = confuse(value) for prefix in prefix_reserved: if value.startswith(confuse(prefix)): raise ValueError( 'The requested user name is reserved. It starts with something a lot like ' + prefix) for full_name in full_name_reserved: if value == confuse(full_name): raise ValueError( 'The requested user name is reserved. It is too much like ' + full_name) return value
def _tag_line(line): line = to_unicode(line) if line and xml_lang: tag = u''.join([ unichr(uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') + uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang)) ]) line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}' return line