Ejemplo n.º 1
0
def js_unescape(s):
    '''
    Unescape a JavaScript or ECMAScript string literal. Note that this
    also decodes \\v, which is not part of the ECMAScript standard.
    '''
    def _js_unescape_char(match):
        return match.group(0).decode('unicode-escape')
    return to_unicode(fix_utf16(_js_unescape_re.sub(_js_unescape_char, to_unicode(s))))
Ejemplo n.º 2
0
def js_unescape(s):
    '''
    Unescape a JavaScript or ECMAScript string literal. Note that this
    also decodes \\v, which is not part of the ECMAScript standard.
    '''
    def _js_unescape_char(match):
        return match.group(0).decode('unicode-escape')

    return to_unicode(
        fix_utf16(_js_unescape_re.sub(_js_unescape_char, to_unicode(s))))
Ejemplo n.º 3
0
 def _tag_line(line):
     line = to_unicode(line)
     if line and xml_lang:
         tag = u''.join([
             unichr(
                 uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') +
                 uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang))
         ])
         line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}'
     return line
Ejemplo n.º 4
0
def embed_text(text, xml_lang=None, xhtml_dir='ltr'):
    '''
    Prepare a span of plain text for embedding in arbitrary other
    plain text.  Each line of the span is wrapped in [RFC2482]
    language tags corresponding to xml_lang (if xml_lang is neither
    empty nor None) and a Unicode bidirectional embedding of the
    xhtml_dir direction (default is "ltr" for left-to-right; "rtl" for
    right-to-left is also allowed.)

    An empty string or None is simply returned.

    The result is returned as a UTF-8 string.

    References

    [RFC2482] Whistler, K. and Adams, G., "Language Tagging in Unicode
    Plain Text", RFC 2482, January 1999.
    '''
    if text is None:
        return None
    if not text:
        return to_utf8(text)
    text = to_unicode(text)

    def _tag_line(line):
        line = to_unicode(line)
        if line and xml_lang:
            tag = u''.join([
                unichr(
                    uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') +
                    uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang))
            ])
            line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}'
        return line

    def _splitlines(text):
        return u'\n'.join(u'\n'.join(
            text.split(u'\r\n')).split('\r')).split('\n')

    return '\r\n'.join([
        (_tag_line(line) and to_utf8(u'%s%s\N{POP DIRECTIONAL FORMATTING}' % (
            {
                'ltr': u'\N{LEFT-TO-RIGHT EMBEDDING}',
                'rtl': u'\N{RIGHT-TO-LEFT EMBEDDING}'
            }[xhtml_dir],
            _tag_line(line),
        )) or to_utf8(line)) for line in _splitlines(to_unicode(text))
    ])
Ejemplo n.º 5
0
def js_escape(s):
    '''
    Escape a UTF-8 or Unicode string for use inside a JavaScript or
    ECMAScript string literal, potentially for embedding in SGML or
    XML PCDATA or CDATA.
    '''
    def _js_escape_char(match):
        ch = match.group(0)
        if match.groupdict()['pyquote']:
            return ch.encode('unicode-escape')
        if match.groupdict()['backslash']:
            return r'\%s' % ch
        if match.groupdict()['empty']:
            return r'\x2f>'
        if match.groupdict()['close']:
            return r'<\x2f'
        if match.groupdict()['ccdata']:
            return r'\x5d]>'
        if match.groupdict()['entityref']:
            return r'\x26'
        och = uniord(ch)
        if och > 0x10ffff:
            assert "Codepoints outside the UTF-16 range are not supported." [:0]
        if och > 0xffff:
            # do UTF-16 encoding for chars outside the BMP
            return r'\u%04.4x\u%04.4x' % (
                ((och - 0x10000) >> 10) | 0xd800,
                ((och - 0x10000) & 0x3ff) | 0xdc00)
        if och > 0xff:
            return r'\u%04.04x' % och
        return r'\x%02.2x' % och
    return to_utf8(_js_escape_re.sub(_js_escape_char, to_unicode(s)))
Ejemplo n.º 6
0
def js_escape(s):
    '''
    Escape a UTF-8 or Unicode string for use inside a JavaScript or
    ECMAScript string literal, potentially for embedding in SGML or
    XML PCDATA or CDATA.
    '''
    def _js_escape_char(match):
        ch = match.group(0)
        if match.groupdict()['pyquote']:
            return ch.encode('unicode-escape')
        if match.groupdict()['backslash']:
            return r'\%s' % ch
        if match.groupdict()['empty']:
            return r'\x2f>'
        if match.groupdict()['close']:
            return r'<\x2f'
        if match.groupdict()['ccdata']:
            return r'\x5d]>'
        if match.groupdict()['entityref']:
            return r'\x26'
        och = uniord(ch)
        if och > 0x10ffff:
            assert "Codepoints outside the UTF-16 range are not supported."[:0]
        if och > 0xffff:
            # do UTF-16 encoding for chars outside the BMP
            return r'\u%04.4x\u%04.4x' % (((och - 0x10000) >> 10) | 0xd800,
                                          ((och - 0x10000) & 0x3ff) | 0xdc00)
        if och > 0xff:
            return r'\u%04.04x' % och
        return r'\x%02.2x' % och

    return to_utf8(_js_escape_re.sub(_js_escape_char, to_unicode(s)))
Ejemplo n.º 7
0
def embed_text(text, xml_lang = None, xhtml_dir = 'ltr'):
    '''
    Prepare a span of plain text for embedding in arbitrary other
    plain text.  Each line of the span is wrapped in [RFC2482]
    language tags corresponding to xml_lang (if xml_lang is neither
    empty nor None) and a Unicode bidirectional embedding of the
    xhtml_dir direction (default is "ltr" for left-to-right; "rtl" for
    right-to-left is also allowed.)

    An empty string or None is simply returned.

    The result is returned as a UTF-8 string.

    References

    [RFC2482] Whistler, K. and Adams, G., "Language Tagging in Unicode
    Plain Text", RFC 2482, January 1999.
    '''
    if text is None:
        return None
    if not text:
        return to_utf8(text)
    text = to_unicode(text)
    def _tag_line(line):
        line = to_unicode(line)
        if line and xml_lang:
            tag = u''.join([ unichr(uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') + uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang)) ])
            line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}'
        return line
    def _splitlines(text):
        return u'\n'.join(u'\n'.join(text.split(u'\r\n')).split('\r')).split('\n')
    return '\r\n'.join(
        [
        (
        _tag_line(line)
        and
        to_utf8(u'%s%s\N{POP DIRECTIONAL FORMATTING}' % (
        { 'ltr': u'\N{LEFT-TO-RIGHT EMBEDDING}',
          'rtl': u'\N{RIGHT-TO-LEFT EMBEDDING}' }[xhtml_dir],
        _tag_line(line),
        ))
        or
        to_utf8(line)
        )
        for line in _splitlines(to_unicode(text))
        ]
        )
Ejemplo n.º 8
0
def verify_text(value):
    '''
    Raises an exception if plain text is not in canonical form.
    '''
    if value is not None:
        value = to_unicode(value)
    if value != canon_text(value):
        raise ValueError("plain text is not in canonical form")
    return value
Ejemplo n.º 9
0
def verify_text(value):
    '''
    Raises an exception if plain text is not in canonical form.
    '''
    if value is not None:
        value = to_unicode(value)
    if value != canon_text(value):
        raise ValueError("plain text is not in canonical form")
    return value
Ejemplo n.º 10
0
def text_to_xhtml(value):
    '''
    Converts plain text to an XHTML fragment.

    Converts C1 control characters as if they were Windows-1252 or
    MacRoman codepoints rather than ISO-8859-1 codepoints.

    Converts characters not allowed in XHTML to the Unicode replacement character.
    '''
    if value is not None:
        value = to_unicode(xml_escape(demoronize(value)))
    return canon_xhtml(value)
Ejemplo n.º 11
0
def text_to_xhtml(value):
    '''
    Converts plain text to an XHTML fragment.

    Converts C1 control characters as if they were Windows-1252 or
    MacRoman codepoints rather than ISO-8859-1 codepoints.

    Converts characters not allowed in XHTML to the Unicode replacement character.
    '''
    if value is not None:
        value = to_unicode(xml_escape(demoronize(value)))
    return canon_xhtml(value)
Ejemplo n.º 12
0
def canon_xhtml(value):
    '''
    Canonicalizes an XHTML fragment.

    FIXME: This should perform XML canonicalization.

    FIXME: This should preserve explicitly encoded whitespace.
    '''
    if value is not None:
        xdoc = '<?xml version="1.0" encoding="utf-8"?>\n<div xmlns="http://www.w3.org/1999/xhtml">%s</div>' % to_utf8(value)
        dom = xml.dom.minidom.parseString(xdoc)
        dom.normalize()
        value = to_unicode(''.join([ elt.toxml(encoding = 'utf-8') for elt in dom.documentElement.childNodes ]))
    return value
Ejemplo n.º 13
0
def xml_escape(s):
    '''
    Serialize a UTF-8 or Unicode string for use inside a UTF-8 XML or XHTML document.

    Characters not allowed in XML or XHTML documents are converted to the Unicode replacement character.

    Whitespace characters are encoded as numeric character references.
    '''
    o = []
    for match in _xml_escapes_re.finditer(to_unicode(fix_xmlutf8(s))):
        if match.group(1):
            o.append(cgi.escape(match.group(0).encode('utf-8', 'replace'), quote = True))
        elif match.group(2):
            o.append('&#%d;' % uniord(match.group(0)))
        else:
            o.append(match.group(0).encode('utf-8', 'replace'))
    return ''.join(o)
Ejemplo n.º 14
0
def canon_xhtml(value):
    '''
    Canonicalizes an XHTML fragment.

    FIXME: This should perform XML canonicalization.

    FIXME: This should preserve explicitly encoded whitespace.
    '''
    if value is not None:
        xdoc = '<?xml version="1.0" encoding="utf-8"?>\n<div xmlns="http://www.w3.org/1999/xhtml">%s</div>' % to_utf8(
            value)
        dom = xml.dom.minidom.parseString(xdoc)
        dom.normalize()
        value = to_unicode(''.join([
            elt.toxml(encoding='utf-8')
            for elt in dom.documentElement.childNodes
        ]))
    return value
Ejemplo n.º 15
0
def xml_escape(s):
    '''
    Serialize a UTF-8 or Unicode string for use inside a UTF-8 XML or XHTML document.

    Characters not allowed in XML or XHTML documents are converted to the Unicode replacement character.

    Whitespace characters are encoded as numeric character references.
    '''
    o = []
    for match in _xml_escapes_re.finditer(to_unicode(fix_xmlutf8(s))):
        if match.group(1):
            o.append(
                cgi.escape(match.group(0).encode('utf-8', 'replace'),
                           quote=True))
        elif match.group(2):
            o.append('&#%d;' % uniord(match.group(0)))
        else:
            o.append(match.group(0).encode('utf-8', 'replace'))
    return ''.join(o)
Ejemplo n.º 16
0
def account_name_graphic(value,
                         prefix_reserved=[u'xn--'],
                         full_name_reserved=[]):
    '''
    Transform an acocunt name for graphic form collation.
    '''
    if value is not None:
        from BTL.canonical.identifier import confuse
        value = to_unicode(value)
        value = unicodedata.normalize('NFC', value)
        value = value.strip()
        value = u'-'.join(value.split())
        value = username_premap_re.sub(lambda m: username_premap[m.group(0)],
                                       value)
        global _account_name_graphic_premap, _account_name_graphic_premap_re
        value = value.strip()
        value = confuse(u'-').join(value.split())
        if _account_name_graphic_premap is None:
            _account_name_graphic_premap = dict([
                (confuse(k), confuse(v))
                for k, v in username_premap.iteritems()
            ])
        if _account_name_graphic_premap_re is None:
            _account_name_graphic_premap_re = re.compile(
                ur'|'.join([
                    ur'(?:%s)' % re.escape(i)
                    for i in _account_name_graphic_premap
                ]), re.UNICODE)
        value = _account_name_graphic_premap_re.sub(
            lambda m: _account_name_graphic_premap[m.group(0)], value)
        value = confuse(value)
        for prefix in prefix_reserved:
            if value.startswith(confuse(prefix)):
                raise ValueError(
                    'The requested user name is reserved.  It starts with something a lot like '
                    + prefix)
        for full_name in full_name_reserved:
            if value == confuse(full_name):
                raise ValueError(
                    'The requested user name is reserved.  It is too much like '
                    + full_name)
    return value
Ejemplo n.º 17
0
 def _tag_line(line):
     line = to_unicode(line)
     if line and xml_lang:
         tag = u''.join([ unichr(uniord(u'\N{TAG LATIN CAPITAL LETTER A}') - uniord(u'A') + uniord(ch)) for ch in to_unicode(_canon_lang(xml_lang)) ])
         line = u'\N{LANGUAGE TAG}' + tag + line + u'\N{LANGUAGE TAG}\N{CANCEL TAG}'
     return line