Example #1
0
 def utf8_to_code(text, encoding):
     encoding = string.upper(encoding)
     if encoding == 'UTF-8':
         return text
     from xml.unicode.iso8859 import wstring
     wstring.install_alias('ISO-8859-1', 'ISO_8859-1:1987')
     #Note: Pass through to wstrop.  This means we don't play nice and
     #Escape characters that are not in the target encoding.
     ws = wstring.from_utf8(text)
     text = ws.encode(encoding)
     #This version would skip all untranslatable chars: see wstrop.c
     #text = ws.encode(encoding, 1)
     return text
Example #2
0
 def utf8_to_code(text, encoding):
     encoding = string.upper(encoding)
     if encoding == 'UTF-8':
         return text
     from xml.unicode.iso8859 import wstring
     wstring.install_alias('ISO-8859-1', 'ISO_8859-1:1987')
     #Note: Pass through to wstrop.  This means we don't play nice and
     #Escape characters that are not in the target encoding.
     ws = wstring.from_utf8(text)
     text = ws.encode(encoding)
     #This version would skip all untranslatable chars: see wstrop.c
     #text = ws.encode(encoding, 1)
     return text
Example #3
0
            text = unicode(text, "utf-8")
        return encoder(text)[0] # result,size

    def ConvertChar(m):
        return '&'+HTML_CHARACTER_ENTITIES[ord(m.group())]+';'

    def UseHtmlCharEntities(text):
        if type(text) is not UnicodeType:
            text = unicode(text, "utf-8")
        new_text, num_subst = re.subn(g_htmlUniCharEntityPattern, ConvertChar,
                                      text)
        return new_text

except ImportError:
    from xml.unicode.iso8859 import wstring
    wstring.install_alias('ISO-8859-1', 'ISO_8859-1:1987')

    def utf8_to_code(text, encoding):
        encoding = string.upper(encoding)
        if encoding == 'UTF-8':
            return text
        #Note: Pass through to wstrop.  This means we don't play nice and
        #Escape characters that are not in the target encoding.
        ws = wstring.from_utf8(text)
        text = ws.encode(encoding)
        #This version would skip all untranslatable chars: see wstrop.c
        #text = ws.encode(encoding, 1)
        return text

    def ConvertChar(m):
        char = ((int(ord(m.group(1))) & 0x03) << 6) | (int(ord(m.group(2))) & 0x3F)
Example #4
0
from xml.dom.html import HTML_DTD, HTML_CHARACTER_ENTITIES

DEFAULT_CHARSET = 'ISO-8859-1'

_root = '(?P<root>[a-zA-Z][a-zA-Z0-9]*)'
_quoted = '("[^"]*")|' + "('[^']*')"
_sysId = r'\s*(?P<system%d>' + _quoted + ')'
_pubId = r'\s*PUBLIC\s*(?P<public>' + _quoted + '(' + (_sysId % 1) + ')?)'
_sysId = 'SYSTEM' + (_sysId % 2)
_doctype = re.compile('DOCTYPE ' + _root + '(%s|%s)?' % (_pubId, _sysId), re.I)

try:
    unicode()
except:
    from xml.unicode.iso8859 import wstring
    wstring.install_alias('ISO-8859-1', 'ISO_8859-1:1987')

    def unicode(str, encoding='US-ASCII'):
        """Create a UTF-8 string"""
        try:
            return wstring.decode(string.upper(encoding), str).utf8()
        except:
            return str

    def unichr(char):
        """Create a UTF-8 string from a Unicode character code"""
        try:
            return wstring.chr(char).utf8()
        except:
            return char
Example #5
0
DEFAULT_CHARSET = "ISO-8859-1"

_root = "(?P<root>[a-zA-Z][a-zA-Z0-9]*)"
_quoted = '("[^"]*")|' + "('[^']*')"
_sysId = r"\s*(?P<system%d>" + _quoted + ")"
_pubId = r"\s*PUBLIC\s*(?P<public>" + _quoted + "(" + (_sysId % 1) + ")?)"
_sysId = "SYSTEM" + (_sysId % 2)
_doctype = re.compile("DOCTYPE " + _root + "(%s|%s)?" % (_pubId, _sysId), re.I)

try:
    unicode()
except:
    from xml.unicode.iso8859 import wstring

    wstring.install_alias("ISO-8859-1", "ISO_8859-1:1987")

    def unicode(str, encoding="US-ASCII"):
        """Create a UTF-8 string"""
        try:
            return wstring.decode(string.upper(encoding), str).utf8()
        except:
            return str

    def unichr(char):
        """Create a UTF-8 string from a Unicode character code"""
        try:
            return wstring.chr(char).utf8()
        except:
            return char