def utf8_to_code(text, encoding): encoding = string.upper(encoding) if encoding == 'UTF-8': return text from xml.unicode.iso8859 import wstring wstring.install_alias('ISO-8859-1', 'ISO_8859-1:1987') #Note: Pass through to wstrop. This means we don't play nice and #Escape characters that are not in the target encoding. ws = wstring.from_utf8(text) text = ws.encode(encoding) #This version would skip all untranslatable chars: see wstrop.c #text = ws.encode(encoding, 1) return text
text = unicode(text, "utf-8") return encoder(text)[0] # result,size def ConvertChar(m): return '&'+HTML_CHARACTER_ENTITIES[ord(m.group())]+';' def UseHtmlCharEntities(text): if type(text) is not UnicodeType: text = unicode(text, "utf-8") new_text, num_subst = re.subn(g_htmlUniCharEntityPattern, ConvertChar, text) return new_text except ImportError: from xml.unicode.iso8859 import wstring wstring.install_alias('ISO-8859-1', 'ISO_8859-1:1987') def utf8_to_code(text, encoding): encoding = string.upper(encoding) if encoding == 'UTF-8': return text #Note: Pass through to wstrop. This means we don't play nice and #Escape characters that are not in the target encoding. ws = wstring.from_utf8(text) text = ws.encode(encoding) #This version would skip all untranslatable chars: see wstrop.c #text = ws.encode(encoding, 1) return text def ConvertChar(m): char = ((int(ord(m.group(1))) & 0x03) << 6) | (int(ord(m.group(2))) & 0x3F)
from xml.dom.html import HTML_DTD, HTML_CHARACTER_ENTITIES DEFAULT_CHARSET = 'ISO-8859-1' _root = '(?P<root>[a-zA-Z][a-zA-Z0-9]*)' _quoted = '("[^"]*")|' + "('[^']*')" _sysId = r'\s*(?P<system%d>' + _quoted + ')' _pubId = r'\s*PUBLIC\s*(?P<public>' + _quoted + '(' + (_sysId % 1) + ')?)' _sysId = 'SYSTEM' + (_sysId % 2) _doctype = re.compile('DOCTYPE ' + _root + '(%s|%s)?' % (_pubId, _sysId), re.I) try: unicode() except: from xml.unicode.iso8859 import wstring wstring.install_alias('ISO-8859-1', 'ISO_8859-1:1987') def unicode(str, encoding='US-ASCII'): """Create a UTF-8 string""" try: return wstring.decode(string.upper(encoding), str).utf8() except: return str def unichr(char): """Create a UTF-8 string from a Unicode character code""" try: return wstring.chr(char).utf8() except: return char
DEFAULT_CHARSET = "ISO-8859-1" _root = "(?P<root>[a-zA-Z][a-zA-Z0-9]*)" _quoted = '("[^"]*")|' + "('[^']*')" _sysId = r"\s*(?P<system%d>" + _quoted + ")" _pubId = r"\s*PUBLIC\s*(?P<public>" + _quoted + "(" + (_sysId % 1) + ")?)" _sysId = "SYSTEM" + (_sysId % 2) _doctype = re.compile("DOCTYPE " + _root + "(%s|%s)?" % (_pubId, _sysId), re.I) try: unicode() except: from xml.unicode.iso8859 import wstring wstring.install_alias("ISO-8859-1", "ISO_8859-1:1987") def unicode(str, encoding="US-ASCII"): """Create a UTF-8 string""" try: return wstring.decode(string.upper(encoding), str).utf8() except: return str def unichr(char): """Create a UTF-8 string from a Unicode character code""" try: return wstring.chr(char).utf8() except: return char