def getUnicode(value, encoding=None, noneToNull=False): """ Returns the unicode representation of the supplied value >>> getUnicode('test') == u'test' True >>> getUnicode(1) == u'1' True >>> getUnicode(None) == 'None' True """ if noneToNull and value is None: return NULL if isinstance(value, six.text_type): return value elif isinstance(value, six.binary_type): # Heuristics (if encoding not explicitly specified) candidates = filterNone( (encoding, kb.get("pageEncoding") if kb.get("originalPage") else None, conf.get("encoding"), UNICODE_ENCODING, sys.getfilesystemencoding())) if all(_ in value for _ in (b'<', b'>')): pass elif any(_ in value for _ in (b":\\", b'/', b'.')) and b'\n' not in value: candidates = filterNone( (encoding, sys.getfilesystemencoding(), kb.get("pageEncoding") if kb.get("originalPage") else None, UNICODE_ENCODING, conf.get("encoding"))) elif conf.get("encoding") and b'\n' not in value: candidates = filterNone( (encoding, conf.get("encoding"), kb.get("pageEncoding") if kb.get("originalPage") else None, sys.getfilesystemencoding(), UNICODE_ENCODING)) for candidate in candidates: try: return six.text_type(value, candidate) except (UnicodeDecodeError, LookupError): pass try: return six.text_type( value, encoding or (kb.get("pageEncoding") if kb.get("originalPage") else None) or UNICODE_ENCODING) except UnicodeDecodeError: return six.text_type(value, UNICODE_ENCODING, errors="reversible") elif isListLike(value): value = list(getUnicode(_, encoding, noneToNull) for _ in value) return value else: try: return six.text_type(value) except UnicodeDecodeError: return six.text_type( str(value), errors="ignore" ) # encoding ignored for non-basestring instances
def to_str(input_, encoding='utf-8', errors='replace'): '''Convert objects to string, encodes to the given encoding :rtype: str >>> to_str('a') b'a' >>> to_str(u'a') b'a' >>> to_str(b'a') b'a' >>> class Foo(object): __str__ = lambda s: u'a' >>> to_str(Foo()) 'a' >>> to_str(Foo) "<class 'python_utils.converters.Foo'>" ''' if isinstance(input_, six.binary_type): pass else: if not hasattr(input_, 'encode'): input_ = six.text_type(input_) input_ = input_.encode(encoding, errors) return input_
def get(self): """ Serialize and return the matched nodes in a single unicode string. Percent encoded content is unquoted. """ try: return etree.tostring(self.root, method=self._tostring_method, encoding='unicode', with_tail=False) except (AttributeError, TypeError): if self.root is True: return u'1' elif self.root is False: return u'0' else: return six.text_type(self.root)
def to_unicode(input_, encoding='utf-8', errors='replace'): '''Convert objects to unicode, if needed decodes string with the given encoding and errors settings. :rtype: unicode >>> to_unicode(b'a') 'a' >>> to_unicode('a') 'a' >>> to_unicode(u'a') 'a' >>> class Foo(object): __str__ = lambda s: u'a' >>> to_unicode(Foo()) 'a' >>> to_unicode(Foo) "<class 'python_utils.converters.Foo'>" ''' if isinstance(input_, six.binary_type): input_ = input_.decode(encoding, errors) else: input_ = six.text_type(input_) return input_
def checkCharEncoding(encoding, warn=True): """ Checks encoding name, repairs common misspellings and adjusts to proper namings used in codecs module >>> checkCharEncoding('iso-8858', False) 'iso8859-1' >>> checkCharEncoding('en_us', False) 'utf8' """ if isinstance(encoding, six.binary_type): encoding = getUnicode(encoding) if isListLike(encoding): encoding = unArrayizeValue(encoding) if encoding: encoding = encoding.lower() else: return encoding # Reference: http://www.destructor.de/charsets/index.htm translate = { "windows-874": "iso-8859-11", "utf-8859-1": "utf8", "en_us": "utf8", "macintosh": "iso-8859-1", "euc_tw": "big5_tw", "th": "tis-620", "unicode": "utf8", "utc8": "utf8", "ebcdic": "ebcdic-cp-be", "iso-8859": "iso8859-1", "iso-8859-0": "iso8859-1", "ansi": "ascii", "gbk2312": "gbk", "windows-31j": "cp932", "en": "us" } for delimiter in (';', ',', '('): if delimiter in encoding: encoding = encoding[:encoding.find(delimiter)].strip() encoding = encoding.replace(""", "") # popular typos/errors if "8858" in encoding: encoding = encoding.replace("8858", "8859") # iso-8858 -> iso-8859 elif "8559" in encoding: encoding = encoding.replace("8559", "8859") # iso-8559 -> iso-8859 elif "8895" in encoding: encoding = encoding.replace("8895", "8859") # iso-8895 -> iso-8859 elif "5889" in encoding: encoding = encoding.replace("5889", "8859") # iso-5889 -> iso-8859 elif "5589" in encoding: encoding = encoding.replace("5589", "8859") # iso-5589 -> iso-8859 elif "2313" in encoding: encoding = encoding.replace("2313", "2312") # gb2313 -> gb2312 elif encoding.startswith("x-"): encoding = encoding[len( "x-"):] # x-euc-kr -> euc-kr / x-mac-turkish -> mac-turkish elif "windows-cp" in encoding: encoding = encoding.replace( "windows-cp", "windows") # windows-cp-1254 -> windows-1254 # name adjustment for compatibility if encoding.startswith("8859"): encoding = "iso-%s" % encoding elif encoding.startswith("cp-"): encoding = "cp%s" % encoding[3:] elif encoding.startswith("euc-"): encoding = "euc_%s" % encoding[4:] elif encoding.startswith( "windows") and not encoding.startswith("windows-"): encoding = "windows-%s" % encoding[7:] elif encoding.find("iso-88") > 0: encoding = encoding[encoding.find("iso-88"):] elif encoding.startswith("is0-"): encoding = "iso%s" % encoding[4:] elif encoding.find("ascii") > 0: encoding = "ascii" elif encoding.find("utf8") > 0: encoding = "utf8" elif encoding.find("utf-8") > 0: encoding = "utf-8" # Reference: http://philip.html5.org/data/charsets-2.html if encoding in translate: encoding = translate[encoding] elif encoding in ("null", "{charset}", "charset", "*") or not re.search(r"\w", encoding): return None # Reference: http://www.iana.org/assignments/character-sets # Reference: http://docs.python.org/library/codecs.html try: codecs.lookup(encoding) except: encoding = None if encoding: try: six.text_type(getBytes(randomStr()), encoding) except: if warn: warnMsg = "invalid web page charset '%s'" % encoding singleTimeLogMessage(warnMsg, logging.WARN, encoding) encoding = None return encoding
def checkCharEncoding(encoding, warn=True): """ Checks encoding name, repairs common misspellings and adjusts to proper namings used in codecs module >>> checkCharEncoding('iso-8858', False) 'iso8859-1' >>> checkCharEncoding('en_us', False) 'utf8' """ if isListLike(encoding): encoding = unArrayizeValue(encoding) if encoding: encoding = encoding.lower() else: return encoding # Reference: http://www.destructor.de/charsets/index.htm translate = {"windows-874": "iso-8859-11", "utf-8859-1": "utf8", "en_us": "utf8", "macintosh": "iso-8859-1", "euc_tw": "big5_tw", "th": "tis-620", "unicode": "utf8", "utc8": "utf8", "ebcdic": "ebcdic-cp-be", "iso-8859": "iso8859-1", "iso-8859-0": "iso8859-1", "ansi": "ascii", "gbk2312": "gbk", "windows-31j": "cp932", "en": "us"} for delimiter in (';', ',', '('): if delimiter in encoding: encoding = encoding[:encoding.find(delimiter)].strip() encoding = encoding.replace(""", "") # popular typos/errors if "8858" in encoding: encoding = encoding.replace("8858", "8859") # iso-8858 -> iso-8859 elif "8559" in encoding: encoding = encoding.replace("8559", "8859") # iso-8559 -> iso-8859 elif "8895" in encoding: encoding = encoding.replace("8895", "8859") # iso-8895 -> iso-8859 elif "5889" in encoding: encoding = encoding.replace("5889", "8859") # iso-5889 -> iso-8859 elif "5589" in encoding: encoding = encoding.replace("5589", "8859") # iso-5589 -> iso-8859 elif "2313" in encoding: encoding = encoding.replace("2313", "2312") # gb2313 -> gb2312 elif encoding.startswith("x-"): encoding = encoding[len("x-"):] # x-euc-kr -> euc-kr / x-mac-turkish -> mac-turkish elif "windows-cp" in encoding: encoding = encoding.replace("windows-cp", "windows") # windows-cp-1254 -> windows-1254 # name adjustment for compatibility if encoding.startswith("8859"): encoding = "iso-%s" % encoding elif encoding.startswith("cp-"): encoding = "cp%s" % encoding[3:] elif encoding.startswith("euc-"): encoding = "euc_%s" % encoding[4:] elif encoding.startswith("windows") and not encoding.startswith("windows-"): encoding = "windows-%s" % encoding[7:] elif encoding.find("iso-88") > 0: encoding = encoding[encoding.find("iso-88"):] elif encoding.startswith("is0-"): encoding = "iso%s" % encoding[4:] elif encoding.find("ascii") > 0: encoding = "ascii" elif encoding.find("utf8") > 0: encoding = "utf8" elif encoding.find("utf-8") > 0: encoding = "utf-8" # Reference: http://philip.html5.org/data/charsets-2.html if encoding in translate: encoding = translate[encoding] elif encoding in ("null", "{charset}", "charset", "*") or not re.search(r"\w", encoding): return None # Reference: http://www.iana.org/assignments/character-sets # Reference: http://docs.python.org/library/codecs.html try: codecs.lookup(encoding) except: encoding = None if encoding: try: six.text_type(getBytes(randomStr()), encoding) except: if warn: warnMsg = "invalid web page charset '%s'" % encoding singleTimeLogMessage(warnMsg, logging.WARN, encoding) encoding = None return encoding