Esempio n. 1
0
def getUnicode(value, encoding=None, noneToNull=False):
    """
    Returns the unicode representation of the supplied value
    >>> getUnicode('test') == u'test'
    True
    >>> getUnicode(1) == u'1'
    True
    >>> getUnicode(None) == 'None'
    True
    """

    if noneToNull and value is None:
        return NULL

    if isinstance(value, six.text_type):
        return value
    elif isinstance(value, six.binary_type):
        # Heuristics (if encoding not explicitly specified)
        candidates = filterNone(
            (encoding,
             kb.get("pageEncoding") if kb.get("originalPage") else None,
             conf.get("encoding"), UNICODE_ENCODING,
             sys.getfilesystemencoding()))
        if all(_ in value for _ in (b'<', b'>')):
            pass
        elif any(_ in value
                 for _ in (b":\\", b'/', b'.')) and b'\n' not in value:
            candidates = filterNone(
                (encoding, sys.getfilesystemencoding(),
                 kb.get("pageEncoding") if kb.get("originalPage") else None,
                 UNICODE_ENCODING, conf.get("encoding")))
        elif conf.get("encoding") and b'\n' not in value:
            candidates = filterNone(
                (encoding, conf.get("encoding"),
                 kb.get("pageEncoding") if kb.get("originalPage") else None,
                 sys.getfilesystemencoding(), UNICODE_ENCODING))

        for candidate in candidates:
            try:
                return six.text_type(value, candidate)
            except (UnicodeDecodeError, LookupError):
                pass

        try:
            return six.text_type(
                value, encoding
                or (kb.get("pageEncoding") if kb.get("originalPage") else None)
                or UNICODE_ENCODING)
        except UnicodeDecodeError:
            return six.text_type(value, UNICODE_ENCODING, errors="reversible")
    elif isListLike(value):
        value = list(getUnicode(_, encoding, noneToNull) for _ in value)
        return value
    else:
        try:
            return six.text_type(value)
        except UnicodeDecodeError:
            return six.text_type(
                str(value), errors="ignore"
            )  # encoding ignored for non-basestring instances
Esempio n. 2
0
def to_str(input_, encoding='utf-8', errors='replace'):
    '''Convert objects to string, encodes to the given encoding

    :rtype: str

    >>> to_str('a')
    b'a'
    >>> to_str(u'a')
    b'a'
    >>> to_str(b'a')
    b'a'
    >>> class Foo(object): __str__ = lambda s: u'a'
    >>> to_str(Foo())
    'a'
    >>> to_str(Foo)
    "<class 'python_utils.converters.Foo'>"
    '''
    if isinstance(input_, six.binary_type):
        pass
    else:
        if not hasattr(input_, 'encode'):
            input_ = six.text_type(input_)

        input_ = input_.encode(encoding, errors)
    return input_
Esempio n. 3
0
 def get(self):
     """
     Serialize and return the matched nodes in a single unicode string.
     Percent encoded content is unquoted.
     """
     try:
         return etree.tostring(self.root,
                               method=self._tostring_method,
                               encoding='unicode',
                               with_tail=False)
     except (AttributeError, TypeError):
         if self.root is True:
             return u'1'
         elif self.root is False:
             return u'0'
         else:
             return six.text_type(self.root)
Esempio n. 4
0
def to_unicode(input_, encoding='utf-8', errors='replace'):
    '''Convert objects to unicode, if needed decodes string with the given
    encoding and errors settings.

    :rtype: unicode

    >>> to_unicode(b'a')
    'a'
    >>> to_unicode('a')
    'a'
    >>> to_unicode(u'a')
    'a'
    >>> class Foo(object): __str__ = lambda s: u'a'
    >>> to_unicode(Foo())
    'a'
    >>> to_unicode(Foo)
    "<class 'python_utils.converters.Foo'>"
    '''
    if isinstance(input_, six.binary_type):
        input_ = input_.decode(encoding, errors)
    else:
        input_ = six.text_type(input_)
    return input_
Esempio n. 5
0
def checkCharEncoding(encoding, warn=True):
    """
    Checks encoding name, repairs common misspellings and adjusts to
    proper namings used in codecs module

    >>> checkCharEncoding('iso-8858', False)
    'iso8859-1'
    >>> checkCharEncoding('en_us', False)
    'utf8'
    """

    if isinstance(encoding, six.binary_type):
        encoding = getUnicode(encoding)

    if isListLike(encoding):
        encoding = unArrayizeValue(encoding)

    if encoding:
        encoding = encoding.lower()
    else:
        return encoding

    # Reference: http://www.destructor.de/charsets/index.htm
    translate = {
        "windows-874": "iso-8859-11",
        "utf-8859-1": "utf8",
        "en_us": "utf8",
        "macintosh": "iso-8859-1",
        "euc_tw": "big5_tw",
        "th": "tis-620",
        "unicode": "utf8",
        "utc8": "utf8",
        "ebcdic": "ebcdic-cp-be",
        "iso-8859": "iso8859-1",
        "iso-8859-0": "iso8859-1",
        "ansi": "ascii",
        "gbk2312": "gbk",
        "windows-31j": "cp932",
        "en": "us"
    }

    for delimiter in (';', ',', '('):
        if delimiter in encoding:
            encoding = encoding[:encoding.find(delimiter)].strip()

    encoding = encoding.replace("&quot", "")

    # popular typos/errors
    if "8858" in encoding:
        encoding = encoding.replace("8858", "8859")  # iso-8858 -> iso-8859
    elif "8559" in encoding:
        encoding = encoding.replace("8559", "8859")  # iso-8559 -> iso-8859
    elif "8895" in encoding:
        encoding = encoding.replace("8895", "8859")  # iso-8895 -> iso-8859
    elif "5889" in encoding:
        encoding = encoding.replace("5889", "8859")  # iso-5889 -> iso-8859
    elif "5589" in encoding:
        encoding = encoding.replace("5589", "8859")  # iso-5589 -> iso-8859
    elif "2313" in encoding:
        encoding = encoding.replace("2313", "2312")  # gb2313 -> gb2312
    elif encoding.startswith("x-"):
        encoding = encoding[len(
            "x-"):]  # x-euc-kr -> euc-kr  /  x-mac-turkish -> mac-turkish
    elif "windows-cp" in encoding:
        encoding = encoding.replace(
            "windows-cp", "windows")  # windows-cp-1254 -> windows-1254

    # name adjustment for compatibility
    if encoding.startswith("8859"):
        encoding = "iso-%s" % encoding
    elif encoding.startswith("cp-"):
        encoding = "cp%s" % encoding[3:]
    elif encoding.startswith("euc-"):
        encoding = "euc_%s" % encoding[4:]
    elif encoding.startswith(
            "windows") and not encoding.startswith("windows-"):
        encoding = "windows-%s" % encoding[7:]
    elif encoding.find("iso-88") > 0:
        encoding = encoding[encoding.find("iso-88"):]
    elif encoding.startswith("is0-"):
        encoding = "iso%s" % encoding[4:]
    elif encoding.find("ascii") > 0:
        encoding = "ascii"
    elif encoding.find("utf8") > 0:
        encoding = "utf8"
    elif encoding.find("utf-8") > 0:
        encoding = "utf-8"

    # Reference: http://philip.html5.org/data/charsets-2.html
    if encoding in translate:
        encoding = translate[encoding]
    elif encoding in ("null", "{charset}", "charset",
                      "*") or not re.search(r"\w", encoding):
        return None

    # Reference: http://www.iana.org/assignments/character-sets
    # Reference: http://docs.python.org/library/codecs.html
    try:
        codecs.lookup(encoding)
    except:
        encoding = None

    if encoding:
        try:
            six.text_type(getBytes(randomStr()), encoding)
        except:
            if warn:
                warnMsg = "invalid web page charset '%s'" % encoding
                singleTimeLogMessage(warnMsg, logging.WARN, encoding)
            encoding = None

    return encoding
Esempio n. 6
0
def checkCharEncoding(encoding, warn=True):
    """
    Checks encoding name, repairs common misspellings and adjusts to
    proper namings used in codecs module

    >>> checkCharEncoding('iso-8858', False)
    'iso8859-1'
    >>> checkCharEncoding('en_us', False)
    'utf8'
    """

    if isListLike(encoding):
        encoding = unArrayizeValue(encoding)

    if encoding:
        encoding = encoding.lower()
    else:
        return encoding

    # Reference: http://www.destructor.de/charsets/index.htm
    translate = {"windows-874": "iso-8859-11", "utf-8859-1": "utf8", "en_us": "utf8", "macintosh": "iso-8859-1", "euc_tw": "big5_tw", "th": "tis-620", "unicode": "utf8", "utc8": "utf8", "ebcdic": "ebcdic-cp-be", "iso-8859": "iso8859-1", "iso-8859-0": "iso8859-1", "ansi": "ascii", "gbk2312": "gbk", "windows-31j": "cp932", "en": "us"}

    for delimiter in (';', ',', '('):
        if delimiter in encoding:
            encoding = encoding[:encoding.find(delimiter)].strip()

    encoding = encoding.replace("&quot", "")

    # popular typos/errors
    if "8858" in encoding:
        encoding = encoding.replace("8858", "8859")  # iso-8858 -> iso-8859
    elif "8559" in encoding:
        encoding = encoding.replace("8559", "8859")  # iso-8559 -> iso-8859
    elif "8895" in encoding:
        encoding = encoding.replace("8895", "8859")  # iso-8895 -> iso-8859
    elif "5889" in encoding:
        encoding = encoding.replace("5889", "8859")  # iso-5889 -> iso-8859
    elif "5589" in encoding:
        encoding = encoding.replace("5589", "8859")  # iso-5589 -> iso-8859
    elif "2313" in encoding:
        encoding = encoding.replace("2313", "2312")  # gb2313 -> gb2312
    elif encoding.startswith("x-"):
        encoding = encoding[len("x-"):]              # x-euc-kr -> euc-kr  /  x-mac-turkish -> mac-turkish
    elif "windows-cp" in encoding:
        encoding = encoding.replace("windows-cp", "windows")  # windows-cp-1254 -> windows-1254

    # name adjustment for compatibility
    if encoding.startswith("8859"):
        encoding = "iso-%s" % encoding
    elif encoding.startswith("cp-"):
        encoding = "cp%s" % encoding[3:]
    elif encoding.startswith("euc-"):
        encoding = "euc_%s" % encoding[4:]
    elif encoding.startswith("windows") and not encoding.startswith("windows-"):
        encoding = "windows-%s" % encoding[7:]
    elif encoding.find("iso-88") > 0:
        encoding = encoding[encoding.find("iso-88"):]
    elif encoding.startswith("is0-"):
        encoding = "iso%s" % encoding[4:]
    elif encoding.find("ascii") > 0:
        encoding = "ascii"
    elif encoding.find("utf8") > 0:
        encoding = "utf8"
    elif encoding.find("utf-8") > 0:
        encoding = "utf-8"

    # Reference: http://philip.html5.org/data/charsets-2.html
    if encoding in translate:
        encoding = translate[encoding]
    elif encoding in ("null", "{charset}", "charset", "*") or not re.search(r"\w", encoding):
        return None

    # Reference: http://www.iana.org/assignments/character-sets
    # Reference: http://docs.python.org/library/codecs.html
    try:
        codecs.lookup(encoding)
    except:
        encoding = None

    if encoding:
        try:
            six.text_type(getBytes(randomStr()), encoding)
        except:
            if warn:
                warnMsg = "invalid web page charset '%s'" % encoding
                singleTimeLogMessage(warnMsg, logging.WARN, encoding)
            encoding = None

    return encoding