Beispiel #1
0
    def test_guess_encoding_no_chardet(self):
        # Test that unicode strings are not allowed
        tools.assert_raises(TypeError, misc.guess_encoding, self.u_spanish)

        tools.ok_(misc.guess_encoding(self.utf8_spanish, disable_chardet=True) == 'utf-8')
        tools.ok_(misc.guess_encoding(self.latin1_spanish, disable_chardet=True) == 'latin-1')
        tools.ok_(misc.guess_encoding(self.utf8_japanese, disable_chardet=True) == 'utf-8')
        tools.ok_(misc.guess_encoding(self.euc_jp_japanese, disable_chardet=True) == 'latin-1')
Beispiel #2
0
 def test_guess_encoding_with_chardet(self):
     # We go this slightly roundabout way because multiple encodings can
     # output the same byte sequence.  What we're really interested in is
     # if we can get the original unicode string without knowing the
     # converters beforehand
     tools.ok_(to_unicode(self.utf8_spanish,
         misc.guess_encoding(self.utf8_spanish)) == self.u_spanish)
     tools.ok_(to_unicode(self.latin1_spanish,
         misc.guess_encoding(self.latin1_spanish)) == self.u_spanish)
     tools.ok_(to_unicode(self.utf8_japanese,
         misc.guess_encoding(self.utf8_japanese)) == self.u_japanese)
Beispiel #3
0
 def test_guess_encoding_with_chardet(self):
     # We go this slightly roundabout way because multiple encodings can
     # output the same byte sequence.  What we're really interested in is
     # if we can get the original unicode string without knowing the
     # converters beforehand
     tools.ok_(
         to_unicode(self.utf8_spanish, misc.guess_encoding(
             self.utf8_spanish)) == self.u_spanish)
     tools.ok_(
         to_unicode(self.latin1_spanish,
                    misc.guess_encoding(self.latin1_spanish)) ==
         self.u_spanish)
     tools.ok_(
         to_unicode(self.utf8_japanese,
                    misc.guess_encoding(self.utf8_japanese)) ==
         self.u_japanese)
Beispiel #4
0
    def test_guess_encoding_no_chardet(self):
        # Test that unicode strings are not allowed
        tools.assert_raises(TypeError, misc.guess_encoding, self.u_spanish)

        tools.ok_(
            misc.guess_encoding(self.utf8_spanish, disable_chardet=True) ==
            'utf-8')
        tools.ok_(
            misc.guess_encoding(self.latin1_spanish, disable_chardet=True) ==
            'latin-1')
        tools.ok_(
            misc.guess_encoding(self.utf8_japanese, disable_chardet=True) ==
            'utf-8')
        tools.ok_(
            misc.guess_encoding(self.euc_jp_japanese, disable_chardet=True) ==
            'latin-1')
Beispiel #5
0
 def test_guess_encoding_with_chardet_uninstalled(self):
     if chardet:
         raise SkipTest('chardet installed, euc_jp will not be mangled')
     else:
         tools.ok_(to_unicode(self.euc_jp_japanese,
             misc.guess_encoding(self.euc_jp_japanese)) ==
             self.u_mangled_euc_jp_as_latin1)
Beispiel #6
0
def guess_encoding_to_xml(string, output_encoding='utf-8', attrib=False,
        control_chars='replace'):
    '''Return a byte :class:`bytes` suitable for inclusion in xml

    :arg string: :class:`str` or byte :class:`bytes` to be transformed into
        a byte :class:`bytes` suitable for inclusion in xml.  If string is
        a byte :class:`bytes` we attempt to guess the encoding.  If we cannot guess,
        we fallback to ``latin-1``.
    :kwarg output_encoding: Output encoding for the byte :class:`bytes`.  This
        should match the encoding of your xml file.
    :kwarg attrib: If :data:`True`, escape the item for use in an xml
        attribute.  If :data:`False` (default) escape the item for use in
        a text node.
    :returns: :term:`utf-8` encoded byte :class:`bytes`

    '''
    # Unicode strings can just be run through unicode_to_xml()
    if isunicodestring(string):
        return unicode_to_xml(string, encoding=output_encoding,
                attrib=attrib, control_chars=control_chars)

    # Guess the encoding of the byte strings
    input_encoding = guess_encoding(string)

    # Return the new byte string
    return byte_string_to_xml(string, input_encoding=input_encoding,
            errors='replace', output_encoding=output_encoding,
            attrib=attrib, control_chars=control_chars)
Beispiel #7
0
def guess_encoding_to_xml(string,
                          output_encoding='utf8',
                          attrib=False,
                          control_chars='replace'):
    '''Return a byte string suitable for inclusion in xml

    :arg string: unicode or byte string to be transformed into a byte string
        suitable for inclusion in xml.  If string is a byte string we attempt
        to guess the encoding.  If we cannot guess, we fallback to latin1.
    :kwarg output_encoding: Output encoding for the byte string.  This should
        match the encoding of your xml file.
    :kwarg attrib: If True, escape the item for use in an attribute.  If False
         default) escape the item for use in a text node.
    :returns: utf8 encoded byte string

    '''
    # Unicode strings can just be run through unicode_to_xml()
    if isinstance(string, unicode):
        return unicode_to_xml(string,
                              encoding=output_encoding,
                              attrib=attrib,
                              control_chars=control_chars)

    # Guess the encoding of the byte strings
    input_encoding = guess_encoding(string)

    # Return the new byte string
    return byte_string_to_xml(string,
                              input_encoding=input_encoding,
                              errors='replace',
                              output_encoding=output_encoding,
                              attrib=attrib,
                              control_chars=control_chars)
Beispiel #8
0
 def test_guess_encoding_with_chardet_uninstalled(self):
     if chardet:
         raise SkipTest('chardet installed, euc_jp will not be mangled')
     else:
         tools.ok_(
             to_unicode(self.euc_jp_japanese,
                        misc.guess_encoding(self.euc_jp_japanese)) ==
             self.u_mangled_euc_jp_as_latin1)
Beispiel #9
0
 def test_guess_encoding_with_chardet_installed(self):
     if chardet:
         tools.ok_(
             to_unicode(self.euc_jp_japanese,
                        misc.guess_encoding(self.euc_jp_japanese)) ==
             self.u_japanese)
     else:
         raise SkipTest(
             'chardet not installed, euc_jp will not be guessed correctly')
Beispiel #10
0
 def test_guess_encoding_with_chardet_installed(self):
     if chardet:
         tools.ok_(to_unicode(self.euc_jp_japanese,
             misc.guess_encoding(self.euc_jp_japanese)) == self.u_japanese)
     else:
         raise SkipTest('chardet not installed, euc_jp will not be guessed correctly')