Exemple #1
0
 def test004_get_encoding(self):
     encodings = ['utf-8', 'iso8859-7', 'utf-16', 'utf-16le', 'utf-16be']
     unicode_string = u'Ένα test string for encoding detection purposes\n'
     encoded_strings = []
     for encoding in encodings:
         encoded_strings.append(unicode_string.encode(encoding))
     encodings_detected = []
     for enc_string in encoded_strings:
         encodings_detected.append(get_encoding(enc_string, encodings=encodings))
     self.assertEqual(encodings, encodings_detected)
Exemple #2
0
 def test007_sample_picker_does_not_break_utf16be(self):
     # iso8859-7 doesn't have to be tested because every character is
     # encoded in 1 byte and cannot be broken
     # We test utf-16le and utf-16be because the .rpartition can make the
     # string end up with odd number of bytes(broken)
     # We also test the pick_sample with a broken utf-8 and see if it can
     # fix it
     inp = 'a\nb'
     inp = inp.encode('utf-16be')
     sample = pick_sample(inp)
     encoding = get_encoding(sample)
     assert encoding == 'utf-16be'
Exemple #3
0
 def test007_sample_picker_does_not_break_utf16be(self):
     # iso8859-7 doesn't have to be tested because every character is
     # encoded in 1 byte and cannot be broken
     # We test utf-16le and utf-16be because the .rpartition can make the
     # string end up with odd number of bytes(broken)
     # We also test the pick_sample with a broken utf-8 and see if it can
     # fix it
     inp = u'a\nb'
     inp = inp.encode('utf-16be')
     sample = pick_sample(inp)
     encoding = get_encoding(sample)
     self.assertEqual(encoding, 'utf-16be')
Exemple #4
0
 def test009_sample_picker_fixes_utf_8(self):
     inp = ("The last char is multibyte in utf-8\n"
            "will be trancated ->\u0a01").encode('utf-8')
     broken_inp = inp[:-1]
     #broken_inp is broken and can't be decoded
     isBroken = False
     try:
         broken_inp.decode('utf-8')
     except UnicodeDecodeError:
         isBroken = True
     sample = pick_sample(broken_inp)
     encoding = get_encoding(sample)
     assert (encoding, isBroken) == ('utf-8', True)
Exemple #5
0
 def test009_sample_picker_fixes_utf_8(self):
     inp = (u"The last char is multibyte in utf-8\n"
            u"will be trancated ->\u0a01").encode('utf-8')
     broken_inp = inp[:-1]
     #broken_inp is broken and can't be decoded
     isBroken = False
     try:
         broken_inp.decode('utf-8')
     except UnicodeDecodeError:
         isBroken = True
     sample = pick_sample(broken_inp)
     encoding = get_encoding(sample)
     self.assertEqual((encoding, isBroken), ('utf-8', True))
Exemple #6
0
 def test008_sample_picker_does_not_break_utf16le(self):
     inp = 'test\u0a01input'
     inp = inp.encode('utf-16le')
     sample = pick_sample(inp)
     encoding = get_encoding(sample)
     assert encoding == 'utf-16le'
Exemple #7
0
 def test008_sample_picker_does_not_break_utf16le(self):
     inp = u'test\u0a01input'
     inp = inp.encode('utf-16le')
     sample = pick_sample(inp)
     encoding = get_encoding(sample)
     self.assertEqual(encoding, 'utf-16le')