def test004_get_encoding(self): encodings = ['utf-8', 'iso8859-7', 'utf-16', 'utf-16le', 'utf-16be'] unicode_string = u'Ένα test string for encoding detection purposes\n' encoded_strings = [] for encoding in encodings: encoded_strings.append(unicode_string.encode(encoding)) encodings_detected = [] for enc_string in encoded_strings: encodings_detected.append(get_encoding(enc_string, encodings=encodings)) self.assertEqual(encodings, encodings_detected)
def test007_sample_picker_does_not_break_utf16be(self): # iso8859-7 doesn't have to be tested because every character is # encoded in 1 byte and cannot be broken # We test utf-16le and utf-16be because the .rpartition can make the # string end up with odd number of bytes(broken) # We also test the pick_sample with a broken utf-8 and see if it can # fix it inp = 'a\nb' inp = inp.encode('utf-16be') sample = pick_sample(inp) encoding = get_encoding(sample) assert encoding == 'utf-16be'
def test007_sample_picker_does_not_break_utf16be(self): # iso8859-7 doesn't have to be tested because every character is # encoded in 1 byte and cannot be broken # We test utf-16le and utf-16be because the .rpartition can make the # string end up with odd number of bytes(broken) # We also test the pick_sample with a broken utf-8 and see if it can # fix it inp = u'a\nb' inp = inp.encode('utf-16be') sample = pick_sample(inp) encoding = get_encoding(sample) self.assertEqual(encoding, 'utf-16be')
def test009_sample_picker_fixes_utf_8(self): inp = ("The last char is multibyte in utf-8\n" "will be trancated ->\u0a01").encode('utf-8') broken_inp = inp[:-1] #broken_inp is broken and can't be decoded isBroken = False try: broken_inp.decode('utf-8') except UnicodeDecodeError: isBroken = True sample = pick_sample(broken_inp) encoding = get_encoding(sample) assert (encoding, isBroken) == ('utf-8', True)
def test009_sample_picker_fixes_utf_8(self): inp = (u"The last char is multibyte in utf-8\n" u"will be trancated ->\u0a01").encode('utf-8') broken_inp = inp[:-1] #broken_inp is broken and can't be decoded isBroken = False try: broken_inp.decode('utf-8') except UnicodeDecodeError: isBroken = True sample = pick_sample(broken_inp) encoding = get_encoding(sample) self.assertEqual((encoding, isBroken), ('utf-8', True))
def test008_sample_picker_does_not_break_utf16le(self): inp = 'test\u0a01input' inp = inp.encode('utf-16le') sample = pick_sample(inp) encoding = get_encoding(sample) assert encoding == 'utf-16le'
def test008_sample_picker_does_not_break_utf16le(self): inp = u'test\u0a01input' inp = inp.encode('utf-16le') sample = pick_sample(inp) encoding = get_encoding(sample) self.assertEqual(encoding, 'utf-16le')