def test_control_characters(self): def show(message, category, filename, lineno, file=None, line=None): with assert_raises_regex(EncodingWarning, '.*control character.*'): raise message s = ''.join(map(chr, xrange(32))) with warnings.catch_warnings(): warnings.showwarning = show t = sanitize_utf8(s).decode('UTF-8') assert_equal( t, u'\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD' u'\uFFFD\t\n\uFFFD\uFFFD\r\uFFFD\uFFFD' u'\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD' u'\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD\uFFFD')
def test_non_utf8(self): def show(message, category, filename, lineno, file=None, line=None): with assert_raises_regex(EncodingWarning, '.* invalid continuation byte'): raise message s0 = 'Jeżu klątw, spłódź Finom część gry hańb' good = 'ó' bad = good.decode('UTF-8').encode('ISO-8859-2') s1 = s0.replace(good, bad) s2 = s0.replace(good, u'\N{REPLACEMENT CHARACTER}'.encode('UTF-8')) with warnings.catch_warnings(): warnings.showwarning = show t = sanitize_utf8(s1) assert_equal(s2, t)
def test_utf8(self): s = 'Jeżu klątw, spłódź Finom część gry hańb' with warnings.catch_warnings(): warnings.filterwarnings('error', category=EncodingWarning) t = sanitize_utf8(s) assert_equal(s, t)
def test_ascii(self): s = 'The quick brown fox jumps over the lazy dog' with warnings.catch_warnings(): warnings.filterwarnings('error', category=EncodingWarning) t = sanitize_utf8(s) assert_equal(s, t)