def test_real_tweets(): """ Test with text actually found on Twitter. I collected these test cases by listening to the Twitter streaming API for a million or so tweets, picking out examples with high weirdness according to ftfy version 2, and seeing what ftfy decoded them to. There are some impressive things that can happen to text, even in an ecosystem that is supposedly entirely UTF-8. The tweets that appear in TEST_CASES are the most interesting examples of these, with some trickiness of how to decode them into the actually intended text. """ for orig, target in TEST_CASES: # make sure that the fix_encoding step outputs a plan that we can # successfully run to reproduce its result encoding_fix, plan = fix_encoding_and_explain(orig) eq_(apply_plan(orig, plan), encoding_fix) # make sure we can decode the text as intended eq_(fix_text(orig), target) # make sure we can decode as intended even with an extra layer of badness extra_bad = orig.encode('utf-8').decode('latin-1') eq_(fix_text(extra_bad), target)
def test_real_text(): """ Test with text actually found in the wild (mostly on Twitter). I collected test cases by listening to the Twitter streaming API for a million or so tweets, picking out examples with high weirdness according to ftfy version 2, and seeing what ftfy decoded them to. There are some impressive things that can happen to text, even in an ecosystem that is supposedly entirely UTF-8. TEST_CASES contains the most interesting examples of these, often with some trickiness of how to decode them into the actually intended text. For some reason, sampling Twitter gives no examples of text being accidentally decoded as Windows-1250, even though it's one of the more common encodings and this mojibake has been spotted in the wild. It may be that Windows-1250 is used in places that culturally don't use Twitter much (Central and Eastern Europe), and therefore nobody designs a Twitter app or bot to use Windows-1250. I've collected a couple of examples of Windows-1250 mojibake from elsewhere. """ for orig, target in TEST_CASES: # make sure that the fix_encoding step outputs a plan that we can # successfully run to reproduce its result encoding_fix, plan = fix_encoding_and_explain(orig) eq_(apply_plan(orig, plan), encoding_fix) # make sure we can decode the text as intended eq_(fix_text(orig), target) # make sure we can decode as intended even with an extra layer of badness extra_bad = orig.encode('utf-8').decode('latin-1') eq_(fix_text(extra_bad), target)
def test_all_bmp_characters(): for index in range(0xa0, 0xfffd): char = unichr(index) # Exclude code points that are not assigned if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'): garble = char.encode('utf-8').decode('latin-1') garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1') for garb in (garble, garble2): fixed, plan = fix_encoding_and_explain(garb) eq_(fixed, char) eq_(apply_plan(garb, plan), char)
def test_all_bmp_characters(): for index in range(0xa0, 0xfffd): char = unichr(index) # Exclude code points that are not assigned if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'): garble = char.encode('utf-8').decode('latin-1') if not (index < 0x800 and ENDING_PUNCT_RE.search(garble)): garble2 = char.encode('utf-8').decode('latin-1').encode( 'utf-8').decode('latin-1') for garb in (garble, garble2): fixed, plan = fix_encoding_and_explain(garb) eq_(fixed, char) eq_(apply_plan(garb, plan), char)
def test_bmp_characters(): for index in range(0xa0, 0xfffd): char = unichr(index) # Exclude code points that are not assigned if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn', 'Sk'): garble = char.encode('utf-8').decode('latin-1') # Exclude characters whose re-encoding is protected by the # 'sequence_weirdness' metric if sequence_weirdness(garble) >= 0: garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1') for garb in (garble, garble2): fixed, plan = fix_encoding_and_explain(garb) eq_(fixed, char) eq_(apply_plan(garb, plan), char)
def test_bmp_characters(): for index in range(0xa0, 0xfffd): char = unichr(index) # Exclude code points that are not assigned if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'): garble = char.encode('utf-8').decode('latin-1') # Exclude characters whose re-encoding is protected by the # 'sequence_weirdness' metric if sequence_weirdness(garble) >= 0: garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1') for garb in (garble, garble2): fixed, plan = fix_encoding_and_explain(garb) eq_(fixed, char) eq_(apply_plan(garb, plan), char)
def test_json_example(test_case): # Run one example from the data file orig = test_case['original'] fixed = test_case['fixed'] # Make sure that the fix_encoding step outputs a plan that we can # successfully run to reproduce its result encoding_fix, plan = fix_encoding_and_explain(orig) assert apply_plan(orig, plan) == encoding_fix # Make sure we can decode the text as intended assert fix_text(orig) == fixed assert encoding_fix == test_case.get('fixed-encoding', fixed) # Make sure we can decode as intended even with an extra layer of badness extra_bad = orig.encode('utf-8').decode('latin-1') assert fix_text(extra_bad) == fixed
def check_example(val): # Run one example from the data file orig = val['original'] fixed = val['fixed'] # Make sure that the fix_encoding step outputs a plan that we can # successfully run to reproduce its result encoding_fix, plan = fix_encoding_and_explain(orig) eq_(apply_plan(orig, plan), encoding_fix) # Make sure we can decode the text as intended eq_(fix_text(orig), fixed) eq_(encoding_fix, val.get('fixed-encoding', fixed)) # Make sure we can decode as intended even with an extra layer of badness extra_bad = orig.encode('utf-8').decode('latin-1') eq_(fix_text(extra_bad), fixed)