Example #1
0
def test_real_tweets():
    """
    Test with text actually found on Twitter.

    I collected these test cases by listening to the Twitter streaming API for
    a million or so tweets, picking out examples with high weirdness according
    to ftfy version 2, and seeing what ftfy decoded them to. There are some
    impressive things that can happen to text, even in an ecosystem that is
    supposedly entirely UTF-8.

    The tweets that appear in TEST_CASES are the most interesting examples of
    these, with some trickiness of how to decode them into the actually intended
    text.
    """
    for orig, target in TEST_CASES:
        # make sure that the fix_encoding step outputs a plan that we can
        # successfully run to reproduce its result
        encoding_fix, plan = fix_encoding_and_explain(orig)
        eq_(apply_plan(orig, plan), encoding_fix)

        # make sure we can decode the text as intended
        eq_(fix_text(orig), target)

        # make sure we can decode as intended even with an extra layer of badness
        extra_bad = orig.encode('utf-8').decode('latin-1')
        eq_(fix_text(extra_bad), target)
Example #2
0
def test_real_text():
    """
    Test with text actually found in the wild (mostly on Twitter).

    I collected test cases by listening to the Twitter streaming API for
    a million or so tweets, picking out examples with high weirdness according
    to ftfy version 2, and seeing what ftfy decoded them to. There are some
    impressive things that can happen to text, even in an ecosystem that is
    supposedly entirely UTF-8.

    TEST_CASES contains the most interesting examples of these, often with some
    trickiness of how to decode them into the actually intended text.

    For some reason, sampling Twitter gives no examples of text being
    accidentally decoded as Windows-1250, even though it's one of the more
    common encodings and this mojibake has been spotted in the wild. It may be
    that Windows-1250 is used in places that culturally don't use Twitter much
    (Central and Eastern Europe), and therefore nobody designs a Twitter app or
    bot to use Windows-1250. I've collected a couple of examples of
    Windows-1250 mojibake from elsewhere.
    """
    for orig, target in TEST_CASES:
        # make sure that the fix_encoding step outputs a plan that we can
        # successfully run to reproduce its result
        encoding_fix, plan = fix_encoding_and_explain(orig)
        eq_(apply_plan(orig, plan), encoding_fix)

        # make sure we can decode the text as intended
        eq_(fix_text(orig), target)

        # make sure we can decode as intended even with an extra layer of badness
        extra_bad = orig.encode('utf-8').decode('latin-1')
        eq_(fix_text(extra_bad), target)
def test_real_tweets():
    """
    Test with text actually found on Twitter.

    I collected these test cases by listening to the Twitter streaming API for
    a million or so tweets, picking out examples with high weirdness according
    to ftfy version 2, and seeing what ftfy decoded them to. There are some
    impressive things that can happen to text, even in an ecosystem that is
    supposedly entirely UTF-8.

    The tweets that appear in TEST_CASES are the most interesting examples of
    these, with some trickiness of how to decode them into the actually intended
    text.
    """
    for orig, target in TEST_CASES:
        # make sure that the fix_encoding step outputs a plan that we can
        # successfully run to reproduce its result
        encoding_fix, plan = fix_encoding_and_explain(orig)
        eq_(apply_plan(orig, plan), encoding_fix)

        # make sure we can decode the text as intended
        eq_(fix_text(orig), target)

        # make sure we can decode as intended even with an extra layer of badness
        extra_bad = orig.encode('utf-8').decode('latin-1')
        eq_(fix_text(extra_bad), target)
def test_real_text():
    """
    Test with text actually found in the wild (mostly on Twitter).

    I collected test cases by listening to the Twitter streaming API for
    a million or so tweets, picking out examples with high weirdness according
    to ftfy version 2, and seeing what ftfy decoded them to. There are some
    impressive things that can happen to text, even in an ecosystem that is
    supposedly entirely UTF-8.

    TEST_CASES contains the most interesting examples of these, often with some
    trickiness of how to decode them into the actually intended text.

    For some reason, sampling Twitter gives no examples of text being
    accidentally decoded as Windows-1250, even though it's one of the more
    common encodings and this mojibake has been spotted in the wild. It may be
    that Windows-1250 is used in places that culturally don't use Twitter much
    (Central and Eastern Europe), and therefore nobody designs a Twitter app or
    bot to use Windows-1250. I've collected a couple of examples of
    Windows-1250 mojibake from elsewhere.
    """
    for orig, target in TEST_CASES:
        # make sure that the fix_encoding step outputs a plan that we can
        # successfully run to reproduce its result
        encoding_fix, plan = fix_encoding_and_explain(orig)
        eq_(apply_plan(orig, plan), encoding_fix)

        # make sure we can decode the text as intended
        eq_(fix_text(orig), target)

        # make sure we can decode as intended even with an extra layer of badness
        extra_bad = orig.encode('utf-8').decode('latin-1')
        eq_(fix_text(extra_bad), target)
Example #5
0
def test_all_bmp_characters():
    for index in range(0xa0, 0xfffd):
        char = unichr(index)
        # Exclude code points that are not assigned
        if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'):
            garble = char.encode('utf-8').decode('latin-1')
            garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1')
            for garb in (garble, garble2):
                fixed, plan = fix_encoding_and_explain(garb)
                eq_(fixed, char)
                eq_(apply_plan(garb, plan), char)
Example #6
0
def test_all_bmp_characters():
    for index in range(0xa0, 0xfffd):
        char = unichr(index)
        # Exclude code points that are not assigned
        if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'):
            garble = char.encode('utf-8').decode('latin-1')
            if not (index < 0x800 and ENDING_PUNCT_RE.search(garble)):
                garble2 = char.encode('utf-8').decode('latin-1').encode(
                    'utf-8').decode('latin-1')
                for garb in (garble, garble2):
                    fixed, plan = fix_encoding_and_explain(garb)
                    eq_(fixed, char)
                    eq_(apply_plan(garb, plan), char)
Example #7
0
def test_bmp_characters():
    for index in range(0xa0, 0xfffd):
        char = unichr(index)
        # Exclude code points that are not assigned
        if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn', 'Sk'):
            garble = char.encode('utf-8').decode('latin-1')
            # Exclude characters whose re-encoding is protected by the
            # 'sequence_weirdness' metric
            if sequence_weirdness(garble) >= 0:
                garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1')
                for garb in (garble, garble2):
                    fixed, plan = fix_encoding_and_explain(garb)
                    eq_(fixed, char)
                    eq_(apply_plan(garb, plan), char)
Example #8
0
def test_bmp_characters():
    for index in range(0xa0, 0xfffd):
        char = unichr(index)
        # Exclude code points that are not assigned
        if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'):
            garble = char.encode('utf-8').decode('latin-1')
            # Exclude characters whose re-encoding is protected by the
            # 'sequence_weirdness' metric
            if sequence_weirdness(garble) >= 0:
                garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1')
                for garb in (garble, garble2):
                    fixed, plan = fix_encoding_and_explain(garb)
                    eq_(fixed, char)
                    eq_(apply_plan(garb, plan), char)
Example #9
0
def test_json_example(test_case):
    # Run one example from the data file
    orig = test_case['original']
    fixed = test_case['fixed']

    # Make sure that the fix_encoding step outputs a plan that we can
    # successfully run to reproduce its result
    encoding_fix, plan = fix_encoding_and_explain(orig)
    assert apply_plan(orig, plan) == encoding_fix

    # Make sure we can decode the text as intended
    assert fix_text(orig) == fixed
    assert encoding_fix == test_case.get('fixed-encoding', fixed)

    # Make sure we can decode as intended even with an extra layer of badness
    extra_bad = orig.encode('utf-8').decode('latin-1')
    assert fix_text(extra_bad) == fixed
Example #10
0
def check_example(val):
    # Run one example from the data file
    orig = val['original']
    fixed = val['fixed']

    # Make sure that the fix_encoding step outputs a plan that we can
    # successfully run to reproduce its result
    encoding_fix, plan = fix_encoding_and_explain(orig)
    eq_(apply_plan(orig, plan), encoding_fix)

    # Make sure we can decode the text as intended
    eq_(fix_text(orig), fixed)
    eq_(encoding_fix, val.get('fixed-encoding', fixed))

    # Make sure we can decode as intended even with an extra layer of badness
    extra_bad = orig.encode('utf-8').decode('latin-1')
    eq_(fix_text(extra_bad), fixed)