def test_real_text():
    """
    Test with text actually found in the wild (mostly on Twitter).

    I collected test cases by listening to the Twitter streaming API for
    a million or so tweets, picking out examples with high weirdness according
    to ftfy version 2, and seeing what ftfy decoded them to. There are some
    impressive things that can happen to text, even in an ecosystem that is
    supposedly entirely UTF-8.

    TEST_CASES contains the most interesting examples of these, often with some
    trickiness of how to decode them into the actually intended text.

    For some reason, sampling Twitter gives no examples of text being
    accidentally decoded as Windows-1250, even though it's one of the more
    common encodings and this mojibake has been spotted in the wild. It may be
    that Windows-1250 is used in places that culturally don't use Twitter much
    (Central and Eastern Europe), and therefore nobody designs a Twitter app or
    bot to use Windows-1250. I've collected a couple of examples of
    Windows-1250 mojibake from elsewhere.
    """
    for orig, target in TEST_CASES:
        # make sure that the fix_encoding step outputs a plan that we can
        # successfully run to reproduce its result
        encoding_fix, plan = fix_encoding_and_explain(orig)
        eq_(apply_plan(orig, plan), encoding_fix)

        # make sure we can decode the text as intended
        eq_(fix_text(orig), target)

        # make sure we can decode as intended even with an extra layer of badness
        extra_bad = orig.encode('utf-8').decode('latin-1')
        eq_(fix_text(extra_bad), target)
def test_real_tweets():
    """
    Test with text actually found on Twitter.

    I collected these test cases by listening to the Twitter streaming API for
    a million or so tweets, picking out examples with high weirdness according
    to ftfy version 2, and seeing what ftfy decoded them to. There are some
    impressive things that can happen to text, even in an ecosystem that is
    supposedly entirely UTF-8.

    The tweets that appear in TEST_CASES are the most interesting examples of
    these, with some trickiness of how to decode them into the actually intended
    text.
    """
    for orig, target in TEST_CASES:
        # make sure that the fix_encoding step outputs a plan that we can
        # successfully run to reproduce its result
        encoding_fix, plan = fix_encoding_and_explain(orig)
        eq_(apply_plan(orig, plan), encoding_fix)

        # make sure we can decode the text as intended
        eq_(fix_text(orig), target)

        # make sure we can decode as intended even with an extra layer of badness
        extra_bad = orig.encode('utf-8').decode('latin-1')
        eq_(fix_text(extra_bad), target)
Example #3
0
def test_failing_json_example(test_case):
    # Run an example from the data file that we believe will fail, due to
    # ftfy's heuristic being insufficient
    orig = test_case['original']
    fixed = test_case['fixed']

    encoding_fix, plan = fix_encoding_and_explain(orig)
    assert encoding_fix == test_case.get('fixed-encoding', fixed)
Example #4
0
def test_all_bmp_characters():
    for index in range(0xa0, 0xfffd):
        char = unichr(index)
        # Exclude code points that are not assigned
        if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'):
            garble = char.encode('utf-8').decode('latin-1')
            if not (index < 0x800 and ENDING_PUNCT_RE.search(garble)):
                garble2 = char.encode('utf-8').decode('latin-1').encode(
                    'utf-8').decode('latin-1')
                for garb in (garble, garble2):
                    fixed, plan = fix_encoding_and_explain(garb)
                    eq_(fixed, char)
                    eq_(apply_plan(garb, plan), char)
Example #5
0
def test_bmp_characters():
    for index in range(0xa0, 0xfffd):
        char = unichr(index)
        # Exclude code points that are not assigned
        if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'):
            garble = char.encode('utf-8').decode('latin-1')
            # Exclude characters whose re-encoding is protected by the
            # 'sequence_weirdness' metric
            if sequence_weirdness(garble) >= 0:
                garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1')
                for garb in (garble, garble2):
                    fixed, plan = fix_encoding_and_explain(garb)
                    eq_(fixed, char)
                    eq_(apply_plan(garb, plan), char)
Example #6
0
def test_json_example(test_case):
    # Run one example from the data file
    orig = test_case['original']
    fixed = test_case['fixed']

    # Make sure that the fix_encoding step outputs a plan that we can
    # successfully run to reproduce its result
    encoding_fix, plan = fix_encoding_and_explain(orig)
    assert apply_plan(orig, plan) == encoding_fix

    # Make sure we can decode the text as intended
    assert fix_text(orig) == fixed
    assert encoding_fix == test_case.get('fixed-encoding', fixed)

    # Make sure we can decode as intended even with an extra layer of badness
    extra_bad = orig.encode('utf-8').decode('latin-1')
    assert fix_text(extra_bad) == fixed
Example #7
0
def check_example(val):
    # Run one example from the data file
    orig = val['original']
    fixed = val['fixed']

    # Make sure that the fix_encoding step outputs a plan that we can
    # successfully run to reproduce its result
    encoding_fix, plan = fix_encoding_and_explain(orig)
    eq_(apply_plan(orig, plan), encoding_fix)

    # Make sure we can decode the text as intended
    eq_(fix_text(orig), fixed)
    eq_(encoding_fix, val.get('fixed-encoding', fixed))

    # Make sure we can decode as intended even with an extra layer of badness
    extra_bad = orig.encode('utf-8').decode('latin-1')
    eq_(fix_text(extra_bad), fixed)
Example #8
0
async def handle_request(request):
    s = request.args.getlist('s')
    if s:
        s = s[0].strip()
        fixed, steps = fix_encoding_and_explain(s)
        return response.html(INDEX.format(
            output='<textarea>{}</textarea>'.format(escape(fixed)),
            steps=escape(steps_to_python(s, steps)),
            s=escape(s),
            examples='\n'.join(examples),
        ))
    else:
        return response.html(INDEX.format(
            output='',
            s='',
            steps='',
            examples='\n'.join(examples),
        ))
Example #9
0
async def homepage(request):
    s = request.query_params.getlist("s")
    if s:
        s = s[0].strip()
        fixed, steps = fix_encoding_and_explain(s)
        return HTMLResponse(
            INDEX.format(
                output="<textarea>{}</textarea>".format(escape(fixed)),
                steps=escape(steps_to_python(s, steps)),
                s=escape(s),
                examples="\n".join(examples),
            ))
    else:
        return HTMLResponse(
            INDEX.format(
                output="",
                s="",
                steps="",
                examples="\n".join(examples),
            ))
Example #10
0
    </p>
    <p>
        <input type="submit" value="Figure out encoding errors">
    </p>
</form>
<pre>{steps}</pre>
{output}
<h3>Examples</h3>
{examples}
<p style="font-size: 0.7em">Web app <a href="https://github.com/simonw/ftfy-web">source code on GitHub</a></p>
</html>
"""

examples = ["<ul>"]
for example in EXAMPLES:
    steps = fix_encoding_and_explain(example)[1]
    if steps:
        examples.append('<li><a href="?{}">{}</a></li>'.format(
            urlencode({"s": example}), escape(example)))
examples.append("</ul>")


def steps_to_python(s, steps):
    python = ["s = {}".format(repr(s))]
    lines = []
    has_sloppy = False
    extra_imports = set()
    for method, encoding, _ in steps:
        if method == "transcode":
            extra_imports.add(encoding)
            line = "s = {}(s)".format(encoding)