コード例 #1
0
def test_unicode_9():
    # This string is 'bɪg'.upper() in Python 3.6 or later, containing the
    # new codepoint U+A7AE LATIN CAPITAL LETTER SMALL CAPITAL I.
    assert sequence_weirdness("BꞮG") == 0

    # That should be less weird than having a definitely-unassigned character
    # in the string.
    assert sequence_weirdness("B\U00090000G") == 2
コード例 #2
0
def test_unicode_9():
    # This string is 'bɪg'.upper() in Python 3.6 or later, containing the
    # new codepoint U+A7AE LATIN CAPITAL LETTER SMALL CAPITAL I.
    eq_(sequence_weirdness("BꞮG"), 0)

    # That should be less weird than having a definitely-unassigned character
    # in the string.
    eq_(sequence_weirdness("B\U00090000G"), 2)
コード例 #3
0
def test_unicode_10():
    # This string is the word "thalīṃ" in the Zanabazar Square Script,
    # a script added in Unicode 10. These characters are recognized as being
    # assigned by Python 3.7, and therefore ftfy should recognize them on
    # all versions for consistency.
    thalim = "\U00011A1A\U00011A2C\U00011A01\U00011A38"
    assert sequence_weirdness(thalim) == 0
コード例 #4
0
def test_unicode_10():
    # This string is the word "thalīṃ" in the Zanabazar Square Script,
    # a script added in Unicode 10. These characters are recognized as being
    # assigned by Python 3.7, and therefore ftfy should recognize them on
    # all versions for consistency.
    thalim = "\U00011A1A\U00011A2C\U00011A01\U00011A38"
    assert sequence_weirdness(thalim) == 0
コード例 #5
0
def is_mojibake(field):
    """Determines whether a string contains mojibake.

    We commonly deal with CSV files that were *encoded* in UTF-8, but decoded
    as something else like CP-1252 (Windows Latin). This manifests in the form
    of "mojibake", for example:

        - CIAT Publicaçao
        - CIAT Publicación

    This uses the excellent "fixes text for you" (ftfy) library to determine
    whether a string contains characters that have been encoded in one encoding
    and decoded in another.

    Inspired by this code snippet from Martijn Pieters on StackOverflow:
    https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python

    Return boolean.
    """
    if not sequence_weirdness(field):
        # Nothing weird, should be okay
        return False
    try:
        field.encode("sloppy-windows-1252")
    except UnicodeEncodeError:
        # Not CP-1252 encodable, probably fine
        return False
    else:
        # Encodable as CP-1252, Mojibake alert level high
        return True
コード例 #6
0
def test_bmp_characters():
    for index in range(0xa0, 0xfffd):
        char = unichr(index)
        # Exclude code points that are not assigned
        if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn', 'Sk'):
            garble = char.encode('utf-8').decode('latin-1')
            # Exclude characters whose re-encoding is protected by the
            # 'sequence_weirdness' metric
            if sequence_weirdness(garble) >= 0:
                garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1')
                for garb in (garble, garble2):
                    fixed, plan = fix_encoding_and_explain(garb)
                    eq_(fixed, char)
                    eq_(apply_plan(garb, plan), char)
コード例 #7
0
def test_bmp_characters():
    for index in range(0xa0, 0xfffd):
        char = unichr(index)
        # Exclude code points that are not assigned
        if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'):
            garble = char.encode('utf-8').decode('latin-1')
            # Exclude characters whose re-encoding is protected by the
            # 'sequence_weirdness' metric
            if sequence_weirdness(garble) >= 0:
                garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1')
                for garb in (garble, garble2):
                    fixed, plan = fix_encoding_and_explain(garb)
                    eq_(fixed, char)
                    eq_(apply_plan(garb, plan), char)
コード例 #8
0
def test_unicode_11():
    # Unicode 11 has implemented the mtavruli form of the Georgian script.
    # They are analogous to capital letters in that they can be used to
    # emphasize text or write a headline.
    #
    # Python will convert to that form when running .upper() on Georgian text,
    # starting in version 3.7.0. We want to recognize the result as reasonable
    # text on all versions.
    #
    # This text is the mtavruli form of "ქართული ენა", meaning "Georgian
    # language".

    georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ'
    assert sequence_weirdness(georgian_mtavruli_text) == 0

    mojibake = georgian_mtavruli_text.encode('utf-8').decode('sloppy-windows-1252')
    assert fix_encoding(mojibake) == georgian_mtavruli_text
コード例 #9
0
def test_unicode_11():
    # Unicode 11 has implemented the mtavruli form of the Georgian script.
    # They are analogous to capital letters in that they can be used to
    # emphasize text or write a headline.
    #
    # Python will convert to that form when running .upper() on Georgian text,
    # starting in version 3.7.0. We want to recognize the result as reasonable
    # text on all versions.
    #
    # This text is the mtavruli form of "ქართული ენა", meaning "Georgian
    # language".

    georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ'
    assert sequence_weirdness(georgian_mtavruli_text) == 0

    mojibake = georgian_mtavruli_text.encode('utf-8').decode(
        'sloppy-windows-1252')
    assert fix_encoding(mojibake) == georgian_mtavruli_text
コード例 #10
0
def test_emoji_skintone_selector():
    # Dear heuristic, you can't call skin-tone selectors weird anymore.
    # We welcome Santa Clauses of all colors.
    eq_(sequence_weirdness('🎅🏿🎅🏽🎅🏼🎅🏻'), 0)
コード例 #11
0
def test_emoji_variation_selector():
    # The hearts here are explicitly marked as emoji using the variation
    # selector U+FE0F. This is not weird.
    eq_(sequence_weirdness('❤\ufe0f' * 10), 0)
コード例 #12
0
 def is_string_encoding_corrupted(text):
     """Returns True iff the provided text contains corrupted characters"""
     return sequence_weirdness(
         text.encode('sloppy-windows-1252').decode('gb2312', 'replace')) > 0
コード例 #13
0
def test_emoji_skintone_selector():
    # Dear heuristic, you can't call skin-tone selectors weird anymore.
    # We welcome Santa Clauses of all colors.
    assert sequence_weirdness('🎅🏿🎅🏽🎅🏼🎅🏻') == 0
コード例 #14
0
def test_emoji_variation_selector():
    # The hearts here are explicitly marked as emoji using the variation
    # selector U+FE0F. This is not weird.
    assert sequence_weirdness('❤\ufe0f' * 10) == 0
コード例 #15
0
def main():
    ignore_files = [
        "train_correct.txt", "train_error.txt", "validation_correct.txt",
        "validation_error.txt"
    ]
    lines = []
    dirs = glob.glob("*.txt")
    for f in ignore_files:
        dirs.remove(f)
    print("Successfully read lines from the following files:")
    for filename in dirs:
        d = os.path.join(os.getcwd(), filename)
        with open(d, "r", encoding='UTF-8') as f:
            if filename != "nmt_combined.txt":
                for r_l in f:
                    split_l = r_l.split("\t")
                    line = unidecode(split_l[0].strip())
                    lines.append(line)
            else:
                spl_lines = f.read().splitlines()
                for l in spl_lines:
                    lines.append(unidecode(l))

        print(filename)

    lines = list(OrderedDict.fromkeys(lines))  # removes all duplicates
    corrupted_lines = []
    for i in range(len(lines) - 1, -1, -1):  # remove all corrupted lines
        if sequence_weirdness(lines[i]) != 0:
            corrupted_lines.append(lines[i])
            del (lines[i])
    with open("corrupted_lines", "w") as f:
        for i in corrupted_lines:
            f.write(i + "\n")
    print("Removed {} corrupted lines".format(len(corrupted_lines)))
    print("Corrupted lines extracted and written to corrected_lines")
    print("Extracted {} unique clean lines".format(len(lines)))
    #import pdb; pdb.set_trace()
    num_dups = 3
    lines = [val for val in lines
             for _ in range(num_dups)]  # create duplicates

    print("Creating {} duplicates for every clean line".format(num_dups))
    print("Total clean lines now: {}".format(len(lines)))
    #lines = lines[:1000]
    noised_lines = []
    chunk_size = int(max(len(lines) / 10, 1))
    manager = Manager()
    process_dict = manager.dict()
    process_list = []
    for i in range(0, len(lines), chunk_size):
        process_dict[i] = []
        index = min(i + chunk_size, len(lines))
        p = Process(target=noise_generator,
                    args=(lines[i:index], i, process_dict))
        process_list.append(p)

    print("Generating noise - {} processes spawned for noise generation...".
          format(len(process_list)))

    for p in process_list:
        p.start()

    for p in process_list:
        p.join()

    for k, v in process_dict.items():
        noised_lines.extend(v)

    c = list(zip(noised_lines, lines))
    random.shuffle(c)
    noised_lines, lines = zip(*c)

    train_thres = int(len(lines) * (85 / 100))

    with open("validation_error.txt", "w",
              encoding="utf-8") as f, open("validation_correct.txt",
                                           "w",
                                           encoding="utf-8") as f2:
        for l in noised_lines[train_thres:]:
            f.write(l.strip() + "\n")

        for l in lines[train_thres:]:
            f2.write(l.strip() + "\n")

    with open("train_error.txt", "w",
              encoding="utf-8") as f, open("train_correct.txt",
                                           "w",
                                           encoding="utf-8") as f2:
        for l in noised_lines[:train_thres]:
            f.write(l.strip() + "\n")

        for l in lines[:train_thres]:
            f2.write(l.strip() + "\n")

    print(
        "Created the following files: validation_error.txt, validation_correct.txt, train_correct.txt, train_error.txt"
    )