def test_unicode_9(): # This string is 'bɪg'.upper() in Python 3.6 or later, containing the # new codepoint U+A7AE LATIN CAPITAL LETTER SMALL CAPITAL I. assert sequence_weirdness("BꞮG") == 0 # That should be less weird than having a definitely-unassigned character # in the string. assert sequence_weirdness("B\U00090000G") == 2
def test_unicode_9(): # This string is 'bɪg'.upper() in Python 3.6 or later, containing the # new codepoint U+A7AE LATIN CAPITAL LETTER SMALL CAPITAL I. eq_(sequence_weirdness("BꞮG"), 0) # That should be less weird than having a definitely-unassigned character # in the string. eq_(sequence_weirdness("B\U00090000G"), 2)
def test_unicode_10(): # This string is the word "thalīṃ" in the Zanabazar Square Script, # a script added in Unicode 10. These characters are recognized as being # assigned by Python 3.7, and therefore ftfy should recognize them on # all versions for consistency. thalim = "\U00011A1A\U00011A2C\U00011A01\U00011A38" assert sequence_weirdness(thalim) == 0
def is_mojibake(field): """Determines whether a string contains mojibake. We commonly deal with CSV files that were *encoded* in UTF-8, but decoded as something else like CP-1252 (Windows Latin). This manifests in the form of "mojibake", for example: - CIAT Publicaçao - CIAT Publicación This uses the excellent "fixes text for you" (ftfy) library to determine whether a string contains characters that have been encoded in one encoding and decoded in another. Inspired by this code snippet from Martijn Pieters on StackOverflow: https://stackoverflow.com/questions/29071995/identify-garbage-unicode-string-using-python Return boolean. """ if not sequence_weirdness(field): # Nothing weird, should be okay return False try: field.encode("sloppy-windows-1252") except UnicodeEncodeError: # Not CP-1252 encodable, probably fine return False else: # Encodable as CP-1252, Mojibake alert level high return True
def test_bmp_characters(): for index in range(0xa0, 0xfffd): char = unichr(index) # Exclude code points that are not assigned if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn', 'Sk'): garble = char.encode('utf-8').decode('latin-1') # Exclude characters whose re-encoding is protected by the # 'sequence_weirdness' metric if sequence_weirdness(garble) >= 0: garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1') for garb in (garble, garble2): fixed, plan = fix_encoding_and_explain(garb) eq_(fixed, char) eq_(apply_plan(garb, plan), char)
def test_bmp_characters(): for index in range(0xa0, 0xfffd): char = unichr(index) # Exclude code points that are not assigned if unicodedata.category(char) not in ('Co', 'Cn', 'Cs', 'Mc', 'Mn'): garble = char.encode('utf-8').decode('latin-1') # Exclude characters whose re-encoding is protected by the # 'sequence_weirdness' metric if sequence_weirdness(garble) >= 0: garble2 = char.encode('utf-8').decode('latin-1').encode('utf-8').decode('latin-1') for garb in (garble, garble2): fixed, plan = fix_encoding_and_explain(garb) eq_(fixed, char) eq_(apply_plan(garb, plan), char)
def test_unicode_11(): # Unicode 11 has implemented the mtavruli form of the Georgian script. # They are analogous to capital letters in that they can be used to # emphasize text or write a headline. # # Python will convert to that form when running .upper() on Georgian text, # starting in version 3.7.0. We want to recognize the result as reasonable # text on all versions. # # This text is the mtavruli form of "ქართული ენა", meaning "Georgian # language". georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ' assert sequence_weirdness(georgian_mtavruli_text) == 0 mojibake = georgian_mtavruli_text.encode('utf-8').decode('sloppy-windows-1252') assert fix_encoding(mojibake) == georgian_mtavruli_text
def test_unicode_11(): # Unicode 11 has implemented the mtavruli form of the Georgian script. # They are analogous to capital letters in that they can be used to # emphasize text or write a headline. # # Python will convert to that form when running .upper() on Georgian text, # starting in version 3.7.0. We want to recognize the result as reasonable # text on all versions. # # This text is the mtavruli form of "ქართული ენა", meaning "Georgian # language". georgian_mtavruli_text = 'ᲥᲐᲠᲗᲣᲚᲘ ᲔᲜᲐ' assert sequence_weirdness(georgian_mtavruli_text) == 0 mojibake = georgian_mtavruli_text.encode('utf-8').decode( 'sloppy-windows-1252') assert fix_encoding(mojibake) == georgian_mtavruli_text
def test_emoji_skintone_selector(): # Dear heuristic, you can't call skin-tone selectors weird anymore. # We welcome Santa Clauses of all colors. eq_(sequence_weirdness('🎅🏿🎅🏽🎅🏼🎅🏻'), 0)
def test_emoji_variation_selector(): # The hearts here are explicitly marked as emoji using the variation # selector U+FE0F. This is not weird. eq_(sequence_weirdness('❤\ufe0f' * 10), 0)
def is_string_encoding_corrupted(text): """Returns True iff the provided text contains corrupted characters""" return sequence_weirdness( text.encode('sloppy-windows-1252').decode('gb2312', 'replace')) > 0
def test_emoji_skintone_selector(): # Dear heuristic, you can't call skin-tone selectors weird anymore. # We welcome Santa Clauses of all colors. assert sequence_weirdness('🎅🏿🎅🏽🎅🏼🎅🏻') == 0
def test_emoji_variation_selector(): # The hearts here are explicitly marked as emoji using the variation # selector U+FE0F. This is not weird. assert sequence_weirdness('❤\ufe0f' * 10) == 0
def main(): ignore_files = [ "train_correct.txt", "train_error.txt", "validation_correct.txt", "validation_error.txt" ] lines = [] dirs = glob.glob("*.txt") for f in ignore_files: dirs.remove(f) print("Successfully read lines from the following files:") for filename in dirs: d = os.path.join(os.getcwd(), filename) with open(d, "r", encoding='UTF-8') as f: if filename != "nmt_combined.txt": for r_l in f: split_l = r_l.split("\t") line = unidecode(split_l[0].strip()) lines.append(line) else: spl_lines = f.read().splitlines() for l in spl_lines: lines.append(unidecode(l)) print(filename) lines = list(OrderedDict.fromkeys(lines)) # removes all duplicates corrupted_lines = [] for i in range(len(lines) - 1, -1, -1): # remove all corrupted lines if sequence_weirdness(lines[i]) != 0: corrupted_lines.append(lines[i]) del (lines[i]) with open("corrupted_lines", "w") as f: for i in corrupted_lines: f.write(i + "\n") print("Removed {} corrupted lines".format(len(corrupted_lines))) print("Corrupted lines extracted and written to corrected_lines") print("Extracted {} unique clean lines".format(len(lines))) #import pdb; pdb.set_trace() num_dups = 3 lines = [val for val in lines for _ in range(num_dups)] # create duplicates print("Creating {} duplicates for every clean line".format(num_dups)) print("Total clean lines now: {}".format(len(lines))) #lines = lines[:1000] noised_lines = [] chunk_size = int(max(len(lines) / 10, 1)) manager = Manager() process_dict = manager.dict() process_list = [] for i in range(0, len(lines), chunk_size): process_dict[i] = [] index = min(i + chunk_size, len(lines)) p = Process(target=noise_generator, args=(lines[i:index], i, process_dict)) process_list.append(p) print("Generating noise - {} processes spawned for noise generation...". format(len(process_list))) for p in process_list: p.start() for p in process_list: p.join() for k, v in process_dict.items(): noised_lines.extend(v) c = list(zip(noised_lines, lines)) random.shuffle(c) noised_lines, lines = zip(*c) train_thres = int(len(lines) * (85 / 100)) with open("validation_error.txt", "w", encoding="utf-8") as f, open("validation_correct.txt", "w", encoding="utf-8") as f2: for l in noised_lines[train_thres:]: f.write(l.strip() + "\n") for l in lines[train_thres:]: f2.write(l.strip() + "\n") with open("train_error.txt", "w", encoding="utf-8") as f, open("train_correct.txt", "w", encoding="utf-8") as f2: for l in noised_lines[:train_thres]: f.write(l.strip() + "\n") for l in lines[:train_thres]: f2.write(l.strip() + "\n") print( "Created the following files: validation_error.txt, validation_correct.txt, train_correct.txt, train_error.txt" )