def preprocess_reddit(infile, outfile): """ Read Reddit text from a JSON-lines file, parse the Markdown, and tag what language each post is in. Filter the posts to enforce _some_ standard of quality: - Posts in English should have score >= 2 (they should have net upvotes) - Other posts should have score >= 1 (no net downvotes) - Posts from subreddits that are banned in 2018 are skipped """ for line in infile: data = json.loads(line) if ('score' in data and 'body' in data and data["score"] is not None and data["score"] >= 1 and data["body"] != "[deleted]"): subreddit = data["subreddit"] subreddit_hash = mmh3.hash(subreddit) if subreddit_hash not in BANNED_SUBREDDITS: md = fix_surrogates( unescape_html(fix_line_breaks(data["body"]))) text = strip_markdown(md) text = text.replace("\n", " ").replace("\u200b", "") text = URL_RE.sub("", text) if text: lang, confident = detect_language(text) if confident: # There are more English posts than we need, so filter them # for score >= 2 if lang != "en" or data["score"] > 1: print(f"{lang}\t{text}", file=outfile)
def fix_text_segment( text, *, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC' ): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def fix_text_segment(text, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, fix_latin_ligatures=True, fix_character_width=True, uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True, normalization='NFC'): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is in a consistent encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_encoding(text) if fix_entities: text = fixes.unescape_html(text) if fix_latin_ligatures: text = fixes.fix_latin_ligatures(text) if fix_character_width: text = fixes.fix_character_width(text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom and not remove_control_chars: # Skip this step if we've already done `remove_control_chars`, # because it would be redundant. text = fixes.remove_bom(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if text == origtext: return text
def fix_text_segment(text, remove_unsafe_private_use=False, fix_entities='auto', remove_terminal_escapes=True, fix_encoding=True, normalization='NFKC', uncurl_quotes=True, fix_line_breaks=True, fix_surrogates=True, remove_control_chars=True, remove_bom=True): """ Apply fixes to text in a single chunk. This could be a line of text within a larger run of `fix_text`, or it could be a larger amount of text that you are certain is all in the same encoding. See `fix_text` for a description of the parameters. """ if isinstance(text, bytes): raise UnicodeError(fixes.BYTES_ERROR_TEXT) if fix_entities == 'auto' and '<' in text and '>' in text: fix_entities = False while True: origtext = text if remove_unsafe_private_use: text = fixes.remove_unsafe_private_use(text) if fix_entities: text = fixes.unescape_html(text) if remove_terminal_escapes: text = fixes.remove_terminal_escapes(text) if fix_encoding: text = fixes.fix_text_encoding(text) if normalization is not None: text = unicodedata.normalize(normalization, text) if uncurl_quotes: text = fixes.uncurl_quotes(text) if fix_line_breaks: text = fixes.fix_line_breaks(text) if fix_surrogates: text = fixes.fix_surrogates(text) if remove_control_chars: text = fixes.remove_control_chars(text) if remove_bom: text = fixes.remove_bom(text) if text == origtext: return text
def preprocess_reddit_lines(input_lines): for line in input_lines: data = json.loads(line) if ('score' in data and 'body' in data and data["score"] is not None and data["score"] >= 2 and data["body"] != "[deleted]" and data["body"] != "[removed]"): subreddit = data["subreddit"].casefold() subreddit_hash = mmh3.hash(subreddit) if subreddit_hash not in BANNED_SUBREDDITS: md = fix_surrogates( unescape_html(fix_line_breaks(data["body"]))) text = strip_markdown(md) text = text.replace("\n", " ").replace("\u200b", "") text = URL_RE.sub("", text) if text: lang, _confidence = detect_language_checked(text) if lang != 'und': # There are more English posts than we need, so filter them # for score >= 3 if lang != "en" or data["score"] > 2: yield (lang, text)
print(fix_text('ünicode')) print(fix_text('<3')) print(fix_text("¯\\_(ã\x83\x84)_/¯")) len(fix_text('')) explain_unicode('ノ( º _ ºノ) 테스트') from ftfy.fixes import fix_encoding, unescape_html, uncurl_quotes, fix_line_breaks, decode_escapes print(fix_encoding('â\x81”.')) print(unescape_html('<hr>')) print(uncurl_quotes('\u201ctest\u201d')) print(fix_line_breaks("1. hello\u2028" "2. world")) factoid = '\\u20a2' print(decode_escapes(factoid)) from ftfy.formatting import character_width, display_center print(character_width('A')) print(character_width('가')) lines = ['Display center', 'center'] for line in lines: print(display_center(line, 20, '▒'))