def sequence_weirdness(text): """ Determine how often a text has unexpected characters or sequences of characters. This metric is used to disambiguate when text should be re-decoded or left as is. We start by normalizing text in NFC form, so that penalties for diacritical marks don't apply to characters that know what to do with them. The following things are deemed weird: - Lowercase letters followed by non-ASCII uppercase letters - Non-Latin characters next to Latin characters - Un-combined diacritical marks, unless they're stacking on non-alphabetic characters (in languages that do that kind of thing a lot) or other marks - C1 control characters - Adjacent symbols from any different pair of these categories: - Modifier marks - Letter modifiers - Non-digit numbers - Symbols (including math and currency) The return value is the number of instances of weirdness. """ text2 = unicodedata.normalize('NFC', text) weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2))) adjustment = len(MOJIBAKE_SYMBOL_RE.findall(text2)) * 2 - len( COMMON_SYMBOL_RE.findall(text2) ) return weirdness * 2 + adjustment
def sequence_weirdness(text): """ Determine how often a text has unexpected characters or sequences of characters. This metric is used to disambiguate when text should be re-decoded or left as is. We start by normalizing text in NFC form, so that penalties for diacritical marks don't apply to characters that know what to do with them. The following things are deemed weird: - Lowercase letters followed by non-ASCII uppercase letters - Non-Latin characters next to Latin characters - Un-combined diacritical marks, unless they're stacking on non-alphabetic characters (in languages that do that kind of thing a lot) or other marks - C1 control characters - Adjacent symbols from any different pair of these categories: - Modifier marks - Letter modifiers - Non-digit numbers - Symbols (including math and currency) The return value is the number of instances of weirdness. """ text2 = unicodedata.normalize('NFC', text) weirdness = len(WEIRDNESS_RE.findall(chars_to_classes(text2))) punct_discount = len(COMMON_SYMBOL_RE.findall(text2)) return weirdness * 2 - punct_discount
def sequence_weirdness(text): """ Determine how often a text has unexpected characters or sequences of characters. This metric is used to disambiguate when text should be re-decoded or left as is. We start by normalizing text in NFC form, so that penalties for diacritical marks don't apply to characters that know what to do with them. """ text2 = unicodedata.normalize('NFC', text) return len(WEIRDNESS_RE.findall(chars_to_classes(text2)))