def codepoint_from_input(raw_emoji: tuple[str, ...]) -> str: """ Returns the codepoint corresponding to the passed tuple, separated by "-". The return format matches the format used in URLs for Twemoji source files. Example usages: >>> codepoint_from_input(("🐍",)) "1f40d" >>> codepoint_from_input(("1f1f8", "1f1ea")) "1f1f8-1f1ea" >>> codepoint_from_input(("👨👧👦",)) "1f468-200d-1f467-200d-1f466" """ raw_emoji = [emoji.lower() for emoji in raw_emoji] if is_emoji(raw_emoji[0]): emojis = (Twemoji.codepoint(emoji) or "" for emoji in raw_emoji[0]) return "-".join(emojis) emoji = "".join( Twemoji.emoji(Twemoji.trim_code(code)) or "" for code in raw_emoji) if is_emoji(emoji): return "-".join(Twemoji.codepoint(e) or "" for e in emoji) raise ValueError("No codepoint could be obtained from the given input")
def clean_text(text: str) -> str: """Process text so it's ready for syllable counting""" # change some characters that are difficult to count syllables for, but keep emojis # split on whitespace and rejoin; removes multiple spaces and newlines if text is None: return text # Remove some unicode letters text_cleaned = " ".join([ "".join([ letter for letter in word if letter.encode("unicode-escape").decode() not in UNICODE_IGNORE ]) for word in fix_text(text).split() ]) # Decode unicode letters and keep emojis text_decoded = " ".join([ "".join([ unidecode(letter) if (not emoji.is_emoji(letter) and letter not in UNICODE_KEEP) else letter for letter in word ]) for word in text_cleaned.split() ]) return text_decoded
def clean_text(text: str) -> str: # Remove token if it contains an ellipsis; assume it is a truncated word text_cleaned = " ".join( [ token for token in text.split() if not (UNICODE_ELLIPSIS in token.encode("unicode-escape").decode()) ] ) # Remove some unicode letters text_cleaned = " ".join( [ "".join( [ letter for letter in word if letter.encode("unicode-escape").decode() not in UNICODE_IGNORE ] ) for word in fix_text(text_cleaned).split() ] ) # Decode unicode letters and keep emojis text_cleaned = " ".join( [ "".join( [ unidecode(letter) if not emoji.is_emoji(letter) else letter for letter in word ] ) for word in text_cleaned.split() ] ) # Normalize unicode letters # NFKD: decomposes, NFKC: composes pre-combined characters again text_cleaned = unicodedata.normalize("NFKC", text_cleaned) # Ensure emojis are surrounded by whitespace tokens = split_text(text_cleaned) # Clean up punctuation tokens = [clean_token(token) for token in tokens] return " ".join(tokens)
def test_is_emoji(): assert emoji.is_emoji('😁') assert not emoji.is_emoji('H') assert emoji.is_emoji('🇫🇷')
def legacy_demojizer(x: str) -> str: return "".join(filter(lambda ch: not emoji.is_emoji(ch), x))
def __call__(self, form, field) -> None: if not emoji.is_emoji(field.data): # type: ignore[attr-defined] raise ValidationError(self.message)