Beispiel #1
0
    def codepoint_from_input(raw_emoji: tuple[str, ...]) -> str:
        """
        Returns the codepoint corresponding to the passed tuple, separated by "-".

        The return format matches the format used in URLs for Twemoji source files.

        Example usages:
        >>> codepoint_from_input(("🐍",))
        "1f40d"
        >>> codepoint_from_input(("1f1f8", "1f1ea"))
        "1f1f8-1f1ea"
        >>> codepoint_from_input(("👨‍👧‍👦",))
        "1f468-200d-1f467-200d-1f466"
        """
        raw_emoji = [emoji.lower() for emoji in raw_emoji]
        if is_emoji(raw_emoji[0]):
            emojis = (Twemoji.codepoint(emoji) or "" for emoji in raw_emoji[0])
            return "-".join(emojis)

        emoji = "".join(
            Twemoji.emoji(Twemoji.trim_code(code)) or "" for code in raw_emoji)
        if is_emoji(emoji):
            return "-".join(Twemoji.codepoint(e) or "" for e in emoji)

        raise ValueError("No codepoint could be obtained from the given input")
Beispiel #2
0
def clean_text(text: str) -> str:
    """Process text so it's ready for syllable counting"""
    # change some characters that are difficult to count syllables for, but keep emojis
    # split on whitespace and rejoin; removes multiple spaces and newlines
    if text is None:
        return text

    # Remove some unicode letters
    text_cleaned = " ".join([
        "".join([
            letter for letter in word
            if letter.encode("unicode-escape").decode() not in UNICODE_IGNORE
        ]) for word in fix_text(text).split()
    ])

    # Decode unicode letters and keep emojis
    text_decoded = " ".join([
        "".join([
            unidecode(letter) if (not emoji.is_emoji(letter)
                                  and letter not in UNICODE_KEEP) else letter
            for letter in word
        ]) for word in text_cleaned.split()
    ])

    return text_decoded
def clean_text(text: str) -> str:
    # Remove token if it contains an ellipsis; assume it is a truncated word
    text_cleaned = " ".join(
        [
            token
            for token in text.split()
            if not (UNICODE_ELLIPSIS in token.encode("unicode-escape").decode())
        ]
    )

    # Remove some unicode letters
    text_cleaned = " ".join(
        [
            "".join(
                [
                    letter
                    for letter in word
                    if letter.encode("unicode-escape").decode() not in UNICODE_IGNORE
                ]
            )
            for word in fix_text(text_cleaned).split()
        ]
    )

    # Decode unicode letters and keep emojis
    text_cleaned = " ".join(
        [
            "".join(
                [
                    unidecode(letter) if not emoji.is_emoji(letter) else letter
                    for letter in word
                ]
            )
            for word in text_cleaned.split()
        ]
    )

    # Normalize unicode letters
    # NFKD: decomposes, NFKC: composes pre-combined characters again
    text_cleaned = unicodedata.normalize("NFKC", text_cleaned)

    # Ensure emojis are surrounded by whitespace
    tokens = split_text(text_cleaned)

    # Clean up punctuation
    tokens = [clean_token(token) for token in tokens]

    return " ".join(tokens)
Beispiel #4
0
def test_is_emoji():
    assert emoji.is_emoji('😁')
    assert not emoji.is_emoji('H')
    assert emoji.is_emoji('🇫🇷')
Beispiel #5
0
def legacy_demojizer(x: str) -> str:
    return "".join(filter(lambda ch: not emoji.is_emoji(ch), x))
Beispiel #6
0
 def __call__(self, form, field) -> None:
     if not emoji.is_emoji(field.data):  # type: ignore[attr-defined]
         raise ValidationError(self.message)