Exemple #1
0
def normalize(text,
              lowercase=True,
              collapse=True,
              latinize=False,
              ascii=False,
              decompose=False,
              replace_categories=UNICODE_CATEGORIES):
    """The main normalization function for text.

    This will take a string and apply a set of transformations to it so
    that it can be processed more easily afterwards. Arguments:

    * ``lowercase``: not very mysterious.
    * ``collapse``: replace multiple whitespace-like characters with a
      single whitespace. This is especially useful with category replacement
      which can lead to a lot of whitespace.
    * ``decompose``: apply a unicode normalization (NFKD) to separate
      simple characters and their diacritics.
    * ``replace_categories``: This will perform a replacement of whole
      classes of unicode characters (e.g. symbols, marks, numbers) with a
      given character. It is used to replace any non-text elements of the
      input string.
    """
    if not isinstance(text, six.string_types):
        return

    # TODO: Python 3?
    if six.PY2 and not isinstance(text, six.text_type):
        encoding = guess_encoding(text, 'utf-8')
        text = text.decode(encoding)

    if lowercase:
        # Yeah I made a Python package for this.
        text = text.lower()

    if decompose:
        text = decompose_nfkd(text)

    if ascii:
        # A stricter form of transliteration that leaves only ASCII
        # characters.
        text = ascii_text(text)
    elif latinize:
        # Perform unicode-based transliteration, e.g. of cyricllic
        # or CJK scripts into latin.
        text = latinize_text(text)

    # Perform unicode category-based character replacement. This is
    # used to filter out whole classes of characters, such as symbols,
    # punctuation, or whitespace-like characters.
    text = category_replace(text, replace_categories)

    if collapse:
        # Remove consecutive whitespace.
        text = collapse_spaces(text)

    return text
Exemple #2
0
def reactions_for_message_content(
    content: str,
    emoji_map: Dict[str, discord.Emoji],
    reaction_options: Dict[str, Dict[str, Any]],
) -> List[discord.Emoji]:
    # extract salt 😎
    searchtext = content.replace("ꜞ", "i").replace("\u2006", " ")
    # 6-bit distortion 🎸
    searchtext = ascii_text(searchtext).lower()
    log.debug("searchtext transformed %r -> %r", content, searchtext)
    reactions_by_index = {}
    for ename in set(emoji_map).intersection(set(reaction_options)):
        start = 0
        overrides = reaction_options.get(ename)
        while True:
            try:
                idx = searchtext.index(ename, start)
            except ValueError:
                break
            start = idx + len(ename)
            override_found = False
            if overrides:
                nobefore = overrides.get("nobefore", [])
                noafter = overrides.get("noafter", [])
                for override in nobefore:
                    first = idx - len(override) - 1
                    if first >= 0:
                        searchbefore = searchtext[first:idx]
                        if searchbefore == override + " ":
                            override_found = True
                            break
                if not override_found:
                    for override in noafter:
                        last = start + len(override) + 1
                        if last <= len(searchtext):
                            searchafter = searchtext[start:last]
                            if searchafter == " " + override:
                                override_found = True
                                break
            inside_an_emoji = (idx > 0 and searchtext[idx - 1] == ":"
                               and start < len(searchtext)
                               and searchtext[start] == ":")
            if inside_an_emoji:
                override_found = True
            if idx > 0 and searchtext[idx - 1].isalnum():
                override_found = True
            elif searchtext[start:start + 1].isalnum():
                override_found = True
            if not override_found:
                reactions_by_index[idx] = ename
    reactions = []
    for idx, ename in sorted(reactions_by_index.items()):
        emoji = emoji_map[ename]
        if emoji not in reactions:
            reactions.append(emoji)
    return reactions
Exemple #3
0
def normalize(text: Any,
              lowercase: bool = True,
              collapse: bool = True,
              latinize: bool = False,
              ascii: bool = False,
              encoding_default: Encoding = DEFAULT_ENCODING,
              encoding: Optional[str] = None,
              replace_categories: Categories = UNICODE_CATEGORIES):
    """The main normalization function for text.

    This will take a string and apply a set of transformations to it so
    that it can be processed more easily afterwards. Arguments:

    * ``lowercase``: not very mysterious.
    * ``collapse``: replace multiple whitespace-like characters with a
      single whitespace. This is especially useful with category replacement
      which can lead to a lot of whitespace.
    * ``decompose``: apply a unicode normalization (NFKD) to separate
      simple characters and their diacritics.
    * ``replace_categories``: This will perform a replacement of whole
      classes of unicode characters (e.g. symbols, marks, numbers) with a
      given character. It is used to replace any non-text elements of the
      input string.
    """
    text = stringify(text,
                     encoding_default=encoding_default,
                     encoding=encoding)
    if text is None:
        return

    if lowercase:
        # Yeah I made a Python package for this.
        text = text.lower()

    if ascii:
        # A stricter form of transliteration that leaves only ASCII
        # characters.
        text = ascii_text(text)
    elif latinize:
        # Perform unicode-based transliteration, e.g. of cyricllic
        # or CJK scripts into latin.
        text = latinize_text(text)

    if text is None:
        return

    # Perform unicode category-based character replacement. This is
    # used to filter out whole classes of characters, such as symbols,
    # punctuation, or whitespace-like characters.
    text = category_replace(text, replace_categories)

    if collapse:
        # Remove consecutive whitespace.
        text = collapse_spaces(text)
    return text
Exemple #4
0
def _safe_name(file_name: Optional[str], sep: str) -> Optional[str]:
    """Convert the file name to ASCII and normalize the string."""
    file_name = stringify(file_name)
    if file_name is None:
        return None
    file_name = ascii_text(file_name)
    file_name = category_replace(file_name, UNICODE_CATEGORIES)
    file_name = collapse_spaces(file_name)
    if file_name is None or not len(file_name):
        return None
    return file_name.replace(WS, sep)