Exemple #1
0
    def normalize(cls, word: Word) -> str:
        res = word.source.strip().upper()

        # Remove yer before linebreak unless tagged otherwise
        if word.tagset.note is not None and not ("+ъ" in word.tagset.note
                                                 or "+ь" in word.tagset.note):
            res = cls._replace_yer_before_linebreak(res)

        # Remove milestones
        res = re.sub(Milestone.REGEX, "", res)

        if word.is_cardinal_number():
            return word.tagset.pos  # Non-spelled out numerals
        if word.is_ordinal_number():
            return str(
                Number(res.replace("(", "").replace(")", "").replace(
                    " ", "")))  # Spelled-out numerals

        res = replace_chars(
            res,
            characters.latin_special_characters,
            characters.cyrillic_special_characters,
        )

        for idx in [idx for idx, char in enumerate(res)
                    if char == "V"]:  # Izhitsa positional replacement
            res = res[:idx] + cls._replace_izhitsa(res, idx) + res[idx + 1:]

        # Orthography normalization
        res = modif(res, word.tagset.pos if word.tagset is not None else "")

        return res.replace("#", "").replace("(", "").replace(")", "")
Exemple #2
0
 def __get_upos(word: Word) -> str:
     if word.pos == "сущ":
         return "PROPN" if word.is_proper else "NOUN"
     if word.pos in ("прил", "прил/ср", "прил/н", "числ/п"):
         return "ADJ"
     if word.pos == "числ" or word.is_cardinal_number():
         return "NUM"
     if word.pos == "мест":
         return "PRON"  # TODO Distinguish DET
     if word.pos in ("гл", "гл/в", "прич", "прич/в", "инф", "инф/в", "суп"):
         return "AUX" if word.tagset.role == "св" else "VERB"
     if word.pos == "нар":
         return "ADV"
     if word.pos in ("пред", "посл"):
         return "ADP"
     if word.pos == "союз":
         return "CCONJ"  # TODO Distinguish SCONJ
     if word.pos == "част":
         return "PART"
     if word.pos == "межд":
         return "INTJ"
     return "X"