def normalize(cls, word: Word) -> str: res = word.source.strip().upper() # Remove yer before linebreak unless tagged otherwise if word.tagset.note is not None and not ("+ъ" in word.tagset.note or "+ь" in word.tagset.note): res = cls._replace_yer_before_linebreak(res) # Remove milestones res = re.sub(Milestone.REGEX, "", res) if word.is_cardinal_number(): return word.tagset.pos # Non-spelled out numerals if word.is_ordinal_number(): return str( Number(res.replace("(", "").replace(")", "").replace( " ", ""))) # Spelled-out numerals res = replace_chars( res, characters.latin_special_characters, characters.cyrillic_special_characters, ) for idx in [idx for idx, char in enumerate(res) if char == "V"]: # Izhitsa positional replacement res = res[:idx] + cls._replace_izhitsa(res, idx) + res[idx + 1:] # Orthography normalization res = modif(res, word.tagset.pos if word.tagset is not None else "") return res.replace("#", "").replace("(", "").replace(")", "")
def __get_upos(word: Word) -> str: if word.pos == "сущ": return "PROPN" if word.is_proper else "NOUN" if word.pos in ("прил", "прил/ср", "прил/н", "числ/п"): return "ADJ" if word.pos == "числ" or word.is_cardinal_number(): return "NUM" if word.pos == "мест": return "PRON" # TODO Distinguish DET if word.pos in ("гл", "гл/в", "прич", "прич/в", "инф", "инф/в", "суп"): return "AUX" if word.tagset.role == "св" else "VERB" if word.pos == "нар": return "ADV" if word.pos in ("пред", "посл"): return "ADP" if word.pos == "союз": return "CCONJ" # TODO Distinguish SCONJ if word.pos == "част": return "PART" if word.pos == "межд": return "INTJ" return "X"