def process(self, word : str) -> typing.List['MorphWordForm']: """ Обработка одного слова Args: word(str): слово должно быть в верхнем регистре """ if (Utils.isNullOrEmpty(word)): return None res = None if (len(word) > 1): i = 0 while i < len(word): ch = word[i] if (LanguageHelper.isCyrillicVowel(ch) or LanguageHelper.isLatinVowel(ch)): break i += 1 if (i >= len(word)): return res mvs = [ ] tn = self.m_root i = 0 while i <= len(word): if (tn.lazy_pos > 0): self.__loadTreeNode(tn) if (tn.rules is not None): word_begin = None word_end = None if (i == 0): word_end = word elif (i < len(word)): word_end = word[i:] else: word_end = "" if (res is None): res = list() for r in tn.rules: wrapmvs14 = RefOutArgWrapper(None) inoutres15 = Utils.tryGetValue(r.variants, word_end, wrapmvs14) mvs = wrapmvs14.value if (inoutres15): if (word_begin is None): if (i == len(word)): word_begin = word elif (i > 0): word_begin = word[0:0+i] else: word_begin = "" r.processResult(res, word_begin, mvs) if (tn.nodes is None or i >= len(word)): break ch = ord(word[i]) wraptn16 = RefOutArgWrapper(None) inoutres17 = Utils.tryGetValue(tn.nodes, ch, wraptn16) tn = wraptn16.value if (not inoutres17): break i += 1 need_test_unknown_vars = True if (res is not None): for r in res: if ((r.class0_.is_pronoun or r.class0_.is_noun or r.class0_.is_adjective) or (r.class0_.is_misc and r.class0_.is_conjunction) or r.class0_.is_preposition): need_test_unknown_vars = False elif (r.class0_.is_adverb and r.normal_case is not None): if (not LanguageHelper.endsWithEx(r.normal_case, "О", "А", None, None)): need_test_unknown_vars = False elif (r.normal_case == "МНОГО"): need_test_unknown_vars = False elif (r.class0_.is_verb and len(res) > 1): ok = False for rr in res: if (rr != r and rr.class0_ != r.class0_): ok = True break if (ok and not LanguageHelper.endsWith(word, "ИМ")): need_test_unknown_vars = False if (need_test_unknown_vars and LanguageHelper.isCyrillicChar(word[0])): gl = 0 sog = 0 j = 0 while j < len(word): if (LanguageHelper.isCyrillicVowel(word[j])): gl += 1 else: sog += 1 j += 1 if ((gl < 2) or (sog < 2)): need_test_unknown_vars = False if (need_test_unknown_vars and res is not None and len(res) == 1): if (res[0].class0_.is_verb): if ("н.вр." in res[0].misc.attrs and "нес.в." in res[0].misc.attrs and not "страд.з." in res[0].misc.attrs): need_test_unknown_vars = False elif ("б.вр." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): need_test_unknown_vars = False elif (res[0].normal_case is not None and LanguageHelper.endsWith(res[0].normal_case, "СЯ")): need_test_unknown_vars = False if (res[0].class0_.is_undefined and "прдктв." in res[0].misc.attrs): need_test_unknown_vars = False if (need_test_unknown_vars): if (self.m_root_reverce is None): return res tn = self.m_root_reverce tn0 = None for i in range(len(word) - 1, -1, -1): if (tn.lazy_pos > 0): self.__loadTreeNode(tn) ch = ord(word[i]) if (tn.nodes is None): break wrapnext18 = RefOutArgWrapper(None) inoutres19 = Utils.tryGetValue(tn.nodes, ch, wrapnext18) next0_ = wrapnext18.value if (not inoutres19): break tn = next0_ if (tn.lazy_pos > 0): self.__loadTreeNode(tn) if (tn.reverce_variants is not None): tn0 = tn break else: i = -1 if (tn0 is not None): glas = i < 4 while i >= 0: if (LanguageHelper.isCyrillicVowel(word[i]) or LanguageHelper.isLatinVowel(word[i])): glas = True break i -= 1 if (glas): for mv in tn0.reverce_variants: if (((not mv.class0_.is_verb and not mv.class0_.is_adjective and not mv.class0_.is_noun) and not mv.class0_.is_proper_surname and not mv.class0_.is_proper_geo) and not mv.class0_.is_proper_secname): continue ok = False for rr in res: if (rr.is_in_dictionary): if (rr.class0_ == mv.class0_ or rr.class0_.is_noun): ok = True break if (not mv.class0_.is_adjective and rr.class0_.is_verb): ok = True break if (ok): continue if (len(mv.tail) > 0 and not LanguageHelper.endsWith(word, mv.tail)): continue r = MorphWordForm(mv, word) if (not MorphWordForm._hasMorphEquals(res, r)): r.undef_coef = mv.coef if (res is None): res = list() res.append(r) if (word == "ПРИ" and res is not None): for i in range(len(res) - 1, -1, -1): if (res[i].class0_.is_proper_geo): del res[i] else: i = -1 if (res is None or len(res) == 0): return None MorphEngine.__sort(res, word) for v in res: if (v.normal_case is None): v.normal_case = word if (v.class0_.is_verb): if (v.normal_full is None and LanguageHelper.endsWith(v.normal_case, "ТЬСЯ")): v.normal_full = v.normal_case[0:0+len(v.normal_case) - 2] v.language = self.language if (v.class0_.is_preposition): v.normal_case = LanguageHelper.normalizePreposition(v.normal_case) mc = MorphClass() for i in range(len(res) - 1, -1, -1): if (not res[i].is_in_dictionary and res[i].class0_.is_adjective and len(res) > 1): if ("к.ф." in res[i].misc.attrs or "неизм." in res[i].misc.attrs): del res[i] continue if (res[i].is_in_dictionary): mc.value |= res[i].class0_.value else: i = -1 if (mc == MorphClass.VERB and len(res) > 1): for r in res: if (r.undef_coef > (100) and r.class0_ == MorphClass.ADJECTIVE): r.undef_coef = (0) if (len(res) == 0): return None return res
def getNormalCaseText(self, mc: 'MorphClass' = None, single_number: bool = False, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: from pullenti.ner.core.MiscHelper import MiscHelper empty = True if (mc is not None and mc.is_preposition): return LanguageHelper.normalizePreposition(self.term) for it in self.morph.items: if (mc is not None and not mc.is_undefined): cc = (it.class0_.value) & (mc.value) if (cc == 0): continue if (MorphClass.isMiscInt(cc) and not MorphClass.isProperInt(cc) and mc.value != it.class0_.value): continue wf = Utils.asObjectOrNull(it, MorphWordForm) normal_full = False if (gender != MorphGender.UNDEFINED): if ((((it.gender) & (gender))) == (MorphGender.UNDEFINED)): if ((gender == MorphGender.MASCULINE and ((it.gender != MorphGender.UNDEFINED or it.number == MorphNumber.PLURAL)) and wf is not None) and wf.normal_full is not None): normal_full = True elif (gender == MorphGender.MASCULINE and it.class0_.is_personal_pronoun): pass else: continue if (not it.case_.is_undefined): empty = False if (wf is not None): if (single_number and it.number == MorphNumber.PLURAL and wf.normal_full is not None): le = len(wf.normal_case) if ((le == (len(wf.normal_full) + 2) and le > 4 and wf.normal_case[le - 2] == 'С') and wf.normal_case[le - 1] == 'Я'): res = wf.normal_case else: res = (wf.normal_full if normal_full else wf.normal_full) else: res = (wf.normal_full if normal_full else (Utils.ifNotNull(wf.normal_case, self.term))) if (single_number and mc is not None and mc == MorphClass.NOUN): if (res == "ДЕТИ"): res = "РЕБЕНОК" if (keep_chars): if (self.chars.is_all_lower): res = res.lower() elif (self.chars.is_capital_upper): res = MiscHelper.convertFirstCharUpperAndOtherLower( res) return res if (not empty): return None te = None if (single_number and mc is not None): bi = MorphBaseInfo._new549(MorphClass(mc), gender, MorphNumber.SINGULAR, self.morph.language) vars0_ = Morphology.getWordform(self.term, bi) if (vars0_ is not None): te = vars0_ if (self.chars.is_cyrillic_letter and te is None and len(self.term) > 3): ch0 = self.term[len(self.term) - 1] ch1 = self.term[len(self.term) - 2] if (ch0 == 'М' and ((ch1 == 'О' or ch1 == 'А'))): te = self.term[0:0 + len(self.term) - 2] elif (not LanguageHelper.isCyrillicVowel(ch1) and LanguageHelper.isCyrillicVowel(ch0)): te = self.term[0:0 + len(self.term) - 1] if (te is None): te = self.term if (keep_chars): if (self.chars.is_all_lower): return te.lower() elif (self.chars.is_capital_upper): return MiscHelper.convertFirstCharUpperAndOtherLower(te) return te