def __calc_coef(wf: 'MorphWordForm') -> int: k = 0 if (not wf.case_.is_undefined): k += 1 if (wf.gender != MorphGender.UNDEFINED): k += 1 if (wf.number != MorphNumber.UNDEFINED): k += 1 if (wf.misc.is_synonym_form): k -= 3 if (wf.normal_case is None or (len(wf.normal_case) < 4)): return k if (wf.class0_.is_adjective and wf.number != MorphNumber.PLURAL): last = wf.normal_case[len(wf.normal_case) - 1] last1 = wf.normal_case[len(wf.normal_case) - 2] ok = False if (wf.gender == MorphGender.FEMINIE): if (last == 'Я'): ok = True if (wf.gender == MorphGender.MASCULINE): if (last == 'Й'): if (last1 == 'И'): k += 1 ok = True if (wf.gender == MorphGender.NEUTER): if (last == 'Е'): ok = True if (ok): if (LanguageHelper.is_cyrillic_vowel(last1)): k += 1 elif (wf.class0_.is_adjective and wf.number == MorphNumber.PLURAL): last = wf.normal_case[len(wf.normal_case) - 1] last1 = wf.normal_case[len(wf.normal_case) - 2] if (last == 'Й' or last == 'Е'): k += 1 return k
def find(self, word : str, try_create : bool, lang_ : 'MorphLang') -> typing.List['DerivateGroup']: if (Utils.isNullOrEmpty(word)): return None tn = self._m_root i = 0 i = 0 while i < len(word): k = ord(word[i]) if (tn.nodes is None): break if (not k in tn.nodes): break tn = tn.nodes[k] if (tn.lazy_pos > 0): self.__load_tree_node(tn) i += 1 li = None if (i >= len(word) and tn.groups is not None): li = list() for g in tn.groups: li.append(self._get_group(g)) gen = False nogen = False for g in li: if (g.is_generated): gen = True else: nogen = True if (gen and nogen): for i in range(len(li) - 1, -1, -1): if (li[i].is_generated): del li[i] else: i = -1 if (li is not None and lang_ is not None and not lang_.is_undefined): for i in range(len(li) - 1, -1, -1): if (not li[i].contains_word(word, lang_)): del li[i] else: i = -1 if (li is not None and len(li) > 0): return li if (len(word) < 4): return None ch0 = word[len(word) - 1] ch1 = word[len(word) - 2] ch2 = word[len(word) - 3] if (ch0 == 'О' or ((ch0 == 'И' and ch1 == 'К'))): word1 = word[0:0+len(word) - 1] li = self.find(word1 + "ИЙ", False, lang_) if ((li) is not None): return li li = self.find(word1 + "ЫЙ", False, lang_) if ((li) is not None): return li if (ch0 == 'О' and ch1 == 'Н'): li = self.find(word1 + "СКИЙ", False, lang_) if ((li) is not None): return li elif (((ch0 == 'Я' or ch0 == 'Ь')) and ((word[len(word) - 2] == 'С'))): word1 = word[0:0+len(word) - 2] if (word1 == "ЯТЬ"): return None li = self.find(word1, False, lang_) if ((li) is not None): return li elif (ch0 == 'Е' and ch1 == 'Ь'): word1 = word[0:0+len(word) - 2] + "ИЕ" li = self.find(word1, False, lang_) if ((li) is not None): return li elif (ch0 == 'Й' and ch2 == 'Н' and try_create): ch3 = word[len(word) - 4] word1 = None if (ch3 != 'Н'): if (LanguageHelper.is_cyrillic_vowel(ch3)): word1 = (word[0:0+len(word) - 3] + "Н" + word[len(word) - 3:]) else: word1 = (word[0:0+len(word) - 4] + word[len(word) - 3:]) if (word1 is not None): li = self.find(word1, False, lang_) if ((li) is not None): return li if (ch0 == 'Й' and ch1 == 'О'): word2 = word[0:0+len(word) - 2] li = self.find(word2 + "ИЙ", False, lang_) if ((li) is not None): return li li = self.find(word2 + "ЫЙ", False, lang_) if ((li) is not None): return li if (not try_create): return None return None
def process(self, word: str) -> typing.List['MorphWordForm']: """ Обработка одного слова Args: word(str): слово должно быть в верхнем регистре """ if (Utils.isNullOrEmpty(word)): return None res = None if (len(word) > 1): i = 0 while i < len(word): ch = word[i] if (LanguageHelper.is_cyrillic_vowel(ch) or LanguageHelper.is_latin_vowel(ch)): break i += 1 if (i >= len(word)): return res mvs = [] tn = self.m_root i = 0 while i <= len(word): if (tn.lazy_pos > 0): self.__load_tree_node(tn) if (tn.rules is not None): word_begin = None word_end = None if (i == 0): word_end = word elif (i < len(word)): word_end = word[i:] else: word_end = "" if (res is None): res = list() for r in tn.rules: wrapmvs20 = RefOutArgWrapper(None) inoutres21 = Utils.tryGetValue(r.variants, word_end, wrapmvs20) mvs = wrapmvs20.value if (inoutres21): if (word_begin is None): if (i == len(word)): word_begin = word elif (i > 0): word_begin = word[0:0 + i] else: word_begin = "" r.process_result(res, word_begin, mvs) if (tn.nodes is None or i >= len(word)): break ch = ord(word[i]) wraptn22 = RefOutArgWrapper(None) inoutres23 = Utils.tryGetValue(tn.nodes, ch, wraptn22) tn = wraptn22.value if (not inoutres23): break i += 1 need_test_unknown_vars = True if (res is not None): for r in res: if ((r.class0_.is_pronoun or r.class0_.is_noun or r.class0_.is_adjective) or (r.class0_.is_misc and r.class0_.is_conjunction) or r.class0_.is_preposition): need_test_unknown_vars = False elif (r.class0_.is_adverb and r.normal_case is not None): if (not LanguageHelper.ends_with_ex( r.normal_case, "О", "А", None, None)): need_test_unknown_vars = False elif (r.normal_case == "МНОГО"): need_test_unknown_vars = False elif (r.class0_.is_verb and len(res) > 1): ok = False for rr in res: if (rr != r and rr.class0_ != r.class0_): ok = True break if (ok and not LanguageHelper.ends_with(word, "ИМ")): need_test_unknown_vars = False if (need_test_unknown_vars and LanguageHelper.is_cyrillic_char(word[0])): gl = 0 sog = 0 j = 0 while j < len(word): if (LanguageHelper.is_cyrillic_vowel(word[j])): gl += 1 else: sog += 1 j += 1 if ((gl < 2) or (sog < 2)): need_test_unknown_vars = False if (need_test_unknown_vars and res is not None and len(res) == 1): if (res[0].class0_.is_verb): if ("н.вр." in res[0].misc.attrs and "нес.в." in res[0].misc.attrs and not "страд.з." in res[0].misc.attrs): need_test_unknown_vars = False elif ("б.вр." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): need_test_unknown_vars = False elif (res[0].normal_case is not None and LanguageHelper.ends_with(res[0].normal_case, "СЯ")): need_test_unknown_vars = False if (res[0].class0_.is_undefined and "прдктв." in res[0].misc.attrs): need_test_unknown_vars = False if (need_test_unknown_vars): if (self.m_root_reverce is None): return res tn = self.m_root_reverce tn0 = None for i in range(len(word) - 1, -1, -1): if (tn.lazy_pos > 0): self.__load_tree_node(tn) ch = ord(word[i]) if (tn.nodes is None): break wrapnext24 = RefOutArgWrapper(None) inoutres25 = Utils.tryGetValue(tn.nodes, ch, wrapnext24) next0_ = wrapnext24.value if (not inoutres25): break tn = next0_ if (tn.lazy_pos > 0): self.__load_tree_node(tn) if (tn.reverce_variants is not None): tn0 = tn break else: i = -1 if (tn0 is not None): glas = i < 4 while i >= 0: if (LanguageHelper.is_cyrillic_vowel(word[i]) or LanguageHelper.is_latin_vowel(word[i])): glas = True break i -= 1 if (glas): for mv in tn0.reverce_variants: if (((not mv.class0_.is_verb and not mv.class0_.is_adjective and not mv.class0_.is_noun) and not mv.class0_.is_proper_surname and not mv.class0_.is_proper_geo) and not mv.class0_.is_proper_secname): continue ok = False for rr in res: if (rr.is_in_dictionary): if (rr.class0_ == mv.class0_ or rr.class0_.is_noun): ok = True break if (not mv.class0_.is_adjective and rr.class0_.is_verb): ok = True break if (ok): continue if (len(mv.tail) > 0 and not LanguageHelper.ends_with(word, mv.tail)): continue r = MorphWordForm(mv, word) if (not MorphWordForm._has_morph_equals(res, r)): r.undef_coef = mv.coef if (res is None): res = list() res.append(r) if (word == "ПРИ" and res is not None): for i in range(len(res) - 1, -1, -1): if (res[i].class0_.is_proper_geo): del res[i] else: i = -1 if (res is None or len(res) == 0): return None MorphEngine.__sort(res, word) for v in res: if (v.normal_case is None): v.normal_case = word if (v.class0_.is_verb): if (v.normal_full is None and LanguageHelper.ends_with(v.normal_case, "ТЬСЯ")): v.normal_full = v.normal_case[0:0 + len(v.normal_case) - 2] v.language = self.language if (v.class0_.is_preposition): v.normal_case = LanguageHelper.normalize_preposition( v.normal_case) mc = MorphClass() for i in range(len(res) - 1, -1, -1): if (not res[i].is_in_dictionary and res[i].class0_.is_adjective and len(res) > 1): if ("к.ф." in res[i].misc.attrs or "неизм." in res[i].misc.attrs): del res[i] continue if (res[i].is_in_dictionary): mc.value |= res[i].class0_.value else: i = -1 if (mc == MorphClass.VERB and len(res) > 1): for r in res: if (r.undef_coef > (100) and r.class0_ == MorphClass.ADJECTIVE): r.undef_coef = (0) if (len(res) == 0): return None return res
def get_wordform(self, word: str, cla: 'MorphClass', gender: 'MorphGender', cas: 'MorphCase', num: 'MorphNumber', add_info: 'MorphWordForm') -> str: tn = self.m_root find = False res = None max_coef = -10 i = 0 while i <= len(word): if (tn.lazy_pos > 0): self.__load_tree_node(tn) if (tn.rules is not None): word_begin = "" word_end = "" if (i > 0): word_begin = word[0:0 + i] else: word_end = word if (i < len(word)): word_end = word[i:] else: word_begin = word for r in tn.rules: if (word_end in r.variants): for li in r.variants_list: for v in li: if ((((cla.value) & (v.class0_.value))) != 0 and v.normal_tail is not None): if (cas.is_undefined): if (v.case_.is_nominative or v.case_.is_undefined): pass else: continue elif (((v.case_) & cas).is_undefined): continue sur = cla.is_proper_surname sur0 = v.class0_.is_proper_surname if (sur or sur0): if (sur != sur0): continue find = True if (gender != MorphGender.UNDEFINED): if ((((gender) & (v.gender))) == ( MorphGender.UNDEFINED)): if (num is not None and num == MorphNumber.PLURAL): pass else: continue if (num != MorphNumber.UNDEFINED): if ((((num) & (v.number))) == ( MorphNumber.UNDEFINED)): continue re = word_begin + v.tail co = 0 if (add_info is not None): co = v.calc_eq_coef(add_info) if (res is None or co > max_coef): res = re max_coef = co if (max_coef == 0): if ((word_begin + v.normal_tail) == word): return re if (tn.nodes is None or i >= len(word)): break ch = ord(word[i]) wraptn28 = RefOutArgWrapper(None) inoutres29 = Utils.tryGetValue(tn.nodes, ch, wraptn28) tn = wraptn28.value if (not inoutres29): break i += 1 if (find): return res tn = self.m_root_reverce tn0 = None for i in range(len(word) - 1, -1, -1): if (tn.lazy_pos > 0): self.__load_tree_node(tn) ch = ord(word[i]) if (tn.nodes is None): break wrapnext30 = RefOutArgWrapper(None) inoutres31 = Utils.tryGetValue(tn.nodes, ch, wrapnext30) next0_ = wrapnext30.value if (not inoutres31): break tn = next0_ if (tn.lazy_pos > 0): self.__load_tree_node(tn) if (tn.reverce_variants is not None): tn0 = tn break else: i = -1 if (tn0 is None): return None for mv in tn0.reverce_variants: if ((((mv.class0_.value) & (cla.value))) != 0 and mv.rule is not None): if (len(mv.tail) > 0 and not LanguageHelper.ends_with(word, mv.tail)): continue word_begin = word[0:0 + len(word) - len(mv.tail)] for liv in mv.rule.variants_list: for v in liv: if ((((v.class0_.value) & (cla.value))) != 0): sur = cla.is_proper_surname sur0 = v.class0_.is_proper_surname if (sur or sur0): if (sur != sur0): continue if (not cas.is_undefined): if (((cas) & v.case_).is_undefined and not v.case_.is_undefined): continue if (num != MorphNumber.UNDEFINED): if (v.number != MorphNumber.UNDEFINED): if ((((v.number) & (num))) == (MorphNumber.UNDEFINED)): continue if (gender != MorphGender.UNDEFINED): if (v.gender != MorphGender.UNDEFINED): if ((((v.gender) & (gender))) == ( MorphGender.UNDEFINED)): continue if (add_info is not None): if (v.calc_eq_coef(add_info) < 0): continue res = (word_begin + v.tail) if (res == word): return word return res if (cla.is_proper_surname): if ((gender == MorphGender.FEMINIE and cla.is_proper_surname and not cas.is_undefined) and not cas.is_nominative): if (word.endswith("ВА") or word.endswith("НА")): if (cas.is_accusative): return word[0:0 + len(word) - 1] + "У" return word[0:0 + len(word) - 1] + "ОЙ" if (gender == MorphGender.FEMINIE): last = word[len(word) - 1] if (last == 'А' or last == 'Я' or last == 'О'): return word if (LanguageHelper.is_cyrillic_vowel(last)): return word[0:0 + len(word) - 1] + "А" elif (last == 'Й'): return word[0:0 + len(word) - 2] + "АЯ" else: return word + "А" return res
def find(self, word: str, try_create: bool, lang_: 'MorphLang') -> typing.List['DerivateGroup']: if (Utils.isNullOrEmpty(word)): return None tn = self._m_root i = 0 while i < len(word): k = ord(word[i]) tn1 = None if (tn.nodes is None): break wraptn14 = RefOutArgWrapper(None) inoutres5 = Utils.tryGetValue(tn.nodes, k, wraptn14) tn1 = wraptn14.value if (not inoutres5): break tn = tn1 if (tn.lazy_pos > 0): pos = tn.lazy_pos wrappos3 = RefOutArgWrapper(pos) DeserializeHelper.deserialize_tree_node( self.__m_buf, self, tn, True, wrappos3) pos = wrappos3.value tn.lazy_pos = 0 i += 1 res = (None if i < len(word) else tn.groups) li = None if (isinstance(res, list)): li = list(Utils.asObjectOrNull(res, list)) gen = False nogen = False for g in li: if (g.is_generated): gen = True else: nogen = True if (gen and nogen): for i in range(len(li) - 1, -1, -1): if (li[i].is_generated): del li[i] else: i = -1 elif (isinstance(res, DerivateGroup)): li = list() li.append(Utils.asObjectOrNull(res, DerivateGroup)) if (li is not None and lang_ is not None and not lang_.is_undefined): for i in range(len(li) - 1, -1, -1): if (not li[i].contains_word(word, lang_)): del li[i] else: i = -1 if (li is not None and len(li) > 0): return li if (len(word) < 4): return None ch0 = word[len(word) - 1] ch1 = word[len(word) - 2] ch2 = word[len(word) - 3] if (ch0 == 'О' or ((ch0 == 'И' and ch1 == 'К'))): word1 = word[0:0 + len(word) - 1] li = self.find(word1 + "ИЙ", False, lang_) if ((li) is not None): return li li = self.find(word1 + "ЫЙ", False, lang_) if ((li) is not None): return li if (ch0 == 'О' and ch1 == 'Н'): li = self.find(word1 + "СКИЙ", False, lang_) if ((li) is not None): return li elif (((ch0 == 'Я' or ch0 == 'Ь')) and ((word[len(word) - 2] == 'С'))): word1 = word[0:0 + len(word) - 2] if (word1 == "ЯТЬ"): return None li = self.find(word1, False, lang_) if ((li) is not None): return li elif (ch0 == 'Е' and ch1 == 'Ь'): word1 = word[0:0 + len(word) - 2] + "ИЕ" li = self.find(word1, False, lang_) if ((li) is not None): return li elif (ch0 == 'Й' and ch2 == 'Н' and try_create): ch3 = word[len(word) - 4] word1 = None if (ch3 != 'Н'): if (LanguageHelper.is_cyrillic_vowel(ch3)): word1 = (word[0:0 + len(word) - 3] + "Н" + word[len(word) - 3:]) else: word1 = (word[0:0 + len(word) - 4] + word[len(word) - 3:]) if (word1 is not None): li = self.find(word1, False, lang_) if ((li) is not None): return li if (ch0 == 'Й' and ch1 == 'О'): word2 = word[0:0 + len(word) - 2] li = self.find(word2 + "ИЙ", False, lang_) if ((li) is not None): return li li = self.find(word2 + "ЫЙ", False, lang_) if ((li) is not None): return li if (not try_create): return None len0_ = len(word) - 4 i = 1 first_pass2883 = True while True: if first_pass2883: first_pass2883 = False else: i += 1 if (not (i <= len0_)): break rest = word[i:] li1 = self.find(rest, False, lang_) if (li1 is None): continue pref = word[0:0 + i] gen = list() for dg in li1: if (not dg.is_dummy and not dg.is_generated): if (dg.not_generate): if (len(rest) < 5): continue gg = dg.create_by_prefix(pref, lang_) if (gg is not None): gen.append(gg) self.add(gg) if (len(gen) == 0): return None return gen return None
def get_variants(rus_or_lat: str) -> typing.List[str]: res = list() if (Utils.isNullOrEmpty(rus_or_lat)): return res rus_or_lat = rus_or_lat.upper() is_rus = LanguageHelper.is_cyrillic_char(rus_or_lat[0]) stack = list() i = 0 i = 0 while i < len(rus_or_lat): li = list() maxlen = 0 for a in RusLatAccord.__get_accords(): pref = None if (is_rus and len(a.rus) > 0): pref = a.rus elif (not is_rus and len(a.lat) > 0): pref = a.lat else: continue if (len(pref) < maxlen): continue if (not RusLatAccord.__is_pref(rus_or_lat, i, pref)): continue if (a.on_tail): if ((len(pref) + i) < len(rus_or_lat)): continue if (len(pref) > maxlen): maxlen = len(pref) li.clear() li.append(a) if (len(li) == 0 or maxlen == 0): return res stack.append(li) i += (maxlen - 1) i += 1 if (len(stack) == 0): return res ind = list() i = 0 while i < len(stack): ind.append(0) i += 1 tmp = io.StringIO() while True: Utils.setLengthStringIO(tmp, 0) i = 0 while i < len(ind): a = stack[i][ind[i]] print((a.lat if is_rus else a.rus), end="", file=tmp) i += 1 ok = True if (not is_rus): i = 0 while i < tmp.tell(): if (Utils.getCharAtStringIO(tmp, i) == 'Й'): if (i == 0): ok = False break if (not LanguageHelper.is_cyrillic_vowel( Utils.getCharAtStringIO(tmp, i - 1))): ok = False break i += 1 if (ok): res.append(Utils.toStringStringIO(tmp)) for i in range(len(ind) - 1, -1, -1): ind[i] += 1 if (ind[i] < len(stack[i])): break else: ind[i] = 0 else: i = -1 if (i < 0): break return res