Example #1
0
 def __calc_coef(wf: 'MorphWordForm') -> int:
     k = 0
     if (not wf.case_.is_undefined):
         k += 1
     if (wf.gender != MorphGender.UNDEFINED):
         k += 1
     if (wf.number != MorphNumber.UNDEFINED):
         k += 1
     if (wf.misc.is_synonym_form):
         k -= 3
     if (wf.normal_case is None or (len(wf.normal_case) < 4)):
         return k
     if (wf.class0_.is_adjective and wf.number != MorphNumber.PLURAL):
         last = wf.normal_case[len(wf.normal_case) - 1]
         last1 = wf.normal_case[len(wf.normal_case) - 2]
         ok = False
         if (wf.gender == MorphGender.FEMINIE):
             if (last == 'Я'):
                 ok = True
         if (wf.gender == MorphGender.MASCULINE):
             if (last == 'Й'):
                 if (last1 == 'И'):
                     k += 1
                 ok = True
         if (wf.gender == MorphGender.NEUTER):
             if (last == 'Е'):
                 ok = True
         if (ok):
             if (LanguageHelper.is_cyrillic_vowel(last1)):
                 k += 1
     elif (wf.class0_.is_adjective and wf.number == MorphNumber.PLURAL):
         last = wf.normal_case[len(wf.normal_case) - 1]
         last1 = wf.normal_case[len(wf.normal_case) - 2]
         if (last == 'Й' or last == 'Е'):
             k += 1
     return k
 def find(self, word : str, try_create : bool, lang_ : 'MorphLang') -> typing.List['DerivateGroup']:
     if (Utils.isNullOrEmpty(word)): 
         return None
     tn = self._m_root
     i = 0
     i = 0
     while i < len(word): 
         k = ord(word[i])
         if (tn.nodes is None): 
             break
         if (not k in tn.nodes): 
             break
         tn = tn.nodes[k]
         if (tn.lazy_pos > 0): 
             self.__load_tree_node(tn)
         i += 1
     li = None
     if (i >= len(word) and tn.groups is not None): 
         li = list()
         for g in tn.groups: 
             li.append(self._get_group(g))
         gen = False
         nogen = False
         for g in li: 
             if (g.is_generated): 
                 gen = True
             else: 
                 nogen = True
         if (gen and nogen): 
             for i in range(len(li) - 1, -1, -1):
                 if (li[i].is_generated): 
                     del li[i]
             else: i = -1
     if (li is not None and lang_ is not None and not lang_.is_undefined): 
         for i in range(len(li) - 1, -1, -1):
             if (not li[i].contains_word(word, lang_)): 
                 del li[i]
         else: i = -1
     if (li is not None and len(li) > 0): 
         return li
     if (len(word) < 4): 
         return None
     ch0 = word[len(word) - 1]
     ch1 = word[len(word) - 2]
     ch2 = word[len(word) - 3]
     if (ch0 == 'О' or ((ch0 == 'И' and ch1 == 'К'))): 
         word1 = word[0:0+len(word) - 1]
         li = self.find(word1 + "ИЙ", False, lang_)
         if ((li) is not None): 
             return li
         li = self.find(word1 + "ЫЙ", False, lang_)
         if ((li) is not None): 
             return li
         if (ch0 == 'О' and ch1 == 'Н'): 
             li = self.find(word1 + "СКИЙ", False, lang_)
             if ((li) is not None): 
                 return li
     elif (((ch0 == 'Я' or ch0 == 'Ь')) and ((word[len(word) - 2] == 'С'))): 
         word1 = word[0:0+len(word) - 2]
         if (word1 == "ЯТЬ"): 
             return None
         li = self.find(word1, False, lang_)
         if ((li) is not None): 
             return li
     elif (ch0 == 'Е' and ch1 == 'Ь'): 
         word1 = word[0:0+len(word) - 2] + "ИЕ"
         li = self.find(word1, False, lang_)
         if ((li) is not None): 
             return li
     elif (ch0 == 'Й' and ch2 == 'Н' and try_create): 
         ch3 = word[len(word) - 4]
         word1 = None
         if (ch3 != 'Н'): 
             if (LanguageHelper.is_cyrillic_vowel(ch3)): 
                 word1 = (word[0:0+len(word) - 3] + "Н" + word[len(word) - 3:])
         else: 
             word1 = (word[0:0+len(word) - 4] + word[len(word) - 3:])
         if (word1 is not None): 
             li = self.find(word1, False, lang_)
             if ((li) is not None): 
                 return li
     if (ch0 == 'Й' and ch1 == 'О'): 
         word2 = word[0:0+len(word) - 2]
         li = self.find(word2 + "ИЙ", False, lang_)
         if ((li) is not None): 
             return li
         li = self.find(word2 + "ЫЙ", False, lang_)
         if ((li) is not None): 
             return li
     if (not try_create): 
         return None
     return None
Example #3
0
 def process(self, word: str) -> typing.List['MorphWordForm']:
     """ Обработка одного слова
     
     Args:
         word(str): слово должно быть в верхнем регистре
     
     """
     if (Utils.isNullOrEmpty(word)):
         return None
     res = None
     if (len(word) > 1):
         i = 0
         while i < len(word):
             ch = word[i]
             if (LanguageHelper.is_cyrillic_vowel(ch)
                     or LanguageHelper.is_latin_vowel(ch)):
                 break
             i += 1
         if (i >= len(word)):
             return res
     mvs = []
     tn = self.m_root
     i = 0
     while i <= len(word):
         if (tn.lazy_pos > 0):
             self.__load_tree_node(tn)
         if (tn.rules is not None):
             word_begin = None
             word_end = None
             if (i == 0):
                 word_end = word
             elif (i < len(word)):
                 word_end = word[i:]
             else:
                 word_end = ""
             if (res is None):
                 res = list()
             for r in tn.rules:
                 wrapmvs20 = RefOutArgWrapper(None)
                 inoutres21 = Utils.tryGetValue(r.variants, word_end,
                                                wrapmvs20)
                 mvs = wrapmvs20.value
                 if (inoutres21):
                     if (word_begin is None):
                         if (i == len(word)):
                             word_begin = word
                         elif (i > 0):
                             word_begin = word[0:0 + i]
                         else:
                             word_begin = ""
                     r.process_result(res, word_begin, mvs)
         if (tn.nodes is None or i >= len(word)):
             break
         ch = ord(word[i])
         wraptn22 = RefOutArgWrapper(None)
         inoutres23 = Utils.tryGetValue(tn.nodes, ch, wraptn22)
         tn = wraptn22.value
         if (not inoutres23):
             break
         i += 1
     need_test_unknown_vars = True
     if (res is not None):
         for r in res:
             if ((r.class0_.is_pronoun or r.class0_.is_noun
                  or r.class0_.is_adjective)
                     or (r.class0_.is_misc and r.class0_.is_conjunction)
                     or r.class0_.is_preposition):
                 need_test_unknown_vars = False
             elif (r.class0_.is_adverb and r.normal_case is not None):
                 if (not LanguageHelper.ends_with_ex(
                         r.normal_case, "О", "А", None, None)):
                     need_test_unknown_vars = False
                 elif (r.normal_case == "МНОГО"):
                     need_test_unknown_vars = False
             elif (r.class0_.is_verb and len(res) > 1):
                 ok = False
                 for rr in res:
                     if (rr != r and rr.class0_ != r.class0_):
                         ok = True
                         break
                 if (ok and not LanguageHelper.ends_with(word, "ИМ")):
                     need_test_unknown_vars = False
     if (need_test_unknown_vars
             and LanguageHelper.is_cyrillic_char(word[0])):
         gl = 0
         sog = 0
         j = 0
         while j < len(word):
             if (LanguageHelper.is_cyrillic_vowel(word[j])):
                 gl += 1
             else:
                 sog += 1
             j += 1
         if ((gl < 2) or (sog < 2)):
             need_test_unknown_vars = False
     if (need_test_unknown_vars and res is not None and len(res) == 1):
         if (res[0].class0_.is_verb):
             if ("н.вр." in res[0].misc.attrs
                     and "нес.в." in res[0].misc.attrs
                     and not "страд.з." in res[0].misc.attrs):
                 need_test_unknown_vars = False
             elif ("б.вр." in res[0].misc.attrs
                   and "сов.в." in res[0].misc.attrs):
                 need_test_unknown_vars = False
             elif (res[0].normal_case is not None
                   and LanguageHelper.ends_with(res[0].normal_case, "СЯ")):
                 need_test_unknown_vars = False
         if (res[0].class0_.is_undefined
                 and "прдктв." in res[0].misc.attrs):
             need_test_unknown_vars = False
     if (need_test_unknown_vars):
         if (self.m_root_reverce is None):
             return res
         tn = self.m_root_reverce
         tn0 = None
         for i in range(len(word) - 1, -1, -1):
             if (tn.lazy_pos > 0):
                 self.__load_tree_node(tn)
             ch = ord(word[i])
             if (tn.nodes is None):
                 break
             wrapnext24 = RefOutArgWrapper(None)
             inoutres25 = Utils.tryGetValue(tn.nodes, ch, wrapnext24)
             next0_ = wrapnext24.value
             if (not inoutres25):
                 break
             tn = next0_
             if (tn.lazy_pos > 0):
                 self.__load_tree_node(tn)
             if (tn.reverce_variants is not None):
                 tn0 = tn
                 break
         else:
             i = -1
         if (tn0 is not None):
             glas = i < 4
             while i >= 0:
                 if (LanguageHelper.is_cyrillic_vowel(word[i])
                         or LanguageHelper.is_latin_vowel(word[i])):
                     glas = True
                     break
                 i -= 1
             if (glas):
                 for mv in tn0.reverce_variants:
                     if (((not mv.class0_.is_verb
                           and not mv.class0_.is_adjective
                           and not mv.class0_.is_noun)
                          and not mv.class0_.is_proper_surname
                          and not mv.class0_.is_proper_geo)
                             and not mv.class0_.is_proper_secname):
                         continue
                     ok = False
                     for rr in res:
                         if (rr.is_in_dictionary):
                             if (rr.class0_ == mv.class0_
                                     or rr.class0_.is_noun):
                                 ok = True
                                 break
                             if (not mv.class0_.is_adjective
                                     and rr.class0_.is_verb):
                                 ok = True
                                 break
                     if (ok):
                         continue
                     if (len(mv.tail) > 0 and
                             not LanguageHelper.ends_with(word, mv.tail)):
                         continue
                     r = MorphWordForm(mv, word)
                     if (not MorphWordForm._has_morph_equals(res, r)):
                         r.undef_coef = mv.coef
                         if (res is None):
                             res = list()
                         res.append(r)
     if (word == "ПРИ" and res is not None):
         for i in range(len(res) - 1, -1, -1):
             if (res[i].class0_.is_proper_geo):
                 del res[i]
         else:
             i = -1
     if (res is None or len(res) == 0):
         return None
     MorphEngine.__sort(res, word)
     for v in res:
         if (v.normal_case is None):
             v.normal_case = word
         if (v.class0_.is_verb):
             if (v.normal_full is None
                     and LanguageHelper.ends_with(v.normal_case, "ТЬСЯ")):
                 v.normal_full = v.normal_case[0:0 + len(v.normal_case) - 2]
         v.language = self.language
         if (v.class0_.is_preposition):
             v.normal_case = LanguageHelper.normalize_preposition(
                 v.normal_case)
     mc = MorphClass()
     for i in range(len(res) - 1, -1, -1):
         if (not res[i].is_in_dictionary and res[i].class0_.is_adjective
                 and len(res) > 1):
             if ("к.ф." in res[i].misc.attrs
                     or "неизм." in res[i].misc.attrs):
                 del res[i]
                 continue
         if (res[i].is_in_dictionary):
             mc.value |= res[i].class0_.value
     else:
         i = -1
     if (mc == MorphClass.VERB and len(res) > 1):
         for r in res:
             if (r.undef_coef > (100)
                     and r.class0_ == MorphClass.ADJECTIVE):
                 r.undef_coef = (0)
     if (len(res) == 0):
         return None
     return res
Example #4
0
 def get_wordform(self, word: str, cla: 'MorphClass', gender: 'MorphGender',
                  cas: 'MorphCase', num: 'MorphNumber',
                  add_info: 'MorphWordForm') -> str:
     tn = self.m_root
     find = False
     res = None
     max_coef = -10
     i = 0
     while i <= len(word):
         if (tn.lazy_pos > 0):
             self.__load_tree_node(tn)
         if (tn.rules is not None):
             word_begin = ""
             word_end = ""
             if (i > 0):
                 word_begin = word[0:0 + i]
             else:
                 word_end = word
             if (i < len(word)):
                 word_end = word[i:]
             else:
                 word_begin = word
             for r in tn.rules:
                 if (word_end in r.variants):
                     for li in r.variants_list:
                         for v in li:
                             if ((((cla.value) & (v.class0_.value))) != 0
                                     and v.normal_tail is not None):
                                 if (cas.is_undefined):
                                     if (v.case_.is_nominative
                                             or v.case_.is_undefined):
                                         pass
                                     else:
                                         continue
                                 elif (((v.case_) & cas).is_undefined):
                                     continue
                                 sur = cla.is_proper_surname
                                 sur0 = v.class0_.is_proper_surname
                                 if (sur or sur0):
                                     if (sur != sur0):
                                         continue
                                 find = True
                                 if (gender != MorphGender.UNDEFINED):
                                     if ((((gender) & (v.gender))) == (
                                             MorphGender.UNDEFINED)):
                                         if (num is not None and num
                                                 == MorphNumber.PLURAL):
                                             pass
                                         else:
                                             continue
                                 if (num != MorphNumber.UNDEFINED):
                                     if ((((num) & (v.number))) == (
                                             MorphNumber.UNDEFINED)):
                                         continue
                                 re = word_begin + v.tail
                                 co = 0
                                 if (add_info is not None):
                                     co = v.calc_eq_coef(add_info)
                                 if (res is None or co > max_coef):
                                     res = re
                                     max_coef = co
                                 if (max_coef == 0):
                                     if ((word_begin +
                                          v.normal_tail) == word):
                                         return re
         if (tn.nodes is None or i >= len(word)):
             break
         ch = ord(word[i])
         wraptn28 = RefOutArgWrapper(None)
         inoutres29 = Utils.tryGetValue(tn.nodes, ch, wraptn28)
         tn = wraptn28.value
         if (not inoutres29):
             break
         i += 1
     if (find):
         return res
     tn = self.m_root_reverce
     tn0 = None
     for i in range(len(word) - 1, -1, -1):
         if (tn.lazy_pos > 0):
             self.__load_tree_node(tn)
         ch = ord(word[i])
         if (tn.nodes is None):
             break
         wrapnext30 = RefOutArgWrapper(None)
         inoutres31 = Utils.tryGetValue(tn.nodes, ch, wrapnext30)
         next0_ = wrapnext30.value
         if (not inoutres31):
             break
         tn = next0_
         if (tn.lazy_pos > 0):
             self.__load_tree_node(tn)
         if (tn.reverce_variants is not None):
             tn0 = tn
             break
     else:
         i = -1
     if (tn0 is None):
         return None
     for mv in tn0.reverce_variants:
         if ((((mv.class0_.value) & (cla.value))) != 0
                 and mv.rule is not None):
             if (len(mv.tail) > 0
                     and not LanguageHelper.ends_with(word, mv.tail)):
                 continue
             word_begin = word[0:0 + len(word) - len(mv.tail)]
             for liv in mv.rule.variants_list:
                 for v in liv:
                     if ((((v.class0_.value) & (cla.value))) != 0):
                         sur = cla.is_proper_surname
                         sur0 = v.class0_.is_proper_surname
                         if (sur or sur0):
                             if (sur != sur0):
                                 continue
                         if (not cas.is_undefined):
                             if (((cas) & v.case_).is_undefined
                                     and not v.case_.is_undefined):
                                 continue
                         if (num != MorphNumber.UNDEFINED):
                             if (v.number != MorphNumber.UNDEFINED):
                                 if ((((v.number) &
                                       (num))) == (MorphNumber.UNDEFINED)):
                                     continue
                         if (gender != MorphGender.UNDEFINED):
                             if (v.gender != MorphGender.UNDEFINED):
                                 if ((((v.gender) & (gender))) == (
                                         MorphGender.UNDEFINED)):
                                     continue
                         if (add_info is not None):
                             if (v.calc_eq_coef(add_info) < 0):
                                 continue
                         res = (word_begin + v.tail)
                         if (res == word):
                             return word
                         return res
     if (cla.is_proper_surname):
         if ((gender == MorphGender.FEMINIE and cla.is_proper_surname
              and not cas.is_undefined) and not cas.is_nominative):
             if (word.endswith("ВА") or word.endswith("НА")):
                 if (cas.is_accusative):
                     return word[0:0 + len(word) - 1] + "У"
                 return word[0:0 + len(word) - 1] + "ОЙ"
         if (gender == MorphGender.FEMINIE):
             last = word[len(word) - 1]
             if (last == 'А' or last == 'Я' or last == 'О'):
                 return word
             if (LanguageHelper.is_cyrillic_vowel(last)):
                 return word[0:0 + len(word) - 1] + "А"
             elif (last == 'Й'):
                 return word[0:0 + len(word) - 2] + "АЯ"
             else:
                 return word + "А"
     return res
Example #5
0
 def find(self, word: str, try_create: bool,
          lang_: 'MorphLang') -> typing.List['DerivateGroup']:
     if (Utils.isNullOrEmpty(word)):
         return None
     tn = self._m_root
     i = 0
     while i < len(word):
         k = ord(word[i])
         tn1 = None
         if (tn.nodes is None):
             break
         wraptn14 = RefOutArgWrapper(None)
         inoutres5 = Utils.tryGetValue(tn.nodes, k, wraptn14)
         tn1 = wraptn14.value
         if (not inoutres5):
             break
         tn = tn1
         if (tn.lazy_pos > 0):
             pos = tn.lazy_pos
             wrappos3 = RefOutArgWrapper(pos)
             DeserializeHelper.deserialize_tree_node(
                 self.__m_buf, self, tn, True, wrappos3)
             pos = wrappos3.value
             tn.lazy_pos = 0
         i += 1
     res = (None if i < len(word) else tn.groups)
     li = None
     if (isinstance(res, list)):
         li = list(Utils.asObjectOrNull(res, list))
         gen = False
         nogen = False
         for g in li:
             if (g.is_generated):
                 gen = True
             else:
                 nogen = True
         if (gen and nogen):
             for i in range(len(li) - 1, -1, -1):
                 if (li[i].is_generated):
                     del li[i]
             else:
                 i = -1
     elif (isinstance(res, DerivateGroup)):
         li = list()
         li.append(Utils.asObjectOrNull(res, DerivateGroup))
     if (li is not None and lang_ is not None and not lang_.is_undefined):
         for i in range(len(li) - 1, -1, -1):
             if (not li[i].contains_word(word, lang_)):
                 del li[i]
         else:
             i = -1
     if (li is not None and len(li) > 0):
         return li
     if (len(word) < 4):
         return None
     ch0 = word[len(word) - 1]
     ch1 = word[len(word) - 2]
     ch2 = word[len(word) - 3]
     if (ch0 == 'О' or ((ch0 == 'И' and ch1 == 'К'))):
         word1 = word[0:0 + len(word) - 1]
         li = self.find(word1 + "ИЙ", False, lang_)
         if ((li) is not None):
             return li
         li = self.find(word1 + "ЫЙ", False, lang_)
         if ((li) is not None):
             return li
         if (ch0 == 'О' and ch1 == 'Н'):
             li = self.find(word1 + "СКИЙ", False, lang_)
             if ((li) is not None):
                 return li
     elif (((ch0 == 'Я' or ch0 == 'Ь')) and ((word[len(word) - 2] == 'С'))):
         word1 = word[0:0 + len(word) - 2]
         if (word1 == "ЯТЬ"):
             return None
         li = self.find(word1, False, lang_)
         if ((li) is not None):
             return li
     elif (ch0 == 'Е' and ch1 == 'Ь'):
         word1 = word[0:0 + len(word) - 2] + "ИЕ"
         li = self.find(word1, False, lang_)
         if ((li) is not None):
             return li
     elif (ch0 == 'Й' and ch2 == 'Н' and try_create):
         ch3 = word[len(word) - 4]
         word1 = None
         if (ch3 != 'Н'):
             if (LanguageHelper.is_cyrillic_vowel(ch3)):
                 word1 = (word[0:0 + len(word) - 3] + "Н" +
                          word[len(word) - 3:])
         else:
             word1 = (word[0:0 + len(word) - 4] + word[len(word) - 3:])
         if (word1 is not None):
             li = self.find(word1, False, lang_)
             if ((li) is not None):
                 return li
     if (ch0 == 'Й' and ch1 == 'О'):
         word2 = word[0:0 + len(word) - 2]
         li = self.find(word2 + "ИЙ", False, lang_)
         if ((li) is not None):
             return li
         li = self.find(word2 + "ЫЙ", False, lang_)
         if ((li) is not None):
             return li
     if (not try_create):
         return None
     len0_ = len(word) - 4
     i = 1
     first_pass2883 = True
     while True:
         if first_pass2883: first_pass2883 = False
         else: i += 1
         if (not (i <= len0_)): break
         rest = word[i:]
         li1 = self.find(rest, False, lang_)
         if (li1 is None):
             continue
         pref = word[0:0 + i]
         gen = list()
         for dg in li1:
             if (not dg.is_dummy and not dg.is_generated):
                 if (dg.not_generate):
                     if (len(rest) < 5):
                         continue
                 gg = dg.create_by_prefix(pref, lang_)
                 if (gg is not None):
                     gen.append(gg)
                     self.add(gg)
         if (len(gen) == 0):
             return None
         return gen
     return None
Example #6
0
 def get_variants(rus_or_lat: str) -> typing.List[str]:
     res = list()
     if (Utils.isNullOrEmpty(rus_or_lat)):
         return res
     rus_or_lat = rus_or_lat.upper()
     is_rus = LanguageHelper.is_cyrillic_char(rus_or_lat[0])
     stack = list()
     i = 0
     i = 0
     while i < len(rus_or_lat):
         li = list()
         maxlen = 0
         for a in RusLatAccord.__get_accords():
             pref = None
             if (is_rus and len(a.rus) > 0):
                 pref = a.rus
             elif (not is_rus and len(a.lat) > 0):
                 pref = a.lat
             else:
                 continue
             if (len(pref) < maxlen):
                 continue
             if (not RusLatAccord.__is_pref(rus_or_lat, i, pref)):
                 continue
             if (a.on_tail):
                 if ((len(pref) + i) < len(rus_or_lat)):
                     continue
             if (len(pref) > maxlen):
                 maxlen = len(pref)
                 li.clear()
             li.append(a)
         if (len(li) == 0 or maxlen == 0):
             return res
         stack.append(li)
         i += (maxlen - 1)
         i += 1
     if (len(stack) == 0):
         return res
     ind = list()
     i = 0
     while i < len(stack):
         ind.append(0)
         i += 1
     tmp = io.StringIO()
     while True:
         Utils.setLengthStringIO(tmp, 0)
         i = 0
         while i < len(ind):
             a = stack[i][ind[i]]
             print((a.lat if is_rus else a.rus), end="", file=tmp)
             i += 1
         ok = True
         if (not is_rus):
             i = 0
             while i < tmp.tell():
                 if (Utils.getCharAtStringIO(tmp, i) == 'Й'):
                     if (i == 0):
                         ok = False
                         break
                     if (not LanguageHelper.is_cyrillic_vowel(
                             Utils.getCharAtStringIO(tmp, i - 1))):
                         ok = False
                         break
                 i += 1
         if (ok):
             res.append(Utils.toStringStringIO(tmp))
         for i in range(len(ind) - 1, -1, -1):
             ind[i] += 1
             if (ind[i] < len(stack[i])):
                 break
             else:
                 ind[i] = 0
         else:
             i = -1
         if (i < 0):
             break
     return res