Esempio n. 1
0
 def __init__(self, source: 'MorphCollection' = None) -> None:
     super().__init__()
     self.__m_class = MorphClass()
     self.__m_gender = MorphGender.UNDEFINED
     self.__m_number = MorphNumber.UNDEFINED
     self.__m_case = MorphCase()
     self.__m_language = MorphLang()
     self.__m_voice = MorphVoice.UNDEFINED
     self.__m_need_recalc = True
     self.__m_items = None
     if (source is None):
         return
     for it in source.items:
         mi = None
         if (isinstance(it, MorphWordForm)):
             wf = MorphWordForm()
             wf.copy_from_word_form(Utils.asObjectOrNull(it, MorphWordForm))
             mi = (wf)
         else:
             mi = MorphBaseInfo()
             mi.copy_from(it)
         if (self.__m_items is None):
             self.__m_items = list()
         self.__m_items.append(mi)
     self.__m_class = MorphClass._new53(source.__m_class.value)
     self.__m_gender = source.__m_gender
     self.__m_case = MorphCase._new29(source.__m_case.value)
     self.__m_number = source.__m_number
     self.__m_language = MorphLang._new56(source.__m_language.value)
     self.__m_voice = source.__m_voice
     self.__m_need_recalc = False
Esempio n. 2
0
 def __process_proper_variants(self, word: str,
                               res: typing.List['MorphWordForm'],
                               geo: bool) -> None:
     tn = self.m_root_reverce
     tn0 = None
     nodes_with_vars = None
     for i in range(len(word) - 1, -1, -1):
         if (tn.lazy_pos > 0):
             self.__load_tree_node(tn)
         ch = ord(word[i])
         if (tn.nodes is None):
             break
         wrapnext34 = RefOutArgWrapper(None)
         inoutres35 = Utils.tryGetValue(tn.nodes, ch, wrapnext34)
         next0_ = wrapnext34.value
         if (not inoutres35):
             break
         tn = next0_
         if (tn.lazy_pos > 0):
             self.__load_tree_node(tn)
         if (tn.reverce_variants is not None):
             if (nodes_with_vars is None):
                 nodes_with_vars = list()
             nodes_with_vars.append(tn)
             tn0 = tn
     else:
         i = -1
     if (nodes_with_vars is None):
         return
     for j in range(len(nodes_with_vars) - 1, -1, -1):
         tn = nodes_with_vars[j]
         if (tn.lazy_pos > 0):
             self.__load_tree_node(tn)
         ok = False
         for v in tn.reverce_variants:
             if (geo and v.class0_.is_proper_geo):
                 pass
             elif (not geo and v.class0_.is_proper_surname):
                 pass
             else:
                 continue
             r = MorphWordForm(v, word)
             if (not MorphWordForm._has_morph_equals(res, r)):
                 r.undef_coef = v.coef
                 res.append(r)
             ok = True
         if (ok):
             break
Esempio n. 3
0
 def __deserialize_item(self, stream: Stream) -> 'MorphBaseInfo':
     from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
     ty = stream.readbyte()
     res = (MorphBaseInfo() if ty == 0 else MorphWordForm())
     res.class0_ = MorphClass._new53(
         SerializerHelper.deserialize_short(stream))
     res.case_ = MorphCase._new29(
         SerializerHelper.deserialize_short(stream))
     res.gender = Utils.valToEnum(
         SerializerHelper.deserialize_short(stream), MorphGender)
     res.number = Utils.valToEnum(
         SerializerHelper.deserialize_short(stream), MorphNumber)
     res.language = MorphLang._new56(
         SerializerHelper.deserialize_short(stream))
     if (ty == 0):
         return res
     wf = Utils.asObjectOrNull(res, MorphWordForm)
     wf.normal_case = SerializerHelper.deserialize_string(stream)
     wf.normal_full = SerializerHelper.deserialize_string(stream)
     wf.undef_coef = SerializerHelper.deserialize_short(stream)
     cou = SerializerHelper.deserialize_int(stream)
     i = 0
     while i < cou:
         if (wf.misc is None):
             wf.misc = MorphMiscInfo()
         wf.misc.attrs.append(SerializerHelper.deserialize_string(stream))
         i += 1
     return res
Esempio n. 4
0
 def __process_proper_variants(self, word : str, res : typing.List['MorphWordForm'], geo : bool) -> None:
     tn = self.m_root_reverce
     nodes_with_vars = None
     i = 0
     for i in range(len(word) - 1, -1, -1):
         if (tn.lazy_pos > 0): 
             self.__load_tree_node(tn)
         ch = ord(word[i])
         if (tn.nodes is None): 
             break
         if (not ch in tn.nodes): 
             break
         tn = tn.nodes[ch]
         if (tn.lazy_pos > 0): 
             self.__load_tree_node(tn)
         if (tn.reverce_variants is not None): 
             if (nodes_with_vars is None): 
                 nodes_with_vars = list()
             nodes_with_vars.append(tn)
     else: i = -1
     if (nodes_with_vars is None): 
         return
     for j in range(len(nodes_with_vars) - 1, -1, -1):
         tn = nodes_with_vars[j]
         if (tn.lazy_pos > 0): 
             self.__load_tree_node(tn)
         ok = False
         for vr in tn.reverce_variants: 
             v = self.get_rule_var(vr.rule_id, vr.variant_id)
             if (v is None): 
                 continue
             if (geo and v.class0_.is_proper_geo): 
                 pass
             elif (not geo and v.class0_.is_proper_surname): 
                 pass
             else: 
                 continue
             r = MorphWordForm(v, word, self.get_misc_info(v.misc_info_id))
             if (not r._has_morph_equals(res)): 
                 r.undef_coef = vr.coef
                 res.append(r)
             ok = True
         if (ok): 
             break
Esempio n. 5
0
 def process_result(self, res : typing.List['MorphWordForm'], word_begin : str, mvs : typing.List['MorphRuleVariant']) -> None:
     for mv in mvs: 
         r = MorphWordForm(mv, None)
         if (mv.normal_tail is not None and len(mv.normal_tail) > 0 and mv.normal_tail[0] != '-'): 
             r.normal_case = (word_begin + mv.normal_tail)
         else: 
             r.normal_case = word_begin
         if (mv.full_normal_tail is not None): 
             if (len(mv.full_normal_tail) > 0 and mv.full_normal_tail[0] != '-'): 
                 r.normal_full = (word_begin + mv.full_normal_tail)
             else: 
                 r.normal_full = word_begin
         if (not MorphWordForm._has_morph_equals(res, r)): 
             r.undef_coef = (0)
             res.append(r)
Esempio n. 6
0
 def __init__(self, graph_: 'SemGraph') -> None:
     self.graph = None
     self.morph = MorphWordForm()
     self.typ = SemObjectType.UNDEFINED
     self.quantity = None
     self.concept = None
     self.attrs = list()
     self.measure = MeasureKind.UNDEFINED
     self.not0_ = False
     self.tokens = list()
     self.links_from = list()
     self.links_to = list()
     self.tag = None
     self.graph = graph_
Esempio n. 7
0
 def __remove_items_morph_case(self, cas: 'MorphCase') -> None:
     if (self.__m_items is None):
         return
     if (len(self.__m_items) == 0):
         self.__m_case = ((self.__m_case) & cas)
     for i in range(len(self.__m_items) - 1, -1, -1):
         if (((self.__m_items[i].case_) & cas).is_undefined):
             del self.__m_items[i]
             self.__m_need_recalc = True
         elif ((((self.__m_items[i].case_) & cas)) !=
               self.__m_items[i].case_):
             if (isinstance(self.__m_items[i], MorphWordForm)):
                 wf = MorphWordForm()
                 wf.copy_from_word_form(
                     Utils.asObjectOrNull(self.__m_items[i], MorphWordForm))
                 wf.case_ = (wf.case_) & cas
                 self.__m_items[i] = (wf)
             else:
                 bi = MorphBaseInfo()
                 bi.copy_from(self.__m_items[i])
                 bi.case_ = (bi.case_) & cas
                 self.__m_items[i] = bi
             self.__m_need_recalc = True
     self.__m_need_recalc = True
 def verb_morph(self) -> 'MorphWordForm':
     """ Полное морф.информация (для глагола) """
     if (self.__m_verb_morph is not None): 
         return self.__m_verb_morph
     for f in self.morph.items: 
         if (f.class0_.is_verb and (isinstance(f, MorphWordForm)) and ((f.misc.person) & (MorphPerson.THIRD)) != (MorphPerson.UNDEFINED)): 
             if (f.normal_case.endswith("СЯ")): 
                 return Utils.asObjectOrNull(f, MorphWordForm)
     for f in self.morph.items: 
         if (f.class0_.is_verb and (isinstance(f, MorphWordForm)) and ((f.misc.person) & (MorphPerson.THIRD)) != (MorphPerson.UNDEFINED)): 
             return Utils.asObjectOrNull(f, MorphWordForm)
     for f in self.morph.items: 
         if (f.class0_.is_verb and (isinstance(f, MorphWordForm))): 
             return Utils.asObjectOrNull(f, MorphWordForm)
     for f in self.morph.items: 
         if (f.class0_.is_adjective and (isinstance(f, MorphWordForm))): 
             return Utils.asObjectOrNull(f, MorphWordForm)
     if (self.__m_normal == "НЕТ"): 
         return MorphWordForm._new605(MorphClass.VERB, MorphMiscInfo())
     return None
Esempio n. 9
0
 def get_word_base_info(word: str,
                        lang: 'MorphLang' = None,
                        is_case_nominative: bool = False,
                        in_dict_only: bool = False) -> 'MorphBaseInfo':
     """ Получить для словоформы род\число\падеж
     
     Args:
         word(str): словоформа
         lang(MorphLang): возможный язык
         is_case_nominative(bool): исходное слово в именительном падеже (иначе считается падеж любым)
         in_dict_only(bool): при true не строить гипотезы для несловарных слов
     
     Returns:
         MorphBaseInfo: базовая морфологическая информация
     """
     mt = Morphology.__m_inner.run(word, False, lang, None, False)
     bi = MorphWordForm()
     cla = MorphClass()
     if (mt is not None and len(mt) > 0):
         for k in range(2):
             ok = False
             for wf in mt[0].word_forms:
                 if (k == 0):
                     if (not wf.is_in_dictionary):
                         continue
                 elif (wf.is_in_dictionary):
                     continue
                 if (is_case_nominative):
                     if (not wf.case_.is_nominative
                             and not wf.case_.is_undefined):
                         continue
                 cla.value |= wf.class0_.value
                 bi.gender = Utils.valToEnum((bi.gender) | (wf.gender),
                                             MorphGender)
                 bi.case_ = (bi.case_) | wf.case_
                 bi.number = Utils.valToEnum((bi.number) | (wf.number),
                                             MorphNumber)
                 if (wf.misc is not None and bi.misc is None):
                     bi.misc = wf.misc
                 ok = True
             if (ok or in_dict_only):
                 break
     bi.class0_ = cla
     return bi
 def __try_parse_en(first: 'Token', typ: 'NounPhraseParseAttr',
                    max_char_pos: int) -> 'NounPhraseToken':
     if (first is None):
         return None
     items = None
     has_article = False
     has_prop = False
     has_misc = False
     if (first.previous is not None
             and first.previous.morph.class0_.is_preposition
             and (first.whitespaces_before_count < 3)):
         has_prop = True
     t = first
     first_pass3048 = True
     while True:
         if first_pass3048: first_pass3048 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (max_char_pos > 0 and t.begin_char > max_char_pos):
             break
         if (not t.chars.is_latin_letter):
             break
         if (t != first and t.whitespaces_before_count > 2):
             if ((((typ) & (NounPhraseParseAttr.MULTILINES))) !=
                 (NounPhraseParseAttr.NO)):
                 pass
             elif (MiscHelper.is_eng_article(t.previous)):
                 pass
             else:
                 break
         tt = Utils.asObjectOrNull(t, TextToken)
         if (t == first and tt is not None):
             if (MiscHelper.is_eng_article(tt)):
                 has_article = True
                 continue
         if (isinstance(t, ReferentToken)):
             if ((((typ) & (NounPhraseParseAttr.REFERENTCANBENOUN))) == (
                     NounPhraseParseAttr.NO)):
                 break
         elif (tt is None):
             break
         if ((t.is_value("SO", None) and t.next0_ is not None
              and t.next0_.is_hiphen) and t.next0_.next0_ is not None):
             if (t.next0_.next0_.is_value("CALL", None)):
                 t = t.next0_.next0_
                 continue
         mc = t.get_morph_class_in_dictionary()
         if (mc.is_conjunction or mc.is_preposition):
             break
         if (mc.is_pronoun or mc.is_personal_pronoun):
             if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (
                     NounPhraseParseAttr.NO)):
                 break
         elif (mc.is_misc):
             if (t.is_value("THIS", None) or t.is_value("THAT", None)):
                 has_misc = True
                 if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (
                         NounPhraseParseAttr.NO)):
                     break
         is_adj = False
         if (((has_article or has_prop or has_misc)) and items is None):
             pass
         elif (isinstance(t, ReferentToken)):
             pass
         else:
             if (not mc.is_noun and not mc.is_adjective):
                 if (mc.is_undefined and has_article):
                     pass
                 elif (items is None and mc.is_undefined
                       and t.chars.is_capital_upper):
                     pass
                 elif (mc.is_pronoun):
                     pass
                 elif (tt.term.endswith("EAN")):
                     is_adj = True
                 elif (MiscHelper.is_eng_adj_suffix(tt.next0_)):
                     pass
                 else:
                     break
             if (mc.is_verb):
                 if (t.next0_ is not None and t.next0_.morph.class0_.is_verb
                         and (t.whitespaces_after_count < 2)):
                     pass
                 elif (t.chars.is_capital_upper
                       and not MiscHelper.can_be_start_of_sentence(t)):
                     pass
                 elif ((t.chars.is_capital_upper and mc.is_noun and
                        (isinstance(t.next0_, TextToken)))
                       and t.next0_.chars.is_capital_upper):
                     pass
                 elif (isinstance(t, ReferentToken)):
                     pass
                 else:
                     break
         if (items is None):
             items = list()
         it = NounPhraseItem(t, t)
         if (mc.is_noun):
             it.can_be_noun = True
         if (mc.is_adjective or mc.is_pronoun or is_adj):
             it.can_be_adj = True
         items.append(it)
         t = it.end_token
         if (len(items) == 1):
             if (MiscHelper.is_eng_adj_suffix(t.next0_)):
                 mc.is_noun = False
                 mc.is_adjective = True
                 t = t.next0_.next0_
     if (items is None):
         return None
     noun = items[len(items) - 1]
     res = NounPhraseToken(first, noun.end_token)
     res.noun = (noun)
     res.morph = MorphCollection()
     for v in noun.end_token.morph.items:
         if (v.class0_.is_verb):
             continue
         if (v.class0_.is_proper and noun.begin_token.chars.is_all_lower):
             continue
         if (isinstance(v, MorphWordForm)):
             wf = MorphWordForm()
             wf.copy_from_word_form(Utils.asObjectOrNull(v, MorphWordForm))
             if (has_article and v.number != MorphNumber.SINGULAR):
                 wf.number = MorphNumber.SINGULAR
             res.morph.add_item(wf)
         else:
             bi = MorphBaseInfo()
             bi.copy_from(v)
             if (has_article and v.number != MorphNumber.SINGULAR):
                 bi.number = MorphNumber.SINGULAR
             res.morph.add_item(bi)
     if (res.morph.items_count == 0 and has_article):
         res.morph.add_item(
             MorphBaseInfo._new192(MorphClass.NOUN, MorphNumber.SINGULAR))
     i = 0
     while i < (len(items) - 1):
         res.adjectives.append(items[i])
         i += 1
     return res
Esempio n. 11
0
 def run(self, text: str, only_tokenizing: bool, dlang: 'MorphLang',
         progress: EventHandler,
         good_text: bool) -> typing.List['MorphToken']:
     """ Произвести морфологический анализ текста
     
     Args:
         text(str): исходный текст
         lang: язык (если null, то попробует определить)
     
     Returns:
         typing.List[MorphToken]: последовательность результирующих морфем
     """
     if (Utils.isNullOrEmpty(text)):
         return None
     twr = TextWrapper(text, good_text)
     twrch = twr.chars
     res = list()
     uni_lex = dict()
     term0 = None
     pure_rus_words = 0
     pure_ukr_words = 0
     pure_by_words = 0
     pure_kz_words = 0
     tot_rus_words = 0
     tot_ukr_words = 0
     tot_by_words = 0
     tot_kz_words = 0
     i = 0
     first_pass2708 = True
     while True:
         if first_pass2708: first_pass2708 = False
         else: i += 1
         if (not (i < twr.length)): break
         ty = InnerMorphology._getCharTyp(twrch[i])
         if (ty == 0):
             continue
         if (ty > 2):
             j = (i + 1)
         else:
             j = (i + 1)
             while j < twr.length:
                 if (InnerMorphology._getCharTyp(twrch[j]) != ty):
                     break
                 j += 1
         wstr = text[i:i + j - i]
         term = None
         if (good_text):
             term = wstr
         else:
             trstr = LanguageHelper.transliteralCorrection(
                 wstr, term0, False)
             term = LanguageHelper.correctWord(trstr)
         if (Utils.isNullOrEmpty(term)):
             i = (j - 1)
             continue
         lang = InnerMorphology.__detectLang(twr, i, j - 1, term)
         if (lang == MorphLang.UA):
             pure_ukr_words += 1
         elif (lang == MorphLang.RU):
             pure_rus_words += 1
         elif (lang == MorphLang.BY):
             pure_by_words += 1
         elif (lang == MorphLang.KZ):
             pure_kz_words += 1
         if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN):
             tot_rus_words += 1
         if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN):
             tot_ukr_words += 1
         if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN):
             tot_by_words += 1
         if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN):
             tot_kz_words += 1
         if (ty == 1):
             term0 = term
         lemmas = None
         if (ty == 1 and not only_tokenizing):
             wraplemmas7 = RefOutArgWrapper(None)
             inoutres8 = Utils.tryGetValue(uni_lex, term, wraplemmas7)
             lemmas = wraplemmas7.value
             if (not inoutres8):
                 lemmas = InnerMorphology.UniLexWrap._new6(lang)
                 uni_lex[term] = lemmas
         tok = MorphToken()
         tok.term = term
         tok.begin_char = i
         if (i == 733860):
             pass
         tok.end_char = (j - 1)
         tok.tag = (lemmas)
         res.append(tok)
         i = (j - 1)
     def_lang = MorphLang(dlang)
     if (pure_rus_words > pure_ukr_words and pure_rus_words > pure_by_words
             and pure_rus_words > pure_kz_words):
         def_lang = MorphLang.RU
     elif (tot_rus_words > tot_ukr_words and tot_rus_words > tot_by_words
           and tot_rus_words > tot_kz_words):
         def_lang = MorphLang.RU
     elif (pure_ukr_words > pure_rus_words
           and pure_ukr_words > pure_by_words
           and pure_ukr_words > pure_kz_words):
         def_lang = MorphLang.UA
     elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words
           and tot_ukr_words > tot_kz_words):
         def_lang = MorphLang.UA
     elif (pure_kz_words > pure_rus_words and pure_kz_words > pure_ukr_words
           and pure_kz_words > pure_by_words):
         def_lang = MorphLang.KZ
     elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words
           and tot_kz_words > tot_by_words):
         def_lang = MorphLang.KZ
     elif (pure_by_words > pure_rus_words and pure_by_words > pure_ukr_words
           and pure_by_words > pure_kz_words):
         def_lang = MorphLang.BY
     elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words
           and tot_by_words > tot_kz_words):
         if (tot_rus_words > 10 and tot_by_words > (tot_rus_words + 20)):
             def_lang = MorphLang.BY
         elif (tot_rus_words == 0 or tot_by_words >= (tot_rus_words * 2)):
             def_lang = MorphLang.BY
     if (((def_lang.is_undefined or def_lang.is_ua)) and tot_rus_words > 0):
         if (((tot_ukr_words > tot_rus_words
               and InnerMorphology.M_ENGINE_UA.language.is_ua))
                 or ((tot_by_words > tot_rus_words
                      and InnerMorphology.M_ENGINE_BY.language.is_by))
                 or ((tot_kz_words > tot_rus_words
                      and InnerMorphology.M_ENGINE_KZ.language.is_kz))):
             cou0 = 0
             tot_kz_words = 0
             tot_ukr_words = tot_kz_words
             tot_by_words = tot_ukr_words
             tot_rus_words = tot_by_words
             for kp in uni_lex.items():
                 lang = MorphLang()
                 wraplang9 = RefOutArgWrapper(lang)
                 kp[1].word_forms = self.__processOneWord(kp[0], wraplang9)
                 lang = wraplang9.value
                 if (kp[1].word_forms is not None):
                     for wf in kp[1].word_forms:
                         lang |= wf.language
                 kp[1].lang = lang
                 if (lang.is_ru):
                     tot_rus_words += 1
                 if (lang.is_ua):
                     tot_ukr_words += 1
                 if (lang.is_by):
                     tot_by_words += 1
                 if (lang.is_kz):
                     tot_kz_words += 1
                 if (lang.is_cyrillic):
                     cou0 += 1
                 if (cou0 >= 100):
                     break
             if (tot_rus_words > ((math.floor(tot_by_words / 2)))
                     and tot_rus_words > ((math.floor(tot_ukr_words / 2)))):
                 def_lang = MorphLang.RU
             elif (tot_ukr_words > ((math.floor(tot_rus_words / 2)))
                   and tot_ukr_words > ((math.floor(tot_by_words / 2)))):
                 def_lang = MorphLang.UA
             elif (tot_by_words > ((math.floor(tot_rus_words / 2)))
                   and tot_by_words > ((math.floor(tot_ukr_words / 2)))):
                 def_lang = MorphLang.BY
         elif (def_lang.is_undefined):
             def_lang = MorphLang.RU
     cou = 0
     tot_kz_words = 0
     tot_ukr_words = tot_kz_words
     tot_by_words = tot_ukr_words
     tot_rus_words = tot_by_words
     for kp in uni_lex.items():
         lang = def_lang
         if (lang.is_undefined):
             if (tot_rus_words > tot_by_words
                     and tot_rus_words > tot_ukr_words
                     and tot_rus_words > tot_kz_words):
                 lang = MorphLang.RU
             elif (tot_ukr_words > tot_rus_words
                   and tot_ukr_words > tot_by_words
                   and tot_ukr_words > tot_kz_words):
                 lang = MorphLang.UA
             elif (tot_by_words > tot_rus_words
                   and tot_by_words > tot_ukr_words
                   and tot_by_words > tot_kz_words):
                 lang = MorphLang.BY
             elif (tot_kz_words > tot_rus_words
                   and tot_kz_words > tot_ukr_words
                   and tot_kz_words > tot_by_words):
                 lang = MorphLang.KZ
         wraplang10 = RefOutArgWrapper(lang)
         kp[1].word_forms = self.__processOneWord(kp[0], wraplang10)
         lang = wraplang10.value
         kp[1].lang = lang
         if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN):
             tot_rus_words += 1
         if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN):
             tot_ukr_words += 1
         if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN):
             tot_by_words += 1
         if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN):
             tot_kz_words += 1
         if (progress is not None):
             self.__onProgress(cou, len(uni_lex), progress)
         cou += 1
     debug_token = None
     empty_list = None
     for r in res:
         uni = Utils.asObjectOrNull(r.tag, InnerMorphology.UniLexWrap)
         r.tag = None
         if (uni is None or uni.word_forms is None
                 or len(uni.word_forms) == 0):
             if (empty_list is None):
                 empty_list = list()
             r.word_forms = empty_list
             if (uni is not None):
                 r.language = uni.lang
         else:
             r.word_forms = uni.word_forms
         if (r.begin_char == 733860):
             debug_token = r
     if (not good_text):
         i = 0
         first_pass2709 = True
         while True:
             if first_pass2709: first_pass2709 = False
             else: i += 1
             if (not (i < (len(res) - 2))): break
             ui0 = twrch[res[i].begin_char]
             ui1 = twrch[res[i + 1].begin_char]
             ui2 = twrch[res[i + 2].begin_char]
             if (ui1.is_quot):
                 p = res[i + 1].begin_char
                 if ((p >= 2 and "БбТт".find(text[p - 1]) >= 0 and
                      ((p + 3) < len(text)))
                         and "ЕеЯяЁё".find(text[p + 1]) >= 0):
                     wstr = LanguageHelper.transliteralCorrection(
                         LanguageHelper.correctWord("{0}Ъ{1}".format(
                             res[i].getSourceText(text),
                             res[i + 2].getSourceText(text))), None, False)
                     li = self.__processOneWord0(wstr)
                     if (li is not None and len(li) > 0
                             and li[0].is_in_dictionary):
                         res[i].end_char = res[i + 2].end_char
                         res[i].term = wstr
                         res[i].word_forms = li
                         del res[i + 1:i + 1 + 2]
                 elif ((ui1.is_apos and p > 0 and str.isalpha(text[p - 1]))
                       and ((p + 1) < len(text))
                       and str.isalpha(text[p + 1])):
                     if (def_lang == MorphLang.UA
                             or (((res[i].language) & MorphLang.UA)) !=
                             MorphLang.UNKNOWN
                             or (((res[i + 2].language) & MorphLang.UA)) !=
                             MorphLang.UNKNOWN):
                         wstr = LanguageHelper.transliteralCorrection(
                             LanguageHelper.correctWord("{0}{1}".format(
                                 res[i].getSourceText(text),
                                 res[i + 2].getSourceText(text))), None,
                             False)
                         li = self.__processOneWord0(wstr)
                         okk = True
                         if (okk):
                             res[i].end_char = res[i + 2].end_char
                             res[i].term = wstr
                             if (li is None):
                                 li = list()
                             res[i].word_forms = li
                             if (li is not None and len(li) > 0):
                                 res[i].language = li[0].language
                             del res[i + 1:i + 1 + 2]
             elif (((ui1.uni_char == '3' or ui1.uni_char == '4'))
                   and res[i + 1].length == 1):
                 src = ("З" if ui1.uni_char == '3' else "Ч")
                 i0 = i + 1
                 if ((res[i].end_char + 1) == res[i + 1].begin_char
                         and ui0.is_cyrillic):
                     i0 -= 1
                     src = (res[i0].getSourceText(text) + src)
                 i1 = i + 1
                 if ((res[i + 1].end_char + 1) == res[i + 2].begin_char
                         and ui2.is_cyrillic):
                     i1 += 1
                     src += res[i1].getSourceText(text)
                 if (len(src) > 2):
                     wstr = LanguageHelper.transliteralCorrection(
                         LanguageHelper.correctWord(src), None, False)
                     li = self.__processOneWord0(wstr)
                     if (li is not None and len(li) > 0
                             and li[0].is_in_dictionary):
                         res[i0].end_char = res[i1].end_char
                         res[i0].term = wstr
                         res[i0].word_forms = li
                         del res[i0 + 1:i0 + 1 + i1 - i0]
             elif ((ui1.is_hiphen and ui0.is_letter and ui2.is_letter)
                   and res[i].end_char > res[i].begin_char
                   and res[i + 2].end_char > res[i + 2].begin_char):
                 newline = False
                 sps = 0
                 j = (res[i + 1].end_char + 1)
                 while j < res[i + 2].begin_char:
                     if (text[j] == '\r' or text[j] == '\n'):
                         newline = True
                         sps += 1
                     elif (not Utils.isWhitespace(text[j])):
                         break
                     else:
                         sps += 1
                     j += 1
                 full_word = LanguageHelper.correctWord(
                     res[i].getSourceText(text) +
                     res[i + 2].getSourceText(text))
                 if (not newline):
                     if (full_word in uni_lex or full_word == "ИЗЗА"):
                         newline = True
                     elif (text[res[i + 1].begin_char] == (chr(0x00AD))):
                         newline = True
                     elif (LanguageHelper.endsWithEx(
                             res[i].getSourceText(text), "О", "о", None,
                             None) and len(res[i + 2].word_forms) > 0
                           and res[i + 2].word_forms[0].is_in_dictionary):
                         if (text[res[i + 1].begin_char] == '¬'):
                             li = self.__processOneWord0(full_word)
                             if (li is not None and len(li) > 0
                                     and li[0].is_in_dictionary):
                                 newline = True
                     elif ((res[i].end_char + 2) == res[i + 2].begin_char):
                         if (not str.isupper(text[res[i + 2].begin_char])
                                 and (sps < 2) and len(full_word) > 4):
                             newline = True
                             if ((i + 3) < len(res)):
                                 ui3 = twrch[res[i + 3].begin_char]
                                 if (ui3.is_hiphen):
                                     newline = False
                     elif (((res[i].end_char + 1) == res[i + 1].begin_char
                            and sps > 0 and (sps < 3))
                           and len(full_word) > 4):
                         newline = True
                 if (newline):
                     li = self.__processOneWord0(full_word)
                     if (li is not None and len(li) > 0
                             and ((li[0].is_in_dictionary
                                   or full_word in uni_lex))):
                         res[i].end_char = res[i + 2].end_char
                         res[i].term = full_word
                         res[i].word_forms = li
                         del res[i + 1:i + 1 + 2]
                 else:
                     pass
             elif ((ui1.is_letter and ui0.is_letter and res[i].length > 2)
                   and res[i + 1].length > 1):
                 if (ui0.is_upper != ui1.is_upper):
                     continue
                 if (not ui0.is_cyrillic or not ui1.is_cyrillic):
                     continue
                 newline = False
                 j = (res[i].end_char + 1)
                 while j < res[i + 1].begin_char:
                     if (twrch[j].code == 0xD or twrch[j].code == 0xA):
                         newline = True
                         break
                     j += 1
                 if (not newline):
                     continue
                 full_word = LanguageHelper.correctWord(
                     res[i].getSourceText(text) +
                     res[i + 1].getSourceText(text))
                 if (not full_word in uni_lex):
                     continue
                 li = self.__processOneWord0(full_word)
                 if (li is not None and len(li) > 0
                         and li[0].is_in_dictionary):
                     res[i].end_char = res[i + 1].end_char
                     res[i].term = full_word
                     res[i].word_forms = li
                     del res[i + 1]
     i = 0
     first_pass2710 = True
     while True:
         if first_pass2710: first_pass2710 = False
         else: i += 1
         if (not (i < len(res))): break
         mt = res[i]
         mt.char_info = CharsInfo()
         ui0 = twrch[mt.begin_char]
         ui00 = UnicodeInfo.ALL_CHARS[ord((res[i].term[0]))]
         j = (mt.begin_char + 1)
         while j <= mt.end_char:
             if (ui0.is_letter):
                 break
             ui0 = twrch[j]
             j += 1
         if (ui0.is_letter):
             res[i].char_info.is_letter = True
             if (ui00.is_latin):
                 res[i].char_info.is_latin_letter = True
             elif (ui00.is_cyrillic):
                 res[i].char_info.is_cyrillic_letter = True
             if (res[i].language == MorphLang.UNKNOWN):
                 if (LanguageHelper.isCyrillic(mt.term)):
                     res[i].language = (MorphLang.RU if
                                        def_lang.is_undefined else def_lang)
             if (good_text):
                 continue
             all_up = True
             all_lo = True
             j = mt.begin_char
             while j <= mt.end_char:
                 if (twrch[j].is_upper or twrch[j].is_digit):
                     all_lo = False
                 else:
                     all_up = False
                 j += 1
             if (all_up):
                 mt.char_info.is_all_upper = True
             elif (all_lo):
                 mt.char_info.is_all_lower = True
             elif (((ui0.is_upper or twrch[mt.begin_char].is_digit))
                   and mt.end_char > mt.begin_char):
                 all_lo = True
                 j = (mt.begin_char + 1)
                 while j <= mt.end_char:
                     if (twrch[j].is_upper or twrch[j].is_digit):
                         all_lo = False
                         break
                     j += 1
                 if (all_lo):
                     mt.char_info.is_capital_upper = True
                 elif (twrch[mt.end_char].is_lower
                       and (mt.end_char - mt.begin_char) > 1):
                     all_up = True
                     j = mt.begin_char
                     while j < mt.end_char:
                         if (twrch[j].is_lower):
                             all_up = False
                             break
                         j += 1
                     if (all_up):
                         mt.char_info.is_last_lower = True
         if (mt.char_info.is_last_lower and mt.length > 2
                 and mt.char_info.is_cyrillic_letter):
             pref = text[mt.begin_char:mt.begin_char + mt.end_char -
                         mt.begin_char]
             ok = False
             for wf in mt.word_forms:
                 if (wf.normal_case == pref or wf.normal_full == pref):
                     ok = True
                     break
             if (not ok):
                 mt.word_forms = list(mt.word_forms)
                 mt.word_forms.insert(
                     0, MorphWordForm._new11(pref, MorphClass.NOUN, 1))
     if (good_text or only_tokenizing):
         return res
     i = 0
     first_pass2711 = True
     while True:
         if first_pass2711: first_pass2711 = False
         else: i += 1
         if (not (i < len(res))): break
         if (res[i].length == 1 and res[i].char_info.is_latin_letter):
             ch = res[i].term[0]
             if (ch == 'C' or ch == 'A' or ch == 'P'):
                 pass
             else:
                 continue
             is_rus = False
             for ii in range(i - 1, -1, -1):
                 if ((res[ii].end_char + 1) != res[ii + 1].begin_char):
                     break
                 elif (res[ii].char_info.is_letter):
                     is_rus = res[ii].char_info.is_cyrillic_letter
                     break
             if (not is_rus):
                 ii = i + 1
                 while ii < len(res):
                     if ((res[ii - 1].end_char + 1) != res[ii].begin_char):
                         break
                     elif (res[ii].char_info.is_letter):
                         is_rus = res[ii].char_info.is_cyrillic_letter
                         break
                     ii += 1
             if (is_rus):
                 res[i].term = LanguageHelper.transliteralCorrection(
                     res[i].term, None, True)
                 res[i].char_info.is_cyrillic_letter = True
                 res[i].char_info.is_latin_letter = True
     for r in res:
         if (r.char_info.is_all_upper or r.char_info.is_capital_upper):
             if (r.language.is_cyrillic):
                 ok = False
                 for wf in r.word_forms:
                     if (wf.class0_.is_proper_surname):
                         ok = True
                         break
                 if (not ok):
                     r.word_forms = list(r.word_forms)
                     InnerMorphology.M_ENGINE_RU.processSurnameVariants(
                         r.term, r.word_forms)
     for r in res:
         for mv in r.word_forms:
             if (mv.normal_case is None):
                 mv.normal_case = r.term
     i = 0
     while i < (len(res) - 2):
         if (res[i].char_info.is_latin_letter
                 and res[i].char_info.is_all_upper and res[i].length == 1):
             if (twrch[res[i + 1].begin_char].is_quot
                     and res[i + 2].char_info.is_latin_letter
                     and res[i + 2].length > 2):
                 if ((res[i].end_char + 1) == res[i + 1].begin_char and
                     (res[i + 1].end_char + 1) == res[i + 2].begin_char):
                     wstr = "{0}{1}".format(res[i].term, res[i + 2].term)
                     li = self.__processOneWord0(wstr)
                     if (li is not None):
                         res[i].word_forms = li
                     res[i].end_char = res[i + 2].end_char
                     res[i].term = wstr
                     if (res[i + 2].char_info.is_all_lower):
                         res[i].char_info.is_all_upper = False
                         res[i].char_info.is_capital_upper = True
                     elif (not res[i + 2].char_info.is_all_upper):
                         res[i].char_info.is_all_upper = False
                     del res[i + 1:i + 1 + 2]
         i += 1
     i = 0
     first_pass2712 = True
     while True:
         if first_pass2712: first_pass2712 = False
         else: i += 1
         if (not (i < (len(res) - 1))): break
         if (not res[i].char_info.is_letter
                 and not res[i + 1].char_info.is_letter
                 and (res[i].end_char + 1) == res[i + 1].begin_char):
             if (twrch[res[i].begin_char].is_hiphen
                     and twrch[res[i + 1].begin_char].is_hiphen):
                 if (i == 0 or not twrch[res[i - 1].begin_char].is_hiphen):
                     pass
                 else:
                     continue
                 if ((i + 2) == len(res)
                         or not twrch[res[i + 2].begin_char].is_hiphen):
                     pass
                 else:
                     continue
                 res[i].end_char = res[i + 1].end_char
                 del res[i + 1]
     return res
Esempio n. 12
0
 def __try_parse_ru(t: 'Token', can_be_partition: bool,
                    can_be_adj_partition: bool,
                    force_parse: bool) -> 'VerbPhraseToken':
     res = None
     t0 = t
     not0_ = None
     has_verb = False
     verb_be_before = False
     prep = None
     first_pass3070 = True
     while True:
         if first_pass3070: first_pass3070 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (not (isinstance(t, TextToken))):
             break
         tt = Utils.asObjectOrNull(t, TextToken)
         is_participle = False
         if (tt.term == "НЕ"):
             not0_ = t
             continue
         ty = 0
         norm = None
         mc = tt.get_morph_class_in_dictionary()
         if (tt.term == "НЕТ"):
             if (has_verb):
                 break
             ty = 1
         elif (tt.term == "ДОПУСТИМО"):
             ty = 3
         elif (mc.is_adverb and not mc.is_verb):
             ty = 2
         elif (tt.is_pure_verb or tt.is_verb_be):
             ty = 1
             if (has_verb):
                 if (not tt.morph.contains_attr("инф.", None)):
                     if (verb_be_before):
                         pass
                     else:
                         break
         elif (mc.is_verb):
             if (mc.is_preposition or mc.is_misc or mc.is_pronoun):
                 pass
             elif (mc.is_noun):
                 if (tt.term == "СТАЛИ" or tt.term == "СТЕКЛО"
                         or tt.term == "БЫЛИ"):
                     ty = 1
                 elif (not tt.chars.is_all_lower
                       and not MiscHelper.can_be_start_of_sentence(tt)):
                     ty = 1
                 elif (mc.is_adjective and can_be_partition):
                     ty = 1
                 elif (force_parse):
                     ty = 1
             elif (mc.is_proper):
                 if (tt.chars.is_all_lower):
                     ty = 1
             else:
                 ty = 1
             if (mc.is_adjective):
                 is_participle = True
             if (not tt.morph.case_.is_undefined):
                 is_participle = True
             if (not can_be_partition and is_participle):
                 break
             if (has_verb):
                 if (tt.morph.contains_attr("инф.", None)):
                     pass
                 elif (not is_participle):
                     pass
                 else:
                     break
         elif ((mc.is_adjective and tt.morph.contains_attr("к.ф.", None)
                and tt.term.endswith("О")) and NounPhraseHelper.try_parse(
                    tt, NounPhraseParseAttr.NO, 0, None) is None):
             ty = 2
         elif (mc.is_adjective
               and ((can_be_partition or can_be_adj_partition))):
             if (tt.morph.contains_attr("к.ф.", None)
                     and not can_be_adj_partition):
                 break
             norm = tt.get_normal_case_text(MorphClass.ADJECTIVE,
                                            MorphNumber.SINGULAR,
                                            MorphGender.MASCULINE, False)
             if (norm.endswith("ЙШИЙ")):
                 pass
             else:
                 grs = DerivateService.find_derivates(norm, True, None)
                 if (grs is not None and len(grs) > 0):
                     hverb = False
                     hpart = False
                     for gr in grs:
                         for w in gr.words:
                             if (w.class0_.is_adjective
                                     and w.class0_.is_verb):
                                 if (w.spelling == norm):
                                     hpart = True
                             elif (w.class0_.is_verb):
                                 hverb = True
                     if (hpart and hverb):
                         ty = 3
                     elif (can_be_adj_partition):
                         ty = 3
                     if (ty != 3 and not Utils.isNullOrEmpty(grs[0].prefix)
                             and norm.startswith(grs[0].prefix)):
                         hverb = False
                         hpart = False
                         norm1 = norm[len(grs[0].prefix):]
                         grs = DerivateService.find_derivates(
                             norm1, True, None)
                         if (grs is not None and len(grs) > 0):
                             for gr in grs:
                                 for w in gr.words:
                                     if (w.class0_.is_adjective
                                             and w.class0_.is_verb):
                                         if (w.spelling == norm1):
                                             hpart = True
                                     elif (w.class0_.is_verb):
                                         hverb = True
                         if (hpart and hverb):
                             ty = 3
         if (ty == 0 and t == t0 and can_be_partition):
             prep = PrepositionHelper.try_parse(t)
             if (prep is not None):
                 t = prep.end_token
                 continue
         if (ty == 0):
             break
         if (res is None):
             res = VerbPhraseToken(t0, t)
         res.end_token = t
         it = VerbPhraseItemToken._new603(t, t, MorphCollection(t.morph))
         if (not0_ is not None):
             it.begin_token = not0_
             it.not0_ = True
             not0_ = (None)
         it.is_adverb = ty == 2
         if (prep is not None and not t.morph.case_.is_undefined
                 and len(res.items) == 0):
             if (((prep.next_case) & t.morph.case_).is_undefined):
                 return None
             it.morph.remove_items(prep.next_case, False)
             res.preposition = prep
         if (norm is None):
             norm = t.get_normal_case_text(
                 (MorphClass.ADJECTIVE if ty == 3 else
                  (MorphClass.ADVERB if ty == 2 else MorphClass.VERB)),
                 MorphNumber.SINGULAR, MorphGender.MASCULINE, False)
             if (ty == 1 and not tt.morph.case_.is_undefined):
                 mi = MorphWordForm._new604(MorphCase.NOMINATIVE,
                                            MorphNumber.SINGULAR,
                                            MorphGender.MASCULINE)
                 for mit in tt.morph.items:
                     if (isinstance(mit, MorphWordForm)):
                         mi.misc = mit.misc
                         break
                 nnn = MorphologyService.get_wordform("КК" + t.term, mi)
                 if (nnn is not None):
                     norm = nnn[2:]
         it.normal = norm
         res.items.append(it)
         if (not has_verb and ((ty == 1 or ty == 3))):
             res.morph = it.morph
             has_verb = True
         if (ty == 1 or ty == 3):
             if (ty == 1 and tt.is_verb_be):
                 verb_be_before = True
             else:
                 verb_be_before = False
     if (not has_verb):
         return None
     for i in range(len(res.items) - 1, 0, -1):
         if (res.items[i].is_adverb):
             del res.items[i]
             res.end_token = res.items[i - 1].end_token
         else:
             break
     return res
Esempio n. 13
0
 def process(self, word : str) -> typing.List['MorphWordForm']:
     if (Utils.isNullOrEmpty(word)): 
         return None
     res = None
     i = 0
     if (len(word) > 1): 
         i = 0
         while i < len(word): 
             ch = word[i]
             if (LanguageHelper.is_cyrillic_vowel(ch) or LanguageHelper.is_latin_vowel(ch)): 
                 break
             i += 1
         if (i >= len(word)): 
             return res
     mvs = [ ]
     tn = self.m_root
     i = 0
     while i <= len(word): 
         if (tn.lazy_pos > 0): 
             self.__load_tree_node(tn)
         if (tn.rule_ids is not None): 
             word_begin = None
             word_end = None
             if (i == 0): 
                 word_end = word
             elif (i < len(word)): 
                 word_end = word[i:]
             else: 
                 word_end = ""
             if (res is None): 
                 res = list()
             for rid in tn.rule_ids: 
                 r = self.get_rule(rid)
                 mvs = r.get_vars(word_end)
                 if (mvs is None): 
                     continue
                 if (word_begin is None): 
                     if (i == len(word)): 
                         word_begin = word
                     elif (i > 0): 
                         word_begin = word[0:0+i]
                     else: 
                         word_begin = ""
                 self.__process_result(res, word_begin, mvs)
         if (tn.nodes is None or i >= len(word)): 
             break
         ch = ord(word[i])
         wraptn9 = RefOutArgWrapper(None)
         inoutres10 = Utils.tryGetValue(tn.nodes, ch, wraptn9)
         tn = wraptn9.value
         if (not inoutres10): 
             break
         i += 1
     need_test_unknown_vars = True
     if (res is not None): 
         for r in res: 
             if ((r.class0_.is_pronoun or r.class0_.is_noun or r.class0_.is_adjective) or (r.class0_.is_misc and r.class0_.is_conjunction) or r.class0_.is_preposition): 
                 need_test_unknown_vars = False
             elif (r.class0_.is_adverb and r.normal_case is not None): 
                 if (not LanguageHelper.ends_with_ex(r.normal_case, "О", "А", None, None)): 
                     need_test_unknown_vars = False
                 elif (r.normal_case == "МНОГО"): 
                     need_test_unknown_vars = False
             elif (r.class0_.is_verb and len(res) > 1): 
                 ok = False
                 for rr in res: 
                     if (rr != r and rr.class0_ != r.class0_): 
                         ok = True
                         break
                 if (ok and not LanguageHelper.ends_with(word, "ИМ")): 
                     need_test_unknown_vars = False
     if (need_test_unknown_vars and LanguageHelper.is_cyrillic_char(word[0])): 
         gl = 0
         sog = 0
         j = 0
         while j < len(word): 
             if (LanguageHelper.is_cyrillic_vowel(word[j])): 
                 gl += 1
             else: 
                 sog += 1
             j += 1
         if ((gl < 2) or (sog < 2)): 
             need_test_unknown_vars = False
     if (need_test_unknown_vars and res is not None and len(res) == 1): 
         if (res[0].class0_.is_verb): 
             if ("н.вр." in res[0].misc.attrs and "нес.в." in res[0].misc.attrs and not "страд.з." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif ("б.вр." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif ("инф." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif (res[0].normal_case is not None and LanguageHelper.ends_with(res[0].normal_case, "СЯ")): 
                 need_test_unknown_vars = False
         if (res[0].class0_.is_undefined and "прдктв." in res[0].misc.attrs): 
             need_test_unknown_vars = False
     if (need_test_unknown_vars): 
         if (self.m_root_reverce is None): 
             return res
         tn = self.m_root_reverce
         tn0 = self.m_root_reverce
         for i in range(len(word) - 1, -1, -1):
             if (tn.lazy_pos > 0): 
                 self.__load_tree_node(tn)
             ch = ord(word[i])
             if (tn.nodes is None): 
                 break
             if (not ch in tn.nodes): 
                 break
             tn = tn.nodes[ch]
             if (tn.lazy_pos > 0): 
                 self.__load_tree_node(tn)
             if (tn.reverce_variants is not None): 
                 tn0 = tn
                 break
         else: i = -1
         if (tn0 != self.m_root_reverce): 
             glas = i < 4
             while i >= 0: 
                 if (LanguageHelper.is_cyrillic_vowel(word[i]) or LanguageHelper.is_latin_vowel(word[i])): 
                     glas = True
                     break
                 i -= 1
             if (glas): 
                 for mvref in tn0.reverce_variants: 
                     mv = self.get_rule_var(mvref.rule_id, mvref.variant_id)
                     if (mv is None): 
                         continue
                     if (((not mv.class0_.is_verb and not mv.class0_.is_adjective and not mv.class0_.is_noun) and not mv.class0_.is_proper_surname and not mv.class0_.is_proper_geo) and not mv.class0_.is_proper_secname): 
                         continue
                     ok = False
                     for rr in res: 
                         if (rr.is_in_dictionary): 
                             if (rr.class0_ == mv.class0_ or rr.class0_.is_noun): 
                                 ok = True
                                 break
                             if (not mv.class0_.is_adjective and rr.class0_.is_verb): 
                                 ok = True
                                 break
                     if (ok): 
                         continue
                     if (len(mv.tail) > 0 and not LanguageHelper.ends_with(word, mv.tail)): 
                         continue
                     r = MorphWordForm(mv, word, self.get_misc_info(mv.misc_info_id))
                     if (not r._has_morph_equals(res)): 
                         r.undef_coef = mvref.coef
                         if (res is None): 
                             res = list()
                         res.append(r)
     if (word == "ПРИ" and res is not None): 
         for i in range(len(res) - 1, -1, -1):
             if (res[i].class0_.is_proper_geo): 
                 del res[i]
         else: i = -1
     if (res is None or len(res) == 0): 
         return None
     self.__sort(res, word)
     for v in res: 
         if (v.normal_case is None): 
             v.normal_case = word
         if (v.class0_.is_verb): 
             if (v.normal_full is None and LanguageHelper.ends_with(v.normal_case, "ТЬСЯ")): 
                 v.normal_full = v.normal_case[0:0+len(v.normal_case) - 2]
         v.language = self.language
         if (v.class0_.is_preposition): 
             v.normal_case = LanguageHelper.normalize_preposition(v.normal_case)
     mc = MorphClass()
     for i in range(len(res) - 1, -1, -1):
         if (not res[i].is_in_dictionary and res[i].class0_.is_adjective and len(res) > 1): 
             if ("к.ф." in res[i].misc.attrs or "неизм." in res[i].misc.attrs): 
                 del res[i]
                 continue
         if (res[i].is_in_dictionary): 
             mc.value |= res[i].class0_.value
     else: i = -1
     if (mc == MorphClass.VERB and len(res) > 1): 
         for r in res: 
             if (r.undef_coef > (100) and r.class0_ == MorphClass.ADJECTIVE): 
                 r.undef_coef = (0)
     if (len(res) == 0): 
         return None
     return res
Esempio n. 14
0
 def getAllWordforms(self, word : str) -> typing.List['MorphWordForm']:
     res = list()
     tn = self.m_root
     i = 0
     while i <= len(word): 
         if (tn.lazy_pos > 0): 
             self.__loadTreeNode(tn)
         if (tn.rules is not None): 
             word_begin = ""
             word_end = ""
             if (i > 0): 
                 word_begin = word[0:0+i]
             else: 
                 word_end = word
             if (i < len(word)): 
                 word_end = word[i:]
             else: 
                 word_begin = word
             for r in tn.rules: 
                 if (word_end in r.variants): 
                     for vl in r.variants_list: 
                         for v in vl: 
                             wf = MorphWordForm(v, None)
                             if (not MorphWordForm._hasMorphEquals(res, wf)): 
                                 wf.normal_case = (word_begin + v.tail)
                                 wf.undef_coef = (0)
                                 res.append(wf)
         if (tn.nodes is None or i >= len(word)): 
             break
         ch = ord(word[i])
         wraptn20 = RefOutArgWrapper(None)
         inoutres21 = Utils.tryGetValue(tn.nodes, ch, wraptn20)
         tn = wraptn20.value
         if (not inoutres21): 
             break
         i += 1
     i = 0
     first_pass2713 = True
     while True:
         if first_pass2713: first_pass2713 = False
         else: i += 1
         if (not (i < len(res))): break
         wf = res[i]
         if (wf.containsAttr("инф.", None)): 
             continue
         j = i + 1
         first_pass2714 = True
         while True:
             if first_pass2714: first_pass2714 = False
             else: j += 1
             if (not (j < len(res))): break
             wf1 = res[j]
             if (wf1.containsAttr("инф.", None)): 
                 continue
             if ((wf.class0_ == wf1.class0_ and wf.gender == wf1.gender and wf.number == wf1.number) and wf.normal_case == wf1.normal_case): 
                 wf.case_ = (wf.case_) | wf1.case_
                 del res[j]
                 j -= 1
     i = 0
     first_pass2715 = True
     while True:
         if first_pass2715: first_pass2715 = False
         else: i += 1
         if (not (i < len(res))): break
         wf = res[i]
         if (wf.containsAttr("инф.", None)): 
             continue
         j = i + 1
         first_pass2716 = True
         while True:
             if first_pass2716: first_pass2716 = False
             else: j += 1
             if (not (j < len(res))): break
             wf1 = res[j]
             if (wf1.containsAttr("инф.", None)): 
                 continue
             if ((wf.class0_ == wf1.class0_ and wf.case_ == wf1.case_ and wf.number == wf1.number) and wf.normal_case == wf1.normal_case): 
                 wf.gender = Utils.valToEnum((wf.gender) | (wf1.gender), MorphGender)
                 del res[j]
                 j -= 1
     return res
Esempio n. 15
0
 def get_all_wordforms(self, word : str) -> typing.List['MorphWordForm']:
     res = list()
     i = 0
     tn = self.m_root
     i = 0
     while i <= len(word): 
         if (tn.lazy_pos > 0): 
             self.__load_tree_node(tn)
         if (tn.rule_ids is not None): 
             word_begin = ""
             word_end = ""
             if (i > 0): 
                 word_begin = word[0:0+i]
             else: 
                 word_end = word
             if (i < len(word)): 
                 word_end = word[i:]
             else: 
                 word_begin = word
             for rid in tn.rule_ids: 
                 r = self.get_rule(rid)
                 if (r.contains_var(word_end)): 
                     for vl in r.morph_vars: 
                         for v in vl: 
                             wf = MorphWordForm(v, None, self.get_misc_info(v.misc_info_id))
                             if (not wf._has_morph_equals(res)): 
                                 wf.normal_case = (word_begin + v.tail)
                                 wf.undef_coef = (0)
                                 res.append(wf)
         if (tn.nodes is None or i >= len(word)): 
             break
         ch = ord(word[i])
         wraptn11 = RefOutArgWrapper(None)
         inoutres12 = Utils.tryGetValue(tn.nodes, ch, wraptn11)
         tn = wraptn11.value
         if (not inoutres12): 
             break
         i += 1
     i = 0
     first_pass2983 = True
     while True:
         if first_pass2983: first_pass2983 = False
         else: i += 1
         if (not (i < len(res))): break
         wf = res[i]
         if (wf.contains_attr("инф.", None)): 
             continue
         cas = wf.case_
         j = i + 1
         first_pass2984 = True
         while True:
             if first_pass2984: first_pass2984 = False
             else: j += 1
             if (not (j < len(res))): break
             wf1 = res[j]
             if (wf1.contains_attr("инф.", None)): 
                 continue
             if ((wf.class0_ == wf1.class0_ and wf.gender == wf1.gender and wf.number == wf1.number) and wf.normal_case == wf1.normal_case): 
                 cas |= wf1.case_
                 del res[j]
                 j -= 1
         if (cas != wf.case_): 
             res[i].case_ = cas
     i = 0
     first_pass2985 = True
     while True:
         if first_pass2985: first_pass2985 = False
         else: i += 1
         if (not (i < len(res))): break
         wf = res[i]
         if (wf.contains_attr("инф.", None)): 
             continue
         j = i + 1
         first_pass2986 = True
         while True:
             if first_pass2986: first_pass2986 = False
             else: j += 1
             if (not (j < len(res))): break
             wf1 = res[j]
             if (wf1.contains_attr("инф.", None)): 
                 continue
             if ((wf.class0_ == wf1.class0_ and wf.case_ == wf1.case_ and wf.number == wf1.number) and wf.normal_case == wf1.normal_case): 
                 wf.gender = Utils.valToEnum((wf.gender) | (wf1.gender), MorphGender)
                 del res[j]
                 j -= 1
     return res
Esempio n. 16
0
 def process(self, word : str) -> typing.List['MorphWordForm']:
     """ Обработка одного слова
     
     Args:
         word(str): слово должно быть в верхнем регистре
     
     """
     if (Utils.isNullOrEmpty(word)): 
         return None
     res = None
     if (len(word) > 1): 
         i = 0
         while i < len(word): 
             ch = word[i]
             if (LanguageHelper.isCyrillicVowel(ch) or LanguageHelper.isLatinVowel(ch)): 
                 break
             i += 1
         if (i >= len(word)): 
             return res
     mvs = [ ]
     tn = self.m_root
     i = 0
     while i <= len(word): 
         if (tn.lazy_pos > 0): 
             self.__loadTreeNode(tn)
         if (tn.rules is not None): 
             word_begin = None
             word_end = None
             if (i == 0): 
                 word_end = word
             elif (i < len(word)): 
                 word_end = word[i:]
             else: 
                 word_end = ""
             if (res is None): 
                 res = list()
             for r in tn.rules: 
                 wrapmvs14 = RefOutArgWrapper(None)
                 inoutres15 = Utils.tryGetValue(r.variants, word_end, wrapmvs14)
                 mvs = wrapmvs14.value
                 if (inoutres15): 
                     if (word_begin is None): 
                         if (i == len(word)): 
                             word_begin = word
                         elif (i > 0): 
                             word_begin = word[0:0+i]
                         else: 
                             word_begin = ""
                     r.processResult(res, word_begin, mvs)
         if (tn.nodes is None or i >= len(word)): 
             break
         ch = ord(word[i])
         wraptn16 = RefOutArgWrapper(None)
         inoutres17 = Utils.tryGetValue(tn.nodes, ch, wraptn16)
         tn = wraptn16.value
         if (not inoutres17): 
             break
         i += 1
     need_test_unknown_vars = True
     if (res is not None): 
         for r in res: 
             if ((r.class0_.is_pronoun or r.class0_.is_noun or r.class0_.is_adjective) or (r.class0_.is_misc and r.class0_.is_conjunction) or r.class0_.is_preposition): 
                 need_test_unknown_vars = False
             elif (r.class0_.is_adverb and r.normal_case is not None): 
                 if (not LanguageHelper.endsWithEx(r.normal_case, "О", "А", None, None)): 
                     need_test_unknown_vars = False
                 elif (r.normal_case == "МНОГО"): 
                     need_test_unknown_vars = False
             elif (r.class0_.is_verb and len(res) > 1): 
                 ok = False
                 for rr in res: 
                     if (rr != r and rr.class0_ != r.class0_): 
                         ok = True
                         break
                 if (ok and not LanguageHelper.endsWith(word, "ИМ")): 
                     need_test_unknown_vars = False
     if (need_test_unknown_vars and LanguageHelper.isCyrillicChar(word[0])): 
         gl = 0
         sog = 0
         j = 0
         while j < len(word): 
             if (LanguageHelper.isCyrillicVowel(word[j])): 
                 gl += 1
             else: 
                 sog += 1
             j += 1
         if ((gl < 2) or (sog < 2)): 
             need_test_unknown_vars = False
     if (need_test_unknown_vars and res is not None and len(res) == 1): 
         if (res[0].class0_.is_verb): 
             if ("н.вр." in res[0].misc.attrs and "нес.в." in res[0].misc.attrs and not "страд.з." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif ("б.вр." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif (res[0].normal_case is not None and LanguageHelper.endsWith(res[0].normal_case, "СЯ")): 
                 need_test_unknown_vars = False
         if (res[0].class0_.is_undefined and "прдктв." in res[0].misc.attrs): 
             need_test_unknown_vars = False
     if (need_test_unknown_vars): 
         if (self.m_root_reverce is None): 
             return res
         tn = self.m_root_reverce
         tn0 = None
         for i in range(len(word) - 1, -1, -1):
             if (tn.lazy_pos > 0): 
                 self.__loadTreeNode(tn)
             ch = ord(word[i])
             if (tn.nodes is None): 
                 break
             wrapnext18 = RefOutArgWrapper(None)
             inoutres19 = Utils.tryGetValue(tn.nodes, ch, wrapnext18)
             next0_ = wrapnext18.value
             if (not inoutres19): 
                 break
             tn = next0_
             if (tn.lazy_pos > 0): 
                 self.__loadTreeNode(tn)
             if (tn.reverce_variants is not None): 
                 tn0 = tn
                 break
         else: i = -1
         if (tn0 is not None): 
             glas = i < 4
             while i >= 0: 
                 if (LanguageHelper.isCyrillicVowel(word[i]) or LanguageHelper.isLatinVowel(word[i])): 
                     glas = True
                     break
                 i -= 1
             if (glas): 
                 for mv in tn0.reverce_variants: 
                     if (((not mv.class0_.is_verb and not mv.class0_.is_adjective and not mv.class0_.is_noun) and not mv.class0_.is_proper_surname and not mv.class0_.is_proper_geo) and not mv.class0_.is_proper_secname): 
                         continue
                     ok = False
                     for rr in res: 
                         if (rr.is_in_dictionary): 
                             if (rr.class0_ == mv.class0_ or rr.class0_.is_noun): 
                                 ok = True
                                 break
                             if (not mv.class0_.is_adjective and rr.class0_.is_verb): 
                                 ok = True
                                 break
                     if (ok): 
                         continue
                     if (len(mv.tail) > 0 and not LanguageHelper.endsWith(word, mv.tail)): 
                         continue
                     r = MorphWordForm(mv, word)
                     if (not MorphWordForm._hasMorphEquals(res, r)): 
                         r.undef_coef = mv.coef
                         if (res is None): 
                             res = list()
                         res.append(r)
     if (word == "ПРИ" and res is not None): 
         for i in range(len(res) - 1, -1, -1):
             if (res[i].class0_.is_proper_geo): 
                 del res[i]
         else: i = -1
     if (res is None or len(res) == 0): 
         return None
     MorphEngine.__sort(res, word)
     for v in res: 
         if (v.normal_case is None): 
             v.normal_case = word
         if (v.class0_.is_verb): 
             if (v.normal_full is None and LanguageHelper.endsWith(v.normal_case, "ТЬСЯ")): 
                 v.normal_full = v.normal_case[0:0+len(v.normal_case) - 2]
         v.language = self.language
         if (v.class0_.is_preposition): 
             v.normal_case = LanguageHelper.normalizePreposition(v.normal_case)
     mc = MorphClass()
     for i in range(len(res) - 1, -1, -1):
         if (not res[i].is_in_dictionary and res[i].class0_.is_adjective and len(res) > 1): 
             if ("к.ф." in res[i].misc.attrs or "неизм." in res[i].misc.attrs): 
                 del res[i]
                 continue
         if (res[i].is_in_dictionary): 
             mc.value |= res[i].class0_.value
     else: i = -1
     if (mc == MorphClass.VERB and len(res) > 1): 
         for r in res: 
             if (r.undef_coef > (100) and r.class0_ == MorphClass.ADJECTIVE): 
                 r.undef_coef = (0)
     if (len(res) == 0): 
         return None
     return res
Esempio n. 17
0
 def __create_result(self, blk : 'SemBlock') -> None:
     if (self.best_var is not None): 
         for s in self.best_var.segs: 
             if (s is not None): 
                 s.correct_morph()
         self.best_var.create_alt_links()
     all_items = list()
     for it in self.items: 
         if (it.res_graph is None): 
             continue
         if (it.result is None): 
             if (isinstance(it.source, NounPhraseToken)): 
                 npt = Utils.asObjectOrNull(it.source, NounPhraseToken)
                 if (it.plural == 1 and ((it.source.morph.number) & (MorphNumber.PLURAL)) != (MorphNumber.UNDEFINED)): 
                     it.source.morph.remove_items(MorphNumber.PLURAL, False)
                 it.result = CreateHelper.create_noun_group(it.res_graph, npt)
                 if (npt.multi_nouns and it.result.quantity is None): 
                     it.result_list = list()
                     it.result_list.append(it.result)
                     if (len(npt.adjectives) > 0 and ((npt.adjectives[0].begin_token.morph.number) & (MorphNumber.SINGULAR)) == (MorphNumber.SINGULAR)): 
                         it.result.morph.number = MorphNumber.SINGULAR
                         if (it.result.morph.normal_full is not None): 
                             it.result.morph.normal_case = it.result.morph.normal_full
                     i = 1
                     while i < len(npt.adjectives): 
                         so = SemObject._new2933(it.res_graph, it.result.typ)
                         so.tokens.append(npt.noun)
                         wf = MorphWordForm()
                         wf.copy_from_word_form(it.result.morph)
                         so.morph = wf
                         for a in it.result.attrs: 
                             so.attrs.append(a)
                         so.concept = it.result.concept
                         so.not0_ = it.result.not0_
                         asem = CreateHelper.create_npt_adj(it.res_graph, npt, npt.adjectives[i])
                         if (asem is not None): 
                             it.res_graph.add_link(SemLinkType.DETAIL, so, asem, "какой", False, None)
                         it.result_list.append(so)
                         it.res_graph.objects.append(so)
                         i += 1
             elif (isinstance(it.source, VerbPhraseToken)): 
                 it.result = CreateHelper.create_verb_group(it.res_graph, Utils.asObjectOrNull(it.source, VerbPhraseToken))
                 it.result_verb_last = (Utils.asObjectOrNull(it.source.last_verb.tag, SemObject))
             elif (isinstance(it.source, NumbersWithUnitToken)): 
                 it.result = CreateHelper.create_number(it.res_graph, Utils.asObjectOrNull(it.source, NumbersWithUnitToken))
             if (it.result is not None and it.quant is not None): 
                 it.result.quantity = it.quant
             if (it.result is not None and it.attrs is not None): 
                 for a in it.attrs: 
                     it.result.attrs.append(a.attr)
                     it.result.tokens.append(a.token)
         if (it.result is not None): 
             if (it.result.graph != it.res_graph): 
                 pass
             all_items.append(it)
     if (self.best_var is not None): 
         for s in self.best_var.segs: 
             if (s is not None): 
                 self.__create_lists(s)
     if (self.best_var is not None): 
         for s in self.best_var.segs: 
             if (s is not None): 
                 self.__create_links(s)
     i = 0
     first_pass3460 = True
     while True:
         if first_pass3460: first_pass3460 = False
         else: i += 1
         if (not (i < len(self.items))): break
         it = self.items[i]
         if (it.typ != SentItemType.ADVERB or it.res_graph is None): 
             continue
         adv = Utils.asObjectOrNull(it.source, AdverbToken)
         if (adv.typ != SemAttributeType.UNDEFINED): 
             continue
         before = None
         after = None
         for ii in range(i - 1, -1, -1):
             it0 = self.items[ii]
             if (it0.typ == SentItemType.VERB): 
                 before = it0
                 break
             elif (it0.typ == SentItemType.ADVERB or it0.typ == SentItemType.NOUN): 
                 pass
             else: 
                 break
         if (before is None): 
             for ii in range(i - 1, -1, -1):
                 it0 = self.items[ii]
                 if (it0.typ == SentItemType.VERB or it0.typ == SentItemType.NOUN): 
                     before = it0
                     break
                 elif (it0.typ == SentItemType.ADVERB): 
                     pass
                 else: 
                     break
         comma_after = False
         ii = i + 1
         while ii < len(self.items): 
             it0 = self.items[ii]
             if (it0.typ == SentItemType.VERB or it0.typ == SentItemType.NOUN): 
                 after = it0
                 break
             elif (it0.typ == SentItemType.ADVERB): 
                 pass
             elif (it0.can_be_comma_end): 
                 if (before is not None and before.typ == SentItemType.VERB): 
                     break
                 if (((ii + 1) < len(self.items)) and ((self.items[ii + 1].typ == SentItemType.ADVERB or self.items[ii + 1].typ == SentItemType.VERB))): 
                     pass
                 else: 
                     comma_after = True
             else: 
                 break
             ii += 1
         if (before is not None and after is not None): 
             if (comma_after): 
                 after = (None)
             elif (before.typ == SentItemType.NOUN and after.typ == SentItemType.VERB): 
                 before = (None)
             elif (before.typ == SentItemType.VERB and after.typ == SentItemType.NOUN): 
                 after = (None)
         it.result = CreateHelper.create_adverb(it.res_graph, adv)
         if (it.attrs is not None): 
             for a in it.attrs: 
                 it.result.attrs.append(a.attr)
                 it.result.tokens.append(a.token)
         if (after is not None or before is not None): 
             it.res_graph.add_link(SemLinkType.DETAIL, (before.result if after is None else after.result), it.result, "как", False, None)
     preds = list()
     agent = None
     for it in self.items: 
         if (it.result is not None and it.typ == SentItemType.VERB and (isinstance(it.source, VerbPhraseToken))): 
             if (agent is not None): 
                 has_pac = False
                 for li in it.res_graph.links: 
                     if (li.typ == SemLinkType.PACIENT and li.source == it.result): 
                         has_pac = True
                         break
                 if (not has_pac): 
                     ni0 = NGItem._new2926(agent)
                     gli0 = NGLink._new2939(ni0, Utils.asObjectOrNull(it.source, VerbPhraseToken), NGLinkType.PACIENT)
                     if (agent.result_list is not None): 
                         gli0.from_is_plural = True
                         gli0.calc_coef(False)
                         if (gli0.coef > 0 and gli0.plural == 1): 
                             for ii in agent.result_list: 
                                 it.res_graph.add_link(SemLinkType.PACIENT, it.result, ii, None, False, None)
                             self.coef += (1)
                     else: 
                         gli0.calc_coef(True)
                         if (gli0.coef > 0): 
                             it.res_graph.add_link(SemLinkType.PACIENT, it.result, agent.result, None, False, None)
                             self.coef += (1)
             ali = None
             for li in it.res_graph.links: 
                 if (li.typ == SemLinkType.AGENT and li.source == it.result): 
                     ali = li
                     break
             if (ali is not None): 
                 agent = self.__find_item_by_res(ali.target)
                 continue
             if (agent is None): 
                 continue
             ni = NGItem._new2926(agent)
             gli = NGLink._new2939(ni, Utils.asObjectOrNull(it.source, VerbPhraseToken), NGLinkType.AGENT)
             if (agent.result_list is not None): 
                 gli.from_is_plural = True
                 gli.calc_coef(False)
                 if (gli.coef > 0 and gli.plural == 1): 
                     for ii in agent.result_list: 
                         it.res_graph.add_link(SemLinkType.AGENT, it.result, ii, None, False, None)
                     self.coef += (1)
             else: 
                 gli.calc_coef(True)
                 if (gli.coef > 0): 
                     it.res_graph.add_link(SemLinkType.AGENT, it.result, agent.result, None, False, None)
                     self.coef += (1)
     agent = (None)
     i = 0
     first_pass3461 = True
     while True:
         if first_pass3461: first_pass3461 = False
         else: i += 1
         if (not (i < len(self.items))): break
         it = self.items[i]
         if (it.result is not None and it.typ == SentItemType.DEEPART): 
             pass
         else: 
             continue
         link = None
         for j in range(i - 1, -1, -1):
             itt = self.items[j]
             if (itt.typ != SentItemType.NOUN): 
                 continue
             if (not ((itt.source.morph.case_.is_nominative))): 
                 continue
             ispacad = False
             for li in itt.res_graph.links: 
                 if (((li.typ == SemLinkType.AGENT or li.typ == SemLinkType.PACIENT)) and li.target == itt.result): 
                     ispacad = True
             if (not ispacad): 
                 continue
             if (link is None): 
                 link = itt.res_graph.add_link(SemLinkType.AGENT, it.result, itt.result, None, False, None)
             elif (link.alt_link is None): 
                 link.alt_link = itt.res_graph.add_link(SemLinkType.AGENT, it.result, itt.result, None, False, None)
                 link.alt_link.alt_link = link
                 break
         if (link is None): 
             j = i + 1
             first_pass3462 = True
             while True:
                 if first_pass3462: first_pass3462 = False
                 else: j += 1
                 if (not (j < len(self.items))): break
                 itt = self.items[j]
                 if (itt.typ != SentItemType.NOUN): 
                     continue
                 if (not ((itt.source.morph.case_.is_nominative))): 
                     continue
                 ispacad = False
                 for li in itt.res_graph.links: 
                     if (((li.typ == SemLinkType.AGENT or li.typ == SemLinkType.PACIENT)) and li.target == itt.result): 
                         ispacad = True
                 if (not ispacad): 
                     continue
                 if (link is None): 
                     link = itt.res_graph.add_link(SemLinkType.AGENT, it.result, itt.result, None, False, None)
                 elif (link.alt_link is None): 
                     link.alt_link = itt.res_graph.add_link(SemLinkType.AGENT, it.result, itt.result, None, False, None)
                     link.alt_link.alt_link = link
                     break
         if (link is not None): 
             self.coef += 1
     for fr in self.res_block.fragments: 
         if (fr.can_be_error_structure): 
             self.coef /= (2)
     if (len(self.res_block.fragments) > 0 and len(self.res_block.fragments[0].graph.objects) > 0): 
         it = self.res_block.fragments[0].graph.objects[0]
         if (self.last_char is not None and self.last_char.is_char('?')): 
             if (it.morph.normal_full == "КАКОЙ" or it.morph.normal_full == "СКОЛЬКО"): 
                 it.typ = SemObjectType.QUESTION