Ejemplo n.º 1
0
 def __init__(self, source: 'MorphCollection' = None) -> None:
     super().__init__()
     self.__m_class = MorphClass()
     self.__m_gender = MorphGender.UNDEFINED
     self.__m_number = MorphNumber.UNDEFINED
     self.__m_case = MorphCase()
     self.__m_language = MorphLang()
     self.__m_voice = MorphVoice.UNDEFINED
     self.__m_need_recalc = True
     self.__m_items = None
     if (source is None):
         return
     for it in source.items:
         mi = None
         if (isinstance(it, MorphWordForm)):
             wf = MorphWordForm()
             wf.copy_from_word_form(Utils.asObjectOrNull(it, MorphWordForm))
             mi = (wf)
         else:
             mi = MorphBaseInfo()
             mi.copy_from(it)
         if (self.__m_items is None):
             self.__m_items = list()
         self.__m_items.append(mi)
     self.__m_class = MorphClass._new53(source.__m_class.value)
     self.__m_gender = source.__m_gender
     self.__m_case = MorphCase._new29(source.__m_case.value)
     self.__m_number = source.__m_number
     self.__m_language = MorphLang._new56(source.__m_language.value)
     self.__m_voice = source.__m_voice
     self.__m_need_recalc = False
Ejemplo n.º 2
0
 def _deserialize(self, str0_: 'ByteArrayWrapper', pos: int) -> bool:
     id0__ = str0_.deserialize_short(pos)
     if (id0__ <= 0):
         return False
     self.misc_info_id = (id0__)
     iii = str0_.deserialize_short(pos)
     mc = MorphClass()
     mc.value = (iii)
     if (mc.is_misc and mc.is_proper):
         mc.is_misc = False
     self.class0_ = mc
     bbb = 0
     bbb = str0_.deserialize_byte(pos)
     self.gender = Utils.valToEnum(bbb, MorphGender)
     bbb = str0_.deserialize_byte(pos)
     self.number = Utils.valToEnum(bbb, MorphNumber)
     bbb = str0_.deserialize_byte(pos)
     mca = MorphCase()
     mca.value = (bbb)
     self.case_ = mca
     s = str0_.deserialize_string(pos)
     self.normal_tail = s
     s = str0_.deserialize_string(pos)
     self.full_normal_tail = s
     return True
Ejemplo n.º 3
0
 def copy_from(self, src: 'MorphBaseInfo') -> None:
     cla = MorphClass()
     cla.value = src.class0_.value
     self.class0_ = cla
     self.gender = src.gender
     self.number = src.number
     cas = MorphCase()
     cas.value = src.case_.value
     self.case_ = cas
     lng = MorphLang()
     lng.value = src.language.value
     self.language = lng
Ejemplo n.º 4
0
 def deserializeDerivateGroup(str0_ : 'ByteArrayWrapper', dg : 'DerivateGroup') -> None:
     attr = str0_.deserializeShort()
     if (((attr & 1)) != 0): 
         dg.is_dummy = True
     if (((attr & 2)) != 0): 
         dg.not_generate = True
     if (((attr & 4)) != 0): 
         dg.m_transitive = 0
     if (((attr & 8)) != 0): 
         dg.m_transitive = 1
     dg.prefix = str0_.deserializeString()
     cou = str0_.deserializeShort()
     while cou > 0: 
         w = DerivateWord(dg)
         w.spelling = str0_.deserializeString()
         w.class0_ = MorphClass()
         w.class0_.value = (str0_.deserializeShort())
         w.lang = MorphLang._new5(str0_.deserializeShort())
         w.attrs.value = (str0_.deserializeShort())
         dg.words.append(w)
         cou -= 1
     cou = str0_.deserializeShort()
     while cou > 0: 
         pref = Utils.ifNotNull(str0_.deserializeString(), "")
         cas = MorphCase()
         cas.value = (str0_.deserializeShort())
         if (dg.nexts is None): 
             dg.nexts = dict()
         dg.nexts[pref] = cas
         cou -= 1
Ejemplo n.º 5
0
 def __deserialize_item(self, stream: Stream) -> 'MorphBaseInfo':
     from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
     ty = stream.readbyte()
     res = (MorphBaseInfo() if ty == 0 else MorphWordForm())
     res.class0_ = MorphClass._new53(
         SerializerHelper.deserialize_short(stream))
     res.case_ = MorphCase._new29(
         SerializerHelper.deserialize_short(stream))
     res.gender = Utils.valToEnum(
         SerializerHelper.deserialize_short(stream), MorphGender)
     res.number = Utils.valToEnum(
         SerializerHelper.deserialize_short(stream), MorphNumber)
     res.language = MorphLang._new56(
         SerializerHelper.deserialize_short(stream))
     if (ty == 0):
         return res
     wf = Utils.asObjectOrNull(res, MorphWordForm)
     wf.normal_case = SerializerHelper.deserialize_string(stream)
     wf.normal_full = SerializerHelper.deserialize_string(stream)
     wf.undef_coef = SerializerHelper.deserialize_short(stream)
     cou = SerializerHelper.deserialize_int(stream)
     i = 0
     while i < cou:
         if (wf.misc is None):
             wf.misc = MorphMiscInfo()
         wf.misc.attrs.append(SerializerHelper.deserialize_string(stream))
         i += 1
     return res
 def add(self,
         val: str,
         shortval: str,
         gen: 'MorphGender',
         add_other_gender_var: bool = False) -> None:
     if (val is None):
         return
     if (self.head is None):
         if (len(val) > 3):
             self.head = val[0:0 + 3]
         else:
             self.head = val
     if (gen == MorphGender.MASCULINE or gen == MorphGender.FEMINIE):
         for it in self.items:
             if (it.value == val and it.gender == gen):
                 return
         self.items.append(
             PersonMorphCollection.PersonMorphVariant._new2591(
                 val, gen, shortval))
         if (add_other_gender_var):
             g0 = (MorphGender.MASCULINE
                   if gen == MorphGender.FEMINIE else MorphGender.FEMINIE)
             v = MorphologyService.get_wordform(
                 val, MorphBaseInfo._new193(MorphClass._new2572(True), g0))
             if (v is not None):
                 self.items.append(
                     PersonMorphCollection.PersonMorphVariant._new2591(
                         v, g0, shortval))
     else:
         self.add(val, shortval, MorphGender.MASCULINE, False)
         self.add(val, shortval, MorphGender.FEMINIE, False)
Ejemplo n.º 7
0
 def __init__(self, bi: 'MorphBaseInfo' = None) -> None:
     self.__m_cla = MorphClass()
     self.__gender = MorphGender.UNDEFINED
     self.__number = MorphNumber.UNDEFINED
     self.__m_cas = MorphCase()
     self.__m_lang = MorphLang()
     if (bi is not None):
         bi.copy_to(self)
Ejemplo n.º 8
0
 def __deserializeMorphRuleVariant(str0_: 'ByteArrayWrapper',
                                   me: 'MorphEngine') -> 'MorphRuleVariant':
     id0_ = str0_.deserializeShort() - 1
     if ((id0_ < 0) or id0_ >= len(me._m_vars)):
         return None
     mrv = MorphRuleVariant._new36(me._m_vars[id0_])
     mc = MorphClass()
     mc.value = (str0_.deserializeShort())
     if (mc.is_misc and mc.is_proper):
         mc.is_misc = False
     mrv.class0_ = mc
     mrv.gender = Utils.valToEnum(str0_.deserializeByte(), MorphGender)
     mrv.number = Utils.valToEnum(str0_.deserializeByte(), MorphNumber)
     mca = MorphCase()
     mca.value = (str0_.deserializeByte())
     mrv.case_ = mca
     mrv.normal_tail = str0_.deserializeString()
     mrv.full_normal_tail = str0_.deserializeString()
     return mrv
Ejemplo n.º 9
0
 def deserialize_derivate_group(str0_: 'ByteArrayWrapper',
                                dg: 'DerivateGroup', pos: int) -> None:
     attr = str0_.deserialize_short(pos)
     if (((attr & 1)) != 0):
         dg.is_dummy = True
     if (((attr & 2)) != 0):
         dg.not_generate = True
     if (((attr & 4)) != 0):
         dg.m_transitive = 0
     if (((attr & 8)) != 0):
         dg.m_transitive = 1
     if (((attr & 0x10)) != 0):
         dg.m_rev_agent_case = 0
     if (((attr & 0x20)) != 0):
         dg.m_rev_agent_case = 1
     if (((attr & 0x40)) != 0):
         dg.m_rev_agent_case = 2
     dg.questions = (Utils.valToEnum(str0_.deserialize_short(pos),
                                     NextModelQuestion))
     dg.questions_ref = (Utils.valToEnum(str0_.deserialize_short(pos),
                                         NextModelQuestion))
     dg.prefix = str0_.deserialize_string(pos)
     cou = str0_.deserialize_short(pos)
     while cou > 0:
         w = DerivateWord(dg)
         w.spelling = str0_.deserialize_string(pos)
         w.class0_ = MorphClass()
         w.class0_.value = (str0_.deserialize_short(pos))
         w.lang = MorphLang._new10(str0_.deserialize_short(pos))
         w.attrs.value = (str0_.deserialize_short(pos))
         dg.words.append(w)
         cou -= 1
     cou = str0_.deserialize_short(pos)
     while cou > 0:
         pref = Utils.ifNotNull(str0_.deserialize_string(pos), "")
         cas = MorphCase()
         cas.value = (str0_.deserialize_short(pos))
         if (dg.nexts is None):
             dg.nexts = dict()
         dg.nexts[pref] = cas
         cou -= 1
     cou = str0_.deserialize_short(pos)
     while cou > 0:
         pref = Utils.ifNotNull(str0_.deserialize_string(pos), "")
         cas = MorphCase()
         cas.value = (str0_.deserialize_short(pos))
         if (dg.nexts_ref is None):
             dg.nexts_ref = dict()
         dg.nexts_ref[pref] = cas
         cou -= 1
Ejemplo n.º 10
0
 def get_word_base_info(word: str,
                        lang: 'MorphLang' = None,
                        is_case_nominative: bool = False,
                        in_dict_only: bool = False) -> 'MorphBaseInfo':
     """ Получить для словоформы род\число\падеж
     
     Args:
         word(str): словоформа
         lang(MorphLang): возможный язык
         is_case_nominative(bool): исходное слово в именительном падеже (иначе считается падеж любым)
         in_dict_only(bool): при true не строить гипотезы для несловарных слов
     
     Returns:
         MorphBaseInfo: базовая морфологическая информация
     """
     mt = Morphology.__m_inner.run(word, False, lang, None, False)
     bi = MorphWordForm()
     cla = MorphClass()
     if (mt is not None and len(mt) > 0):
         for k in range(2):
             ok = False
             for wf in mt[0].word_forms:
                 if (k == 0):
                     if (not wf.is_in_dictionary):
                         continue
                 elif (wf.is_in_dictionary):
                     continue
                 if (is_case_nominative):
                     if (not wf.case_.is_nominative
                             and not wf.case_.is_undefined):
                         continue
                 cla.value |= wf.class0_.value
                 bi.gender = Utils.valToEnum((bi.gender) | (wf.gender),
                                             MorphGender)
                 bi.case_ = (bi.case_) | wf.case_
                 bi.number = Utils.valToEnum((bi.number) | (wf.number),
                                             MorphNumber)
                 if (wf.misc is not None and bi.misc is None):
                     bi.misc = wf.misc
                 ok = True
             if (ok or in_dict_only):
                 break
     bi.class0_ = cla
     return bi
Ejemplo n.º 11
0
 def clone(self) -> 'MorphCollection':
     """ Создать копию
     
     """
     res = MorphCollection()
     if (self.__m_items is not None):
         res.__m_items = list()
         try:
             res.__m_items.extend(self.__m_items)
         except Exception as ex:
             pass
     if (not self.__m_need_recalc):
         res.__m_class = MorphClass._new53(self.__m_class.value)
         res.__m_gender = self.__m_gender
         res.__m_case = MorphCase._new29(self.__m_case.value)
         res.__m_number = self.__m_number
         res.__m_language = MorphLang._new56(self.__m_language.value)
         res.__m_need_recalc = False
         res.__m_voice = self.__m_voice
     return res
Ejemplo n.º 12
0
 def _deserialize(self, str0_ : 'ByteArrayWrapper', pos : int) -> None:
     attr = str0_.deserialize_short(pos)
     if (((attr & 1)) != 0): 
         self.is_dummy = True
     if (((attr & 2)) != 0): 
         self.not_generate = True
     self.prefix = str0_.deserialize_string(pos)
     self.model._deserialize(str0_, pos)
     self.cm._deserialize(str0_, pos)
     self.cm_rev._deserialize(str0_, pos)
     cou = str0_.deserialize_short(pos)
     while cou > 0: 
         w = DerivateWord()
         w.spelling = str0_.deserialize_string(pos)
         sh = str0_.deserialize_short(pos)
         w.class0_ = MorphClass()
         w.class0_.value = (sh)
         sh = str0_.deserialize_short(pos)
         w.lang = MorphLang()
         w.lang.value = (sh)
         sh = str0_.deserialize_short(pos)
         w.attrs.value = (sh)
         b = str0_.deserialize_byte(pos)
         w.aspect = (Utils.valToEnum(b, MorphAspect))
         b = str0_.deserialize_byte(pos)
         w.tense = (Utils.valToEnum(b, MorphTense))
         b = str0_.deserialize_byte(pos)
         w.voice = (Utils.valToEnum(b, MorphVoice))
         b = str0_.deserialize_byte(pos)
         cou1 = b
         while cou1 > 0: 
             n = str0_.deserialize_string(pos)
             if (w.next_words is None): 
                 w.next_words = list()
             if (n is not None): 
                 w.next_words.append(n)
             cou1 -= 1
         self.words.append(w)
         cou -= 1
Ejemplo n.º 13
0
 def _deserialize(self, stream: Stream) -> None:
     from pullenti.ner.core.internal.SerializerHelper import SerializerHelper
     self.__m_class = MorphClass._new53(
         SerializerHelper.deserialize_short(stream))
     self.__m_case = MorphCase._new29(
         SerializerHelper.deserialize_short(stream))
     self.__m_gender = (Utils.valToEnum(
         SerializerHelper.deserialize_short(stream), MorphGender))
     self.__m_number = (Utils.valToEnum(
         SerializerHelper.deserialize_short(stream), MorphNumber))
     self.__m_voice = (Utils.valToEnum(
         SerializerHelper.deserialize_short(stream), MorphVoice))
     self.__m_language = MorphLang._new56(
         SerializerHelper.deserialize_short(stream))
     cou = SerializerHelper.deserialize_int(stream)
     self.__m_items = list()
     i = 0
     while i < cou:
         it = self.__deserialize_item(stream)
         if (it is not None):
             self.__m_items.append(it)
         i += 1
     self.__m_need_recalc = False
Ejemplo n.º 14
0
 def __recalc(self) -> None:
     self.__m_need_recalc = False
     if (self.__m_items is None or len(self.__m_items) == 0):
         return
     self.__m_class = MorphClass()
     self.__m_gender = MorphGender.UNDEFINED
     g = self.__m_gender == MorphGender.UNDEFINED
     self.__m_number = MorphNumber.UNDEFINED
     n = self.__m_number == MorphNumber.UNDEFINED
     self.__m_case = MorphCase()
     ca = self.__m_case.is_undefined
     la = self.__m_language is None or self.__m_language.is_undefined
     self.__m_voice = MorphVoice.UNDEFINED
     verb_has_undef = False
     if (self.__m_items is not None):
         for it in self.__m_items:
             self.__m_class.value |= it.class0_.value
             if (g):
                 self.__m_gender = (Utils.valToEnum(
                     (self.__m_gender) | (it.gender), MorphGender))
             if (ca):
                 self.__m_case |= it.case_
             if (n):
                 self.__m_number = (Utils.valToEnum(
                     (self.__m_number) | (it.number), MorphNumber))
             if (la):
                 self.__m_language.value |= it.language.value
             if (it.class0_.is_verb):
                 if (isinstance(it, MorphWordForm)):
                     v = it.misc.voice
                     if (v == MorphVoice.UNDEFINED):
                         verb_has_undef = True
                     else:
                         self.__m_voice = (Utils.valToEnum(
                             (self.__m_voice) | (v), MorphVoice))
     if (verb_has_undef):
         self.__m_voice = MorphVoice.UNDEFINED
Ejemplo n.º 15
0
 def process(self, word : str) -> typing.List['MorphWordForm']:
     """ Обработка одного слова
     
     Args:
         word(str): слово должно быть в верхнем регистре
     
     """
     if (Utils.isNullOrEmpty(word)): 
         return None
     res = None
     if (len(word) > 1): 
         i = 0
         while i < len(word): 
             ch = word[i]
             if (LanguageHelper.isCyrillicVowel(ch) or LanguageHelper.isLatinVowel(ch)): 
                 break
             i += 1
         if (i >= len(word)): 
             return res
     mvs = [ ]
     tn = self.m_root
     i = 0
     while i <= len(word): 
         if (tn.lazy_pos > 0): 
             self.__loadTreeNode(tn)
         if (tn.rules is not None): 
             word_begin = None
             word_end = None
             if (i == 0): 
                 word_end = word
             elif (i < len(word)): 
                 word_end = word[i:]
             else: 
                 word_end = ""
             if (res is None): 
                 res = list()
             for r in tn.rules: 
                 wrapmvs14 = RefOutArgWrapper(None)
                 inoutres15 = Utils.tryGetValue(r.variants, word_end, wrapmvs14)
                 mvs = wrapmvs14.value
                 if (inoutres15): 
                     if (word_begin is None): 
                         if (i == len(word)): 
                             word_begin = word
                         elif (i > 0): 
                             word_begin = word[0:0+i]
                         else: 
                             word_begin = ""
                     r.processResult(res, word_begin, mvs)
         if (tn.nodes is None or i >= len(word)): 
             break
         ch = ord(word[i])
         wraptn16 = RefOutArgWrapper(None)
         inoutres17 = Utils.tryGetValue(tn.nodes, ch, wraptn16)
         tn = wraptn16.value
         if (not inoutres17): 
             break
         i += 1
     need_test_unknown_vars = True
     if (res is not None): 
         for r in res: 
             if ((r.class0_.is_pronoun or r.class0_.is_noun or r.class0_.is_adjective) or (r.class0_.is_misc and r.class0_.is_conjunction) or r.class0_.is_preposition): 
                 need_test_unknown_vars = False
             elif (r.class0_.is_adverb and r.normal_case is not None): 
                 if (not LanguageHelper.endsWithEx(r.normal_case, "О", "А", None, None)): 
                     need_test_unknown_vars = False
                 elif (r.normal_case == "МНОГО"): 
                     need_test_unknown_vars = False
             elif (r.class0_.is_verb and len(res) > 1): 
                 ok = False
                 for rr in res: 
                     if (rr != r and rr.class0_ != r.class0_): 
                         ok = True
                         break
                 if (ok and not LanguageHelper.endsWith(word, "ИМ")): 
                     need_test_unknown_vars = False
     if (need_test_unknown_vars and LanguageHelper.isCyrillicChar(word[0])): 
         gl = 0
         sog = 0
         j = 0
         while j < len(word): 
             if (LanguageHelper.isCyrillicVowel(word[j])): 
                 gl += 1
             else: 
                 sog += 1
             j += 1
         if ((gl < 2) or (sog < 2)): 
             need_test_unknown_vars = False
     if (need_test_unknown_vars and res is not None and len(res) == 1): 
         if (res[0].class0_.is_verb): 
             if ("н.вр." in res[0].misc.attrs and "нес.в." in res[0].misc.attrs and not "страд.з." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif ("б.вр." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif (res[0].normal_case is not None and LanguageHelper.endsWith(res[0].normal_case, "СЯ")): 
                 need_test_unknown_vars = False
         if (res[0].class0_.is_undefined and "прдктв." in res[0].misc.attrs): 
             need_test_unknown_vars = False
     if (need_test_unknown_vars): 
         if (self.m_root_reverce is None): 
             return res
         tn = self.m_root_reverce
         tn0 = None
         for i in range(len(word) - 1, -1, -1):
             if (tn.lazy_pos > 0): 
                 self.__loadTreeNode(tn)
             ch = ord(word[i])
             if (tn.nodes is None): 
                 break
             wrapnext18 = RefOutArgWrapper(None)
             inoutres19 = Utils.tryGetValue(tn.nodes, ch, wrapnext18)
             next0_ = wrapnext18.value
             if (not inoutres19): 
                 break
             tn = next0_
             if (tn.lazy_pos > 0): 
                 self.__loadTreeNode(tn)
             if (tn.reverce_variants is not None): 
                 tn0 = tn
                 break
         else: i = -1
         if (tn0 is not None): 
             glas = i < 4
             while i >= 0: 
                 if (LanguageHelper.isCyrillicVowel(word[i]) or LanguageHelper.isLatinVowel(word[i])): 
                     glas = True
                     break
                 i -= 1
             if (glas): 
                 for mv in tn0.reverce_variants: 
                     if (((not mv.class0_.is_verb and not mv.class0_.is_adjective and not mv.class0_.is_noun) and not mv.class0_.is_proper_surname and not mv.class0_.is_proper_geo) and not mv.class0_.is_proper_secname): 
                         continue
                     ok = False
                     for rr in res: 
                         if (rr.is_in_dictionary): 
                             if (rr.class0_ == mv.class0_ or rr.class0_.is_noun): 
                                 ok = True
                                 break
                             if (not mv.class0_.is_adjective and rr.class0_.is_verb): 
                                 ok = True
                                 break
                     if (ok): 
                         continue
                     if (len(mv.tail) > 0 and not LanguageHelper.endsWith(word, mv.tail)): 
                         continue
                     r = MorphWordForm(mv, word)
                     if (not MorphWordForm._hasMorphEquals(res, r)): 
                         r.undef_coef = mv.coef
                         if (res is None): 
                             res = list()
                         res.append(r)
     if (word == "ПРИ" and res is not None): 
         for i in range(len(res) - 1, -1, -1):
             if (res[i].class0_.is_proper_geo): 
                 del res[i]
         else: i = -1
     if (res is None or len(res) == 0): 
         return None
     MorphEngine.__sort(res, word)
     for v in res: 
         if (v.normal_case is None): 
             v.normal_case = word
         if (v.class0_.is_verb): 
             if (v.normal_full is None and LanguageHelper.endsWith(v.normal_case, "ТЬСЯ")): 
                 v.normal_full = v.normal_case[0:0+len(v.normal_case) - 2]
         v.language = self.language
         if (v.class0_.is_preposition): 
             v.normal_case = LanguageHelper.normalizePreposition(v.normal_case)
     mc = MorphClass()
     for i in range(len(res) - 1, -1, -1):
         if (not res[i].is_in_dictionary and res[i].class0_.is_adjective and len(res) > 1): 
             if ("к.ф." in res[i].misc.attrs or "неизм." in res[i].misc.attrs): 
                 del res[i]
                 continue
         if (res[i].is_in_dictionary): 
             mc.value |= res[i].class0_.value
     else: i = -1
     if (mc == MorphClass.VERB and len(res) > 1): 
         for r in res: 
             if (r.undef_coef > (100) and r.class0_ == MorphClass.ADJECTIVE): 
                 r.undef_coef = (0)
     if (len(res) == 0): 
         return None
     return res
Ejemplo n.º 16
0
 def __init__(self) -> None:
     self.__m_cla = MorphClass()
     self.__gender = MorphGender.UNDEFINED
     self.__number = MorphNumber.UNDEFINED
     self.__m_cas = MorphCase()
     self.__m_lang = MorphLang()
Ejemplo n.º 17
0
 def get_morph_class_in_dictionary(self) -> 'MorphClass':
     res = MorphClass()
     for wf in self.morph.items:
         if ((isinstance(wf, MorphWordForm)) and wf.is_in_dictionary):
             res |= wf.class0_
     return res
Ejemplo n.º 18
0
 def get_normal_case_text(self,
                          mc: 'MorphClass' = None,
                          num: 'MorphNumber' = MorphNumber.UNDEFINED,
                          gender: 'MorphGender' = MorphGender.UNDEFINED,
                          keep_chars: bool = False) -> str:
     from pullenti.ner.core.MiscHelper import MiscHelper
     empty = True
     if (mc is not None and mc.is_preposition):
         return LanguageHelper.normalize_preposition(self.term)
     for it in self.morph.items:
         if (mc is not None and not mc.is_undefined):
             cc = (it.class0_) & mc
             if (cc.is_undefined):
                 continue
             if (cc.is_misc and not cc.is_proper and mc != it.class0_):
                 continue
         wf = Utils.asObjectOrNull(it, MorphWordForm)
         normal_full = False
         if (gender != MorphGender.UNDEFINED):
             if (((it.gender) & (gender)) == (MorphGender.UNDEFINED)):
                 if ((gender == MorphGender.MASCULINE and
                      ((it.gender != MorphGender.UNDEFINED or it.number
                        == MorphNumber.PLURAL)) and wf is not None)
                         and wf.normal_full is not None):
                     normal_full = True
                 elif (gender == MorphGender.MASCULINE
                       and it.class0_.is_personal_pronoun):
                     pass
                 else:
                     continue
         if (not it.case_.is_undefined):
             empty = False
         if (wf is not None):
             res = None
             if (num == MorphNumber.SINGULAR
                     and it.number == MorphNumber.PLURAL
                     and wf.normal_full is not None):
                 le = len(wf.normal_case)
                 if ((le == (len(wf.normal_full) + 2) and le > 4
                      and wf.normal_case[le - 2] == 'С')
                         and wf.normal_case[le - 1] == 'Я'):
                     res = wf.normal_case
                 else:
                     res = (wf.normal_full
                            if normal_full else wf.normal_full)
             else:
                 res = (wf.normal_full if normal_full else
                        (Utils.ifNotNull(wf.normal_case, self.term)))
             if (num == MorphNumber.SINGULAR and mc is not None
                     and mc == MorphClass.NOUN):
                 if (res == "ДЕТИ"):
                     res = "РЕБЕНОК"
             if (keep_chars):
                 if (self.chars.is_all_lower):
                     res = res.lower()
                 elif (self.chars.is_capital_upper):
                     res = MiscHelper.convert_first_char_upper_and_other_lower(
                         res)
             return res
     if (not empty):
         return None
     te = None
     if (num == MorphNumber.SINGULAR and mc is not None):
         bi = MorphBaseInfo._new492(MorphClass._new53(mc.value), gender,
                                    MorphNumber.SINGULAR,
                                    self.morph.language)
         vars0_ = MorphologyService.get_wordform(self.term, bi)
         if (vars0_ is not None):
             te = vars0_
     if (te is None):
         te = self.term
     if (keep_chars):
         if (self.chars.is_all_lower):
             return te.lower()
         elif (self.chars.is_capital_upper):
             return MiscHelper.convert_first_char_upper_and_other_lower(te)
     return te
Ejemplo n.º 19
0
 def copy_to(self, dst: 'MorphBaseInfo') -> None:
     dst.class0_ = MorphClass(self.class0_)
     dst.gender = self.gender
     dst.number = self.number
     dst.case_ = MorphCase(self.case_)
     dst.language = MorphLang(self.language)
Ejemplo n.º 20
0
 def get_normal_case_text(self,
                          mc: 'MorphClass' = None,
                          num: 'MorphNumber' = MorphNumber.UNDEFINED,
                          gender: 'MorphGender' = MorphGender.UNDEFINED,
                          keep_chars: bool = False) -> str:
     if ((isinstance(self.begin_token, ReferentToken))
             and self.begin_token == self.end_token):
         return self.begin_token.get_normal_case_text(
             mc, num, gender, keep_chars)
     res = None
     max_coef = 0
     def_coef = -1
     for it in self.morph.items:
         v = Utils.asObjectOrNull(it, NounPhraseItemTextVar)
         if (v is None):
             continue
         if (v.undef_coef > 0
                 and (((v.undef_coef < max_coef) or def_coef >= 0))):
             continue
         if (num == MorphNumber.SINGULAR
                 and v.single_number_value is not None):
             if (mc is not None and ((gender == MorphGender.NEUTER
                                      or gender == MorphGender.FEMINIE))
                     and mc.is_adjective):
                 bi = MorphBaseInfo._new401(MorphClass._new53(mc.value),
                                            gender, MorphNumber.SINGULAR,
                                            MorphCase.NOMINATIVE,
                                            self.morph.language)
                 str0_ = MorphologyService.get_wordform(
                     v.single_number_value, bi)
                 if (str0_ is not None):
                     res = str0_
             else:
                 res = v.single_number_value
             if (v.undef_coef == 0):
                 break
             max_coef = v.undef_coef
             continue
         if (Utils.isNullOrEmpty(v.normal_value)):
             continue
         if (str.isdigit(v.normal_value[0]) and mc is not None
                 and mc.is_adjective):
             val = 0
             wrapval402 = RefOutArgWrapper(0)
             inoutres403 = Utils.tryParseInt(v.normal_value, wrapval402)
             val = wrapval402.value
             if (inoutres403):
                 str0_ = NumberHelper.get_number_adjective(
                     val, gender,
                     (MorphNumber.SINGULAR if num == MorphNumber.SINGULAR
                      or val == 1 else MorphNumber.PLURAL))
                 if (str0_ is not None):
                     res = str0_
                     if (v.undef_coef == 0):
                         break
                     max_coef = v.undef_coef
                     continue
         res1 = it.normal_value
         if (num == MorphNumber.SINGULAR):
             if (res1 == "ДЕТИ"):
                 res1 = "РЕБЕНОК"
             elif (res1 == "ЛЮДИ"):
                 res1 = "ЧЕЛОВЕК"
         max_coef = v.undef_coef
         if (v.undef_coef > 0):
             res = res1
             continue
         def_co = 0
         if (mc is not None and mc.is_adjective and v.undef_coef == 0):
             pass
         elif (
             ((isinstance(self.begin_token, TextToken))
              and res1 == self.begin_token.term and it.case_.is_nominative)
                 and it.number == MorphNumber.SINGULAR):
             def_co = 1
         if (num == MorphNumber.PLURAL and
             ((v.number) & (MorphNumber.PLURAL)) == (MorphNumber.PLURAL)):
             def_co += 3
         if (res is None or def_co > def_coef):
             res = res1
             def_coef = def_co
             if (def_co > 0):
                 break
     if (res is not None):
         return self.__corr_chars(res, keep_chars)
     if (res is None and self.begin_token == self.end_token):
         res = self.begin_token.get_normal_case_text(
             mc, num, gender, keep_chars)
     elif (res is None):
         res = self.begin_token.get_normal_case_text(
             mc, num, gender, keep_chars)
         if (res is None):
             res = MiscHelper.get_text_value_of_meta_token(
                 self, (GetTextAttr.KEEPREGISTER
                        if keep_chars else GetTextAttr.NO))
         else:
             res = "{0} {1}".format(
                 res,
                 MiscHelper.get_text_value(
                     self.begin_token.next0_, self.end_token,
                     (GetTextAttr.KEEPREGISTER
                      if keep_chars else GetTextAttr.NO)))
     return Utils.ifNotNull(res, "?")
Ejemplo n.º 21
0
 def getNormalCaseText(self,
                       mc: 'MorphClass' = None,
                       single_number: bool = False,
                       gender: 'MorphGender' = MorphGender.UNDEFINED,
                       keep_chars: bool = False) -> str:
     if ((isinstance(self.begin_token, ReferentToken))
             and self.begin_token == self.end_token):
         return self.begin_token.getNormalCaseText(mc, single_number,
                                                   gender, keep_chars)
     res = None
     max_coef = 0
     def_coef = -1
     for it in self.morph.items:
         v = Utils.asObjectOrNull(it, NounPhraseItemTextVar)
         if (v.undef_coef > 0
                 and (((v.undef_coef < max_coef) or def_coef >= 0))):
             continue
         if (single_number and v.single_number_value is not None):
             if (mc is not None and ((gender == MorphGender.NEUTER
                                      or gender == MorphGender.FEMINIE))
                     and mc.is_adjective):
                 bi = MorphBaseInfo._new467(MorphClass(mc), gender,
                                            MorphNumber.SINGULAR,
                                            MorphCase.NOMINATIVE,
                                            self.morph.language)
                 str0_ = Morphology.getWordform(v.single_number_value, bi)
                 if (str0_ is not None):
                     res = str0_
             else:
                 res = v.single_number_value
             if (v.undef_coef == 0):
                 break
             max_coef = v.undef_coef
             continue
         if (Utils.isNullOrEmpty(v.normal_value)):
             continue
         if (str.isdigit(v.normal_value[0]) and mc is not None
                 and mc.is_adjective):
             wrapval468 = RefOutArgWrapper(0)
             inoutres469 = Utils.tryParseInt(v.normal_value, wrapval468)
             val = wrapval468.value
             if (inoutres469):
                 str0_ = NumberHelper.getNumberAdjective(
                     val, gender, (MorphNumber.SINGULAR if single_number
                                   or val == 1 else MorphNumber.PLURAL))
                 if (str0_ is not None):
                     res = str0_
                     if (v.undef_coef == 0):
                         break
                     max_coef = v.undef_coef
                     continue
         res1 = (it).normal_value
         if (single_number):
             if (res1 == "ДЕТИ"):
                 res1 = "РЕБЕНОК"
             elif (res1 == "ЛЮДИ"):
                 res1 = "ЧЕЛОВЕК"
         max_coef = v.undef_coef
         if (v.undef_coef > 0):
             res = res1
             continue
         def_co = 0
         if (mc is not None and mc.is_adjective and v.undef_coef == 0):
             pass
         elif (((isinstance(self.begin_token, TextToken)) and res1
                == (self.begin_token).term and it.case_.is_nominative)
               and it.number == MorphNumber.SINGULAR):
             def_co = 1
         if (res is None or def_co > def_coef):
             res = res1
             def_coef = def_co
             if (def_co > 0):
                 break
     if (res is not None):
         return self.__corrChars(res, keep_chars)
     if (res is None and self.begin_token == self.end_token):
         res = self.begin_token.getNormalCaseText(mc, single_number, gender,
                                                  keep_chars)
     return Utils.ifNotNull(res, "?")
Ejemplo n.º 22
0
 def process(self, word : str) -> typing.List['MorphWordForm']:
     if (Utils.isNullOrEmpty(word)): 
         return None
     res = None
     i = 0
     if (len(word) > 1): 
         i = 0
         while i < len(word): 
             ch = word[i]
             if (LanguageHelper.is_cyrillic_vowel(ch) or LanguageHelper.is_latin_vowel(ch)): 
                 break
             i += 1
         if (i >= len(word)): 
             return res
     mvs = [ ]
     tn = self.m_root
     i = 0
     while i <= len(word): 
         if (tn.lazy_pos > 0): 
             self.__load_tree_node(tn)
         if (tn.rule_ids is not None): 
             word_begin = None
             word_end = None
             if (i == 0): 
                 word_end = word
             elif (i < len(word)): 
                 word_end = word[i:]
             else: 
                 word_end = ""
             if (res is None): 
                 res = list()
             for rid in tn.rule_ids: 
                 r = self.get_rule(rid)
                 mvs = r.get_vars(word_end)
                 if (mvs is None): 
                     continue
                 if (word_begin is None): 
                     if (i == len(word)): 
                         word_begin = word
                     elif (i > 0): 
                         word_begin = word[0:0+i]
                     else: 
                         word_begin = ""
                 self.__process_result(res, word_begin, mvs)
         if (tn.nodes is None or i >= len(word)): 
             break
         ch = ord(word[i])
         wraptn9 = RefOutArgWrapper(None)
         inoutres10 = Utils.tryGetValue(tn.nodes, ch, wraptn9)
         tn = wraptn9.value
         if (not inoutres10): 
             break
         i += 1
     need_test_unknown_vars = True
     if (res is not None): 
         for r in res: 
             if ((r.class0_.is_pronoun or r.class0_.is_noun or r.class0_.is_adjective) or (r.class0_.is_misc and r.class0_.is_conjunction) or r.class0_.is_preposition): 
                 need_test_unknown_vars = False
             elif (r.class0_.is_adverb and r.normal_case is not None): 
                 if (not LanguageHelper.ends_with_ex(r.normal_case, "О", "А", None, None)): 
                     need_test_unknown_vars = False
                 elif (r.normal_case == "МНОГО"): 
                     need_test_unknown_vars = False
             elif (r.class0_.is_verb and len(res) > 1): 
                 ok = False
                 for rr in res: 
                     if (rr != r and rr.class0_ != r.class0_): 
                         ok = True
                         break
                 if (ok and not LanguageHelper.ends_with(word, "ИМ")): 
                     need_test_unknown_vars = False
     if (need_test_unknown_vars and LanguageHelper.is_cyrillic_char(word[0])): 
         gl = 0
         sog = 0
         j = 0
         while j < len(word): 
             if (LanguageHelper.is_cyrillic_vowel(word[j])): 
                 gl += 1
             else: 
                 sog += 1
             j += 1
         if ((gl < 2) or (sog < 2)): 
             need_test_unknown_vars = False
     if (need_test_unknown_vars and res is not None and len(res) == 1): 
         if (res[0].class0_.is_verb): 
             if ("н.вр." in res[0].misc.attrs and "нес.в." in res[0].misc.attrs and not "страд.з." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif ("б.вр." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif ("инф." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif (res[0].normal_case is not None and LanguageHelper.ends_with(res[0].normal_case, "СЯ")): 
                 need_test_unknown_vars = False
         if (res[0].class0_.is_undefined and "прдктв." in res[0].misc.attrs): 
             need_test_unknown_vars = False
     if (need_test_unknown_vars): 
         if (self.m_root_reverce is None): 
             return res
         tn = self.m_root_reverce
         tn0 = self.m_root_reverce
         for i in range(len(word) - 1, -1, -1):
             if (tn.lazy_pos > 0): 
                 self.__load_tree_node(tn)
             ch = ord(word[i])
             if (tn.nodes is None): 
                 break
             if (not ch in tn.nodes): 
                 break
             tn = tn.nodes[ch]
             if (tn.lazy_pos > 0): 
                 self.__load_tree_node(tn)
             if (tn.reverce_variants is not None): 
                 tn0 = tn
                 break
         else: i = -1
         if (tn0 != self.m_root_reverce): 
             glas = i < 4
             while i >= 0: 
                 if (LanguageHelper.is_cyrillic_vowel(word[i]) or LanguageHelper.is_latin_vowel(word[i])): 
                     glas = True
                     break
                 i -= 1
             if (glas): 
                 for mvref in tn0.reverce_variants: 
                     mv = self.get_rule_var(mvref.rule_id, mvref.variant_id)
                     if (mv is None): 
                         continue
                     if (((not mv.class0_.is_verb and not mv.class0_.is_adjective and not mv.class0_.is_noun) and not mv.class0_.is_proper_surname and not mv.class0_.is_proper_geo) and not mv.class0_.is_proper_secname): 
                         continue
                     ok = False
                     for rr in res: 
                         if (rr.is_in_dictionary): 
                             if (rr.class0_ == mv.class0_ or rr.class0_.is_noun): 
                                 ok = True
                                 break
                             if (not mv.class0_.is_adjective and rr.class0_.is_verb): 
                                 ok = True
                                 break
                     if (ok): 
                         continue
                     if (len(mv.tail) > 0 and not LanguageHelper.ends_with(word, mv.tail)): 
                         continue
                     r = MorphWordForm(mv, word, self.get_misc_info(mv.misc_info_id))
                     if (not r._has_morph_equals(res)): 
                         r.undef_coef = mvref.coef
                         if (res is None): 
                             res = list()
                         res.append(r)
     if (word == "ПРИ" and res is not None): 
         for i in range(len(res) - 1, -1, -1):
             if (res[i].class0_.is_proper_geo): 
                 del res[i]
         else: i = -1
     if (res is None or len(res) == 0): 
         return None
     self.__sort(res, word)
     for v in res: 
         if (v.normal_case is None): 
             v.normal_case = word
         if (v.class0_.is_verb): 
             if (v.normal_full is None and LanguageHelper.ends_with(v.normal_case, "ТЬСЯ")): 
                 v.normal_full = v.normal_case[0:0+len(v.normal_case) - 2]
         v.language = self.language
         if (v.class0_.is_preposition): 
             v.normal_case = LanguageHelper.normalize_preposition(v.normal_case)
     mc = MorphClass()
     for i in range(len(res) - 1, -1, -1):
         if (not res[i].is_in_dictionary and res[i].class0_.is_adjective and len(res) > 1): 
             if ("к.ф." in res[i].misc.attrs or "неизм." in res[i].misc.attrs): 
                 del res[i]
                 continue
         if (res[i].is_in_dictionary): 
             mc.value |= res[i].class0_.value
     else: i = -1
     if (mc == MorphClass.VERB and len(res) > 1): 
         for r in res: 
             if (r.undef_coef > (100) and r.class0_ == MorphClass.ADJECTIVE): 
                 r.undef_coef = (0)
     if (len(res) == 0): 
         return None
     return res
Ejemplo n.º 23
0
 def getNormalCaseText(self,
                       mc: 'MorphClass' = None,
                       single_number: bool = False,
                       gender: 'MorphGender' = MorphGender.UNDEFINED,
                       keep_chars: bool = False) -> str:
     from pullenti.ner.core.MiscHelper import MiscHelper
     empty = True
     if (mc is not None and mc.is_preposition):
         return LanguageHelper.normalizePreposition(self.term)
     for it in self.morph.items:
         if (mc is not None and not mc.is_undefined):
             cc = (it.class0_.value) & (mc.value)
             if (cc == 0):
                 continue
             if (MorphClass.isMiscInt(cc) and not MorphClass.isProperInt(cc)
                     and mc.value != it.class0_.value):
                 continue
         wf = Utils.asObjectOrNull(it, MorphWordForm)
         normal_full = False
         if (gender != MorphGender.UNDEFINED):
             if ((((it.gender) & (gender))) == (MorphGender.UNDEFINED)):
                 if ((gender == MorphGender.MASCULINE and
                      ((it.gender != MorphGender.UNDEFINED or it.number
                        == MorphNumber.PLURAL)) and wf is not None)
                         and wf.normal_full is not None):
                     normal_full = True
                 elif (gender == MorphGender.MASCULINE
                       and it.class0_.is_personal_pronoun):
                     pass
                 else:
                     continue
         if (not it.case_.is_undefined):
             empty = False
         if (wf is not None):
             if (single_number and it.number == MorphNumber.PLURAL
                     and wf.normal_full is not None):
                 le = len(wf.normal_case)
                 if ((le == (len(wf.normal_full) + 2) and le > 4
                      and wf.normal_case[le - 2] == 'С')
                         and wf.normal_case[le - 1] == 'Я'):
                     res = wf.normal_case
                 else:
                     res = (wf.normal_full
                            if normal_full else wf.normal_full)
             else:
                 res = (wf.normal_full if normal_full else
                        (Utils.ifNotNull(wf.normal_case, self.term)))
             if (single_number and mc is not None
                     and mc == MorphClass.NOUN):
                 if (res == "ДЕТИ"):
                     res = "РЕБЕНОК"
             if (keep_chars):
                 if (self.chars.is_all_lower):
                     res = res.lower()
                 elif (self.chars.is_capital_upper):
                     res = MiscHelper.convertFirstCharUpperAndOtherLower(
                         res)
             return res
     if (not empty):
         return None
     te = None
     if (single_number and mc is not None):
         bi = MorphBaseInfo._new549(MorphClass(mc), gender,
                                    MorphNumber.SINGULAR,
                                    self.morph.language)
         vars0_ = Morphology.getWordform(self.term, bi)
         if (vars0_ is not None):
             te = vars0_
     if (self.chars.is_cyrillic_letter and te is None
             and len(self.term) > 3):
         ch0 = self.term[len(self.term) - 1]
         ch1 = self.term[len(self.term) - 2]
         if (ch0 == 'М' and ((ch1 == 'О' or ch1 == 'А'))):
             te = self.term[0:0 + len(self.term) - 2]
         elif (not LanguageHelper.isCyrillicVowel(ch1)
               and LanguageHelper.isCyrillicVowel(ch0)):
             te = self.term[0:0 + len(self.term) - 1]
     if (te is None):
         te = self.term
     if (keep_chars):
         if (self.chars.is_all_lower):
             return te.lower()
         elif (self.chars.is_capital_upper):
             return MiscHelper.convertFirstCharUpperAndOtherLower(te)
     return te