Esempio n. 1
0
 def initialize(lang: 'MorphLang' = None) -> None:
     """ Инициализация сервиса. Каждый анализатор нужно аинициализировать отдельно.
     Если вызывается Sdk.Initialize(), то там инициализация сервиса и всех анализаторов делается.
     
     Args:
         lang(MorphLang): необходимые языки (по умолчанию, русский и английский)
     
     """
     from pullenti.ner.core.internal.NumberExHelper import NumberExHelper
     from pullenti.ner.core.internal.BlockLine import BlockLine
     from pullenti.ner.core.internal.NounPhraseItem import NounPhraseItem
     from pullenti.ner.core.PrepositionHelper import PrepositionHelper
     from pullenti.ner.core.ConjunctionHelper import ConjunctionHelper
     if (ProcessorService.__m_inited):
         return
     ProcessorService.__m_inited = True
     MorphologyService.initialize(lang)
     DerivateService.initialize(lang)
     Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = True
     PrepositionHelper._initialize()
     ConjunctionHelper._initialize()
     NounPhraseItem._initialize()
     NumberHelper._initialize()
     NumberExHelper._initialize()
     BlockLine.initialize()
     Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = False
 def add(self,
         val: str,
         shortval: str,
         gen: 'MorphGender',
         add_other_gender_var: bool = False) -> None:
     if (val is None):
         return
     if (self.head is None):
         if (len(val) > 3):
             self.head = val[0:0 + 3]
         else:
             self.head = val
     if (gen == MorphGender.MASCULINE or gen == MorphGender.FEMINIE):
         for it in self.items:
             if (it.value == val and it.gender == gen):
                 return
         self.items.append(
             PersonMorphCollection.PersonMorphVariant._new2591(
                 val, gen, shortval))
         if (add_other_gender_var):
             g0 = (MorphGender.MASCULINE
                   if gen == MorphGender.FEMINIE else MorphGender.FEMINIE)
             v = MorphologyService.get_wordform(
                 val, MorphBaseInfo._new193(MorphClass._new2572(True), g0))
             if (v is not None):
                 self.items.append(
                     PersonMorphCollection.PersonMorphVariant._new2591(
                         v, g0, shortval))
     else:
         self.add(val, shortval, MorphGender.MASCULINE, False)
         self.add(val, shortval, MorphGender.FEMINIE, False)
Esempio n. 3
0
 def __merge_letters(self) -> None:
     before_word = False
     tmp = io.StringIO()
     t = self.first_token
     first_pass3055 = True
     while True:
         if first_pass3055: first_pass3055 = False
         else: t = t.next0_
         if (not (t is not None)): break
         tt = Utils.asObjectOrNull(t, TextToken)
         if (not tt.chars.is_letter or tt.length_char != 1):
             before_word = False
             continue
         i = t.whitespaces_before_count
         if (i > 2 or ((i == 2 and before_word))):
             pass
         else:
             before_word = False
             continue
         i = 0
         t1 = None
         Utils.setLengthStringIO(tmp, 0)
         print(tt.get_source_text(), end="", file=tmp)
         t1 = t
         while t1.next0_ is not None:
             tt = (Utils.asObjectOrNull(t1.next0_, TextToken))
             if (tt.length_char != 1 or tt.whitespaces_before_count != 1):
                 break
             i += 1
             print(tt.get_source_text(), end="", file=tmp)
             t1 = t1.next0_
         if (i > 3 or ((i > 1 and before_word))):
             pass
         else:
             before_word = False
             continue
         before_word = False
         mt = MorphologyService.process(Utils.toStringStringIO(tmp), None,
                                        None)
         if (mt is None or len(mt) != 1):
             t = t1
             continue
         for wf in mt[0].word_forms:
             if (wf.is_in_dictionary):
                 before_word = True
                 break
         if (not before_word):
             t = t1
             continue
         tt = TextToken(mt[0], self, t.begin_char, t1.end_char)
         if (t == self.first_token):
             self.first_token = (tt)
         else:
             tt.previous = t.previous
         tt.next0_ = t1.next0_
         t = (tt)
Esempio n. 4
0
 def get_morph_variant(self, cas : 'MorphCase', plural : bool) -> str:
     """ Сгенерировать текст именной группы в нужном падеже и числе
     
     Args:
         cas(MorphCase): нужный падеж
         plural(bool): нужное число
     
     Returns:
         str: результирующая строка
     """
     mi = MorphBaseInfo._new499(cas, MorphLang.RU)
     if (plural): 
         mi.number = MorphNumber.PLURAL
     else: 
         mi.number = MorphNumber.SINGULAR
     res = None
     for a in self.adjectives: 
         tt = MiscHelper.get_text_value_of_meta_token(a, GetTextAttr.NO)
         if (a.begin_token != a.end_token or not (isinstance(a.begin_token, TextToken))): 
             pass
         else: 
             tt2 = MorphologyService.get_wordform(tt, mi)
             if (tt2 is not None): 
                 tt = tt2
         if (res is None): 
             res = tt
         else: 
             res = "{0} {1}".format(res, tt)
     if (self.noun is not None): 
         tt = MiscHelper.get_text_value_of_meta_token(self.noun, GetTextAttr.NO)
         if (self.noun.begin_token != self.noun.end_token or not (isinstance(self.noun.begin_token, TextToken))): 
             pass
         else: 
             tt2 = MorphologyService.get_wordform(tt, mi)
             if (tt2 is not None): 
                 tt = tt2
         if (res is None): 
             res = tt
         else: 
             res = "{0} {1}".format(res, tt)
     return res
Esempio n. 5
0
 def __correct_words_by_morph(self, lang: 'MorphLang') -> None:
     tt = self.first_token
     first_pass3054 = True
     while True:
         if first_pass3054: first_pass3054 = False
         else: tt = tt.next0_
         if (not (tt is not None)): break
         if (not (isinstance(tt, TextToken))):
             continue
         if (tt.morph.contains_attr("прдктв.", None)):
             continue
         dd = tt.get_morph_class_in_dictionary()
         if (not dd.is_undefined or (tt.length_char < 4)):
             continue
         if (tt.morph.class0_.is_proper_surname
                 and not tt.chars.is_all_lower):
             continue
         if (tt.chars.is_all_upper):
             continue
         corw = MorphologyService.correct_word(
             tt.term, (lang if tt.morph.language.is_undefined else
                       tt.morph.language))
         if (corw is None):
             continue
         ccc = MorphologyService.process(corw, lang, None)
         if (ccc is None or len(ccc) != 1):
             continue
         tt1 = TextToken._new473(ccc[0], self, tt.begin_char, tt.end_char,
                                 tt.chars, tt.term)
         mc = tt1.get_morph_class_in_dictionary()
         if (mc.is_proper_surname):
             continue
         if (tt == self.first_token):
             self.first_token = (tt1)
         else:
             tt.previous.next0_ = tt1
         tt1.next0_ = tt.next0_
         tt = (tt1)
         if (self.corrected_tokens is None):
             self.corrected_tokens = dict()
         self.corrected_tokens[tt] = tt.get_source_text()
Esempio n. 6
0
 def __correct_words_by_merging(self, lang: 'MorphLang') -> None:
     t = self.first_token
     first_pass3053 = True
     while True:
         if first_pass3053: first_pass3053 = False
         else: t = t.next0_
         if (not (t is not None and t.next0_ is not None)): break
         if (not t.chars.is_letter or (t.length_char < 2)):
             continue
         mc0 = t.get_morph_class_in_dictionary()
         if (t.morph.contains_attr("прдктв.", None)):
             continue
         t1 = t.next0_
         if (t1.is_hiphen and t1.next0_ is not None
                 and not t1.is_newline_after):
             t1 = t1.next0_
         if (t1.length_char == 1):
             continue
         if (not t1.chars.is_letter or not t.chars.is_letter
                 or t1.chars.is_latin_letter != t.chars.is_latin_letter):
             continue
         if (t1.chars.is_all_upper and not t.chars.is_all_upper):
             continue
         elif (not t1.chars.is_all_lower):
             continue
         elif (t.chars.is_all_upper):
             continue
         if (t1.morph.contains_attr("прдктв.", None)):
             continue
         mc1 = t1.get_morph_class_in_dictionary()
         if (not mc1.is_undefined and not mc0.is_undefined):
             continue
         if ((len(t.term) + len(t1.term)) < 6):
             continue
         corw = t.term + t1.term
         ccc = MorphologyService.process(corw, lang, None)
         if (ccc is None or len(ccc) != 1):
             continue
         if (corw == "ПОСТ" or corw == "ВРЕД"):
             continue
         tt = TextToken(ccc[0], self, t.begin_char, t1.end_char)
         if (tt.get_morph_class_in_dictionary().is_undefined):
             continue
         tt.chars = t.chars
         if (t == self.first_token):
             self.first_token = (tt)
         else:
             t.previous.next0_ = tt
         if (t1.next0_ is not None):
             tt.next0_ = t1.next0_
         t = (tt)
 def is_participle(self) -> bool:
     """ Это причастие """
     if (self.__m_is_participle >= 0): 
         return self.__m_is_participle > 0
     for f in self.morph.items: 
         if (f.class0_.is_adjective and (isinstance(f, MorphWordForm)) and not "к.ф." in f.misc.attrs): 
             return True
         elif (f.class0_.is_verb and not f.case_.is_undefined): 
             return True
     self.__m_is_participle = 0
     tt = Utils.asObjectOrNull(self.end_token, TextToken)
     if (tt is not None and tt.term.endswith("СЯ")): 
         mb = MorphologyService.get_word_base_info(tt.term[0:0+len(tt.term) - 2], None, False, False)
         if (mb is not None): 
             if (mb.class0_.is_adjective): 
                 self.__m_is_participle = 1
     return self.__m_is_participle > 0
Esempio n. 8
0
 def get_normal_case_text(self,
                          mc: 'MorphClass' = None,
                          num: 'MorphNumber' = MorphNumber.UNDEFINED,
                          gender: 'MorphGender' = MorphGender.UNDEFINED,
                          keep_chars: bool = False) -> str:
     if ((isinstance(self.begin_token, ReferentToken))
             and self.begin_token == self.end_token):
         return self.begin_token.get_normal_case_text(
             mc, num, gender, keep_chars)
     res = None
     max_coef = 0
     def_coef = -1
     for it in self.morph.items:
         v = Utils.asObjectOrNull(it, NounPhraseItemTextVar)
         if (v is None):
             continue
         if (v.undef_coef > 0
                 and (((v.undef_coef < max_coef) or def_coef >= 0))):
             continue
         if (num == MorphNumber.SINGULAR
                 and v.single_number_value is not None):
             if (mc is not None and ((gender == MorphGender.NEUTER
                                      or gender == MorphGender.FEMINIE))
                     and mc.is_adjective):
                 bi = MorphBaseInfo._new401(MorphClass._new53(mc.value),
                                            gender, MorphNumber.SINGULAR,
                                            MorphCase.NOMINATIVE,
                                            self.morph.language)
                 str0_ = MorphologyService.get_wordform(
                     v.single_number_value, bi)
                 if (str0_ is not None):
                     res = str0_
             else:
                 res = v.single_number_value
             if (v.undef_coef == 0):
                 break
             max_coef = v.undef_coef
             continue
         if (Utils.isNullOrEmpty(v.normal_value)):
             continue
         if (str.isdigit(v.normal_value[0]) and mc is not None
                 and mc.is_adjective):
             val = 0
             wrapval402 = RefOutArgWrapper(0)
             inoutres403 = Utils.tryParseInt(v.normal_value, wrapval402)
             val = wrapval402.value
             if (inoutres403):
                 str0_ = NumberHelper.get_number_adjective(
                     val, gender,
                     (MorphNumber.SINGULAR if num == MorphNumber.SINGULAR
                      or val == 1 else MorphNumber.PLURAL))
                 if (str0_ is not None):
                     res = str0_
                     if (v.undef_coef == 0):
                         break
                     max_coef = v.undef_coef
                     continue
         res1 = it.normal_value
         if (num == MorphNumber.SINGULAR):
             if (res1 == "ДЕТИ"):
                 res1 = "РЕБЕНОК"
             elif (res1 == "ЛЮДИ"):
                 res1 = "ЧЕЛОВЕК"
         max_coef = v.undef_coef
         if (v.undef_coef > 0):
             res = res1
             continue
         def_co = 0
         if (mc is not None and mc.is_adjective and v.undef_coef == 0):
             pass
         elif (
             ((isinstance(self.begin_token, TextToken))
              and res1 == self.begin_token.term and it.case_.is_nominative)
                 and it.number == MorphNumber.SINGULAR):
             def_co = 1
         if (num == MorphNumber.PLURAL and
             ((v.number) & (MorphNumber.PLURAL)) == (MorphNumber.PLURAL)):
             def_co += 3
         if (res is None or def_co > def_coef):
             res = res1
             def_coef = def_co
             if (def_co > 0):
                 break
     if (res is not None):
         return self.__corr_chars(res, keep_chars)
     if (res is None and self.begin_token == self.end_token):
         res = self.begin_token.get_normal_case_text(
             mc, num, gender, keep_chars)
     elif (res is None):
         res = self.begin_token.get_normal_case_text(
             mc, num, gender, keep_chars)
         if (res is None):
             res = MiscHelper.get_text_value_of_meta_token(
                 self, (GetTextAttr.KEEPREGISTER
                        if keep_chars else GetTextAttr.NO))
         else:
             res = "{0} {1}".format(
                 res,
                 MiscHelper.get_text_value(
                     self.begin_token.next0_, self.end_token,
                     (GetTextAttr.KEEPREGISTER
                      if keep_chars else GetTextAttr.NO)))
     return Utils.ifNotNull(res, "?")
 def __get_name_without_brackets(begin: 'Token',
                                 end: 'Token',
                                 normalize_first_noun_group: bool = False,
                                 normal_first_group_single: bool = False,
                                 ignore_geo_referent: bool = False) -> str:
     res = None
     if (BracketHelper.can_be_start_of_sequence(begin, False, False)
             and BracketHelper.can_be_end_of_sequence(
                 end, False, begin, False)):
         begin = begin.next0_
         end = end.previous
     if (normalize_first_noun_group
             and not begin.morph.class0_.is_preposition):
         npt = NounPhraseHelper.try_parse(
             begin, NounPhraseParseAttr.REFERENTCANBENOUN, 0, None)
         if (npt is not None):
             if (npt.noun.get_morph_class_in_dictionary().is_undefined
                     and len(npt.adjectives) == 0):
                 npt = (None)
         if (npt is not None and npt.end_token.end_char > end.end_char):
             npt = (None)
         if (npt is not None):
             res = npt.get_normal_case_text(
                 None, (MorphNumber.SINGULAR if normal_first_group_single
                        else MorphNumber.UNDEFINED), MorphGender.UNDEFINED,
                 False)
             te = npt.end_token.next0_
             if (((te is not None and te.next0_ is not None and te.is_comma)
                  and (isinstance(te.next0_, TextToken))
                  and te.next0_.end_char <= end.end_char)
                     and te.next0_.morph.class0_.is_verb
                     and te.next0_.morph.class0_.is_adjective):
                 for it in te.next0_.morph.items:
                     if (it.gender == npt.morph.gender
                             or ((it.gender) & (npt.morph.gender)) !=
                         (MorphGender.UNDEFINED)):
                         if (not (
                             (it.case_) & npt.morph.case_).is_undefined):
                             if (it.number == npt.morph.number or
                                 ((it.number) & (npt.morph.number)) !=
                                 (MorphNumber.UNDEFINED)):
                                 var = te.next0_.term
                                 if (isinstance(it, MorphWordForm)):
                                     var = it.normal_case
                                 bi = MorphBaseInfo._new492(
                                     MorphClass.ADJECTIVE, npt.morph.gender,
                                     npt.morph.number, npt.morph.language)
                                 var = MorphologyService.get_wordform(
                                     var, bi)
                                 if (var is not None):
                                     res = "{0}, {1}".format(res, var)
                                     te = te.next0_.next0_
                                 break
             if (te is not None and te.end_char <= end.end_char):
                 s = ProperNameHelper.get_name_ex(te, end,
                                                  MorphClass.UNDEFINED,
                                                  MorphCase.UNDEFINED,
                                                  MorphGender.UNDEFINED,
                                                  True, ignore_geo_referent)
                 if (not Utils.isNullOrEmpty(s)):
                     if (not str.isalnum(s[0])):
                         res = "{0}{1}".format(res, s)
                     else:
                         res = "{0} {1}".format(res, s)
         elif ((isinstance(begin, TextToken))
               and begin.chars.is_cyrillic_letter):
             mm = begin.get_morph_class_in_dictionary()
             if (not mm.is_undefined):
                 res = begin.get_normal_case_text(mm, MorphNumber.UNDEFINED,
                                                  MorphGender.UNDEFINED,
                                                  False)
                 if (begin.end_char < end.end_char):
                     res = "{0} {1}".format(
                         res,
                         ProperNameHelper.get_name_ex(
                             begin.next0_, end, MorphClass.UNDEFINED,
                             MorphCase.UNDEFINED, MorphGender.UNDEFINED,
                             True, False))
     if (res is None):
         res = ProperNameHelper.get_name_ex(begin, end,
                                            MorphClass.UNDEFINED,
                                            MorphCase.UNDEFINED,
                                            MorphGender.UNDEFINED, True,
                                            ignore_geo_referent)
     if (not Utils.isNullOrEmpty(res)):
         k = 0
         i = len(res) - 1
         while i >= 0:
             if (res[i] == '*' or Utils.isWhitespace(res[i])):
                 pass
             else:
                 break
             i -= 1
             k += 1
         if (k > 0):
             if (k == len(res)):
                 return None
             res = res[0:0 + len(res) - k]
     return res
Esempio n. 10
0
 def __init__(self,
              sofa_: 'SourceOfAnalysis' = None,
              only_tokenizing: bool = False,
              lang: 'MorphLang' = None,
              progress: EventHandler = None) -> None:
     self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0)
     self.corrected_tokens = None
     self.first_token = None
     self.__m_entities = list()
     self.ontology = None
     self.base_language = MorphLang()
     self.__m_sofa = None
     self.statistics = None
     self.__m_datas = dict()
     self.misc_data = dict()
     self.processor = None
     self.recurse_level = 0
     self._m_analyzer_stack = list()
     self.onto_regime = False
     if (sofa_ is None):
         return
     self.__m_sofa = sofa_
     self._start_date = datetime.datetime.now()
     tokens = MorphologyService.process(sofa_.text, lang, None)
     t0 = None
     if (tokens is not None):
         ii = 0
         while ii < len(tokens):
             mt = tokens[ii]
             if (mt.begin_char == 733860):
                 pass
             tt = TextToken(mt, self)
             if (sofa_.correction_dict is not None):
                 corw = None
                 wrapcorw471 = RefOutArgWrapper(None)
                 inoutres472 = Utils.tryGetValue(sofa_.correction_dict,
                                                 mt.term, wrapcorw471)
                 corw = wrapcorw471.value
                 if (inoutres472):
                     ccc = MorphologyService.process(corw, lang, None)
                     if (ccc is not None and len(ccc) == 1):
                         tt1 = TextToken._new470(ccc[0], self,
                                                 tt.begin_char, tt.end_char,
                                                 tt.term)
                         tt1.chars = tt.chars
                         tt = tt1
                         if (self.corrected_tokens is None):
                             self.corrected_tokens = dict()
                         self.corrected_tokens[tt] = tt.get_source_text()
             if (t0 is None):
                 self.first_token = (tt)
             else:
                 t0.next0_ = tt
             t0 = (tt)
             ii += 1
     if (sofa_.clear_dust):
         self.__clear_dust()
     if (sofa_.do_words_merging_by_morph):
         self.__correct_words_by_merging(lang)
     if (sofa_.do_word_correction_by_morph):
         self.__correct_words_by_morph(lang)
     self.__merge_letters()
     self.__define_base_language()
     if (sofa_.create_number_tokens):
         t = self.first_token
         first_pass3049 = True
         while True:
             if first_pass3049: first_pass3049 = False
             else: t = t.next0_
             if (not (t is not None)): break
             nt = NumberHelper._try_parse_number(t)
             if (nt is None):
                 continue
             self.embed_token(nt)
             t = (nt)
     if (only_tokenizing):
         return
     t = self.first_token
     first_pass3050 = True
     while True:
         if first_pass3050: first_pass3050 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (t.morph.class0_.is_preposition):
             continue
         mc = t.get_morph_class_in_dictionary()
         if (mc.is_undefined and t.chars.is_cyrillic_letter
                 and t.length_char > 4):
             tail = sofa_.text[t.end_char - 1:t.end_char - 1 + 2]
             tte = None
             tt = t.previous
             if (tt is not None and
                 ((tt.is_comma_and or tt.morph.class0_.is_preposition
                   or tt.morph.class0_.is_conjunction))):
                 tt = tt.previous
             if ((tt is not None
                  and not tt.get_morph_class_in_dictionary().is_undefined
                  and (((tt.morph.class0_.value) &
                        (t.morph.class0_.value))) != 0)
                     and tt.length_char > 4):
                 tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1 + 2]
                 if (tail2 == tail):
                     tte = tt
             if (tte is None):
                 tt = t.next0_
                 if (tt is not None and
                     ((tt.is_comma_and or tt.morph.class0_.is_preposition
                       or tt.morph.class0_.is_conjunction))):
                     tt = tt.next0_
                 if ((tt is not None and
                      not tt.get_morph_class_in_dictionary().is_undefined
                      and (((tt.morph.class0_.value) &
                            (t.morph.class0_.value))) != 0)
                         and tt.length_char > 4):
                     tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1 + 2]
                     if (tail2 == tail):
                         tte = tt
             if (tte is not None):
                 t.morph.remove_items_ex(
                     tte.morph, tte.get_morph_class_in_dictionary())
         continue
     self.__create_statistics()
Esempio n. 11
0
 def get_normal_case_text(self,
                          mc: 'MorphClass' = None,
                          num: 'MorphNumber' = MorphNumber.UNDEFINED,
                          gender: 'MorphGender' = MorphGender.UNDEFINED,
                          keep_chars: bool = False) -> str:
     from pullenti.ner.core.MiscHelper import MiscHelper
     empty = True
     if (mc is not None and mc.is_preposition):
         return LanguageHelper.normalize_preposition(self.term)
     for it in self.morph.items:
         if (mc is not None and not mc.is_undefined):
             cc = (it.class0_) & mc
             if (cc.is_undefined):
                 continue
             if (cc.is_misc and not cc.is_proper and mc != it.class0_):
                 continue
         wf = Utils.asObjectOrNull(it, MorphWordForm)
         normal_full = False
         if (gender != MorphGender.UNDEFINED):
             if (((it.gender) & (gender)) == (MorphGender.UNDEFINED)):
                 if ((gender == MorphGender.MASCULINE and
                      ((it.gender != MorphGender.UNDEFINED or it.number
                        == MorphNumber.PLURAL)) and wf is not None)
                         and wf.normal_full is not None):
                     normal_full = True
                 elif (gender == MorphGender.MASCULINE
                       and it.class0_.is_personal_pronoun):
                     pass
                 else:
                     continue
         if (not it.case_.is_undefined):
             empty = False
         if (wf is not None):
             res = None
             if (num == MorphNumber.SINGULAR
                     and it.number == MorphNumber.PLURAL
                     and wf.normal_full is not None):
                 le = len(wf.normal_case)
                 if ((le == (len(wf.normal_full) + 2) and le > 4
                      and wf.normal_case[le - 2] == 'С')
                         and wf.normal_case[le - 1] == 'Я'):
                     res = wf.normal_case
                 else:
                     res = (wf.normal_full
                            if normal_full else wf.normal_full)
             else:
                 res = (wf.normal_full if normal_full else
                        (Utils.ifNotNull(wf.normal_case, self.term)))
             if (num == MorphNumber.SINGULAR and mc is not None
                     and mc == MorphClass.NOUN):
                 if (res == "ДЕТИ"):
                     res = "РЕБЕНОК"
             if (keep_chars):
                 if (self.chars.is_all_lower):
                     res = res.lower()
                 elif (self.chars.is_capital_upper):
                     res = MiscHelper.convert_first_char_upper_and_other_lower(
                         res)
             return res
     if (not empty):
         return None
     te = None
     if (num == MorphNumber.SINGULAR and mc is not None):
         bi = MorphBaseInfo._new492(MorphClass._new53(mc.value), gender,
                                    MorphNumber.SINGULAR,
                                    self.morph.language)
         vars0_ = MorphologyService.get_wordform(self.term, bi)
         if (vars0_ is not None):
             te = vars0_
     if (te is None):
         te = self.term
     if (keep_chars):
         if (self.chars.is_all_lower):
             return te.lower()
         elif (self.chars.is_capital_upper):
             return MiscHelper.convert_first_char_upper_and_other_lower(te)
     return te
Esempio n. 12
0
 def __try_parse_ru(t: 'Token', can_be_partition: bool,
                    can_be_adj_partition: bool,
                    force_parse: bool) -> 'VerbPhraseToken':
     res = None
     t0 = t
     not0_ = None
     has_verb = False
     verb_be_before = False
     prep = None
     first_pass3070 = True
     while True:
         if first_pass3070: first_pass3070 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (not (isinstance(t, TextToken))):
             break
         tt = Utils.asObjectOrNull(t, TextToken)
         is_participle = False
         if (tt.term == "НЕ"):
             not0_ = t
             continue
         ty = 0
         norm = None
         mc = tt.get_morph_class_in_dictionary()
         if (tt.term == "НЕТ"):
             if (has_verb):
                 break
             ty = 1
         elif (tt.term == "ДОПУСТИМО"):
             ty = 3
         elif (mc.is_adverb and not mc.is_verb):
             ty = 2
         elif (tt.is_pure_verb or tt.is_verb_be):
             ty = 1
             if (has_verb):
                 if (not tt.morph.contains_attr("инф.", None)):
                     if (verb_be_before):
                         pass
                     else:
                         break
         elif (mc.is_verb):
             if (mc.is_preposition or mc.is_misc or mc.is_pronoun):
                 pass
             elif (mc.is_noun):
                 if (tt.term == "СТАЛИ" or tt.term == "СТЕКЛО"
                         or tt.term == "БЫЛИ"):
                     ty = 1
                 elif (not tt.chars.is_all_lower
                       and not MiscHelper.can_be_start_of_sentence(tt)):
                     ty = 1
                 elif (mc.is_adjective and can_be_partition):
                     ty = 1
                 elif (force_parse):
                     ty = 1
             elif (mc.is_proper):
                 if (tt.chars.is_all_lower):
                     ty = 1
             else:
                 ty = 1
             if (mc.is_adjective):
                 is_participle = True
             if (not tt.morph.case_.is_undefined):
                 is_participle = True
             if (not can_be_partition and is_participle):
                 break
             if (has_verb):
                 if (tt.morph.contains_attr("инф.", None)):
                     pass
                 elif (not is_participle):
                     pass
                 else:
                     break
         elif ((mc.is_adjective and tt.morph.contains_attr("к.ф.", None)
                and tt.term.endswith("О")) and NounPhraseHelper.try_parse(
                    tt, NounPhraseParseAttr.NO, 0, None) is None):
             ty = 2
         elif (mc.is_adjective
               and ((can_be_partition or can_be_adj_partition))):
             if (tt.morph.contains_attr("к.ф.", None)
                     and not can_be_adj_partition):
                 break
             norm = tt.get_normal_case_text(MorphClass.ADJECTIVE,
                                            MorphNumber.SINGULAR,
                                            MorphGender.MASCULINE, False)
             if (norm.endswith("ЙШИЙ")):
                 pass
             else:
                 grs = DerivateService.find_derivates(norm, True, None)
                 if (grs is not None and len(grs) > 0):
                     hverb = False
                     hpart = False
                     for gr in grs:
                         for w in gr.words:
                             if (w.class0_.is_adjective
                                     and w.class0_.is_verb):
                                 if (w.spelling == norm):
                                     hpart = True
                             elif (w.class0_.is_verb):
                                 hverb = True
                     if (hpart and hverb):
                         ty = 3
                     elif (can_be_adj_partition):
                         ty = 3
                     if (ty != 3 and not Utils.isNullOrEmpty(grs[0].prefix)
                             and norm.startswith(grs[0].prefix)):
                         hverb = False
                         hpart = False
                         norm1 = norm[len(grs[0].prefix):]
                         grs = DerivateService.find_derivates(
                             norm1, True, None)
                         if (grs is not None and len(grs) > 0):
                             for gr in grs:
                                 for w in gr.words:
                                     if (w.class0_.is_adjective
                                             and w.class0_.is_verb):
                                         if (w.spelling == norm1):
                                             hpart = True
                                     elif (w.class0_.is_verb):
                                         hverb = True
                         if (hpart and hverb):
                             ty = 3
         if (ty == 0 and t == t0 and can_be_partition):
             prep = PrepositionHelper.try_parse(t)
             if (prep is not None):
                 t = prep.end_token
                 continue
         if (ty == 0):
             break
         if (res is None):
             res = VerbPhraseToken(t0, t)
         res.end_token = t
         it = VerbPhraseItemToken._new603(t, t, MorphCollection(t.morph))
         if (not0_ is not None):
             it.begin_token = not0_
             it.not0_ = True
             not0_ = (None)
         it.is_adverb = ty == 2
         if (prep is not None and not t.morph.case_.is_undefined
                 and len(res.items) == 0):
             if (((prep.next_case) & t.morph.case_).is_undefined):
                 return None
             it.morph.remove_items(prep.next_case, False)
             res.preposition = prep
         if (norm is None):
             norm = t.get_normal_case_text(
                 (MorphClass.ADJECTIVE if ty == 3 else
                  (MorphClass.ADVERB if ty == 2 else MorphClass.VERB)),
                 MorphNumber.SINGULAR, MorphGender.MASCULINE, False)
             if (ty == 1 and not tt.morph.case_.is_undefined):
                 mi = MorphWordForm._new604(MorphCase.NOMINATIVE,
                                            MorphNumber.SINGULAR,
                                            MorphGender.MASCULINE)
                 for mit in tt.morph.items:
                     if (isinstance(mit, MorphWordForm)):
                         mi.misc = mit.misc
                         break
                 nnn = MorphologyService.get_wordform("КК" + t.term, mi)
                 if (nnn is not None):
                     norm = nnn[2:]
         it.normal = norm
         res.items.append(it)
         if (not has_verb and ((ty == 1 or ty == 3))):
             res.morph = it.morph
             has_verb = True
         if (ty == 1 or ty == 3):
             if (ty == 1 and tt.is_verb_be):
                 verb_be_before = True
             else:
                 verb_be_before = False
     if (not has_verb):
         return None
     for i in range(len(res.items) - 1, 0, -1):
         if (res.items[i].is_adverb):
             del res.items[i]
             res.end_token = res.items[i - 1].end_token
         else:
             break
     return res
 def _try_parse(t : 'Token', add_units : 'TerminCollection', second : bool, can_omit_number : bool, can_be_nan : bool) -> 'NumbersWithUnitToken':
     if (t is None): 
         return None
     while t is not None:
         if (t.is_comma_and or t.is_value("НО", None)): 
             t = t.next0_
         else: 
             break
     t0 = t
     about_ = False
     has_keyw = False
     is_diap_keyw = False
     min_max = 0
     wrapmin_max1633 = RefOutArgWrapper(min_max)
     ttt = NumbersWithUnitToken._is_min_or_max(t, wrapmin_max1633)
     min_max = wrapmin_max1633.value
     if (ttt is not None): 
         t = ttt.next0_
         if (t is None): 
             return None
     if (t is None): 
         return None
     if (t.is_char('~') or t.is_value("ОКОЛО", None) or t.is_value("ПРИМЕРНО", None)): 
         t = t.next0_
         about_ = True
         has_keyw = True
         if (t is None): 
             return None
     if (t.is_value("В", None) and t.next0_ is not None): 
         if (t.next0_.is_value("ПРЕДЕЛ", None) or t.is_value("ДИАПАЗОН", None)): 
             t = t.next0_.next0_
             if (t is None): 
                 return None
             is_diap_keyw = True
     if (t0.is_char('(')): 
         mt0 = NumbersWithUnitToken._try_parse(t.next0_, add_units, False, False, False)
         if (mt0 is not None and mt0.end_token.next0_ is not None and mt0.end_token.next0_.is_char(')')): 
             if (second): 
                 if (mt0.from_val is not None and mt0.to_val is not None and mt0.from_val == (- mt0.to_val)): 
                     pass
                 else: 
                     return None
             mt0.begin_token = t0
             mt0.end_token = mt0.end_token.next0_
             uu = UnitToken.try_parse_list(mt0.end_token.next0_, add_units, False)
             if (uu is not None and len(mt0.units) == 0): 
                 mt0.units = uu
                 mt0.end_token = uu[len(uu) - 1].end_token
             return mt0
     plusminus = False
     unit_before = False
     is_age_ = False
     dty = NumbersWithUnitToken.DiapTyp.UNDEFINED
     whd = None
     uni = None
     tok = (None if NumbersWithUnitToken.M_TERMINS is None else NumbersWithUnitToken.M_TERMINS.try_parse(t, TerminParseAttr.NO))
     if (tok is not None): 
         if (tok.end_token.is_value("СТАРШЕ", None) or tok.end_token.is_value("МЛАДШЕ", None)): 
             is_age_ = True
         t = tok.end_token.next0_
         dty = (Utils.valToEnum(tok.termin.tag, NumbersWithUnitToken.DiapTyp))
         has_keyw = True
         if (not tok.is_whitespace_after): 
             if (t is None): 
                 return None
             if (isinstance(t, NumberToken)): 
                 if (tok.begin_token == tok.end_token and not tok.chars.is_all_lower): 
                     return None
             elif (t.is_comma and t.next0_ is not None and t.next0_.is_value("ЧЕМ", None)): 
                 t = t.next0_.next0_
                 if (t is not None and t.morph.class0_.is_preposition): 
                     t = t.next0_
             elif (t.is_char_of(":,(") or t.is_table_control_char): 
                 pass
             else: 
                 return None
         if (t is not None and t.is_char('(')): 
             uni = UnitToken.try_parse_list(t.next0_, add_units, False)
             if (uni is not None): 
                 t = uni[len(uni) - 1].end_token.next0_
                 while t is not None:
                     if (t.is_char_of("):")): 
                         t = t.next0_
                     else: 
                         break
                 mt0 = NumbersWithUnitToken._try_parse(t, add_units, False, can_omit_number, False)
                 if (mt0 is not None and len(mt0.units) == 0): 
                     mt0.begin_token = t0
                     mt0.units = uni
                     return mt0
             whd = NumbersWithUnitToken._try_parsewhl(t)
             if (whd is not None): 
                 t = whd.end_token.next0_
         elif (t is not None and t.is_value("IP", None)): 
             uni = UnitToken.try_parse_list(t, add_units, False)
             if (uni is not None): 
                 t = uni[len(uni) - 1].end_token.next0_
         if ((t is not None and t.is_hiphen and t.is_whitespace_before) and t.is_whitespace_after): 
             t = t.next0_
     elif (t.is_char('<')): 
         dty = NumbersWithUnitToken.DiapTyp.LS
         t = t.next0_
         has_keyw = True
         if (t is not None and t.is_char('=')): 
             t = t.next0_
             dty = NumbersWithUnitToken.DiapTyp.LE
     elif (t.is_char('>')): 
         dty = NumbersWithUnitToken.DiapTyp.GT
         t = t.next0_
         has_keyw = True
         if (t is not None and t.is_char('=')): 
             t = t.next0_
             dty = NumbersWithUnitToken.DiapTyp.GE
     elif (t.is_char('≤')): 
         dty = NumbersWithUnitToken.DiapTyp.LE
         has_keyw = True
         t = t.next0_
     elif (t.is_char('≥')): 
         dty = NumbersWithUnitToken.DiapTyp.GE
         has_keyw = True
         t = t.next0_
     elif (t.is_value("IP", None)): 
         uni = UnitToken.try_parse_list(t, add_units, False)
         if (uni is not None): 
             t = uni[len(uni) - 1].end_token.next0_
     elif (t.is_value("ЗА", None) and (isinstance(t.next0_, NumberToken))): 
         dty = NumbersWithUnitToken.DiapTyp.GE
         t = t.next0_
     while t is not None and ((t.is_char_of(":,") or t.is_value("ЧЕМ", None) or t.is_table_control_char)):
         t = t.next0_
     if (t is not None): 
         if (t.is_char('+') or t.is_value("ПЛЮС", None)): 
             t = t.next0_
             if (t is not None and not t.is_whitespace_before): 
                 if (t.is_hiphen): 
                     t = t.next0_
                     plusminus = True
                 elif ((t.is_char_of("\\/") and t.next0_ is not None and not t.is_newline_after) and t.next0_.is_hiphen): 
                     t = t.next0_.next0_
                     plusminus = True
         elif (second and ((t.is_char_of("\\/÷…~")))): 
             t = t.next0_
         elif ((t.is_hiphen and t == t0 and not second) and NumbersWithUnitToken.M_TERMINS.try_parse(t.next0_, TerminParseAttr.NO) is not None): 
             tok = NumbersWithUnitToken.M_TERMINS.try_parse(t.next0_, TerminParseAttr.NO)
             t = tok.end_token.next0_
             dty = (Utils.valToEnum(tok.termin.tag, NumbersWithUnitToken.DiapTyp))
         elif (t.is_hiphen and t == t0 and ((t.is_whitespace_after or second))): 
             t = t.next0_
         elif (t.is_char('±')): 
             t = t.next0_
             plusminus = True
             has_keyw = True
         elif ((second and t.is_char('.') and t.next0_ is not None) and t.next0_.is_char('.')): 
             t = t.next0_.next0_
             if (t is not None and t.is_char('.')): 
                 t = t.next0_
     num = NumberHelper.try_parse_real_number(t, True, False)
     if (num is None): 
         uni = UnitToken.try_parse_list(t, add_units, False)
         if (uni is not None): 
             unit_before = True
             t = uni[len(uni) - 1].end_token.next0_
             delim = False
             while t is not None:
                 if (t.is_char_of(":,")): 
                     delim = True
                     t = t.next0_
                 elif (t.is_hiphen and t.is_whitespace_after): 
                     delim = True
                     t = t.next0_
                 else: 
                     break
             if (not delim): 
                 if (t is None): 
                     if (has_keyw and can_be_nan): 
                         pass
                     else: 
                         return None
                 elif (not t.is_whitespace_before): 
                     return None
                 if (t.next0_ is not None and t.is_hiphen and t.is_whitespace_after): 
                     delim = True
                     t = t.next0_
             num = NumberHelper.try_parse_real_number(t, True, False)
     res = None
     rval = 0
     if (num is None): 
         tt = NumbersWithUnitToken.M_SPEC.try_parse(t, TerminParseAttr.NO)
         if (tt is not None): 
             rval = (tt.termin.tag)
             unam = tt.termin.tag2
             for u in UnitsHelper.UNITS: 
                 if (u.fullname_cyr == unam): 
                     uni = list()
                     uni.append(UnitToken._new1626(t, t, u))
                     break
             if (uni is None): 
                 return None
             res = NumbersWithUnitToken._new1628(t0, tt.end_token, about_)
             t = tt.end_token.next0_
         else: 
             if (not can_omit_number and not has_keyw and not can_be_nan): 
                 return None
             if ((uni is not None and len(uni) == 1 and uni[0].begin_token == uni[0].end_token) and uni[0].length_char > 3): 
                 rval = (1)
                 res = NumbersWithUnitToken._new1628(t0, uni[len(uni) - 1].end_token, about_)
                 t = res.end_token.next0_
             elif (has_keyw and can_be_nan): 
                 rval = math.nan
                 res = NumbersWithUnitToken._new1628(t0, t0, about_)
                 if (t is not None): 
                     res.end_token = t.previous
                 else: 
                     t = t0
                     while t is not None: 
                         res.end_token = t
                         t = t.next0_
             else: 
                 return None
     else: 
         if ((t == t0 and t0.is_hiphen and not t.is_whitespace_before) and not t.is_whitespace_after and (num.real_value < 0)): 
             num = NumberHelper.try_parse_real_number(t.next0_, True, False)
             if (num is None): 
                 return None
         if (t == t0 and (isinstance(t, NumberToken)) and t.morph.class0_.is_adjective): 
             nn = Utils.asObjectOrNull(t.end_token, TextToken)
             if (nn is None): 
                 return None
             norm = nn.get_normal_case_text(MorphClass.ADJECTIVE, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False)
             if ((norm.endswith("Ь") or norm == "ЧЕТЫРЕ" or norm == "ТРИ") or norm == "ДВА"): 
                 pass
             else: 
                 mi = MorphologyService.get_word_base_info("КОКО" + nn.term, None, False, False)
                 if (mi.class0_.is_adjective): 
                     return None
         t = num.end_token.next0_
         res = NumbersWithUnitToken._new1628(t0, num.end_token, about_)
         rval = num.real_value
     if (uni is None): 
         uni = UnitToken.try_parse_list(t, add_units, False)
         if (uni is not None): 
             if ((plusminus and second and len(uni) >= 1) and uni[0].unit == UnitsHelper.UPERCENT): 
                 res.end_token = uni[0].end_token
                 res.plus_minus_percent = True
                 tt1 = uni[0].end_token.next0_
                 uni = UnitToken.try_parse_list(tt1, add_units, False)
                 if (uni is not None): 
                     res.units = uni
                     res.end_token = uni[len(uni) - 1].end_token
             else: 
                 res.units = uni
                 res.end_token = uni[len(uni) - 1].end_token
             t = res.end_token.next0_
     else: 
         res.units = uni
         if (len(uni) > 1): 
             uni1 = UnitToken.try_parse_list(t, add_units, False)
             if (((uni1 is not None and uni1[0].unit == uni[0].unit and (len(uni1) < len(uni))) and uni[len(uni1)].pow0_ == -1 and uni1[len(uni1) - 1].end_token.next0_ is not None) and uni1[len(uni1) - 1].end_token.next0_.is_char_of("/\\")): 
                 num2 = NumbersWithUnitToken._try_parse(uni1[len(uni1) - 1].end_token.next0_.next0_, add_units, False, False, False)
                 if (num2 is not None and num2.units is not None and num2.units[0].unit == uni[len(uni1)].unit): 
                     res.units = uni1
                     res.div_num = num2
                     res.end_token = num2.end_token
     res.whl = whd
     if (dty != NumbersWithUnitToken.DiapTyp.UNDEFINED): 
         if (dty == NumbersWithUnitToken.DiapTyp.GE or dty == NumbersWithUnitToken.DiapTyp.FROM): 
             res.from_include = True
             res.from_val = rval
         elif (dty == NumbersWithUnitToken.DiapTyp.GT): 
             res.from_include = False
             res.from_val = rval
         elif (dty == NumbersWithUnitToken.DiapTyp.LE or dty == NumbersWithUnitToken.DiapTyp.TO): 
             res.to_include = True
             res.to_val = rval
         elif (dty == NumbersWithUnitToken.DiapTyp.LS): 
             res.to_include = False
             res.to_val = rval
     is_second_max = False
     if (not second): 
         iii = 0
         wrapiii1632 = RefOutArgWrapper(iii)
         ttt = NumbersWithUnitToken._is_min_or_max(t, wrapiii1632)
         iii = wrapiii1632.value
         if (ttt is not None and iii > 0): 
             is_second_max = True
             t = ttt.next0_
     next0__ = (None if second or plusminus or ((t is not None and ((t.is_table_control_char or t.is_newline_before)))) else NumbersWithUnitToken._try_parse(t, add_units, True, False, can_be_nan))
     if (next0__ is not None and (isinstance(t.previous, NumberToken))): 
         if (MeasureHelper.is_mult_char(t.previous.end_token)): 
             next0__ = (None)
     if (next0__ is not None and ((next0__.to_val is not None or next0__.single_val is not None)) and next0__.from_val is None): 
         if ((((next0__.begin_token.is_char('+') and next0__.single_val is not None and not math.isnan(next0__.single_val)) and next0__.end_token.next0_ is not None and next0__.end_token.next0_.is_char_of("\\/")) and next0__.end_token.next0_.next0_ is not None and next0__.end_token.next0_.next0_.is_hiphen) and not has_keyw and not math.isnan(rval)): 
             next2 = NumbersWithUnitToken._try_parse(next0__.end_token.next0_.next0_.next0_, add_units, True, False, False)
             if (next2 is not None and next2.single_val is not None and not math.isnan(next2.single_val)): 
                 res.from_val = (rval - next2.single_val)
                 res.from_include = True
                 res.to_val = (rval + next0__.single_val)
                 res.to_include = True
                 if (next2.units is not None and len(res.units) == 0): 
                     res.units = next2.units
                 res.end_token = next2.end_token
                 return res
         if (len(next0__.units) > 0): 
             if (len(res.units) == 0): 
                 res.units = next0__.units
             elif (not UnitToken.can_be_equals(res.units, next0__.units)): 
                 next0__ = (None)
         elif (len(res.units) > 0 and not unit_before and not next0__.plus_minus_percent): 
             next0__ = (None)
         if (next0__ is not None): 
             res.end_token = next0__.end_token
         if (next0__ is not None and next0__.to_val is not None): 
             res.to_val = next0__.to_val
             res.to_include = next0__.to_include
         elif (next0__ is not None and next0__.single_val is not None): 
             if (next0__.begin_token.is_char_of("/\\")): 
                 res.div_num = next0__
                 res.single_val = rval
                 return res
             elif (next0__.plus_minus_percent): 
                 res.single_val = rval
                 res.plus_minus = next0__.single_val
                 res.plus_minus_percent = True
                 res.to_include = True
             else: 
                 res.to_val = next0__.single_val
                 res.to_include = True
         if (next0__ is not None): 
             if (res.from_val is None): 
                 res.from_val = rval
                 res.from_include = True
             return res
     elif ((next0__ is not None and next0__.from_val is not None and next0__.to_val is not None) and next0__.to_val == (- next0__.from_val)): 
         if (len(next0__.units) == 1 and next0__.units[0].unit == UnitsHelper.UPERCENT and len(res.units) > 0): 
             res.single_val = rval
             res.plus_minus = next0__.to_val
             res.plus_minus_percent = True
             res.end_token = next0__.end_token
             return res
         if (len(next0__.units) == 0): 
             res.single_val = rval
             res.plus_minus = next0__.to_val
             res.end_token = next0__.end_token
             return res
         res.from_val = (next0__.from_val + rval)
         res.from_include = True
         res.to_val = (next0__.to_val + rval)
         res.to_include = True
         res.end_token = next0__.end_token
         if (len(next0__.units) > 0): 
             res.units = next0__.units
         return res
     if (dty == NumbersWithUnitToken.DiapTyp.UNDEFINED): 
         if (plusminus and ((not res.plus_minus_percent or not second))): 
             res.from_include = True
             res.from_val = (- rval)
             res.to_include = True
             res.to_val = rval
         else: 
             res.single_val = rval
             res.plus_minus_percent = plusminus
     if (is_age_): 
         res.is_age = True
     return res