def initialize(lang: 'MorphLang' = None) -> None: """ Инициализация сервиса. Каждый анализатор нужно аинициализировать отдельно. Если вызывается Sdk.Initialize(), то там инициализация сервиса и всех анализаторов делается. Args: lang(MorphLang): необходимые языки (по умолчанию, русский и английский) """ from pullenti.ner.core.internal.NumberExHelper import NumberExHelper from pullenti.ner.core.internal.BlockLine import BlockLine from pullenti.ner.core.internal.NounPhraseItem import NounPhraseItem from pullenti.ner.core.PrepositionHelper import PrepositionHelper from pullenti.ner.core.ConjunctionHelper import ConjunctionHelper if (ProcessorService.__m_inited): return ProcessorService.__m_inited = True MorphologyService.initialize(lang) DerivateService.initialize(lang) Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = True PrepositionHelper._initialize() ConjunctionHelper._initialize() NounPhraseItem._initialize() NumberHelper._initialize() NumberExHelper._initialize() BlockLine.initialize() Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = False
def add(self, val: str, shortval: str, gen: 'MorphGender', add_other_gender_var: bool = False) -> None: if (val is None): return if (self.head is None): if (len(val) > 3): self.head = val[0:0 + 3] else: self.head = val if (gen == MorphGender.MASCULINE or gen == MorphGender.FEMINIE): for it in self.items: if (it.value == val and it.gender == gen): return self.items.append( PersonMorphCollection.PersonMorphVariant._new2591( val, gen, shortval)) if (add_other_gender_var): g0 = (MorphGender.MASCULINE if gen == MorphGender.FEMINIE else MorphGender.FEMINIE) v = MorphologyService.get_wordform( val, MorphBaseInfo._new193(MorphClass._new2572(True), g0)) if (v is not None): self.items.append( PersonMorphCollection.PersonMorphVariant._new2591( v, g0, shortval)) else: self.add(val, shortval, MorphGender.MASCULINE, False) self.add(val, shortval, MorphGender.FEMINIE, False)
def __merge_letters(self) -> None: before_word = False tmp = io.StringIO() t = self.first_token first_pass3055 = True while True: if first_pass3055: first_pass3055 = False else: t = t.next0_ if (not (t is not None)): break tt = Utils.asObjectOrNull(t, TextToken) if (not tt.chars.is_letter or tt.length_char != 1): before_word = False continue i = t.whitespaces_before_count if (i > 2 or ((i == 2 and before_word))): pass else: before_word = False continue i = 0 t1 = None Utils.setLengthStringIO(tmp, 0) print(tt.get_source_text(), end="", file=tmp) t1 = t while t1.next0_ is not None: tt = (Utils.asObjectOrNull(t1.next0_, TextToken)) if (tt.length_char != 1 or tt.whitespaces_before_count != 1): break i += 1 print(tt.get_source_text(), end="", file=tmp) t1 = t1.next0_ if (i > 3 or ((i > 1 and before_word))): pass else: before_word = False continue before_word = False mt = MorphologyService.process(Utils.toStringStringIO(tmp), None, None) if (mt is None or len(mt) != 1): t = t1 continue for wf in mt[0].word_forms: if (wf.is_in_dictionary): before_word = True break if (not before_word): t = t1 continue tt = TextToken(mt[0], self, t.begin_char, t1.end_char) if (t == self.first_token): self.first_token = (tt) else: tt.previous = t.previous tt.next0_ = t1.next0_ t = (tt)
def get_morph_variant(self, cas : 'MorphCase', plural : bool) -> str: """ Сгенерировать текст именной группы в нужном падеже и числе Args: cas(MorphCase): нужный падеж plural(bool): нужное число Returns: str: результирующая строка """ mi = MorphBaseInfo._new499(cas, MorphLang.RU) if (plural): mi.number = MorphNumber.PLURAL else: mi.number = MorphNumber.SINGULAR res = None for a in self.adjectives: tt = MiscHelper.get_text_value_of_meta_token(a, GetTextAttr.NO) if (a.begin_token != a.end_token or not (isinstance(a.begin_token, TextToken))): pass else: tt2 = MorphologyService.get_wordform(tt, mi) if (tt2 is not None): tt = tt2 if (res is None): res = tt else: res = "{0} {1}".format(res, tt) if (self.noun is not None): tt = MiscHelper.get_text_value_of_meta_token(self.noun, GetTextAttr.NO) if (self.noun.begin_token != self.noun.end_token or not (isinstance(self.noun.begin_token, TextToken))): pass else: tt2 = MorphologyService.get_wordform(tt, mi) if (tt2 is not None): tt = tt2 if (res is None): res = tt else: res = "{0} {1}".format(res, tt) return res
def __correct_words_by_morph(self, lang: 'MorphLang') -> None: tt = self.first_token first_pass3054 = True while True: if first_pass3054: first_pass3054 = False else: tt = tt.next0_ if (not (tt is not None)): break if (not (isinstance(tt, TextToken))): continue if (tt.morph.contains_attr("прдктв.", None)): continue dd = tt.get_morph_class_in_dictionary() if (not dd.is_undefined or (tt.length_char < 4)): continue if (tt.morph.class0_.is_proper_surname and not tt.chars.is_all_lower): continue if (tt.chars.is_all_upper): continue corw = MorphologyService.correct_word( tt.term, (lang if tt.morph.language.is_undefined else tt.morph.language)) if (corw is None): continue ccc = MorphologyService.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue tt1 = TextToken._new473(ccc[0], self, tt.begin_char, tt.end_char, tt.chars, tt.term) mc = tt1.get_morph_class_in_dictionary() if (mc.is_proper_surname): continue if (tt == self.first_token): self.first_token = (tt1) else: tt.previous.next0_ = tt1 tt1.next0_ = tt.next0_ tt = (tt1) if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.get_source_text()
def __correct_words_by_merging(self, lang: 'MorphLang') -> None: t = self.first_token first_pass3053 = True while True: if first_pass3053: first_pass3053 = False else: t = t.next0_ if (not (t is not None and t.next0_ is not None)): break if (not t.chars.is_letter or (t.length_char < 2)): continue mc0 = t.get_morph_class_in_dictionary() if (t.morph.contains_attr("прдктв.", None)): continue t1 = t.next0_ if (t1.is_hiphen and t1.next0_ is not None and not t1.is_newline_after): t1 = t1.next0_ if (t1.length_char == 1): continue if (not t1.chars.is_letter or not t.chars.is_letter or t1.chars.is_latin_letter != t.chars.is_latin_letter): continue if (t1.chars.is_all_upper and not t.chars.is_all_upper): continue elif (not t1.chars.is_all_lower): continue elif (t.chars.is_all_upper): continue if (t1.morph.contains_attr("прдктв.", None)): continue mc1 = t1.get_morph_class_in_dictionary() if (not mc1.is_undefined and not mc0.is_undefined): continue if ((len(t.term) + len(t1.term)) < 6): continue corw = t.term + t1.term ccc = MorphologyService.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue if (corw == "ПОСТ" or corw == "ВРЕД"): continue tt = TextToken(ccc[0], self, t.begin_char, t1.end_char) if (tt.get_morph_class_in_dictionary().is_undefined): continue tt.chars = t.chars if (t == self.first_token): self.first_token = (tt) else: t.previous.next0_ = tt if (t1.next0_ is not None): tt.next0_ = t1.next0_ t = (tt)
def is_participle(self) -> bool: """ Это причастие """ if (self.__m_is_participle >= 0): return self.__m_is_participle > 0 for f in self.morph.items: if (f.class0_.is_adjective and (isinstance(f, MorphWordForm)) and not "к.ф." in f.misc.attrs): return True elif (f.class0_.is_verb and not f.case_.is_undefined): return True self.__m_is_participle = 0 tt = Utils.asObjectOrNull(self.end_token, TextToken) if (tt is not None and tt.term.endswith("СЯ")): mb = MorphologyService.get_word_base_info(tt.term[0:0+len(tt.term) - 2], None, False, False) if (mb is not None): if (mb.class0_.is_adjective): self.__m_is_participle = 1 return self.__m_is_participle > 0
def get_normal_case_text(self, mc: 'MorphClass' = None, num: 'MorphNumber' = MorphNumber.UNDEFINED, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: if ((isinstance(self.begin_token, ReferentToken)) and self.begin_token == self.end_token): return self.begin_token.get_normal_case_text( mc, num, gender, keep_chars) res = None max_coef = 0 def_coef = -1 for it in self.morph.items: v = Utils.asObjectOrNull(it, NounPhraseItemTextVar) if (v is None): continue if (v.undef_coef > 0 and (((v.undef_coef < max_coef) or def_coef >= 0))): continue if (num == MorphNumber.SINGULAR and v.single_number_value is not None): if (mc is not None and ((gender == MorphGender.NEUTER or gender == MorphGender.FEMINIE)) and mc.is_adjective): bi = MorphBaseInfo._new401(MorphClass._new53(mc.value), gender, MorphNumber.SINGULAR, MorphCase.NOMINATIVE, self.morph.language) str0_ = MorphologyService.get_wordform( v.single_number_value, bi) if (str0_ is not None): res = str0_ else: res = v.single_number_value if (v.undef_coef == 0): break max_coef = v.undef_coef continue if (Utils.isNullOrEmpty(v.normal_value)): continue if (str.isdigit(v.normal_value[0]) and mc is not None and mc.is_adjective): val = 0 wrapval402 = RefOutArgWrapper(0) inoutres403 = Utils.tryParseInt(v.normal_value, wrapval402) val = wrapval402.value if (inoutres403): str0_ = NumberHelper.get_number_adjective( val, gender, (MorphNumber.SINGULAR if num == MorphNumber.SINGULAR or val == 1 else MorphNumber.PLURAL)) if (str0_ is not None): res = str0_ if (v.undef_coef == 0): break max_coef = v.undef_coef continue res1 = it.normal_value if (num == MorphNumber.SINGULAR): if (res1 == "ДЕТИ"): res1 = "РЕБЕНОК" elif (res1 == "ЛЮДИ"): res1 = "ЧЕЛОВЕК" max_coef = v.undef_coef if (v.undef_coef > 0): res = res1 continue def_co = 0 if (mc is not None and mc.is_adjective and v.undef_coef == 0): pass elif ( ((isinstance(self.begin_token, TextToken)) and res1 == self.begin_token.term and it.case_.is_nominative) and it.number == MorphNumber.SINGULAR): def_co = 1 if (num == MorphNumber.PLURAL and ((v.number) & (MorphNumber.PLURAL)) == (MorphNumber.PLURAL)): def_co += 3 if (res is None or def_co > def_coef): res = res1 def_coef = def_co if (def_co > 0): break if (res is not None): return self.__corr_chars(res, keep_chars) if (res is None and self.begin_token == self.end_token): res = self.begin_token.get_normal_case_text( mc, num, gender, keep_chars) elif (res is None): res = self.begin_token.get_normal_case_text( mc, num, gender, keep_chars) if (res is None): res = MiscHelper.get_text_value_of_meta_token( self, (GetTextAttr.KEEPREGISTER if keep_chars else GetTextAttr.NO)) else: res = "{0} {1}".format( res, MiscHelper.get_text_value( self.begin_token.next0_, self.end_token, (GetTextAttr.KEEPREGISTER if keep_chars else GetTextAttr.NO))) return Utils.ifNotNull(res, "?")
def __get_name_without_brackets(begin: 'Token', end: 'Token', normalize_first_noun_group: bool = False, normal_first_group_single: bool = False, ignore_geo_referent: bool = False) -> str: res = None if (BracketHelper.can_be_start_of_sequence(begin, False, False) and BracketHelper.can_be_end_of_sequence( end, False, begin, False)): begin = begin.next0_ end = end.previous if (normalize_first_noun_group and not begin.morph.class0_.is_preposition): npt = NounPhraseHelper.try_parse( begin, NounPhraseParseAttr.REFERENTCANBENOUN, 0, None) if (npt is not None): if (npt.noun.get_morph_class_in_dictionary().is_undefined and len(npt.adjectives) == 0): npt = (None) if (npt is not None and npt.end_token.end_char > end.end_char): npt = (None) if (npt is not None): res = npt.get_normal_case_text( None, (MorphNumber.SINGULAR if normal_first_group_single else MorphNumber.UNDEFINED), MorphGender.UNDEFINED, False) te = npt.end_token.next0_ if (((te is not None and te.next0_ is not None and te.is_comma) and (isinstance(te.next0_, TextToken)) and te.next0_.end_char <= end.end_char) and te.next0_.morph.class0_.is_verb and te.next0_.morph.class0_.is_adjective): for it in te.next0_.morph.items: if (it.gender == npt.morph.gender or ((it.gender) & (npt.morph.gender)) != (MorphGender.UNDEFINED)): if (not ( (it.case_) & npt.morph.case_).is_undefined): if (it.number == npt.morph.number or ((it.number) & (npt.morph.number)) != (MorphNumber.UNDEFINED)): var = te.next0_.term if (isinstance(it, MorphWordForm)): var = it.normal_case bi = MorphBaseInfo._new492( MorphClass.ADJECTIVE, npt.morph.gender, npt.morph.number, npt.morph.language) var = MorphologyService.get_wordform( var, bi) if (var is not None): res = "{0}, {1}".format(res, var) te = te.next0_.next0_ break if (te is not None and te.end_char <= end.end_char): s = ProperNameHelper.get_name_ex(te, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, ignore_geo_referent) if (not Utils.isNullOrEmpty(s)): if (not str.isalnum(s[0])): res = "{0}{1}".format(res, s) else: res = "{0} {1}".format(res, s) elif ((isinstance(begin, TextToken)) and begin.chars.is_cyrillic_letter): mm = begin.get_morph_class_in_dictionary() if (not mm.is_undefined): res = begin.get_normal_case_text(mm, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False) if (begin.end_char < end.end_char): res = "{0} {1}".format( res, ProperNameHelper.get_name_ex( begin.next0_, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, False)) if (res is None): res = ProperNameHelper.get_name_ex(begin, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, ignore_geo_referent) if (not Utils.isNullOrEmpty(res)): k = 0 i = len(res) - 1 while i >= 0: if (res[i] == '*' or Utils.isWhitespace(res[i])): pass else: break i -= 1 k += 1 if (k > 0): if (k == len(res)): return None res = res[0:0 + len(res) - k] return res
def __init__(self, sofa_: 'SourceOfAnalysis' = None, only_tokenizing: bool = False, lang: 'MorphLang' = None, progress: EventHandler = None) -> None: self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0) self.corrected_tokens = None self.first_token = None self.__m_entities = list() self.ontology = None self.base_language = MorphLang() self.__m_sofa = None self.statistics = None self.__m_datas = dict() self.misc_data = dict() self.processor = None self.recurse_level = 0 self._m_analyzer_stack = list() self.onto_regime = False if (sofa_ is None): return self.__m_sofa = sofa_ self._start_date = datetime.datetime.now() tokens = MorphologyService.process(sofa_.text, lang, None) t0 = None if (tokens is not None): ii = 0 while ii < len(tokens): mt = tokens[ii] if (mt.begin_char == 733860): pass tt = TextToken(mt, self) if (sofa_.correction_dict is not None): corw = None wrapcorw471 = RefOutArgWrapper(None) inoutres472 = Utils.tryGetValue(sofa_.correction_dict, mt.term, wrapcorw471) corw = wrapcorw471.value if (inoutres472): ccc = MorphologyService.process(corw, lang, None) if (ccc is not None and len(ccc) == 1): tt1 = TextToken._new470(ccc[0], self, tt.begin_char, tt.end_char, tt.term) tt1.chars = tt.chars tt = tt1 if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.get_source_text() if (t0 is None): self.first_token = (tt) else: t0.next0_ = tt t0 = (tt) ii += 1 if (sofa_.clear_dust): self.__clear_dust() if (sofa_.do_words_merging_by_morph): self.__correct_words_by_merging(lang) if (sofa_.do_word_correction_by_morph): self.__correct_words_by_morph(lang) self.__merge_letters() self.__define_base_language() if (sofa_.create_number_tokens): t = self.first_token first_pass3049 = True while True: if first_pass3049: first_pass3049 = False else: t = t.next0_ if (not (t is not None)): break nt = NumberHelper._try_parse_number(t) if (nt is None): continue self.embed_token(nt) t = (nt) if (only_tokenizing): return t = self.first_token first_pass3050 = True while True: if first_pass3050: first_pass3050 = False else: t = t.next0_ if (not (t is not None)): break if (t.morph.class0_.is_preposition): continue mc = t.get_morph_class_in_dictionary() if (mc.is_undefined and t.chars.is_cyrillic_letter and t.length_char > 4): tail = sofa_.text[t.end_char - 1:t.end_char - 1 + 2] tte = None tt = t.previous if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.previous if ((tt is not None and not tt.get_morph_class_in_dictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1 + 2] if (tail2 == tail): tte = tt if (tte is None): tt = t.next0_ if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.next0_ if ((tt is not None and not tt.get_morph_class_in_dictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1 + 2] if (tail2 == tail): tte = tt if (tte is not None): t.morph.remove_items_ex( tte.morph, tte.get_morph_class_in_dictionary()) continue self.__create_statistics()
def get_normal_case_text(self, mc: 'MorphClass' = None, num: 'MorphNumber' = MorphNumber.UNDEFINED, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: from pullenti.ner.core.MiscHelper import MiscHelper empty = True if (mc is not None and mc.is_preposition): return LanguageHelper.normalize_preposition(self.term) for it in self.morph.items: if (mc is not None and not mc.is_undefined): cc = (it.class0_) & mc if (cc.is_undefined): continue if (cc.is_misc and not cc.is_proper and mc != it.class0_): continue wf = Utils.asObjectOrNull(it, MorphWordForm) normal_full = False if (gender != MorphGender.UNDEFINED): if (((it.gender) & (gender)) == (MorphGender.UNDEFINED)): if ((gender == MorphGender.MASCULINE and ((it.gender != MorphGender.UNDEFINED or it.number == MorphNumber.PLURAL)) and wf is not None) and wf.normal_full is not None): normal_full = True elif (gender == MorphGender.MASCULINE and it.class0_.is_personal_pronoun): pass else: continue if (not it.case_.is_undefined): empty = False if (wf is not None): res = None if (num == MorphNumber.SINGULAR and it.number == MorphNumber.PLURAL and wf.normal_full is not None): le = len(wf.normal_case) if ((le == (len(wf.normal_full) + 2) and le > 4 and wf.normal_case[le - 2] == 'С') and wf.normal_case[le - 1] == 'Я'): res = wf.normal_case else: res = (wf.normal_full if normal_full else wf.normal_full) else: res = (wf.normal_full if normal_full else (Utils.ifNotNull(wf.normal_case, self.term))) if (num == MorphNumber.SINGULAR and mc is not None and mc == MorphClass.NOUN): if (res == "ДЕТИ"): res = "РЕБЕНОК" if (keep_chars): if (self.chars.is_all_lower): res = res.lower() elif (self.chars.is_capital_upper): res = MiscHelper.convert_first_char_upper_and_other_lower( res) return res if (not empty): return None te = None if (num == MorphNumber.SINGULAR and mc is not None): bi = MorphBaseInfo._new492(MorphClass._new53(mc.value), gender, MorphNumber.SINGULAR, self.morph.language) vars0_ = MorphologyService.get_wordform(self.term, bi) if (vars0_ is not None): te = vars0_ if (te is None): te = self.term if (keep_chars): if (self.chars.is_all_lower): return te.lower() elif (self.chars.is_capital_upper): return MiscHelper.convert_first_char_upper_and_other_lower(te) return te
def __try_parse_ru(t: 'Token', can_be_partition: bool, can_be_adj_partition: bool, force_parse: bool) -> 'VerbPhraseToken': res = None t0 = t not0_ = None has_verb = False verb_be_before = False prep = None first_pass3070 = True while True: if first_pass3070: first_pass3070 = False else: t = t.next0_ if (not (t is not None)): break if (not (isinstance(t, TextToken))): break tt = Utils.asObjectOrNull(t, TextToken) is_participle = False if (tt.term == "НЕ"): not0_ = t continue ty = 0 norm = None mc = tt.get_morph_class_in_dictionary() if (tt.term == "НЕТ"): if (has_verb): break ty = 1 elif (tt.term == "ДОПУСТИМО"): ty = 3 elif (mc.is_adverb and not mc.is_verb): ty = 2 elif (tt.is_pure_verb or tt.is_verb_be): ty = 1 if (has_verb): if (not tt.morph.contains_attr("инф.", None)): if (verb_be_before): pass else: break elif (mc.is_verb): if (mc.is_preposition or mc.is_misc or mc.is_pronoun): pass elif (mc.is_noun): if (tt.term == "СТАЛИ" or tt.term == "СТЕКЛО" or tt.term == "БЫЛИ"): ty = 1 elif (not tt.chars.is_all_lower and not MiscHelper.can_be_start_of_sentence(tt)): ty = 1 elif (mc.is_adjective and can_be_partition): ty = 1 elif (force_parse): ty = 1 elif (mc.is_proper): if (tt.chars.is_all_lower): ty = 1 else: ty = 1 if (mc.is_adjective): is_participle = True if (not tt.morph.case_.is_undefined): is_participle = True if (not can_be_partition and is_participle): break if (has_verb): if (tt.morph.contains_attr("инф.", None)): pass elif (not is_participle): pass else: break elif ((mc.is_adjective and tt.morph.contains_attr("к.ф.", None) and tt.term.endswith("О")) and NounPhraseHelper.try_parse( tt, NounPhraseParseAttr.NO, 0, None) is None): ty = 2 elif (mc.is_adjective and ((can_be_partition or can_be_adj_partition))): if (tt.morph.contains_attr("к.ф.", None) and not can_be_adj_partition): break norm = tt.get_normal_case_text(MorphClass.ADJECTIVE, MorphNumber.SINGULAR, MorphGender.MASCULINE, False) if (norm.endswith("ЙШИЙ")): pass else: grs = DerivateService.find_derivates(norm, True, None) if (grs is not None and len(grs) > 0): hverb = False hpart = False for gr in grs: for w in gr.words: if (w.class0_.is_adjective and w.class0_.is_verb): if (w.spelling == norm): hpart = True elif (w.class0_.is_verb): hverb = True if (hpart and hverb): ty = 3 elif (can_be_adj_partition): ty = 3 if (ty != 3 and not Utils.isNullOrEmpty(grs[0].prefix) and norm.startswith(grs[0].prefix)): hverb = False hpart = False norm1 = norm[len(grs[0].prefix):] grs = DerivateService.find_derivates( norm1, True, None) if (grs is not None and len(grs) > 0): for gr in grs: for w in gr.words: if (w.class0_.is_adjective and w.class0_.is_verb): if (w.spelling == norm1): hpart = True elif (w.class0_.is_verb): hverb = True if (hpart and hverb): ty = 3 if (ty == 0 and t == t0 and can_be_partition): prep = PrepositionHelper.try_parse(t) if (prep is not None): t = prep.end_token continue if (ty == 0): break if (res is None): res = VerbPhraseToken(t0, t) res.end_token = t it = VerbPhraseItemToken._new603(t, t, MorphCollection(t.morph)) if (not0_ is not None): it.begin_token = not0_ it.not0_ = True not0_ = (None) it.is_adverb = ty == 2 if (prep is not None and not t.morph.case_.is_undefined and len(res.items) == 0): if (((prep.next_case) & t.morph.case_).is_undefined): return None it.morph.remove_items(prep.next_case, False) res.preposition = prep if (norm is None): norm = t.get_normal_case_text( (MorphClass.ADJECTIVE if ty == 3 else (MorphClass.ADVERB if ty == 2 else MorphClass.VERB)), MorphNumber.SINGULAR, MorphGender.MASCULINE, False) if (ty == 1 and not tt.morph.case_.is_undefined): mi = MorphWordForm._new604(MorphCase.NOMINATIVE, MorphNumber.SINGULAR, MorphGender.MASCULINE) for mit in tt.morph.items: if (isinstance(mit, MorphWordForm)): mi.misc = mit.misc break nnn = MorphologyService.get_wordform("КК" + t.term, mi) if (nnn is not None): norm = nnn[2:] it.normal = norm res.items.append(it) if (not has_verb and ((ty == 1 or ty == 3))): res.morph = it.morph has_verb = True if (ty == 1 or ty == 3): if (ty == 1 and tt.is_verb_be): verb_be_before = True else: verb_be_before = False if (not has_verb): return None for i in range(len(res.items) - 1, 0, -1): if (res.items[i].is_adverb): del res.items[i] res.end_token = res.items[i - 1].end_token else: break return res
def _try_parse(t : 'Token', add_units : 'TerminCollection', second : bool, can_omit_number : bool, can_be_nan : bool) -> 'NumbersWithUnitToken': if (t is None): return None while t is not None: if (t.is_comma_and or t.is_value("НО", None)): t = t.next0_ else: break t0 = t about_ = False has_keyw = False is_diap_keyw = False min_max = 0 wrapmin_max1633 = RefOutArgWrapper(min_max) ttt = NumbersWithUnitToken._is_min_or_max(t, wrapmin_max1633) min_max = wrapmin_max1633.value if (ttt is not None): t = ttt.next0_ if (t is None): return None if (t is None): return None if (t.is_char('~') or t.is_value("ОКОЛО", None) or t.is_value("ПРИМЕРНО", None)): t = t.next0_ about_ = True has_keyw = True if (t is None): return None if (t.is_value("В", None) and t.next0_ is not None): if (t.next0_.is_value("ПРЕДЕЛ", None) or t.is_value("ДИАПАЗОН", None)): t = t.next0_.next0_ if (t is None): return None is_diap_keyw = True if (t0.is_char('(')): mt0 = NumbersWithUnitToken._try_parse(t.next0_, add_units, False, False, False) if (mt0 is not None and mt0.end_token.next0_ is not None and mt0.end_token.next0_.is_char(')')): if (second): if (mt0.from_val is not None and mt0.to_val is not None and mt0.from_val == (- mt0.to_val)): pass else: return None mt0.begin_token = t0 mt0.end_token = mt0.end_token.next0_ uu = UnitToken.try_parse_list(mt0.end_token.next0_, add_units, False) if (uu is not None and len(mt0.units) == 0): mt0.units = uu mt0.end_token = uu[len(uu) - 1].end_token return mt0 plusminus = False unit_before = False is_age_ = False dty = NumbersWithUnitToken.DiapTyp.UNDEFINED whd = None uni = None tok = (None if NumbersWithUnitToken.M_TERMINS is None else NumbersWithUnitToken.M_TERMINS.try_parse(t, TerminParseAttr.NO)) if (tok is not None): if (tok.end_token.is_value("СТАРШЕ", None) or tok.end_token.is_value("МЛАДШЕ", None)): is_age_ = True t = tok.end_token.next0_ dty = (Utils.valToEnum(tok.termin.tag, NumbersWithUnitToken.DiapTyp)) has_keyw = True if (not tok.is_whitespace_after): if (t is None): return None if (isinstance(t, NumberToken)): if (tok.begin_token == tok.end_token and not tok.chars.is_all_lower): return None elif (t.is_comma and t.next0_ is not None and t.next0_.is_value("ЧЕМ", None)): t = t.next0_.next0_ if (t is not None and t.morph.class0_.is_preposition): t = t.next0_ elif (t.is_char_of(":,(") or t.is_table_control_char): pass else: return None if (t is not None and t.is_char('(')): uni = UnitToken.try_parse_list(t.next0_, add_units, False) if (uni is not None): t = uni[len(uni) - 1].end_token.next0_ while t is not None: if (t.is_char_of("):")): t = t.next0_ else: break mt0 = NumbersWithUnitToken._try_parse(t, add_units, False, can_omit_number, False) if (mt0 is not None and len(mt0.units) == 0): mt0.begin_token = t0 mt0.units = uni return mt0 whd = NumbersWithUnitToken._try_parsewhl(t) if (whd is not None): t = whd.end_token.next0_ elif (t is not None and t.is_value("IP", None)): uni = UnitToken.try_parse_list(t, add_units, False) if (uni is not None): t = uni[len(uni) - 1].end_token.next0_ if ((t is not None and t.is_hiphen and t.is_whitespace_before) and t.is_whitespace_after): t = t.next0_ elif (t.is_char('<')): dty = NumbersWithUnitToken.DiapTyp.LS t = t.next0_ has_keyw = True if (t is not None and t.is_char('=')): t = t.next0_ dty = NumbersWithUnitToken.DiapTyp.LE elif (t.is_char('>')): dty = NumbersWithUnitToken.DiapTyp.GT t = t.next0_ has_keyw = True if (t is not None and t.is_char('=')): t = t.next0_ dty = NumbersWithUnitToken.DiapTyp.GE elif (t.is_char('≤')): dty = NumbersWithUnitToken.DiapTyp.LE has_keyw = True t = t.next0_ elif (t.is_char('≥')): dty = NumbersWithUnitToken.DiapTyp.GE has_keyw = True t = t.next0_ elif (t.is_value("IP", None)): uni = UnitToken.try_parse_list(t, add_units, False) if (uni is not None): t = uni[len(uni) - 1].end_token.next0_ elif (t.is_value("ЗА", None) and (isinstance(t.next0_, NumberToken))): dty = NumbersWithUnitToken.DiapTyp.GE t = t.next0_ while t is not None and ((t.is_char_of(":,") or t.is_value("ЧЕМ", None) or t.is_table_control_char)): t = t.next0_ if (t is not None): if (t.is_char('+') or t.is_value("ПЛЮС", None)): t = t.next0_ if (t is not None and not t.is_whitespace_before): if (t.is_hiphen): t = t.next0_ plusminus = True elif ((t.is_char_of("\\/") and t.next0_ is not None and not t.is_newline_after) and t.next0_.is_hiphen): t = t.next0_.next0_ plusminus = True elif (second and ((t.is_char_of("\\/÷…~")))): t = t.next0_ elif ((t.is_hiphen and t == t0 and not second) and NumbersWithUnitToken.M_TERMINS.try_parse(t.next0_, TerminParseAttr.NO) is not None): tok = NumbersWithUnitToken.M_TERMINS.try_parse(t.next0_, TerminParseAttr.NO) t = tok.end_token.next0_ dty = (Utils.valToEnum(tok.termin.tag, NumbersWithUnitToken.DiapTyp)) elif (t.is_hiphen and t == t0 and ((t.is_whitespace_after or second))): t = t.next0_ elif (t.is_char('±')): t = t.next0_ plusminus = True has_keyw = True elif ((second and t.is_char('.') and t.next0_ is not None) and t.next0_.is_char('.')): t = t.next0_.next0_ if (t is not None and t.is_char('.')): t = t.next0_ num = NumberHelper.try_parse_real_number(t, True, False) if (num is None): uni = UnitToken.try_parse_list(t, add_units, False) if (uni is not None): unit_before = True t = uni[len(uni) - 1].end_token.next0_ delim = False while t is not None: if (t.is_char_of(":,")): delim = True t = t.next0_ elif (t.is_hiphen and t.is_whitespace_after): delim = True t = t.next0_ else: break if (not delim): if (t is None): if (has_keyw and can_be_nan): pass else: return None elif (not t.is_whitespace_before): return None if (t.next0_ is not None and t.is_hiphen and t.is_whitespace_after): delim = True t = t.next0_ num = NumberHelper.try_parse_real_number(t, True, False) res = None rval = 0 if (num is None): tt = NumbersWithUnitToken.M_SPEC.try_parse(t, TerminParseAttr.NO) if (tt is not None): rval = (tt.termin.tag) unam = tt.termin.tag2 for u in UnitsHelper.UNITS: if (u.fullname_cyr == unam): uni = list() uni.append(UnitToken._new1626(t, t, u)) break if (uni is None): return None res = NumbersWithUnitToken._new1628(t0, tt.end_token, about_) t = tt.end_token.next0_ else: if (not can_omit_number and not has_keyw and not can_be_nan): return None if ((uni is not None and len(uni) == 1 and uni[0].begin_token == uni[0].end_token) and uni[0].length_char > 3): rval = (1) res = NumbersWithUnitToken._new1628(t0, uni[len(uni) - 1].end_token, about_) t = res.end_token.next0_ elif (has_keyw and can_be_nan): rval = math.nan res = NumbersWithUnitToken._new1628(t0, t0, about_) if (t is not None): res.end_token = t.previous else: t = t0 while t is not None: res.end_token = t t = t.next0_ else: return None else: if ((t == t0 and t0.is_hiphen and not t.is_whitespace_before) and not t.is_whitespace_after and (num.real_value < 0)): num = NumberHelper.try_parse_real_number(t.next0_, True, False) if (num is None): return None if (t == t0 and (isinstance(t, NumberToken)) and t.morph.class0_.is_adjective): nn = Utils.asObjectOrNull(t.end_token, TextToken) if (nn is None): return None norm = nn.get_normal_case_text(MorphClass.ADJECTIVE, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False) if ((norm.endswith("Ь") or norm == "ЧЕТЫРЕ" or norm == "ТРИ") or norm == "ДВА"): pass else: mi = MorphologyService.get_word_base_info("КОКО" + nn.term, None, False, False) if (mi.class0_.is_adjective): return None t = num.end_token.next0_ res = NumbersWithUnitToken._new1628(t0, num.end_token, about_) rval = num.real_value if (uni is None): uni = UnitToken.try_parse_list(t, add_units, False) if (uni is not None): if ((plusminus and second and len(uni) >= 1) and uni[0].unit == UnitsHelper.UPERCENT): res.end_token = uni[0].end_token res.plus_minus_percent = True tt1 = uni[0].end_token.next0_ uni = UnitToken.try_parse_list(tt1, add_units, False) if (uni is not None): res.units = uni res.end_token = uni[len(uni) - 1].end_token else: res.units = uni res.end_token = uni[len(uni) - 1].end_token t = res.end_token.next0_ else: res.units = uni if (len(uni) > 1): uni1 = UnitToken.try_parse_list(t, add_units, False) if (((uni1 is not None and uni1[0].unit == uni[0].unit and (len(uni1) < len(uni))) and uni[len(uni1)].pow0_ == -1 and uni1[len(uni1) - 1].end_token.next0_ is not None) and uni1[len(uni1) - 1].end_token.next0_.is_char_of("/\\")): num2 = NumbersWithUnitToken._try_parse(uni1[len(uni1) - 1].end_token.next0_.next0_, add_units, False, False, False) if (num2 is not None and num2.units is not None and num2.units[0].unit == uni[len(uni1)].unit): res.units = uni1 res.div_num = num2 res.end_token = num2.end_token res.whl = whd if (dty != NumbersWithUnitToken.DiapTyp.UNDEFINED): if (dty == NumbersWithUnitToken.DiapTyp.GE or dty == NumbersWithUnitToken.DiapTyp.FROM): res.from_include = True res.from_val = rval elif (dty == NumbersWithUnitToken.DiapTyp.GT): res.from_include = False res.from_val = rval elif (dty == NumbersWithUnitToken.DiapTyp.LE or dty == NumbersWithUnitToken.DiapTyp.TO): res.to_include = True res.to_val = rval elif (dty == NumbersWithUnitToken.DiapTyp.LS): res.to_include = False res.to_val = rval is_second_max = False if (not second): iii = 0 wrapiii1632 = RefOutArgWrapper(iii) ttt = NumbersWithUnitToken._is_min_or_max(t, wrapiii1632) iii = wrapiii1632.value if (ttt is not None and iii > 0): is_second_max = True t = ttt.next0_ next0__ = (None if second or plusminus or ((t is not None and ((t.is_table_control_char or t.is_newline_before)))) else NumbersWithUnitToken._try_parse(t, add_units, True, False, can_be_nan)) if (next0__ is not None and (isinstance(t.previous, NumberToken))): if (MeasureHelper.is_mult_char(t.previous.end_token)): next0__ = (None) if (next0__ is not None and ((next0__.to_val is not None or next0__.single_val is not None)) and next0__.from_val is None): if ((((next0__.begin_token.is_char('+') and next0__.single_val is not None and not math.isnan(next0__.single_val)) and next0__.end_token.next0_ is not None and next0__.end_token.next0_.is_char_of("\\/")) and next0__.end_token.next0_.next0_ is not None and next0__.end_token.next0_.next0_.is_hiphen) and not has_keyw and not math.isnan(rval)): next2 = NumbersWithUnitToken._try_parse(next0__.end_token.next0_.next0_.next0_, add_units, True, False, False) if (next2 is not None and next2.single_val is not None and not math.isnan(next2.single_val)): res.from_val = (rval - next2.single_val) res.from_include = True res.to_val = (rval + next0__.single_val) res.to_include = True if (next2.units is not None and len(res.units) == 0): res.units = next2.units res.end_token = next2.end_token return res if (len(next0__.units) > 0): if (len(res.units) == 0): res.units = next0__.units elif (not UnitToken.can_be_equals(res.units, next0__.units)): next0__ = (None) elif (len(res.units) > 0 and not unit_before and not next0__.plus_minus_percent): next0__ = (None) if (next0__ is not None): res.end_token = next0__.end_token if (next0__ is not None and next0__.to_val is not None): res.to_val = next0__.to_val res.to_include = next0__.to_include elif (next0__ is not None and next0__.single_val is not None): if (next0__.begin_token.is_char_of("/\\")): res.div_num = next0__ res.single_val = rval return res elif (next0__.plus_minus_percent): res.single_val = rval res.plus_minus = next0__.single_val res.plus_minus_percent = True res.to_include = True else: res.to_val = next0__.single_val res.to_include = True if (next0__ is not None): if (res.from_val is None): res.from_val = rval res.from_include = True return res elif ((next0__ is not None and next0__.from_val is not None and next0__.to_val is not None) and next0__.to_val == (- next0__.from_val)): if (len(next0__.units) == 1 and next0__.units[0].unit == UnitsHelper.UPERCENT and len(res.units) > 0): res.single_val = rval res.plus_minus = next0__.to_val res.plus_minus_percent = True res.end_token = next0__.end_token return res if (len(next0__.units) == 0): res.single_val = rval res.plus_minus = next0__.to_val res.end_token = next0__.end_token return res res.from_val = (next0__.from_val + rval) res.from_include = True res.to_val = (next0__.to_val + rval) res.to_include = True res.end_token = next0__.end_token if (len(next0__.units) > 0): res.units = next0__.units return res if (dty == NumbersWithUnitToken.DiapTyp.UNDEFINED): if (plusminus and ((not res.plus_minus_percent or not second))): res.from_include = True res.from_val = (- rval) res.to_include = True res.to_val = rval else: res.single_val = rval res.plus_minus_percent = plusminus if (is_age_): res.is_age = True return res