def get_normal_case_text(self, mc: 'MorphClass' = None, num: 'MorphNumber' = MorphNumber.UNDEFINED, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: if ((isinstance(self.begin_token, ReferentToken)) and self.begin_token == self.end_token): return self.begin_token.get_normal_case_text( mc, num, gender, keep_chars) res = None max_coef = 0 def_coef = -1 for it in self.morph.items: v = Utils.asObjectOrNull(it, NounPhraseItemTextVar) if (v is None): continue if (v.undef_coef > 0 and (((v.undef_coef < max_coef) or def_coef >= 0))): continue if (num == MorphNumber.SINGULAR and v.single_number_value is not None): if (mc is not None and ((gender == MorphGender.NEUTER or gender == MorphGender.FEMINIE)) and mc.is_adjective): bi = MorphBaseInfo._new401(MorphClass._new53(mc.value), gender, MorphNumber.SINGULAR, MorphCase.NOMINATIVE, self.morph.language) str0_ = MorphologyService.get_wordform( v.single_number_value, bi) if (str0_ is not None): res = str0_ else: res = v.single_number_value if (v.undef_coef == 0): break max_coef = v.undef_coef continue if (Utils.isNullOrEmpty(v.normal_value)): continue if (str.isdigit(v.normal_value[0]) and mc is not None and mc.is_adjective): val = 0 wrapval402 = RefOutArgWrapper(0) inoutres403 = Utils.tryParseInt(v.normal_value, wrapval402) val = wrapval402.value if (inoutres403): str0_ = NumberHelper.get_number_adjective( val, gender, (MorphNumber.SINGULAR if num == MorphNumber.SINGULAR or val == 1 else MorphNumber.PLURAL)) if (str0_ is not None): res = str0_ if (v.undef_coef == 0): break max_coef = v.undef_coef continue res1 = it.normal_value if (num == MorphNumber.SINGULAR): if (res1 == "ДЕТИ"): res1 = "РЕБЕНОК" elif (res1 == "ЛЮДИ"): res1 = "ЧЕЛОВЕК" max_coef = v.undef_coef if (v.undef_coef > 0): res = res1 continue def_co = 0 if (mc is not None and mc.is_adjective and v.undef_coef == 0): pass elif ( ((isinstance(self.begin_token, TextToken)) and res1 == self.begin_token.term and it.case_.is_nominative) and it.number == MorphNumber.SINGULAR): def_co = 1 if (num == MorphNumber.PLURAL and ((v.number) & (MorphNumber.PLURAL)) == (MorphNumber.PLURAL)): def_co += 3 if (res is None or def_co > def_coef): res = res1 def_coef = def_co if (def_co > 0): break if (res is not None): return self.__corr_chars(res, keep_chars) if (res is None and self.begin_token == self.end_token): res = self.begin_token.get_normal_case_text( mc, num, gender, keep_chars) elif (res is None): res = self.begin_token.get_normal_case_text( mc, num, gender, keep_chars) if (res is None): res = MiscHelper.get_text_value_of_meta_token( self, (GetTextAttr.KEEPREGISTER if keep_chars else GetTextAttr.NO)) else: res = "{0} {1}".format( res, MiscHelper.get_text_value( self.begin_token.next0_, self.end_token, (GetTextAttr.KEEPREGISTER if keep_chars else GetTextAttr.NO))) return Utils.ifNotNull(res, "?")
def find(self, word: str, try_create: bool, lang_: 'MorphLang') -> typing.List['DerivateGroup']: if (Utils.isNullOrEmpty(word)): return None tn = self._m_root i = 0 while i < len(word): k = ord(word[i]) tn1 = None if (tn.nodes is None): break wraptn14 = RefOutArgWrapper(None) inoutres5 = Utils.tryGetValue(tn.nodes, k, wraptn14) tn1 = wraptn14.value if (not inoutres5): break tn = tn1 if (tn.lazy_pos > 0): pos = tn.lazy_pos wrappos3 = RefOutArgWrapper(pos) DeserializeHelper.deserialize_tree_node( self.__m_buf, self, tn, True, wrappos3) pos = wrappos3.value tn.lazy_pos = 0 i += 1 res = (None if i < len(word) else tn.groups) li = None if (isinstance(res, list)): li = list(Utils.asObjectOrNull(res, list)) gen = False nogen = False for g in li: if (g.is_generated): gen = True else: nogen = True if (gen and nogen): for i in range(len(li) - 1, -1, -1): if (li[i].is_generated): del li[i] else: i = -1 elif (isinstance(res, DerivateGroup)): li = list() li.append(Utils.asObjectOrNull(res, DerivateGroup)) if (li is not None and lang_ is not None and not lang_.is_undefined): for i in range(len(li) - 1, -1, -1): if (not li[i].contains_word(word, lang_)): del li[i] else: i = -1 if (li is not None and len(li) > 0): return li if (len(word) < 4): return None ch0 = word[len(word) - 1] ch1 = word[len(word) - 2] ch2 = word[len(word) - 3] if (ch0 == 'О' or ((ch0 == 'И' and ch1 == 'К'))): word1 = word[0:0 + len(word) - 1] li = self.find(word1 + "ИЙ", False, lang_) if ((li) is not None): return li li = self.find(word1 + "ЫЙ", False, lang_) if ((li) is not None): return li if (ch0 == 'О' and ch1 == 'Н'): li = self.find(word1 + "СКИЙ", False, lang_) if ((li) is not None): return li elif (((ch0 == 'Я' or ch0 == 'Ь')) and ((word[len(word) - 2] == 'С'))): word1 = word[0:0 + len(word) - 2] if (word1 == "ЯТЬ"): return None li = self.find(word1, False, lang_) if ((li) is not None): return li elif (ch0 == 'Е' and ch1 == 'Ь'): word1 = word[0:0 + len(word) - 2] + "ИЕ" li = self.find(word1, False, lang_) if ((li) is not None): return li elif (ch0 == 'Й' and ch2 == 'Н' and try_create): ch3 = word[len(word) - 4] word1 = None if (ch3 != 'Н'): if (LanguageHelper.is_cyrillic_vowel(ch3)): word1 = (word[0:0 + len(word) - 3] + "Н" + word[len(word) - 3:]) else: word1 = (word[0:0 + len(word) - 4] + word[len(word) - 3:]) if (word1 is not None): li = self.find(word1, False, lang_) if ((li) is not None): return li if (ch0 == 'Й' and ch1 == 'О'): word2 = word[0:0 + len(word) - 2] li = self.find(word2 + "ИЙ", False, lang_) if ((li) is not None): return li li = self.find(word2 + "ЫЙ", False, lang_) if ((li) is not None): return li if (not try_create): return None len0_ = len(word) - 4 i = 1 first_pass2883 = True while True: if first_pass2883: first_pass2883 = False else: i += 1 if (not (i <= len0_)): break rest = word[i:] li1 = self.find(rest, False, lang_) if (li1 is None): continue pref = word[0:0 + i] gen = list() for dg in li1: if (not dg.is_dummy and not dg.is_generated): if (dg.not_generate): if (len(rest) < 5): continue gg = dg.create_by_prefix(pref, lang_) if (gg is not None): gen.append(gg) self.add(gg) if (len(gen) == 0): return None return gen return None
def __tryParse(t: 'Token', is_in_lit: bool, max_char: int = 0) -> typing.List['ReferentToken']: if (t is None): return None is_bracket_regime = False if (t.previous is not None and t.previous.isChar('(')): is_bracket_regime = True blt = BookLinkToken.tryParse(t, 0) if (blt is None): blt = BookLinkToken.tryParseAuthor(t, FioTemplateType.UNDEFINED) if (blt is None and not is_bracket_regime): return None t0 = t coef = 0 is_electr_res = False decree = None regtyp = BookLinkAnalyzer.RegionTyp.UNDEFINED num = None spec_see = None book_prev = None if (is_bracket_regime): regtyp = BookLinkAnalyzer.RegionTyp.AUTHORS elif (blt.typ == BookLinkTyp.PERSON): if (not is_in_lit): return None regtyp = BookLinkAnalyzer.RegionTyp.AUTHORS elif (blt.typ == BookLinkTyp.NUMBER): num = blt.value t = blt.end_token.next0_ if (t is None or t.is_newline_before): return None if (not t.is_whitespace_before): if (isinstance(t, NumberToken)): n = (t).value if ((((n == "3" or n == "0")) and not t.is_whitespace_after and (isinstance(t.next0_, TextToken))) and t.next0_.chars.is_all_lower): pass else: return None elif (not ((isinstance(t, TextToken))) or t.chars.is_all_lower): r = t.getReferent() if (isinstance(r, PersonReferent)): pass elif (is_in_lit and r is not None and r.type_name == "DECREE"): pass else: return None first_pass2757 = True while True: if first_pass2757: first_pass2757 = False else: t = t.next0_ if (not (t is not None)): break if (isinstance(t, NumberToken)): break if (not ((isinstance(t, TextToken)))): break if (BracketHelper.canBeStartOfSequence(t, True, False)): break if (not t.chars.is_letter): continue bbb = BookLinkToken.tryParse(t, 0) if (bbb is not None): if (bbb.typ == BookLinkTyp.TAMZE): spec_see = bbb t = bbb.end_token.next0_ break if (bbb.typ == BookLinkTyp.SEE): t = bbb.end_token continue break if (spec_see is not None and spec_see.typ == BookLinkTyp.TAMZE): coef += 1 max0_ = 1000 tt = t0 while tt is not None and max0_ > 0: if (isinstance(tt.getReferent(), BookLinkRefReferent)): book_prev = (tt.getReferent()).book break tt = tt.previous max0_ -= 1 blt1 = BookLinkToken.tryParseAuthor(t, FioTemplateType.UNDEFINED) if (blt1 is not None and blt1.typ == BookLinkTyp.PERSON): regtyp = BookLinkAnalyzer.RegionTyp.AUTHORS else: ok = False tt = t first_pass2758 = True while True: if first_pass2758: first_pass2758 = False else: tt = (None if tt is None else tt.next0_) if (not (tt is not None)): break if (tt.is_newline_before): break if (is_in_lit and tt.getReferent() is not None and tt.getReferent().type_name == "DECREE"): ok = True decree = tt break bbb = BookLinkToken.tryParse(tt, 0) if (bbb is None): continue if (bbb.typ == BookLinkTyp.ELECTRONRES): is_electr_res = True ok = True break if (bbb.typ == BookLinkTyp.DELIMETER): tt = bbb.end_token.next0_ if (BookLinkToken.tryParseAuthor( tt, FioTemplateType.UNDEFINED) is not None): ok = True break bbb = BookLinkToken.tryParse(tt, 0) if (bbb is not None): if (bbb.typ == BookLinkTyp.EDITORS or bbb.typ == BookLinkTyp.TRANSLATE or bbb.typ == BookLinkTyp.SOSTAVITEL): ok = True break if (not ok and not is_in_lit): if (BookLinkToken.checkLinkBefore(t0, num)): pass else: return None regtyp = BookLinkAnalyzer.RegionTyp.NAME else: return None res = BookLinkReferent() corr_authors = list() t00 = t blt00 = None start_of_name = None prev_pers_templ = FioTemplateType.UNDEFINED if (regtyp == BookLinkAnalyzer.RegionTyp.AUTHORS): first_pass2759 = True while True: if first_pass2759: first_pass2759 = False else: t = t.next0_ if (not (t is not None)): break if (max_char > 0 and t.begin_char >= max_char): break if (t.isCharOf(".;") or t.is_comma_and): continue if (t.isChar('/')): break if ((t.isChar('(') and t.next0_ is not None and t.next0_.isValue("EDS", None)) and t.next0_.next0_ is not None and t.next0_.next0_.isChar(')')): t = t.next0_.next0_.next0_ break blt = BookLinkToken.tryParseAuthor(t, prev_pers_templ) if (blt is None and t.previous is not None and t.previous.is_and): blt = BookLinkToken.tryParseAuthor( t.previous, FioTemplateType.UNDEFINED) if (blt is None): if ((isinstance(t.getReferent(), OrganizationReferent)) and blt00 is not None): bbb2 = BookLinkToken.tryParse(t.next0_, 0) if (bbb2 is not None): if (bbb2.typ == BookLinkTyp.YEAR): res.addSlot(BookLinkReferent.ATTR_AUTHOR, t.getReferent(), False, 0) res.year = int(bbb2.value) coef += .5 t = bbb2.end_token.next0_ break if (blt.typ == BookLinkTyp.PERSON): tt2 = blt.end_token.next0_ bbb2 = BookLinkToken.tryParse(tt2, 0) if (bbb2 is not None): if (bbb2.typ == BookLinkTyp.YEAR): res.year = int(bbb2.value) coef += .5 blt.end_token = bbb2.end_token blt00 = (None) if (blt00 is not None and ((blt00.end_token.next0_ == blt.begin_token or blt.begin_token.previous.isChar('.')))): tt11 = blt.end_token.next0_ nex = BookLinkToken.tryParse(tt11, 0) if (nex is not None and nex.typ == BookLinkTyp.ANDOTHERS): pass else: if (tt11 is None): break if (tt11.isChar('/') and tt11.next0_ is not None and tt11.next0_.isChar('/')): break if (tt11.isChar(':')): break if ((str(blt).find('.') < 0) and str(blt00).find('.') > 0): break if ((isinstance(tt11, TextToken)) and tt11.chars.is_all_lower): break if (tt11.isCharOf(",.;") and tt11.next0_ is not None): tt11 = tt11.next0_ nex = BookLinkToken.tryParse(tt11, 0) if (nex is not None and nex.typ != BookLinkTyp.PERSON and nex.typ != BookLinkTyp.ANDOTHERS): break elif ( (blt00 is not None and blt00.person_template != FioTemplateType.UNDEFINED and blt.person_template != blt00.person_template) and blt.person_template == FioTemplateType.NAMESURNAME): if (blt.end_token.next0_ is None or not blt.end_token.next0_.is_comma_and): break if (BookLinkToken.tryParseAuthor( blt.end_token.next0_.next0_, FioTemplateType.UNDEFINED) is not None): pass else: break if (blt00 is None and blt.person_template == FioTemplateType.NAMESURNAME): tt = blt.end_token.next0_ if (tt is not None and tt.is_hiphen): tt = tt.next0_ if (isinstance(tt, NumberToken)): break BookLinkAnalyzer.__addAuthor(res, blt) coef += 1 t = blt.end_token if (isinstance(t.getReferent(), PersonReferent)): corr_authors.append( Utils.asObjectOrNull(t, ReferentToken)) blt00 = blt prev_pers_templ = blt.person_template start_of_name = blt.start_of_name if ((start_of_name) is not None): t = t.next0_ break continue if (blt.typ == BookLinkTyp.ANDOTHERS): coef += .5 t = blt.end_token.next0_ res.authors_and_other = True break break if (t is None): return None if ((t.is_newline_before and t != t0 and num is None) and res.findSlot( BookLinkReferent.ATTR_AUTHOR, None, True) is None): return None if (start_of_name is None): if (t.chars.is_all_lower): coef -= (1) if (t.chars.is_latin_letter and not is_electr_res and num is None): if (res.getSlotValue(BookLinkReferent.ATTR_AUTHOR) is None): return None tn0 = t tn1 = None uri = None next_num = None wrapnn393 = RefOutArgWrapper(0) inoutres394 = Utils.tryParseInt(Utils.ifNotNull(num, ""), wrapnn393) nn = wrapnn393.value if (inoutres394): next_num = str((nn + 1)) br = (BracketHelper.tryParse( t, Utils.valToEnum( (BracketParseAttr.CANCONTAINSVERBS) | (BracketParseAttr.CANBEMANYLINES), BracketParseAttr), 100) if BracketHelper.canBeStartOfSequence(t, True, False) else None) if (br is not None): t = t.next0_ pages = None first_pass2760 = True while True: if first_pass2760: first_pass2760 = False else: t = t.next0_ if (not (t is not None)): break if (max_char > 0 and t.begin_char >= max_char): break if (br is not None and br.end_token == t): tn1 = t break tit = TitleItemToken.tryAttach(t) if (tit is not None): if ((tit.typ == TitleItemToken.Types.TYP and tn0 == t and br is None) and BracketHelper.canBeStartOfSequence( tit.end_token.next0_, True, False)): br = BracketHelper.tryParse(tit.end_token.next0_, BracketParseAttr.NO, 100) if (br is not None): coef += (1) if (num is not None): coef += 1 tn0 = br.begin_token tn1 = br.end_token res.typ = tit.value.lower() t = br.end_token.next0_ break if (t.is_newline_before and t != tn0): if (br is not None and (t.end_char < br.end_char)): pass elif (not MiscHelper.canBeStartOfSentence(t)): pass else: if (t.newlines_before_count > 1): break if ((isinstance(t, NumberToken)) and num is not None and (t).int_value is not None): if (num == str(((t).int_value - 1))): break elif (num is not None): pass else: nnn = NounPhraseHelper.tryParse( t.previous, Utils.valToEnum( ((NounPhraseParseAttr.PARSEPREPOSITION) | (NounPhraseParseAttr.PARSEADVERBS) | (NounPhraseParseAttr.PARSENUMERICASADJECTIVE)) | (NounPhraseParseAttr.MULTILINES), NounPhraseParseAttr), 0) if (nnn is not None and nnn.end_char >= t.end_char): pass else: break if (t.isCharOf(".;") and t.whitespaces_after_count > 0): tit = TitleItemToken.tryAttach(t.next0_) if ((tit) is not None): if (tit.typ == TitleItemToken.Types.TYP): break stop = True words = 0 notwords = 0 tt = t.next0_ first_pass2761 = True while True: if first_pass2761: first_pass2761 = False else: tt = tt.next0_ if (not (tt is not None)): break blt0 = BookLinkToken.tryParse(tt, 0) if (blt0 is None): if (tt.is_newline_before): break if ((isinstance(tt, TextToken)) and not tt.getMorphClassInDictionary().is_undefined ): words += 1 else: notwords += 1 if (words > 6 and words > (notwords * 4)): stop = False break continue if ((blt0.typ == BookLinkTyp.DELIMETER or blt0.typ == BookLinkTyp.TRANSLATE or blt0.typ == BookLinkTyp.TYPE) or blt0.typ == BookLinkTyp.GEO or blt0.typ == BookLinkTyp.PRESS): stop = False break if (br is not None and br.end_token.previous.end_char > t.end_char): stop = False if (stop): break if (t == decree): t = t.next0_ break blt = BookLinkToken.tryParse(t, 0) if (blt is None): tn1 = t continue if (blt.typ == BookLinkTyp.DELIMETER): break if (((blt.typ == BookLinkTyp.MISC or blt.typ == BookLinkTyp.TRANSLATE or blt.typ == BookLinkTyp.NAMETAIL) or blt.typ == BookLinkTyp.TYPE or blt.typ == BookLinkTyp.VOLUME) or blt.typ == BookLinkTyp.PAGERANGE or blt.typ == BookLinkTyp.PAGES): coef += 1 break if (blt.typ == BookLinkTyp.GEO or blt.typ == BookLinkTyp.PRESS): if (t.previous.is_hiphen or t.previous.isCharOf(".;") or blt.add_coef > 0): break if (blt.typ == BookLinkTyp.YEAR): if (t.previous is not None and t.previous.is_comma): break if (blt.typ == BookLinkTyp.ELECTRONRES): is_electr_res = True break if (blt.typ == BookLinkTyp.URL): if (t == tn0 or t.previous.isCharOf(":.")): is_electr_res = True break tn1 = t if (tn1 is None and start_of_name is None): if (is_electr_res): uri_re = BookLinkReferent() rt0 = ReferentToken(uri_re, t00, t) rts0 = list() bref0 = BookLinkRefReferent._new389(uri_re) if (num is not None): bref0.number = num rt01 = ReferentToken(bref0, t0, rt0.end_token) ok = False while t is not None: if (t.is_newline_before): break blt0 = BookLinkToken.tryParse(t, 0) if (blt0 is not None): if (isinstance(blt0.ref, UriReferent)): uri_re.addSlot( BookLinkReferent.ATTR_URL, Utils.asObjectOrNull(blt0.ref, UriReferent), False, 0) ok = True t = blt0.end_token rt0.end_token = rt01.end_token = t t = t.next0_ if (ok): rts0.append(rt01) rts0.append(rt0) return rts0 if (decree is not None and num is not None): rts0 = list() bref0 = BookLinkRefReferent._new389(decree.getReferent()) if (num is not None): bref0.number = num rt01 = ReferentToken(bref0, t0, decree) t = decree.next0_ while t is not None: if (t.is_newline_before): break if (isinstance(t, TextToken)): if ((t).is_pure_verb): return None rt01.end_token = t t = t.next0_ rts0.append(rt01) return rts0 if (book_prev is not None): tt = t while tt is not None and ((tt.isCharOf(",.") or tt.is_hiphen)): tt = tt.next0_ blt0 = BookLinkToken.tryParse(tt, 0) if (blt0 is not None and blt0.typ == BookLinkTyp.PAGERANGE): rts0 = list() bref0 = BookLinkRefReferent._new389(book_prev) if (num is not None): bref0.number = num bref0.pages = blt0.value rt00 = ReferentToken(bref0, t0, blt0.end_token) rts0.append(rt00) return rts0 return None if (br is not None and ((tn1 == br.end_token or tn1 == br.end_token.previous))): tn0 = tn0.next0_ tn1 = tn1.previous if (start_of_name is None): while tn0 is not None: if (tn0.isCharOf(":,~")): tn0 = tn0.next0_ else: break while tn1 is not None and tn1.begin_char > tn0.begin_char: if (tn1.isCharOf(".;,:(~") or tn1.is_hiphen or tn1.isValue("РЕД", None)): pass else: break tn1 = tn1.previous nam = MiscHelper.getTextValue( tn0, tn1, Utils.valToEnum( (GetTextAttr.KEEPQUOTES) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) if (start_of_name is not None): if (nam is None or (len(nam) < 3)): nam = start_of_name else: nam = "{0}{1}{2}".format( start_of_name, (" " if tn0.is_whitespace_before else ""), nam) if (nam is None): return None res.name = nam if (num is None and not is_in_lit): if (len(nam) < 20): return None coef -= (2) if (len(nam) > 500): coef -= (math.floor(len(nam) / 500)) if (is_bracket_regime): coef -= 1 if (len(nam) > 200): if (num is None): return None if (res.findSlot(BookLinkReferent.ATTR_AUTHOR, None, True) is None and not BookLinkToken.checkLinkBefore(t0, num)): return None en = 0 ru = 0 ua = 0 cha = 0 nocha = 0 chalen = 0 lt0 = tn0 lt1 = tn1 if (tn1 is None): if (t is None): return None lt0 = t0 lt1 = t tn1 = t.previous tt = lt0 while tt is not None and tt.end_char <= lt1.end_char: if ((isinstance(tt, TextToken)) and tt.chars.is_letter): if (tt.chars.is_latin_letter): en += 1 elif (tt.morph.language.is_ua): ua += 1 elif (tt.morph.language.is_ru): ru += 1 if (tt.length_char > 2): cha += 1 chalen += tt.length_char elif (not ((isinstance(tt, ReferentToken)))): nocha += 1 tt = tt.next0_ if (ru > (ua + en)): res.lang = "RU" elif (ua > (ru + en)): res.lang = "UA" elif (en > (ru + ua)): res.lang = "EN" if (nocha > 3 and nocha > cha and start_of_name is None): if (nocha > (math.floor(chalen / 3))): coef -= (2) if (res.lang == "EN"): tt = tn0.next0_ first_pass2762 = True while True: if first_pass2762: first_pass2762 = False else: tt = tt.next0_ if (not (tt is not None and (tt.end_char < tn1.end_char))): break if (tt.is_comma and tt.next0_ is not None and ((not tt.next0_.chars.is_all_lower or (isinstance(tt.next0_, ReferentToken))))): if (tt.next0_.next0_ is not None and tt.next0_.next0_.is_comma_and): if (isinstance(tt.next0_, ReferentToken)): pass else: continue nam = MiscHelper.getTextValue( tn0, tt.previous, Utils.valToEnum((GetTextAttr.KEEPQUOTES) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) if (nam is not None and len(nam) > 15): res.name = nam break rt = ReferentToken(res, t00, tn1) authors = True edits = False br = (None) first_pass2763 = True while True: if first_pass2763: first_pass2763 = False else: t = t.next0_ if (not (t is not None)): break if (max_char > 0 and t.begin_char >= max_char): break if (BracketHelper.canBeStartOfSequence(t, False, False)): br = BracketHelper.tryParse(t, BracketParseAttr.CANBEMANYLINES, 100) if (br is not None and br.length_char > 300): br = (None) blt = BookLinkToken.tryParse(t, 0) if (t.is_newline_before and not t.isChar('/') and not t.previous.isChar('/')): if (blt is not None and blt.typ == BookLinkTyp.NUMBER): break if (t.previous.isCharOf(":")): pass elif (blt is not None and (( ((blt.typ == BookLinkTyp.DELIMETER or blt.typ == BookLinkTyp.PAGERANGE or blt.typ == BookLinkTyp.PAGES) or blt.typ == BookLinkTyp.GEO or blt.typ == BookLinkTyp.PRESS) or blt.typ == BookLinkTyp.N))): pass elif (num is not None and BookLinkToken.tryParseAuthor( t, FioTemplateType.UNDEFINED) is not None): pass elif (num is not None and blt is not None and blt.typ != BookLinkTyp.NUMBER): pass elif (br is not None and (t.end_char < br.end_char) and t.begin_char > br.begin_char): pass else: ok = False mmm = 50 tt = t.next0_ while tt is not None and mmm > 0: if (tt.is_newline_before): blt2 = BookLinkToken.tryParse(tt, 0) if (blt2 is not None and blt2.typ == BookLinkTyp.NUMBER and blt2.value == next_num): ok = True break if (blt2 is not None): if (blt2.typ == BookLinkTyp.PAGES or blt2.typ == BookLinkTyp.GEO or blt2.typ == BookLinkTyp.PRESS): ok = True break tt = tt.next0_ mmm -= 1 if (not ok): npt = NounPhraseHelper.tryParse( t.previous, Utils.valToEnum( ((NounPhraseParseAttr.MULTILINES) | (NounPhraseParseAttr.PARSEADVERBS) | (NounPhraseParseAttr.PARSEPREPOSITION)) | (NounPhraseParseAttr.PARSEVERBS) | (NounPhraseParseAttr.PARSEPRONOUNS), NounPhraseParseAttr), 0) if (npt is not None and npt.end_char >= t.end_char): ok = True if (not ok): break rt.end_token = t if (blt is not None): rt.end_token = blt.end_token if (t.isCharOf(".,") or t.is_hiphen): continue if (t.isValue("С", None)): pass if (regtyp == BookLinkAnalyzer.RegionTyp.FIRST and blt is not None and blt.typ == BookLinkTyp.EDITORS): edits = True t = blt.end_token coef += 1 continue if (regtyp == BookLinkAnalyzer.RegionTyp.FIRST and blt is not None and blt.typ == BookLinkTyp.SOSTAVITEL): edits = False t = blt.end_token coef += 1 continue if (regtyp == BookLinkAnalyzer.RegionTyp.FIRST and authors): blt2 = BookLinkToken.tryParseAuthor(t, prev_pers_templ) if (blt2 is not None and blt2.typ == BookLinkTyp.PERSON): prev_pers_templ = blt2.person_template if (not edits): BookLinkAnalyzer.__addAuthor(res, blt2) coef += 1 t = blt2.end_token continue if (blt2 is not None and blt2.typ == BookLinkTyp.ANDOTHERS): if (not edits): res.authors_and_other = True coef += 1 t = blt2.end_token continue authors = False if (blt is None): continue if (blt.typ == BookLinkTyp.ELECTRONRES or blt.typ == BookLinkTyp.URL): is_electr_res = True if (blt.typ == BookLinkTyp.ELECTRONRES): coef += 1.5 else: coef += .5 if (isinstance(blt.ref, UriReferent)): res.addSlot(BookLinkReferent.ATTR_URL, Utils.asObjectOrNull(blt.ref, UriReferent), False, 0) elif (blt.typ == BookLinkTyp.YEAR): if (res.year == 0): res.year = int(blt.value) coef += .5 elif (blt.typ == BookLinkTyp.DELIMETER): coef += 1 if (blt.length_char == 2): regtyp = BookLinkAnalyzer.RegionTyp.SECOND else: regtyp = BookLinkAnalyzer.RegionTyp.FIRST elif ( (((blt.typ == BookLinkTyp.MISC or blt.typ == BookLinkTyp.TYPE or blt.typ == BookLinkTyp.PAGES) or blt.typ == BookLinkTyp.NAMETAIL or blt.typ == BookLinkTyp.TRANSLATE) or blt.typ == BookLinkTyp.PRESS or blt.typ == BookLinkTyp.VOLUME) or blt.typ == BookLinkTyp.N): coef += 1 elif (blt.typ == BookLinkTyp.PAGERANGE): pages = blt coef += 1 if (is_bracket_regime and blt.end_token.next0_ is not None and blt.end_token.next0_.isChar(')')): coef += (2) if (res.name is not None and res.findSlot(BookLinkReferent.ATTR_AUTHOR, None, True) is not None): coef = (10) elif (blt.typ == BookLinkTyp.GEO and ((regtyp == BookLinkAnalyzer.RegionTyp.SECOND or regtyp == BookLinkAnalyzer.RegionTyp.FIRST))): coef += 1 elif (blt.typ == BookLinkTyp.GEO and t.previous is not None and t.previous.isChar('.')): coef += 1 elif (blt.typ == BookLinkTyp.ANDOTHERS): coef += 1 if (authors): res.authors_and_other = True coef += blt.add_coef t = blt.end_token if ((coef < 2.5) and num is not None): if (BookLinkToken.checkLinkBefore(t0, num)): coef += (2) elif (BookLinkToken.checkLinkAfter(rt.end_token, num)): coef += (1) if (rt.length_char > 500): return None if (is_in_lit): coef += 1 if (coef < 2.5): if (is_electr_res and uri is not None): pass elif (coef >= 2 and is_in_lit): pass else: return None for rr in corr_authors: pits0 = PersonItemToken.tryAttachList( rr.begin_token, None, PersonItemToken.ParseAttr.CANINITIALBEDIGIT, 10) if (pits0 is None or (len(pits0) < 2)): continue if (pits0[0].typ == PersonItemToken.ItemType.VALUE): exi = False for i in range(len(rr.referent.slots) - 1, -1, -1): s = rr.referent.slots[i] if (s.type_name == PersonReferent.ATTR_LASTNAME): ln = Utils.asObjectOrNull(s.value, str) if (ln is None): continue if (ln == pits0[0].value): exi = True continue if (ln.find('-') > 0): ln = ln[0:0 + ln.find('-')] if (pits0[0].begin_token.isValue(ln, None)): del rr.referent.slots[i] if (not exi): rr.referent.addSlot(PersonReferent.ATTR_LASTNAME, pits0[0].value, False, 0) rts = list() bref = BookLinkRefReferent._new389(res) if (num is not None): bref.number = num rt1 = ReferentToken(bref, t0, rt.end_token) if (pages is not None): if (pages.value is not None): bref.pages = pages.value rt.end_token = pages.begin_token.previous rts.append(rt1) rts.append(rt) return rts
def try_attach(self, t : 'Token', for_ontology : bool=False) -> 'ReferentToken': if (t is None): return None rt0 = self.__try_attach_spec(t) if (rt0 is not None): return rt0 if (t.chars.is_all_lower): if (not t.is_whitespace_after and (isinstance(t.next0_, NumberToken))): if (t.previous is None or t.is_whitespace_before or t.previous.is_char_of(",:")): pass else: return None else: return None tmp = io.StringIO() t1 = t hiph = False ok = True nums = 0 chars = 0 w = t1.next0_ first_pass3148 = True while True: if first_pass3148: first_pass3148 = False else: w = w.next0_ if (not (w is not None)): break if (w.is_whitespace_before and not for_ontology): break if (w.is_char_of("/\\_") or w.is_hiphen): hiph = True print('-', end="", file=tmp) continue hiph = False nt = Utils.asObjectOrNull(w, NumberToken) if (nt is not None): if (nt.typ != NumberSpellingType.DIGIT): break t1 = (nt) print(nt.get_source_text(), end="", file=tmp) nums += 1 continue tt = Utils.asObjectOrNull(w, TextToken) if (tt is None): break if (tt.length_char > 3): ok = False break if (not str.isalpha(tt.term[0])): if (tt.is_char_of(",:") or BracketHelper.can_be_end_of_sequence(tt, False, None, False)): break if (not tt.is_char_of("+*&^#@!")): ok = False break chars += 1 t1 = (tt) print(tt.get_source_text(), end="", file=tmp) if (not for_ontology): if ((tmp.tell() < 1) or not ok or hiph): return None if (tmp.tell() > 12): return None last = Utils.getCharAtStringIO(tmp, tmp.tell() - 1) if (last == '!'): return None if ((nums + chars) == 0): return None if (not self.__check_attach(t, t1)): return None new_dr = DenominationReferent() new_dr._add_value(t, t1) return ReferentToken(new_dr, t, t1)
def org0_(self) -> 'OrganizationReferent': """ Организация """ return Utils.asObjectOrNull( self.get_slot_value(TitlePageReferent.ATTR_ORG), OrganizationReferent)
def url(self) -> 'UriReferent': """ URL """ return Utils.asObjectOrNull( self.get_slot_value(BookLinkReferent.ATTR_URL), UriReferent)
def url(self) -> 'UriReferent': return Utils.asObjectOrNull( self.getSlotValue(BookLinkReferent.ATTR_URL), UriReferent)
def __tryNounName(li: typing.List['CityItemToken'], oi: 'IntOntologyItem', always: bool) -> 'ReferentToken': oi.value = (None) if (li is None or (len(li) < 2) or ((li[0].typ != CityItemToken.ItemType.NOUN and li[0].typ != CityItemToken.ItemType.MISC))): return None ok = not li[0].doubtful if (ok and li[0].typ == CityItemToken.ItemType.MISC): ok = False typ = (None if li[0].typ == CityItemToken.ItemType.MISC else li[0].value) typ2 = (None if li[0].typ == CityItemToken.ItemType.MISC else li[0].alt_value) prob_adj = None i1 = 1 org0_ = None if ((typ is not None and li[i1].typ == CityItemToken.ItemType.NOUN and ((i1 + 1) < len(li))) and li[0].whitespaces_after_count <= 1 and (((LanguageHelper.endsWith(typ, "ПОСЕЛОК") or LanguageHelper.endsWith(typ, "СЕЛИЩЕ") or typ == "ДЕРЕВНЯ") or typ == "СЕЛО"))): if (li[i1].begin_token == li[i1].end_token): ooo = AddressItemToken.tryAttachOrg(li[i1].begin_token) if (ooo is not None and ooo.ref_token is not None): return None typ2 = li[i1].value if (typ2 == "СТАНЦИЯ" and li[i1].begin_token.isValue("СТ", None) and ((i1 + 1) < len(li))): m = li[i1 + 1].morph if (m.number == MorphNumber.PLURAL): prob_adj = "СТАРЫЕ" elif (m.gender == MorphGender.FEMINIE): prob_adj = "СТАРАЯ" elif (m.gender == MorphGender.MASCULINE): prob_adj = "СТАРЫЙ" else: prob_adj = "СТАРОЕ" i1 += 1 name = Utils.ifNotNull(li[i1].value, ((None if li[i1].onto_item is None else li[i1].onto_item.canonic_text))) alt_name = li[i1].alt_value if (name is None): return None mc = li[0].morph if (i1 == 1 and li[i1].typ == CityItemToken.ItemType.CITY and ((li[0].value == "ГОРОД" or li[0].value == "МІСТО" or li[0].typ == CityItemToken.ItemType.MISC))): if (typ is None and ((i1 + 1) < len(li)) and li[i1 + 1].typ == CityItemToken.ItemType.NOUN): return None oi.value = li[i1].onto_item if (oi.value is not None): name = oi.value.canonic_text if (len(name) > 2 or oi.value.misc_attr is not None): if (not li[1].doubtful or ((oi.value is not None and oi.value.misc_attr is not None))): ok = True elif (not ok and not li[1].is_newline_before): if (li[0].geo_object_before or li[1].geo_object_after): ok = True elif (StreetDefineHelper.checkStreetAfter( li[1].end_token.next0_)): ok = True elif (li[1].end_token.next0_ is not None and (isinstance(li[1].end_token.next0_.getReferent(), DateReferent))): ok = True elif ((li[1].whitespaces_before_count < 2) and li[1].onto_item is not None): if (li[1].is_newline_after): ok = True if (li[1].doubtful and li[1].end_token.next0_ is not None and li[1].end_token.chars == li[1].end_token.next0_.chars): ok = False if (li[0].begin_token.previous is not None and li[0].begin_token.previous.isValue("В", None)): ok = True if (not ok): ok = CityAttachHelper.checkYearAfter(li[1].end_token.next0_) if (not ok): ok = CityAttachHelper.checkCityAfter(li[1].end_token.next0_) elif ((li[i1].typ == CityItemToken.ItemType.PROPERNAME or li[i1].typ == CityItemToken.ItemType.CITY)): if (((li[0].value == "АДМИНИСТРАЦИЯ" or li[0].value == "АДМІНІСТРАЦІЯ")) and i1 == 1): return None if (li[i1].is_newline_before): if (len(li) != 2): return None if (not li[0].doubtful): ok = True if (len(name) < 2): ok = False elif ((len(name) < 3) and li[0].morph.number != MorphNumber.SINGULAR): ok = False if (li[i1].doubtful and not li[i1].geo_object_after and not li[0].geo_object_before): if (li[i1].morph.case_.is_genitive): if (((li[0].begin_token.previous is None or MiscLocationHelper.checkGeoObjectBefore( li[0].begin_token))) and ((li[i1].end_token.next0_ is None or MiscLocationHelper.checkGeoObjectAfter( li[i1].end_token.next0_) or AddressItemToken.checkHouseAfter( li[i1].end_token.next0_, False, True)))): pass else: ok = False else: rt0 = li[i1].kit.processReferent( "PERSONPROPERTY", li[0].begin_token.previous) if (rt0 is not None): rt1 = li[i1].kit.processReferent( "PERSON", li[i1].begin_token) if (rt1 is not None): ok = False npt = NounPhraseHelper.tryParse(li[i1].begin_token, NounPhraseParseAttr.NO, 0) if (npt is not None): if (npt.end_token.end_char > li[i1].end_char and len(npt.adjectives) > 0 and not npt.adjectives[0].end_token.next0_.is_comma): ok = False elif (TerrItemToken._m_unknown_regions.tryParse( npt.end_token, TerminParseAttr.FULLWORDSONLY) is not None): ok1 = False if (li[0].begin_token.previous is not None): ttt = li[0].begin_token.previous if (ttt.is_comma and ttt.previous is not None): ttt = ttt.previous geo_ = Utils.asObjectOrNull( ttt.getReferent(), GeoReferent) if (geo_ is not None and not geo_.is_city): ok1 = True if (npt.end_token.next0_ is not None): ttt = npt.end_token.next0_ if (ttt.is_comma and ttt.next0_ is not None): ttt = ttt.next0_ geo_ = Utils.asObjectOrNull( ttt.getReferent(), GeoReferent) if (geo_ is not None and not geo_.is_city): ok1 = True if (not ok1): return None if (li[0].value == "ПОРТ"): if (li[i1].chars.is_all_upper or li[i1].chars.is_latin_letter): return None elif (li[0].geo_object_before): ok = True elif (li[i1].geo_object_after and not li[i1].is_newline_after): ok = True else: ok = CityAttachHelper.checkYearAfter(li[i1].end_token.next0_) if (not ok): ok = CityAttachHelper.checkStreetAfter(li[i1].end_token.next0_) if (not ok and li[0].begin_token.previous is not None and li[0].begin_token.previous.isValue("В", None)): ok = True else: return None if (not ok and not always): if (MiscLocationHelper.checkNearBefore(li[0].begin_token.previous) is None): return None if (len(li) > (i1 + 1)): del li[i1 + 1:i1 + 1 + len(li) - i1 - 1] city = GeoReferent() if (oi.value is not None and oi.value.referent is not None): city = (Utils.asObjectOrNull(oi.value.referent.clone(), GeoReferent)) city.occurrence.clear() if (not li[0].morph.case_.is_undefined and li[0].morph.gender != MorphGender.UNDEFINED): if (li[i1].end_token.morph.class0_.is_adjective and li[i1].begin_token == li[i1].end_token): nam = ProperNameHelper.getNameEx( li[i1].begin_token, li[i1].end_token, MorphClass.ADJECTIVE, li[0].morph.case_, li[0].morph.gender, False, False) if (nam is not None and nam != name): name = nam if (li[0].morph.case_.is_nominative): if (alt_name is not None): city._addName(alt_name) alt_name = (None) city._addName(name) if (prob_adj is not None): city._addName(prob_adj + " " + name) if (alt_name is not None): city._addName(alt_name) if (prob_adj is not None): city._addName(prob_adj + " " + alt_name) if (typ is not None): city._addTyp(typ) elif (not city.is_city): city._addTypCity(li[0].kit.base_language) if (typ2 is not None): city._addTyp(typ2.lower()) if (li[0].higher_geo is not None and GeoOwnerHelper.canBeHigher(li[0].higher_geo, city)): city.higher = li[0].higher_geo if (li[0].typ == CityItemToken.ItemType.MISC): del li[0] res = ReferentToken._new719(city, li[0].begin_token, li[len(li) - 1].end_token, mc) if (res.end_token.next0_ is not None and res.end_token.next0_.is_hiphen and (isinstance(res.end_token.next0_.next0_, NumberToken))): num = Utils.asObjectOrNull(res.end_token.next0_.next0_, NumberToken) if ((num.typ == NumberSpellingType.DIGIT and not num.morph.class0_.is_adjective and num.int_value is not None) and (num.int_value < 50)): for s in city.slots: if (s.type_name == GeoReferent.ATTR_NAME): city.uploadSlot(s, "{0}-{1}".format(s.value, num.value)) res.end_token = num if (li[0].begin_token == li[0].end_token and li[0].begin_token.isValue("ГОРОДОК", None)): if (AddressItemToken.checkHouseAfter(res.end_token.next0_, True, False)): return None return res
def __tryNameExist(li: typing.List['CityItemToken'], oi: 'IntOntologyItem', always: bool) -> 'ReferentToken': """ Это проверяем некоторые частные случаи Args: li(typing.List[CityItemToken]): oi(IntOntologyItem): """ oi.value = (None) if (li is None or li[0].typ != CityItemToken.ItemType.CITY): return None oi.value = li[0].onto_item tt = Utils.asObjectOrNull(li[0].begin_token, TextToken) if (tt is None): return None ok = False nam = (li[0].value if oi.value is None else oi.value.canonic_text) if (nam is None): return None if (nam == "РИМ"): if (tt.term == "РИМ"): if ((isinstance(tt.next0_, TextToken)) and tt.next0_.getMorphClassInDictionary().is_proper_secname ): pass else: ok = True elif (tt.previous is not None and tt.previous.isValue("В", None) and tt.term == "РИМЕ"): ok = True elif (oi.value is not None and oi.value.referent is not None and oi.value.owner.is_ext_ontology): ok = True elif (nam.endswith("ГРАД") or nam.endswith("СК")): ok = True elif (nam.endswith("TOWN") or nam.startswith("SAN")): ok = True elif (li[0].chars.is_latin_letter and li[0].begin_token.previous is not None and ((li[0].begin_token.previous.isValue("IN", None) or li[0].begin_token.previous.isValue("FROM", None)))): ok = True else: tt2 = li[0].end_token.next0_ first_pass2890 = True while True: if first_pass2890: first_pass2890 = False else: tt2 = tt2.next0_ if (not (tt2 is not None)): break if (tt2.is_newline_before): break if ((tt2.isCharOf(",(") or tt2.morph.class0_.is_preposition or tt2.morph.class0_.is_conjunction) or tt2.morph.class0_.is_misc): continue if ((isinstance(tt2.getReferent(), GeoReferent)) and tt2.chars.is_cyrillic_letter == li[0].chars.is_cyrillic_letter): ok = True break if (not ok): tt2 = li[0].begin_token.previous first_pass2891 = True while True: if first_pass2891: first_pass2891 = False else: tt2 = tt2.previous if (not (tt2 is not None)): break if (tt2.is_newline_after): break if ((tt2.isCharOf(",)") or tt2.morph.class0_.is_preposition or tt2.morph.class0_.is_conjunction) or tt2.morph.class0_.is_misc): continue if ((isinstance(tt2.getReferent(), GeoReferent)) and tt2.chars.is_cyrillic_letter == li[0].chars.is_cyrillic_letter): ok = True if (ok): sits = StreetItemToken.tryParseList( li[0].begin_token, None, 10) if (sits is not None and len(sits) > 1): ss = StreetDefineHelper._tryParseStreet( sits, False, False) if (ss is not None): del sits[0] if (StreetDefineHelper._tryParseStreet( sits, False, False) is None): ok = False if (ok): if (len(li) > 1 and li[1].typ == CityItemToken.ItemType.PROPERNAME and (li[1].whitespaces_before_count < 3)): ok = False else: mc = li[0].begin_token.getMorphClassInDictionary() if (mc.is_proper_name or mc.is_proper_surname or mc.is_adjective): ok = False else: npt = NounPhraseHelper.tryParse( li[0].begin_token, NounPhraseParseAttr.NO, 0) if (npt is not None and npt.end_char > li[0].end_char): ok = False if (AddressItemToken.tryAttachOrg(li[0].begin_token) is not None): ok = False break break if (always): if (li[0].whitespaces_before_count > 3 and li[0].doubtful and li[0].begin_token.getMorphClassInDictionary( ).is_proper_surname): pp = li[0].kit.processReferent("PERSON", li[0].begin_token) if (pp is not None): always = False if (li[0].begin_token.chars.is_latin_letter and li[0].begin_token == li[0].end_token): tt1 = li[0].end_token.next0_ if (tt1 is not None and tt1.isChar(',')): tt1 = tt1.next0_ if (((isinstance(tt1, TextToken)) and tt1.chars.is_latin_letter and (tt1.length_char < 3)) and not tt1.chars.is_all_lower): ok = False if (not ok and not always): return None city = None if (oi.value is not None and (isinstance(oi.value.referent, GeoReferent)) and not oi.value.owner.is_ext_ontology): city = (Utils.asObjectOrNull(oi.value.referent, GeoReferent)) else: city = GeoReferent() city._addName(nam) if (oi.value is not None and (isinstance(oi.value.referent, GeoReferent))): city._mergeSlots2( Utils.asObjectOrNull(oi.value.referent, GeoReferent), li[0].kit.base_language) if (not city.is_city): city._addTypCity(li[0].kit.base_language) return ReferentToken._new719(city, li[0].begin_token, li[0].end_token, li[0].morph)
def try_attach_org(t: 'Token', can_be_cyr: bool = False) -> 'ReferentToken': from pullenti.ner.org.internal.OrgItemNameToken import OrgItemNameToken if (t is None): return None br = False if (t.is_char('(') and t.next0_ is not None): t = t.next0_ br = True if (isinstance(t, NumberToken)): if (t.typ == NumberSpellingType.WORDS and t.morph.class0_.is_adjective and t.chars.is_capital_upper): pass else: return None else: if (t.chars.is_all_lower): return None if ((t.length_char < 3) and not t.chars.is_letter): return None if (not t.chars.is_latin_letter): if (not can_be_cyr or not t.chars.is_cyrillic_letter): return None t0 = t t1 = t0 nam_wo = 0 tok = None geo_ = None add_typ = None first_pass3312 = True while True: if first_pass3312: first_pass3312 = False else: t = t.next0_ if (not (t is not None)): break if (t != t0 and t.whitespaces_before_count > 1): break if (t.is_char(')')): break if (t.is_char('(') and t.next0_ is not None): if ((isinstance(t.next0_.get_referent(), GeoReferent)) and t.next0_.next0_ is not None and t.next0_.next0_.is_char(')')): geo_ = (Utils.asObjectOrNull(t.next0_.get_referent(), GeoReferent)) t = t.next0_.next0_ continue typ = OrgItemTypeToken.try_attach(t.next0_, True, None) if ((typ is not None and typ.end_token.next0_ is not None and typ.end_token.next0_.is_char(')')) and typ.chars.is_latin_letter): add_typ = typ t = typ.end_token.next0_ continue if (((isinstance(t.next0_, TextToken)) and t.next0_.next0_ is not None and t.next0_.next0_.is_char(')')) and t.next0_.chars.is_capital_upper): t = t.next0_.next0_ t1 = t continue break tok = OrgItemEngItem.try_attach(t, can_be_cyr) if (tok is None and t.is_char_of(".,") and t.next0_ is not None): tok = OrgItemEngItem.try_attach(t.next0_, can_be_cyr) if (tok is None and t.next0_.is_char_of(",.")): tok = OrgItemEngItem.try_attach(t.next0_.next0_, can_be_cyr) if (tok is not None): if (tok.length_char == 1 and t0.chars.is_cyrillic_letter): return None break if (t.is_hiphen and not t.is_whitespace_after and not t.is_whitespace_before): continue if (t.is_char_of("&+") or t.is_and): continue if (t.is_char('.')): if (t.previous is not None and t.previous.length_char == 1): continue elif (MiscHelper.can_be_start_of_sentence(t.next0_)): break if (not t.chars.is_latin_letter): if (not can_be_cyr or not t.chars.is_cyrillic_letter): break if (t.chars.is_all_lower): if (t.morph.class0_.is_preposition or t.morph.class0_.is_conjunction): continue if (br): continue break mc = t.get_morph_class_in_dictionary() if (mc.is_verb): if (t.next0_ is not None and t.next0_.morph.class0_.is_preposition): break if (t.next0_ is not None and t.next0_.is_value("OF", None)): break if (isinstance(t, TextToken)): nam_wo += 1 t1 = t if (tok is None): return None if (t0 == tok.begin_token): br2 = BracketHelper.try_parse(tok.end_token.next0_, BracketParseAttr.NO, 100) if (br2 is not None): org1 = OrganizationReferent() if (tok.short_value is not None): org1.add_type_str(tok.short_value) org1.add_type_str(tok.full_value) nam1 = MiscHelper.get_text_value(br2.begin_token, br2.end_token, GetTextAttr.NO) if (nam1 is not None): org1.add_name(nam1, True, None) return ReferentToken(org1, t0, br2.end_token) return None org0_ = OrganizationReferent() te = tok.end_token if (tok.is_bank): t1 = tok.end_token if (tok.full_value == "company" and (tok.whitespaces_after_count < 3)): tok1 = OrgItemEngItem.try_attach(tok.end_token.next0_, can_be_cyr) if (tok1 is not None): t1 = tok.end_token tok = tok1 te = tok.end_token if (tok.full_value == "company"): if (nam_wo == 0): return None nam = MiscHelper.get_text_value(t0, t1, GetTextAttr.IGNOREARTICLES) if (nam == "STOCK" and tok.full_value == "company"): return None alt_nam = None if (Utils.isNullOrEmpty(nam)): return None if (nam.find('(') > 0): i1 = nam.find('(') i2 = nam.find(')') if (i1 < i2): alt_nam = nam tai = None if ((i2 + 1) < len(nam)): tai = nam[i2:].strip() nam = nam[0:0 + i1].strip() if (tai is not None): nam = "{0} {1}".format(nam, tai) if (tok.is_bank): org0_.add_type_str( ("bank" if tok.kit.base_language.is_en else "банк")) org0_.add_profile(OrgProfile.FINANCE) if ((t1.next0_ is not None and t1.next0_.is_value("OF", None) and t1.next0_.next0_ is not None) and t1.next0_.next0_.chars.is_latin_letter): nam0 = OrgItemNameToken.try_attach(t1.next0_, None, False, False) if (nam0 is not None): te = nam0.end_token else: te = t1.next0_.next0_ nam = MiscHelper.get_text_value(t0, te, GetTextAttr.NO) if (isinstance(te.get_referent(), GeoReferent)): org0_._add_geo_object( Utils.asObjectOrNull(te.get_referent(), GeoReferent)) elif (t0 == t1): return None else: if (tok.short_value is not None): org0_.add_type_str(tok.short_value) org0_.add_type_str(tok.full_value) if (Utils.isNullOrEmpty(nam)): return None org0_.add_name(nam, True, None) if (alt_nam is not None): org0_.add_name(alt_nam, True, None) res = ReferentToken(org0_, t0, te) t = te while t.next0_ is not None: if (t.next0_.is_char_of(",.")): t = t.next0_ else: break if (t.whitespaces_after_count < 2): tok = OrgItemEngItem.try_attach(t.next0_, can_be_cyr) if (tok is not None): if (tok.short_value is not None): org0_.add_type_str(tok.short_value) org0_.add_type_str(tok.full_value) res.end_token = tok.end_token if (geo_ is not None): org0_._add_geo_object(geo_) if (add_typ is not None): org0_.add_type(add_typ, False) if (not br): return res t = res.end_token if (t.next0_ is None or t.next0_.is_char(')')): res.end_token = t.next0_ else: return None return res
def tryParse(t: 'Token', loc_onto: 'IntOntologyCollection') -> 'NamedItemToken': if (t is None): return None if (isinstance(t, ReferentToken)): r = t.getReferent() if ((r.type_name == "PERSON" or r.type_name == "PERSONPROPERTY" or (isinstance(r, GeoReferent))) or r.type_name == "ORGANIZATION"): return NamedItemToken._new1635(t, t, r, t.morph) return None typ = NamedItemToken.__m_types.tryParse(t, TerminParseAttr.NO) nam = NamedItemToken.__m_names.tryParse(t, TerminParseAttr.NO) if (typ is not None): if (not ((isinstance(t, TextToken)))): return None res = NamedItemToken._new1636(typ.begin_token, typ.end_token, typ.morph, typ.chars) res.kind = (Utils.valToEnum(typ.termin.tag, NamedEntityKind)) res.type_value = typ.termin.canonic_text if ((nam is not None and nam.end_token == typ.end_token and not t.chars.is_all_lower) and (Utils.valToEnum( nam.termin.tag, NamedEntityKind)) == res.kind): res.name_value = nam.termin.canonic_text res.is_wellknown = True return res if (nam is not None): if (nam.begin_token.chars.is_all_lower): return None res = NamedItemToken._new1636(nam.begin_token, nam.end_token, nam.morph, nam.chars) res.kind = (Utils.valToEnum(nam.termin.tag, NamedEntityKind)) res.name_value = nam.termin.canonic_text ok = True if (not t.is_whitespace_before and t.previous is not None): ok = False elif (not t.is_whitespace_after and t.next0_ is not None): if (t.next0_.isCharOf(",.;!?") and t.next0_.is_whitespace_after): pass else: ok = False if (ok): res.is_wellknown = True res.type_value = (Utils.asObjectOrNull(nam.termin.tag2, str)) return res adj = MiscLocationHelper.tryAttachNordWest(t) if (adj is not None): if (adj.morph.class0_.is_noun): if (adj.end_token.isValue("ВОСТОК", None)): if (adj.begin_token == adj.end_token): return None re = NamedItemToken._new1638(t, adj.end_token, adj.morph) re.kind = NamedEntityKind.LOCATION re.name_value = MiscHelper.getTextValue( t, adj.end_token, GetTextAttr.FIRSTNOUNGROUPTONOMINATIVE) re.is_wellknown = True return re return None if (adj.whitespaces_after_count > 2): return None if ((isinstance(adj.end_token.next0_, ReferentToken)) and (isinstance(adj.end_token.next0_.getReferent(), GeoReferent))): re = NamedItemToken._new1638(t, adj.end_token.next0_, adj.end_token.next0_.morph) re.kind = NamedEntityKind.LOCATION re.name_value = MiscHelper.getTextValue( t, adj.end_token.next0_, GetTextAttr.FIRSTNOUNGROUPTONOMINATIVE) re.is_wellknown = True re.ref = adj.end_token.next0_.getReferent() return re res = NamedItemToken.tryParse(adj.end_token.next0_, loc_onto) if (res is not None and res.kind == NamedEntityKind.LOCATION): s = adj.getNormalCaseText(MorphClass.ADJECTIVE, True, res.morph.gender, False) if (s is not None): if (res.name_value is None): res.name_value = s.upper() else: res.name_value = "{0} {1}".format( s.upper(), res.name_value) res.type_value = (None) res.begin_token = t res.chars = t.chars res.is_wellknown = True return res if (t.chars.is_capital_upper and not MiscHelper.canBeStartOfSentence(t)): npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0) if (npt is not None and len(npt.adjectives) > 0): test = NamedItemToken.tryParse(npt.noun.begin_token, loc_onto) if (test is not None and test.end_token == npt.end_token and test.type_value is not None): test.begin_token = t tmp = io.StringIO() for a in npt.adjectives: s = a.getNormalCaseText(MorphClass.ADJECTIVE, True, test.morph.gender, False) if (tmp.tell() > 0): print(' ', end="", file=tmp) print(s, end="", file=tmp) test.name_value = Utils.toStringStringIO(tmp) test.chars = t.chars if (test.kind == NamedEntityKind.LOCATION): test.is_wellknown = True return test if ((BracketHelper.isBracket(t, True) and t.next0_ is not None and t.next0_.chars.is_letter) and not t.next0_.chars.is_all_lower): br = BracketHelper.tryParse(t, BracketParseAttr.NO, 100) if (br is not None): res = NamedItemToken(t, br.end_token) res.is_in_bracket = True res.name_value = MiscHelper.getTextValue( t, br.end_token, GetTextAttr.NO) nam = NamedItemToken.__m_names.tryParse( t.next0_, TerminParseAttr.NO) if (nam is not None and nam.end_token == br.end_token.previous): res.kind = (Utils.valToEnum(nam.termin.tag, NamedEntityKind)) res.is_wellknown = True res.name_value = nam.termin.canonic_text return res if (((isinstance(t, TextToken)) and t.chars.is_letter and not t.chars.is_all_lower) and t.length_char > 2): res = NamedItemToken._new1638(t, t, t.morph) str0_ = (t).term if (str0_.endswith("О") or str0_.endswith("И") or str0_.endswith("Ы")): res.name_value = str0_ else: res.name_value = t.getNormalCaseText(None, False, MorphGender.UNDEFINED, False) res.chars = t.chars if (((not t.is_whitespace_after and t.next0_ is not None and t.next0_.is_hiphen) and (isinstance(t.next0_.next0_, TextToken)) and not t.next0_.next0_.is_whitespace_after) and t.chars.is_cyrillic_letter == t.next0_.next0_.chars.is_cyrillic_letter): res.end_token = t.next0_.next0_ t = res.end_token res.name_value = "{0}-{1}".format( res.name_value, t.getNormalCaseText(None, False, MorphGender.UNDEFINED, False)) return res return None
def __try_parse(t: 'Token', lev: int) -> 'BookLinkToken': if (t is None or lev > 3): return None if (t.is_char('[')): re = BookLinkToken.__try_parse(t.next0_, lev + 1) if (re is not None and re.end_token.next0_ is not None and re.end_token.next0_.is_char(']')): re.begin_token = t re.end_token = re.end_token.next0_ return re if (re is not None and re.end_token.is_char(']')): re.begin_token = t return re if (re is not None): if (re.typ == BookLinkTyp.SOSTAVITEL or re.typ == BookLinkTyp.EDITORS): return re br = BracketHelper.try_parse(t, BracketParseAttr.NO, 100) if (br is not None): if ((isinstance(br.end_token.previous, NumberToken)) and (br.length_char < 30)): return BookLinkToken._new329( t, br.end_token, BookLinkTyp.NUMBER, MiscHelper.get_text_value(br.begin_token.next0_, br.end_token.previous, GetTextAttr.NO)) t0 = t if (isinstance(t, ReferentToken)): if (isinstance(t.get_referent(), PersonReferent)): return BookLinkToken.try_parse_author( t, FioTemplateType.UNDEFINED) if (isinstance(t.get_referent(), GeoReferent)): return BookLinkToken._new326(t, t, BookLinkTyp.GEO, t.get_referent()) if (isinstance(t.get_referent(), DateReferent)): dr = Utils.asObjectOrNull(t.get_referent(), DateReferent) if (len(dr.slots) == 1 and dr.year > 0): return BookLinkToken._new329(t, t, BookLinkTyp.YEAR, str(dr.year)) if (dr.year > 0 and t.previous is not None and t.previous.is_comma): return BookLinkToken._new329(t, t, BookLinkTyp.YEAR, str(dr.year)) if (isinstance(t.get_referent(), OrganizationReferent)): org0_ = Utils.asObjectOrNull(t.get_referent(), OrganizationReferent) if (org0_.kind == OrganizationKind.PRESS): return BookLinkToken._new326(t, t, BookLinkTyp.PRESS, org0_) if (isinstance(t.get_referent(), UriReferent)): uri = Utils.asObjectOrNull(t.get_referent(), UriReferent) if ((uri.scheme == "http" or uri.scheme == "https" or uri.scheme == "ftp") or uri.scheme is None): return BookLinkToken._new326(t, t, BookLinkTyp.URL, uri) tok_ = BookLinkToken.__m_termins.try_parse(t, TerminParseAttr.NO) if (tok_ is not None): typ_ = Utils.valToEnum(tok_.termin.tag, BookLinkTyp) ok = True if (typ_ == BookLinkTyp.TYPE or typ_ == BookLinkTyp.NAMETAIL or typ_ == BookLinkTyp.ELECTRONRES): if (t.previous is not None and ((t.previous.is_char_of(".:[") or t.previous.is_hiphen))): pass else: ok = False if (ok): return BookLinkToken._new329(t, tok_.end_token, typ_, tok_.termin.canonic_text) if (typ_ == BookLinkTyp.ELECTRONRES): tt = tok_.end_token.next0_ first_pass3019 = True while True: if first_pass3019: first_pass3019 = False else: tt = tt.next0_ if (not (tt is not None)): break if ((isinstance(tt, TextToken)) and not tt.chars.is_letter): continue if (isinstance(tt.get_referent(), UriReferent)): return BookLinkToken._new326(t, tt, BookLinkTyp.ELECTRONRES, tt.get_referent()) break if (t.is_char('/')): res = BookLinkToken._new329(t, t, BookLinkTyp.DELIMETER, "/") if (t.next0_ is not None and t.next0_.is_char('/')): res.end_token = t.next0_ res.value = "//" if (not t.is_whitespace_before and not t.is_whitespace_after): coo = 3 no = True tt = t.next0_ while tt is not None and coo > 0: vvv = BookLinkToken.try_parse(tt, lev + 1) if (vvv is not None and vvv.typ != BookLinkTyp.NUMBER): no = False break tt = tt.next0_ coo -= 1 if (no): return None return res if ((isinstance(t, NumberToken)) and t.int_value is not None and t.typ == NumberSpellingType.DIGIT): res = BookLinkToken._new329(t, t, BookLinkTyp.NUMBER, str(t.value)) val = t.int_value if (val >= 1930 and (val < 2030)): res.typ = BookLinkTyp.YEAR if (t.next0_ is not None and t.next0_.is_char('.')): res.end_token = t.next0_ elif ((t.next0_ is not None and t.next0_.length_char == 1 and not t.next0_.chars.is_letter) and t.next0_.is_whitespace_after): res.end_token = t.next0_ elif (isinstance(t.next0_, TextToken)): term = t.next0_.term if (((term == "СТР" or term == "C" or term == "С") or term == "P" or term == "S") or term == "PAGES"): res.end_token = t.next0_ res.typ = BookLinkTyp.PAGES res.value = str(t.value) return res if (isinstance(t, TextToken)): term = t.term if ((((( ((term == "СТР" or term == "C" or term == "С") or term == "ТОМ" or term == "T") or term == "Т" or term == "P") or term == "PP" or term == "V") or term == "VOL" or term == "S") or term == "СТОР" or t.is_value("PAGE", None)) or t.is_value("СТРАНИЦА", "СТОРІНКА")): tt = t.next0_ while tt is not None: if (tt.is_char_of(".:~")): tt = tt.next0_ else: break if (isinstance(tt, NumberToken)): res = BookLinkToken._new328(t, tt, BookLinkTyp.PAGERANGE) tt0 = tt tt1 = tt tt = tt.next0_ first_pass3020 = True while True: if first_pass3020: first_pass3020 = False else: tt = tt.next0_ if (not (tt is not None)): break if (tt.is_char_of(",") or tt.is_hiphen): if (isinstance(tt.next0_, NumberToken)): tt = tt.next0_ res.end_token = tt tt1 = tt continue break res.value = MiscHelper.get_text_value( tt0, tt1, GetTextAttr.NO) return res if ((term == "M" or term == "М" or term == "СПБ") or term == "K" or term == "К"): if (t.next0_ is not None and t.next0_.is_char_of(":;")): re = BookLinkToken._new328(t, t.next0_, BookLinkTyp.GEO) return re if (t.next0_ is not None and t.next0_.is_char_of(".")): res = BookLinkToken._new328(t, t.next0_, BookLinkTyp.GEO) if (t.next0_.next0_ is not None and t.next0_.next0_.is_char_of(":;")): res.end_token = t.next0_.next0_ elif (t.next0_.next0_ is not None and (isinstance(t.next0_.next0_, NumberToken))): pass elif (t.next0_.next0_ is not None and t.next0_.next0_.is_comma and (isinstance(t.next0_.next0_.next0_, NumberToken))): pass else: return None return res if (term == "ПЕР" or term == "ПЕРЕВ" or term == "ПЕРЕВОД"): tt = t if (tt.next0_ is not None and tt.next0_.is_char('.')): tt = tt.next0_ if (tt.next0_ is not None and ((tt.next0_.is_value("C", None) or tt.next0_.is_value("С", None)))): tt = tt.next0_ if (tt.next0_ is None or tt.whitespaces_after_count > 2): return None re = BookLinkToken._new328(t, tt.next0_, BookLinkTyp.TRANSLATE) return re if (term == "ТАМ" or term == "ТАМЖЕ"): res = BookLinkToken._new328(t, t, BookLinkTyp.TAMZE) if (t.next0_ is not None and t.next0_.is_value("ЖЕ", None)): res.end_token = t.next0_ return res if (((term == "СМ" or term == "CM" or term == "НАПР") or term == "НАПРИМЕР" or term == "SEE") or term == "ПОДРОБНЕЕ" or term == "ПОДРОБНО"): res = BookLinkToken._new328(t, t, BookLinkTyp.SEE) t = t.next0_ first_pass3021 = True while True: if first_pass3021: first_pass3021 = False else: t = t.next0_ if (not (t is not None)): break if (t.is_char_of(".:") or t.is_value("ALSO", None)): res.end_token = t continue if (t.is_value("В", None) or t.is_value("IN", None)): res.end_token = t continue vvv = BookLinkToken.__try_parse(t, lev + 1) if (vvv is not None and vvv.typ == BookLinkTyp.SEE): res.end_token = vvv.end_token break break return res if (term == "БОЛЕЕ"): vvv = BookLinkToken.__try_parse(t.next0_, lev + 1) if (vvv is not None and vvv.typ == BookLinkTyp.SEE): vvv.begin_token = t return vvv no = MiscHelper.check_number_prefix(t) if (isinstance(no, NumberToken)): return BookLinkToken._new328(t, no, BookLinkTyp.N) if (((term == "B" or term == "В")) and (isinstance(t.next0_, NumberToken)) and (isinstance(t.next0_.next0_, TextToken))): term2 = t.next0_.next0_.term if (((term2 == "Т" or term2 == "T" or term2.startswith("ТОМ")) or term2 == "TT" or term2 == "ТТ") or term2 == "КН" or term2.startswith("КНИГ")): return BookLinkToken._new328(t, t.next0_.next0_, BookLinkTyp.VOLUME) if (t.is_char('(')): if (((isinstance(t.next0_, NumberToken)) and t.next0_.int_value is not None and t.next0_.next0_ is not None) and t.next0_.next0_.is_char(')')): num = t.next0_.int_value if (num > 1900 and num <= 2040): if (num <= datetime.datetime.now().year): return BookLinkToken._new329(t, t.next0_.next0_, BookLinkTyp.YEAR, str(num)) if (((isinstance(t.next0_, ReferentToken)) and (isinstance(t.next0_.get_referent(), DateReferent)) and t.next0_.next0_ is not None) and t.next0_.next0_.is_char(')')): num = t.next0_.get_referent().year if (num > 0): return BookLinkToken._new329(t, t.next0_.next0_, BookLinkTyp.YEAR, str(num)) return None
def tryAttachTerritory( li: typing.List['TerrItemToken'], ad: 'AnalyzerData', attach_always: bool = False, cits: typing.List['CityItemToken'] = None, exists: typing.List['GeoReferent'] = None) -> 'ReferentToken': if (li is None or len(li) == 0): return None ex_obj = None new_name = None adj_list = list() noun = None add_noun = None rt = TerrAttachHelper.__tryAttachMoscowAO(li, ad) if (rt is not None): return rt if (li[0].termin_item is not None and li[0].termin_item.canonic_text == "ТЕРРИТОРИЯ"): res2 = TerrAttachHelper.__tryAttachPureTerr(li, ad) return res2 if (len(li) == 2): if (li[0].rzd is not None and li[1].rzd_dir is not None): rzd = GeoReferent() rzd._addName(li[1].rzd_dir) rzd._addTypTer(li[0].kit.base_language) rzd.addSlot(GeoReferent.ATTR_REF, li[0].rzd.referent, False, 0) rzd.addExtReferent(li[0].rzd) return ReferentToken(rzd, li[0].begin_token, li[1].end_token) if (li[1].rzd is not None and li[0].rzd_dir is not None): rzd = GeoReferent() rzd._addName(li[0].rzd_dir) rzd._addTypTer(li[0].kit.base_language) rzd.addSlot(GeoReferent.ATTR_REF, li[1].rzd.referent, False, 0) rzd.addExtReferent(li[1].rzd) return ReferentToken(rzd, li[0].begin_token, li[1].end_token) can_be_city_before = False adj_terr_before = False if (cits is not None): if (cits[0].typ == CityItemToken.ItemType.CITY): can_be_city_before = True elif (cits[0].typ == CityItemToken.ItemType.NOUN and len(cits) > 1): can_be_city_before = True k = 0 while k < len(li): if (li[k].onto_item is not None): if (ex_obj is not None or new_name is not None): break if (noun is not None): if (k == 1): if (noun.termin_item.canonic_text == "РАЙОН" or noun.termin_item.canonic_text == "ОБЛАСТЬ" or noun.termin_item.canonic_text == "СОЮЗ"): if (isinstance(li[k].onto_item.referent, GeoReferent)): if ((li[k].onto_item.referent).is_state): break ok = False tt = li[k].end_token.next0_ if (tt is None): ok = True elif (tt.isCharOf(",.")): ok = True if (not ok): ok = MiscLocationHelper.checkGeoObjectBefore( li[0].begin_token) if (not ok): adr = AddressItemToken.tryParse( tt, None, False, False, None) if (adr is not None): if (adr.typ == AddressItemToken.ItemType.STREET): ok = True if (not ok): break if (li[k].onto_item is not None): if (noun.begin_token.isValue("МО", None) or noun.begin_token.isValue("ЛО", None)): return None ex_obj = li[k] elif (li[k].termin_item is not None): if (noun is not None): break if (li[k].termin_item.is_always_prefix and k > 0): break if (k > 0 and li[k].is_doubt): if (li[k].begin_token == li[k].end_token and li[k].begin_token.isValue("ЗАО", None)): break if (li[k].termin_item.is_adjective or li[k].is_geo_in_dictionary): adj_list.append(li[k]) else: if (ex_obj is not None): geo_ = Utils.asObjectOrNull(ex_obj.onto_item.referent, GeoReferent) if (geo_ is None): break if (ex_obj.is_adjective and ((li[k].termin_item.canonic_text == "СОЮЗ" or li[k].termin_item.canonic_text == "ФЕДЕРАЦИЯ"))): str0_ = str(ex_obj.onto_item) if (not li[k].termin_item.canonic_text in str0_): return None if (li[k].termin_item.canonic_text == "РАЙОН" or li[k].termin_item.canonic_text == "ОКРУГ" or li[k].termin_item.canonic_text == "КРАЙ"): tmp = io.StringIO() for s in geo_.slots: if (s.type_name == GeoReferent.ATTR_TYPE): print("{0};".format(s.value), end="", file=tmp, flush=True) if (not li[k].termin_item.canonic_text in Utils.toStringStringIO(tmp).upper()): if (k != 1 or new_name is not None): break new_name = li[0] new_name.is_adjective = True new_name.onto_item = (None) ex_obj = (None) noun = li[k] if (k == 0): tt = TerrItemToken.tryParse(li[k].begin_token.previous, None, True, False) if (tt is not None and tt.morph.class0_.is_adjective): adj_terr_before = True else: if (ex_obj is not None): break if (new_name is not None): break new_name = li[k] k += 1 name = None alt_name = None full_name = None morph_ = None if (ex_obj is not None): if (ex_obj.is_adjective and not ex_obj.morph.language.is_en and noun is None): if (attach_always and ex_obj.end_token.next0_ is not None): npt = NounPhraseHelper.tryParse(ex_obj.begin_token, NounPhraseParseAttr.NO, 0) if (ex_obj.end_token.next0_.is_comma_and): pass elif (npt is None): pass else: str0_ = StreetItemToken.tryParse( ex_obj.end_token.next0_, None, False, None, False) if (str0_ is not None): if (str0_.typ == StreetItemType.NOUN and str0_.end_token == npt.end_token): return None else: cit = CityItemToken.tryParse(ex_obj.end_token.next0_, None, False, None) if (cit is not None and ((cit.typ == CityItemToken.ItemType.NOUN or cit.typ == CityItemToken.ItemType.CITY))): npt = NounPhraseHelper.tryParse( ex_obj.begin_token, NounPhraseParseAttr.NO, 0) if (npt is not None and npt.end_token == cit.end_token): pass else: return None elif (ex_obj.begin_token.isValue("ПОДНЕБЕСНЫЙ", None)): pass else: return None if (noun is None and ex_obj.can_be_city): cit0 = CityItemToken.tryParseBack(ex_obj.begin_token.previous) if (cit0 is not None and cit0.typ != CityItemToken.ItemType.PROPERNAME): return None if (ex_obj.is_doubt and noun is None): ok2 = False if (TerrAttachHelper.__canBeGeoAfter(ex_obj.end_token.next0_)): ok2 = True elif (not ex_obj.can_be_surname and not ex_obj.can_be_city): if ((ex_obj.end_token.next0_ is not None and ex_obj.end_token.next0_.isChar(')') and ex_obj.begin_token.previous is not None) and ex_obj.begin_token.previous.isChar('(')): ok2 = True elif (ex_obj.chars.is_latin_letter and ex_obj.begin_token.previous is not None): if (ex_obj.begin_token.previous.isValue("IN", None)): ok2 = True elif (ex_obj.begin_token.previous.isValue("THE", None) and ex_obj.begin_token.previous.previous is not None and ex_obj.begin_token.previous.previous.isValue( "IN", None)): ok2 = True if (not ok2): cit0 = CityItemToken.tryParseBack( ex_obj.begin_token.previous) if (cit0 is not None and cit0.typ != CityItemToken.ItemType.PROPERNAME): pass elif (MiscLocationHelper.checkGeoObjectBefore( ex_obj.begin_token.previous)): pass else: return None name = ex_obj.onto_item.canonic_text morph_ = ex_obj.morph elif (new_name is not None): if (noun is None): return None j = 1 while j < k: if (li[j].is_newline_before and not li[0].is_newline_before): return None j += 1 morph_ = noun.morph if (new_name.is_adjective): if (noun.termin_item.acronym == "АО"): if (noun.begin_token != noun.end_token): return None if (new_name.morph.gender != MorphGender.FEMINIE): return None geo_before = None tt0 = li[0].begin_token.previous if (tt0 is not None and tt0.is_comma_and): tt0 = tt0.previous if (not li[0].is_newline_before and tt0 is not None): geo_before = (Utils.asObjectOrNull(tt0.getReferent(), GeoReferent)) if (Utils.indexOfList(li, noun, 0) < Utils.indexOfList( li, new_name, 0)): if (noun.termin_item.is_state): return None if (new_name.can_be_surname and geo_before is None): if (((noun.morph.case_) & new_name.morph.case_).is_undefined): return None if (MiscHelper.isExistsInDictionary( new_name.begin_token, new_name.end_token, (MorphClass.ADJECTIVE) | MorphClass.PRONOUN | MorphClass.VERB)): if (noun.begin_token != new_name.begin_token): if (geo_before is None): if (len(li) == 2 and TerrAttachHelper.__canBeGeoAfter( li[1].end_token.next0_)): pass elif (len(li) == 3 and li[2].termin_item is not None and TerrAttachHelper.__canBeGeoAfter( li[2].end_token.next0_)): pass elif (new_name.is_geo_in_dictionary): pass elif (new_name.end_token.is_newline_after): pass else: return None npt = NounPhraseHelper.tryParse( new_name.end_token, NounPhraseParseAttr.PARSEPRONOUNS, 0) if (npt is not None and npt.end_token != new_name.end_token): if (len(li) >= 3 and li[2].termin_item is not None and npt.end_token == li[2].end_token): add_noun = li[2] else: return None rtp = new_name.kit.processReferent("PERSON", new_name.begin_token) if (rtp is not None): return None name = ProperNameHelper.getNameEx(new_name.begin_token, new_name.end_token, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False) else: ok = False if (((k + 1) < len(li)) and li[k].termin_item is None and li[k + 1].termin_item is not None): ok = True elif ((k < len(li)) and li[k].onto_item is not None): ok = True elif (k == len(li) and not new_name.is_adj_in_dictionary): ok = True elif (MiscLocationHelper.checkGeoObjectBefore( li[0].begin_token) or can_be_city_before): ok = True elif (MiscLocationHelper.checkGeoObjectAfter( li[k - 1].end_token)): ok = True elif (len(li) == 3 and k == 2): cit = CityItemToken.tryParse(li[2].begin_token, None, False, None) if (cit is not None): if (cit.typ == CityItemToken.ItemType.CITY or cit.typ == CityItemToken.ItemType.NOUN): ok = True elif (len(li) == 2): ok = TerrAttachHelper.__canBeGeoAfter( li[len(li) - 1].end_token.next0_) if (not ok and not li[0].is_newline_before and not li[0].chars.is_all_lower): rt00 = li[0].kit.processReferent( "PERSONPROPERTY", li[0].begin_token.previous) if (rt00 is not None): ok = True if (noun.termin_item is not None and noun.termin_item.is_strong and new_name.is_adjective): ok = True if (noun.is_doubt and len(adj_list) == 0 and geo_before is None): return None name = ProperNameHelper.getNameEx(new_name.begin_token, new_name.end_token, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False) if (not ok and not attach_always): if (MiscHelper.isExistsInDictionary( new_name.begin_token, new_name.end_token, (MorphClass.ADJECTIVE) | MorphClass.PRONOUN | MorphClass.VERB)): if (exists is not None): for e0_ in exists: if (e0_.findSlot(GeoReferent.ATTR_NAME, name, True) is not None): ok = True break if (not ok): return None full_name = "{0} {1}".format( ProperNameHelper.getNameEx(li[0].begin_token, noun.begin_token.previous, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False), noun.termin_item.canonic_text) else: if (not attach_always or ((noun.termin_item is not None and noun.termin_item.canonic_text == "ФЕДЕРАЦИЯ"))): is_latin = noun.chars.is_latin_letter and new_name.chars.is_latin_letter if (Utils.indexOfList(li, noun, 0) > Utils.indexOfList( li, new_name, 0)): if (not is_latin): return None if (not new_name.is_district_name and not BracketHelper.canBeStartOfSequence( new_name.begin_token, False, False)): if (len(adj_list) == 0 and MiscHelper.isExistsInDictionary( new_name.begin_token, new_name.end_token, (MorphClass.NOUN) | MorphClass.PRONOUN)): if (len(li) == 2 and noun.is_city_region and (noun.whitespaces_after_count < 2)): pass else: return None if (not is_latin): if ((noun.termin_item.is_region and not attach_always and ((not adj_terr_before or new_name.is_doubt))) and not noun.is_city_region and not noun.termin_item.is_specific_prefix): if (not MiscLocationHelper. checkGeoObjectBefore( noun.begin_token)): if (not noun.is_doubt and noun.begin_token != noun.end_token): pass else: return None if (noun.is_doubt and len(adj_list) == 0): if (((noun.termin_item.acronym == "МО" or noun.termin_item.acronym == "ЛО")) and k == (len(li) - 1) and li[k].termin_item is not None): add_noun = li[k] k += 1 else: return None pers = new_name.kit.processReferent( "PERSON", new_name.begin_token) if (pers is not None): return None name = MiscHelper.getTextValue(new_name.begin_token, new_name.end_token, GetTextAttr.NO) if (new_name.begin_token != new_name.end_token): ttt = new_name.begin_token.next0_ while ttt is not None and ttt.end_char <= new_name.end_char: if (ttt.chars.is_letter): ty = TerrItemToken.tryParse( ttt, None, False, False) if ((ty is not None and ty.termin_item is not None and noun is not None) and ((noun.termin_item.canonic_text in ty.termin_item.canonic_text or ty.termin_item.canonic_text in noun.termin_item.canonic_text))): name = MiscHelper.getTextValue( new_name.begin_token, ttt.previous, GetTextAttr.NO) break ttt = ttt.next0_ if (len(adj_list) > 0): npt = NounPhraseHelper.tryParse(adj_list[0].begin_token, NounPhraseParseAttr.NO, 0) if (npt is not None and npt.end_token == noun.end_token): alt_name = "{0} {1}".format( npt.getNormalCaseText(None, False, MorphGender.UNDEFINED, False), name) else: if ((len(li) == 1 and noun is not None and noun.end_token.next0_ is not None) and (isinstance( noun.end_token.next0_.getReferent(), GeoReferent))): g = Utils.asObjectOrNull(noun.end_token.next0_.getReferent(), GeoReferent) if (noun.termin_item is not None): tyy = noun.termin_item.canonic_text.lower() ooo = False if (g.findSlot(GeoReferent.ATTR_TYPE, tyy, True) is not None): ooo = True elif (tyy.endswith("район") and g.findSlot( GeoReferent.ATTR_TYPE, "район", True) is not None): ooo = True if (ooo): return ReferentToken._new719(g, noun.begin_token, noun.end_token.next0_, noun.begin_token.morph) if ((len(li) == 1 and noun == li[0] and li[0].termin_item is not None) and TerrItemToken.tryParse( li[0].end_token.next0_, None, True, False) is None and TerrItemToken.tryParse(li[0].begin_token.previous, None, True, False) is None): if (li[0].morph.number == MorphNumber.PLURAL): return None cou = 0 str0_ = li[0].termin_item.canonic_text.lower() tt = li[0].begin_token.previous first_pass2898 = True while True: if first_pass2898: first_pass2898 = False else: tt = tt.previous if (not (tt is not None)): break if (tt.is_newline_after): cou += 10 else: cou += 1 if (cou > 500): break g = Utils.asObjectOrNull(tt.getReferent(), GeoReferent) if (g is None): continue ok = True cou = 0 tt = li[0].end_token.next0_ first_pass2899 = True while True: if first_pass2899: first_pass2899 = False else: tt = tt.next0_ if (not (tt is not None)): break if (tt.is_newline_before): cou += 10 else: cou += 1 if (cou > 500): break tee = TerrItemToken.tryParse(tt, None, True, False) if (tee is None): continue ok = False break if (ok): ii = 0 while g is not None and (ii < 3): if (g.findSlot(GeoReferent.ATTR_TYPE, str0_, True) is not None): return ReferentToken._new719( g, li[0].begin_token, li[0].end_token, noun.begin_token.morph) g = g.higher ii += 1 break return None ter = None if (ex_obj is not None and (isinstance(ex_obj.tag, GeoReferent))): ter = (Utils.asObjectOrNull(ex_obj.tag, GeoReferent)) else: ter = GeoReferent() if (ex_obj is not None): geo_ = Utils.asObjectOrNull(ex_obj.onto_item.referent, GeoReferent) if (geo_ is not None and not geo_.is_city): ter._mergeSlots2(geo_, li[0].kit.base_language) else: ter._addName(name) if (noun is None and ex_obj.can_be_city): ter._addTypCity(li[0].kit.base_language) else: pass elif (new_name is not None): ter._addName(name) if (alt_name is not None): ter._addName(alt_name) if (noun is not None): if (noun.termin_item.canonic_text == "АО"): ter._addTyp( ("АВТОНОМНИЙ ОКРУГ" if li[0].kit.base_language.is_ua else "АВТОНОМНЫЙ ОКРУГ")) elif (noun.termin_item.canonic_text == "МУНИЦИПАЛЬНОЕ СОБРАНИЕ" or noun.termin_item.canonic_text == "МУНІЦИПАЛЬНЕ ЗБОРИ"): ter._addTyp(("МУНІЦИПАЛЬНЕ УТВОРЕННЯ" if li[0].kit.base_language.is_ua else "МУНИЦИПАЛЬНОЕ ОБРАЗОВАНИЕ")) elif (noun.termin_item.acronym == "МО" and add_noun is not None): ter._addTyp(add_noun.termin_item.canonic_text) else: if (noun.termin_item.canonic_text == "СОЮЗ" and ex_obj is not None and ex_obj.end_char > noun.end_char): return ReferentToken._new719(ter, ex_obj.begin_token, ex_obj.end_token, ex_obj.morph) ter._addTyp(noun.termin_item.canonic_text) if (noun.termin_item.is_region and ter.is_state): ter._addTypReg(li[0].kit.base_language) if (ter.is_state and ter.is_region): for a in adj_list: if (a.termin_item.is_region): ter._addTypReg(li[0].kit.base_language) break if (ter.is_state): if (full_name is not None): ter._addName(full_name) res = ReferentToken(ter, li[0].begin_token, li[k - 1].end_token) if (noun is not None and noun.morph.class0_.is_noun): res.morph = noun.morph else: res.morph = MorphCollection() ii = 0 while ii < k: for v in li[ii].morph.items: bi = MorphBaseInfo(v) if (noun is not None): if (bi.class0_.is_adjective): bi.class0_ = MorphClass.NOUN res.morph.addItem(bi) ii += 1 if (li[0].termin_item is not None and li[0].termin_item.is_specific_prefix): res.begin_token = li[0].end_token.next0_ if (add_noun is not None and add_noun.end_char > res.end_char): res.end_token = add_noun.end_token if ((isinstance(res.begin_token.previous, TextToken)) and (res.whitespaces_before_count < 2)): tt = Utils.asObjectOrNull(res.begin_token.previous, TextToken) if (tt.term == "АР"): for ty in ter.typs: if ("республика" in ty or "республіка" in ty): res.begin_token = tt break return res
def _canBeHigherToken(rhi: 'Token', rlo: 'Token') -> bool: if (rhi is None or rlo is None): return False if (rhi.morph.case_.is_instrumental and not rhi.morph.case_.is_genitive): return False hi = Utils.asObjectOrNull(rhi.getReferent(), GeoReferent) lo = Utils.asObjectOrNull(rlo.getReferent(), GeoReferent) if (hi is None or lo is None): return False citi_in_reg = False if (hi.is_city and lo.is_region): if (hi.findSlot(GeoReferent.ATTR_TYPE, "город", True) is not None or hi.findSlot(GeoReferent.ATTR_TYPE, "місто", True) is not None or hi.findSlot(GeoReferent.ATTR_TYPE, "city", True) is not None): s = GeoOwnerHelper.__getTypesString(lo) if ((("район" in s or "административный округ" in s or "муниципальный округ" in s) or "адміністративний округ" in s or "муніципальний округ" in s) or lo.findSlot( GeoReferent.ATTR_TYPE, "округ", True) is not None): if (rhi.next0_ == rlo and rlo.morph.case_.is_genitive): citi_in_reg = True if (hi.is_region and lo.is_city): if (lo.findSlot(GeoReferent.ATTR_TYPE, "город", True) is not None or lo.findSlot(GeoReferent.ATTR_TYPE, "місто", True) is not None or lo.findSlot(GeoReferent.ATTR_TYPE, "city", True) is not None): s = GeoOwnerHelper.__getTypesString(hi) if (s == "район;"): if (hi.higher is not None and hi.higher.is_region): citi_in_reg = True elif (rhi.end_char <= rlo.begin_char and rhi.next0_.is_comma and not rlo.morph.case_.is_genitive): citi_in_reg = True elif (rhi.end_char <= rlo.begin_char and rhi.next0_.is_comma): citi_in_reg = True else: citi_in_reg = True if (rhi.end_char <= rlo.begin_char): if (not rhi.morph.class0_.is_adjective): if (hi.is_state and not rhi.chars.is_latin_letter): return False if (rhi.is_newline_after or rlo.is_newline_before): if (not citi_in_reg): return False else: pass if (rlo.previous is not None and rlo.previous.morph.class0_.is_preposition): if (rlo.previous.morph.language.is_ua): if ((rlo.previous.isValue("У", None) and not rlo.morph.case_.is_dative and not rlo.morph.case_.is_prepositional) and not rlo.morph.case_.is_undefined): return False if (rlo.previous.isValue("З", None) and not rlo.morph.case_.is_genitive and not rlo.morph.case_.is_undefined): return False else: if ((rlo.previous.isValue("В", None) and not rlo.morph.case_.is_dative and not rlo.morph.case_.is_prepositional) and not rlo.morph.case_.is_undefined): return False if (rlo.previous.isValue("ИЗ", None) and not rlo.morph.case_.is_genitive and not rlo.morph.case_.is_undefined): return False if (not GeoOwnerHelper.canBeHigher(hi, lo)): return citi_in_reg return True
def toString(self, short_variant: bool, lang: 'MorphLang', lev: int = 0) -> str: res = io.StringIO() ki = self.kind str0_ = (Utils.asObjectOrNull( MetaInstrumentBlock.GLOBAL_META.kind_feature. convertInnerValueToOuterValue(Utils.enumToString(ki), lang), str)) if (str0_ is not None): print(str0_, end="", file=res) if (self.kind2 != InstrumentKind.UNDEFINED): str0_ = (Utils.asObjectOrNull( MetaInstrumentBlock.GLOBAL_META.kind_feature. convertInnerValueToOuterValue( Utils.enumToString(self.kind2), lang), str)) if (str0_ is not None): print(" ({0})".format(str0_), end="", file=res, flush=True) if (self.number > 0): if (ki == InstrumentKind.TABLE): print(" {0} строк, {1} столбцов".format( len(self.children), self.number), end="", file=res, flush=True) else: print(" №{0}".format(self.number), end="", file=res, flush=True) if (self.sub_number > 0): print(".{0}".format(self.sub_number), end="", file=res, flush=True) if (self.sub_number2 > 0): print(".{0}".format(self.sub_number2), end="", file=res, flush=True) if (self.sub_number3 > 0): print(".{0}".format(self.sub_number3), end="", file=res, flush=True) if (self.min_number > 0): for i in range(res.tell() - 1, -1, -1): if (Utils.getCharAtStringIO(res, i) == ' ' or Utils.getCharAtStringIO(res, i) == '.'): Utils.insertStringIO( res, i + 1, "{0}-".format(self.min_number)) break ignore_ref = False if (self.is_expired): print(" (утратить силу)", end="", file=res) ignore_ref = True elif (ki != InstrumentKind.EDITIONS and ki != InstrumentKind.APPROVED and (isinstance(self.ref, DecreeReferent))): print(" (*)", end="", file=res) ignore_ref = True str0_ = self.getStringValue(InstrumentBlockReferent.ATTR_NAME) if ((str0_) is None): str0_ = self.getStringValue(InstrumentBlockReferent.ATTR_VALUE) if (str0_ is not None): if (len(str0_) > 100): str0_ = (str0_[0:0 + 100] + "...") print(" \"{0}\"".format(str0_), end="", file=res, flush=True) elif (not ignore_ref and (isinstance(self.ref, Referent)) and (lev < 30)): print(" \"{0}\"".format( self.ref.toString(short_variant, lang, lev + 1)), end="", file=res, flush=True) return Utils.toStringStringIO(res).strip()
def __try1(li: typing.List['CityItemToken'], oi: 'IntOntologyItem', ad: 'AnalyzerDataWithOntology') -> 'ReferentToken': oi.value = (None) if (li is None or (len(li) < 1)): return None elif (li[0].typ != CityItemToken.ItemType.CITY): if (len(li) != 2 or li[0].typ != CityItemToken.ItemType.PROPERNAME or li[1].typ != CityItemToken.ItemType.NOUN): return None i = 1 oi.value = li[0].onto_item ok = not li[0].doubtful if ((ok and li[0].onto_item is not None and li[0].onto_item.misc_attr is None) and ad is not None): if (li[0].onto_item.owner != ad.local_ontology and not li[0].onto_item.owner.is_ext_ontology): if (li[0].begin_token.previous is not None and li[0].begin_token.previous.isValue("В", None)): pass else: ok = False if (len(li) == 1 and li[0].begin_token.morph.class0_.is_adjective): sits = StreetItemToken.tryParseList(li[0].begin_token, None, 3) if (sits is not None and len(sits) == 2 and sits[1].typ == StreetItemType.NOUN): return None typ = None alttyp = None mc = li[0].morph if (i < len(li)): if (li[i].typ == CityItemToken.ItemType.NOUN): at = None if (not li[i].chars.is_all_lower and (li[i].whitespaces_after_count < 2)): sit = StreetItemToken.tryParse(li[i].end_token.next0_, None, False, None, False) if (sit is not None and sit.typ == StreetItemType.NOUN): at = AddressItemToken.tryParse(li[i].begin_token, None, False, False, None) if (at is not None): at2 = AddressItemToken.tryParse( li[i].end_token.next0_, None, False, False, None) if (at2 is not None and at2.typ == AddressItemToken.ItemType.STREET): at = (None) if (at is None): typ = li[i].value alttyp = li[i].alt_value if (li[i].begin_token.isValue("СТ", None) and li[i].begin_token.chars.is_all_upper): return None if ((i + 1) == len(li)): ok = True if (not li[i].morph.case_.is_undefined): mc = li[i].morph i += 1 elif (ok): i += 1 else: tt0 = li[0].begin_token.previous if ((isinstance(tt0, TextToken)) and (tt0.whitespaces_after_count < 3)): if (tt0.isValue("МЭР", "МЕР") or tt0.isValue("ГЛАВА", None) or tt0.isValue("ГРАДОНАЧАЛЬНИК", None)): ok = True i += 1 if (not ok and oi.value is not None and (len(oi.value.canonic_text) < 4)): return None if (not ok and li[0].begin_token.morph.class0_.is_proper_name): return None if (not ok): if (not MiscHelper.isExistsInDictionary( li[0].begin_token, li[0].end_token, (MorphClass.ADJECTIVE) | MorphClass.NOUN | MorphClass.PRONOUN)): ok = (li[0].geo_object_before or li[i - 1].geo_object_after) if (ok and li[0].begin_token == li[0].end_token): mcc = li[0].begin_token.getMorphClassInDictionary() if (mcc.is_proper_name or mcc.is_proper_surname): ok = False elif (li[0].geo_object_before and (li[0].whitespaces_after_count < 2)): ad1 = AddressItemToken.tryParse( li[0].begin_token, None, False, False, None) if (ad1 is not None and ad1.typ == AddressItemToken.ItemType.STREET): ad2 = AddressItemToken.tryParse( li[0].end_token.next0_, None, False, False, None) if (ad2 is None or ad2.typ != AddressItemToken.ItemType.STREET): ok = False elif (AddressItemToken.tryAttachOrg(li[0].begin_token) is not None): ok = False if (ok): if (li[0].kit.processReferent("PERSON", li[0].begin_token) is not None): ok = False if (not ok): ok = CityAttachHelper.checkYearAfter(li[0].end_token.next0_) if (not ok and ((not li[0].begin_token.morph.class0_.is_adjective or li[0].begin_token != li[0].end_token))): ok = CityAttachHelper.checkCityAfter(li[0].end_token.next0_) if (not ok): return None if (i < len(li)): del li[i:i + len(li) - i] rt = None if (oi.value is None): if (li[0].value is not None and li[0].higher_geo is not None): cap = GeoReferent() cap._addName(li[0].value) cap._addTypCity(li[0].kit.base_language) cap.higher = li[0].higher_geo if (typ is not None): cap._addTyp(typ) if (alttyp is not None): cap._addTyp(alttyp) rt = ReferentToken(cap, li[0].begin_token, li[0].end_token) else: if (li[0].value is None): return None if (typ is None): if ((len(li) == 1 and li[0].begin_token.previous is not None and li[0].begin_token.previous.is_hiphen) and (isinstance(li[0].begin_token.previous.previous, ReferentToken)) and (isinstance( li[0].begin_token.previous.previous.getReferent(), GeoReferent))): pass else: return None else: if (not LanguageHelper.endsWithEx(typ, "ПУНКТ", "ПОСЕЛЕНИЕ", "ПОСЕЛЕННЯ", "ПОСЕЛОК")): if (not LanguageHelper.endsWith(typ, "CITY")): if (typ == "СТАНЦИЯ" and ((MiscLocationHelper.checkGeoObjectBefore( li[0].begin_token)))): pass elif (len(li) > 1 and li[1].typ == CityItemToken.ItemType.NOUN and li[0].typ == CityItemToken.ItemType.CITY): pass else: return None if (li[0].begin_token.morph.class0_.is_adjective): li[0].value = ProperNameHelper.getNameEx( li[0].begin_token, li[0].end_token, MorphClass.ADJECTIVE, li[1].morph.case_, li[1].morph.gender, False, False) elif (isinstance(oi.value.referent, GeoReferent)): rt = ReferentToken._new719( Utils.asObjectOrNull(oi.value.referent, GeoReferent), li[0].begin_token, li[len(li) - 1].end_token, mc) elif (typ is None): typ = oi.value.typ if (rt is None): city = GeoReferent() city._addName( (li[0].value if oi.value is None else oi.value.canonic_text)) if (typ is not None): city._addTyp(typ) else: city._addTypCity(li[0].kit.base_language) if (alttyp is not None): city._addTyp(alttyp) rt = ReferentToken._new719(city, li[0].begin_token, li[len(li) - 1].end_token, mc) if ((isinstance(rt.referent, GeoReferent)) and len(li) == 1 and (rt.referent).is_city): if (rt.begin_token.previous is not None and rt.begin_token.previous.isValue("Г", None)): rt.begin_token = rt.begin_token.previous elif ((rt.begin_token.previous is not None and rt.begin_token.previous.isChar('.') and rt.begin_token.previous.previous is not None) and rt.begin_token.previous.previous.isValue("Г", None)): rt.begin_token = rt.begin_token.previous.previous elif (rt.end_token.next0_ is not None and (rt.whitespaces_after_count < 2) and rt.end_token.next0_.isValue("Г", None)): rt.end_token = rt.end_token.next0_ if (rt.end_token.next0_ is not None and rt.end_token.next0_.isChar('.')): rt.end_token = rt.end_token.next0_ return rt
def process(self, kit: 'AnalysisKit') -> None: ad = kit.getAnalyzerData(self) models = TerminCollection() objs_by_model = dict() obj_by_names = TerminCollection() t = kit.first_token first_pass3136 = True while True: if first_pass3136: first_pass3136 = False else: t = t.next0_ if (not (t is not None)): break its = TransItemToken.tryParseList(t, 10) if (its is None): continue rts = self.__tryAttach(its, False) if (rts is not None): for rt in rts: cou = 0 tt = t.previous first_pass3137 = True while True: if first_pass3137: first_pass3137 = False else: tt = tt.previous cou += 1 if (not (tt is not None and (cou < 1000))): break tr = Utils.asObjectOrNull(tt.getReferent(), TransportReferent) if (tr is None): continue ok = True for s in rt.referent.slots: if (tr.findSlot(s.type_name, s.value, True) is None): ok = False break if (ok): rt.referent = (tr) break rt.referent = ad.registerReferent(rt.referent) kit.embedToken(rt) t = (rt) for s in rt.referent.slots: if (s.type_name == TransportReferent.ATTR_MODEL): mod = str(s.value) for k in range(2): if (not str.isdigit(mod[0])): li = [] wrapli2546 = RefOutArgWrapper(None) inoutres2547 = Utils.tryGetValue( objs_by_model, mod, wrapli2546) li = wrapli2546.value if (not inoutres2547): li = list() objs_by_model[mod] = li if (not rt.referent in li): li.append(rt.referent) models.addStr(mod, li, None, False) if (k > 0): break brand = rt.referent.getStringValue( TransportReferent.ATTR_BRAND) if (brand is None): break mod = "{0} {1}".format(brand, mod) elif (s.type_name == TransportReferent.ATTR_NAME): obj_by_names.add( Termin._new117(str(s.value), rt.referent)) if (len(objs_by_model) == 0 and len(obj_by_names.termins) == 0): return t = kit.first_token first_pass3138 = True while True: if first_pass3138: first_pass3138 = False else: t = t.next0_ if (not (t is not None)): break br = BracketHelper.tryParse(t, BracketParseAttr.NO, 10) if (br is not None): toks = obj_by_names.tryParse(t.next0_, TerminParseAttr.NO) if (toks is not None and toks.end_token.next0_ == br.end_token): rt0 = ReferentToken( Utils.asObjectOrNull(toks.termin.tag, Referent), br.begin_token, br.end_token) kit.embedToken(rt0) t = (rt0) continue if (not ((isinstance(t, TextToken)))): continue if (not t.chars.is_letter): continue tok = models.tryParse(t, TerminParseAttr.NO) if (tok is None): if (not t.chars.is_all_lower): tok = obj_by_names.tryParse(t, TerminParseAttr.NO) if (tok is None): continue if (not tok.is_whitespace_after): if (tok.end_token.next0_ is None or not tok.end_token.next0_.isCharOf(",.)")): if (not BracketHelper.isBracket(tok.end_token.next0_, False)): continue tr = None li = Utils.asObjectOrNull(tok.termin.tag, list) if (li is not None and len(li) == 1): tr = li[0] else: tr = (Utils.asObjectOrNull(tok.termin.tag, Referent)) if (tr is not None): tit = TransItemToken.tryParse(tok.begin_token.previous, None, False, True) if (tit is not None and tit.typ == TransItemToken.Typs.BRAND): tr.addSlot(TransportReferent.ATTR_BRAND, tit.value, False, 0) tok.begin_token = tit.begin_token rt0 = ReferentToken(tr, tok.begin_token, tok.end_token) kit.embedToken(rt0) t = (rt0) continue
def process(self, kit : 'AnalysisKit') -> None: # Основная функция выделения телефонов ad = kit.get_analyzer_data(self) has_denoms = False for a in kit.processor.analyzers: if ((isinstance(a, DenominationAnalyzer)) and not a.ignore_this_analyzer): has_denoms = True if (not has_denoms): a = DenominationAnalyzer() a.process(kit) li = list() tmp = io.StringIO() tmp2 = list() max0_ = 0 t = kit.first_token while t is not None: max0_ += 1 t = t.next0_ cur = 0 t = kit.first_token first_pass3292 = True while True: if first_pass3292: first_pass3292 = False else: t = t.next0_; cur += 1 if (not (t is not None)): break r = t.get_referent() if (r is not None): t = self.__add_referents(ad, t, cur, max0_) continue if (not (isinstance(t, TextToken))): continue if (not t.chars.is_letter or (t.length_char < 3)): continue term = t.term if (term == "ЕСТЬ"): if ((isinstance(t.previous, TextToken)) and t.previous.morph.class0_.is_verb): pass else: continue npt = None npt = NounPhraseHelper.try_parse(t, Utils.valToEnum((NounPhraseParseAttr.ADJECTIVECANBELAST) | (NounPhraseParseAttr.PARSEPREPOSITION), NounPhraseParseAttr), 0, None) if (npt is None): mc = t.get_morph_class_in_dictionary() if (mc.is_verb and not mc.is_preposition): if (t.is_verb_be): continue if (t.is_value("МОЧЬ", None) or t.is_value("WOULD", None)): continue kref = KeywordReferent._new1595(KeywordType.PREDICATE) norm = t.get_normal_case_text(MorphClass.VERB, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False) if (norm is None): norm = t.lemma if (norm.endswith("ЬСЯ")): norm = norm[0:0+len(norm) - 2] kref.add_slot(KeywordReferent.ATTR_VALUE, norm, False, 0) drv = DerivateService.find_derivates(norm, True, t.morph.language) KeywordAnalyzer.__add_normals(kref, drv, norm) kref = (Utils.asObjectOrNull(ad.register_referent(kref), KeywordReferent)) KeywordAnalyzer.__set_rank(kref, cur, max0_) rt1 = ReferentToken._new734(ad.register_referent(kref), t, t, t.morph) kit.embed_token(rt1) t = (rt1) continue continue if (npt.internal_noun is not None): continue if (npt.end_token.is_value("ЦЕЛОМ", None) or npt.end_token.is_value("ЧАСТНОСТИ", None)): if (npt.preposition is not None): t = npt.end_token continue if (npt.end_token.is_value("СТОРОНЫ", None) and npt.preposition is not None and npt.preposition.normal == "С"): t = npt.end_token continue if (npt.begin_token == npt.end_token): mc = t.get_morph_class_in_dictionary() if (mc.is_preposition): continue elif (mc.is_adverb): if (t.is_value("ПОТОМ", None)): continue else: pass li.clear() t0 = t tt = t first_pass3293 = True while True: if first_pass3293: first_pass3293 = False else: tt = tt.next0_ if (not (tt is not None and tt.end_char <= npt.end_char)): break if (not (isinstance(tt, TextToken))): continue if (tt.is_value("NATURAL", None)): pass if ((tt.length_char < 3) or not tt.chars.is_letter): continue mc = tt.get_morph_class_in_dictionary() if ((mc.is_preposition or mc.is_pronoun or mc.is_personal_pronoun) or mc.is_conjunction): if (tt.is_value("ОТНОШЕНИЕ", None)): pass else: continue if (mc.is_misc): if (MiscHelper.is_eng_article(tt)): continue kref = KeywordReferent._new1595(KeywordType.OBJECT) norm = tt.lemma kref.add_slot(KeywordReferent.ATTR_VALUE, norm, False, 0) if (norm != "ЕСТЬ"): drv = DerivateService.find_derivates(norm, True, tt.morph.language) KeywordAnalyzer.__add_normals(kref, drv, norm) kref = (Utils.asObjectOrNull(ad.register_referent(kref), KeywordReferent)) KeywordAnalyzer.__set_rank(kref, cur, max0_) rt1 = ReferentToken._new734(kref, tt, tt, tt.morph) kit.embed_token(rt1) if (tt == t and len(li) == 0): t0 = (rt1) t = (rt1) li.append(kref) if (len(li) > 1): kref = KeywordReferent._new1595(KeywordType.OBJECT) Utils.setLengthStringIO(tmp, 0) tmp2.clear() has_norm = False for kw in li: s = kw.get_string_value(KeywordReferent.ATTR_VALUE) if (tmp.tell() > 0): print(' ', end="", file=tmp) print(s, end="", file=tmp) n = kw.get_string_value(KeywordReferent.ATTR_NORMAL) if (n is not None): has_norm = True tmp2.append(n) else: tmp2.append(s) kref.add_slot(KeywordReferent.ATTR_REF, kw, False, 0) val = npt.get_normal_case_text(None, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False) kref.add_slot(KeywordReferent.ATTR_VALUE, val, False, 0) Utils.setLengthStringIO(tmp, 0) tmp2.sort() for s in tmp2: if (tmp.tell() > 0): print(' ', end="", file=tmp) print(s, end="", file=tmp) norm = Utils.toStringStringIO(tmp) if (norm != val): kref.add_slot(KeywordReferent.ATTR_NORMAL, norm, False, 0) kref = (Utils.asObjectOrNull(ad.register_referent(kref), KeywordReferent)) KeywordAnalyzer.__set_rank(kref, cur, max0_) rt1 = ReferentToken._new734(kref, t0, t, npt.morph) kit.embed_token(rt1) t = (rt1) cur = 0 t = kit.first_token first_pass3294 = True while True: if first_pass3294: first_pass3294 = False else: t = t.next0_; cur += 1 if (not (t is not None)): break kw = Utils.asObjectOrNull(t.get_referent(), KeywordReferent) if (kw is None or kw.typ != KeywordType.OBJECT): continue if (t.next0_ is None or kw.child_words > 2): continue t1 = t.next0_ if (t1.is_value("OF", None) and (t1.whitespaces_after_count < 3) and t1.next0_ is not None): t1 = t1.next0_ if ((isinstance(t1, TextToken)) and MiscHelper.is_eng_article(t1) and t1.next0_ is not None): t1 = t1.next0_ elif (not t1.morph.case_.is_genitive or t.whitespaces_after_count > 1): continue kw2 = Utils.asObjectOrNull(t1.get_referent(), KeywordReferent) if (kw2 is None): continue if (kw == kw2): continue if (kw2.typ != KeywordType.OBJECT or (kw.child_words + kw2.child_words) > 3): continue kw_un = KeywordReferent() kw_un._union(kw, kw2, MiscHelper.get_text_value(t1, t1, GetTextAttr.NO)) kw_un = (Utils.asObjectOrNull(ad.register_referent(kw_un), KeywordReferent)) KeywordAnalyzer.__set_rank(kw_un, cur, max0_) rt1 = ReferentToken._new734(kw_un, t, t1, t.morph) kit.embed_token(rt1) t = (rt1) if (KeywordAnalyzer.SORT_KEYWORDS_BY_RANK): all0_ = list(ad.referents) all0_.sort(key=operator.attrgetter('rank'), reverse=True) ad.referents = all0_ if (KeywordAnalyzer.ANNOTATION_MAX_SENTENCES > 0): ano = AutoannoSentToken.create_annotation(kit, KeywordAnalyzer.ANNOTATION_MAX_SENTENCES) if (ano is not None): ad.register_referent(ano)
def tryParse(t: 'Token', typ: 'BracketParseAttr' = BracketParseAttr.NO, max_tokens: int = 100) -> 'BracketSequenceToken': """ Попробовать восстановить последовательность, обрамляемой кавычками Args: t(Token): typ(BracketParseAttr): параметры выделения max_tokens(int): максимально токенов (вдруг забыли закрывающую ккавычку) """ t0 = t cou = 0 if (not BracketHelper.canBeStartOfSequence(t0, False, False)): return None br_list = list() br_list.append(BracketHelper.Bracket(t0)) cou = 0 crlf = 0 last = None lev = 1 is_assim = br_list[ 0].char0_ != '«' and BracketHelper.M_ASSYMOPEN_CHARS.find( br_list[0].char0_) >= 0 t = t0.next0_ first_pass2802 = True while True: if first_pass2802: first_pass2802 = False else: t = t.next0_ if (not (t is not None)): break if (t.is_table_control_char): break last = t if (t.isCharOf(BracketHelper.M_OPEN_CHARS) or t.isCharOf(BracketHelper.M_CLOSE_CHARS)): if (t.is_newline_before and (((typ) & (BracketParseAttr.CANBEMANYLINES))) == (BracketParseAttr.NO)): if (t.whitespaces_before_count > 10 or BracketHelper.canBeStartOfSequence( t, False, False)): if (t.isChar('(') and not t0.isChar('(')): pass else: last = t.previous break bb = BracketHelper.Bracket(t) br_list.append(bb) if (len(br_list) > 20): break if ((len(br_list) == 3 and br_list[1].can_be_open and bb.can_be_close) and BracketHelper.__mustBeCloseChar( bb.char0_, br_list[1].char0_) and BracketHelper.__mustBeCloseChar( bb.char0_, br_list[0].char0_)): ok = False tt = t.next0_ while tt is not None: if (tt.is_newline_before): break if (tt.isChar(',')): break if (tt.isChar('.')): tt = tt.next0_ while tt is not None: if (tt.is_newline_before): break elif (tt.isCharOf(BracketHelper.M_OPEN_CHARS) or tt.isCharOf( BracketHelper.M_CLOSE_CHARS)): bb2 = BracketHelper.Bracket(tt) if (BracketHelper.canBeEndOfSequence( tt, False, None, False) and BracketHelper.__canBeCloseChar( bb2.char0_, br_list[0].char0_)): ok = True break tt = tt.next0_ break if (t.isCharOf(BracketHelper.M_OPEN_CHARS) or t.isCharOf(BracketHelper.M_CLOSE_CHARS)): ok = True break tt = tt.next0_ if (not ok): break if (is_assim): if (bb.can_be_open and not bb.can_be_close and bb.char0_ == br_list[0].char0_): lev += 1 elif (bb.can_be_close and not bb.can_be_open and BracketHelper.M_OPEN_CHARS.find(br_list[0].char0_) == BracketHelper.M_CLOSE_CHARS.find(bb.char0_)): lev -= 1 if (lev == 0): break else: cou += 1 if ((cou) > max_tokens): break if ((((typ) & (BracketParseAttr.CANCONTAINSVERBS))) == ( BracketParseAttr.NO)): if (t.morph.language.is_cyrillic): if (t.getMorphClassInDictionary() == MorphClass.VERB): if (not t.morph.class0_.is_adjective and not t.morph.containsAttr( "страд.з.", None)): if (t.chars.is_all_lower): norm = t.getNormalCaseText( None, False, MorphGender.UNDEFINED, False) if (not LanguageHelper.endsWith( norm, "СЯ")): if (len(br_list) > 1): break if (br_list[0].char0_ != '('): break elif (t.morph.language.is_en): if (t.morph.class0_ == MorphClass.VERB and t.chars.is_all_lower): break r = t.getReferent() if (r is not None and r.type_name == "ADDRESS"): if (not t0.isChar('(')): break if ((((typ) & (BracketParseAttr.CANBEMANYLINES))) != (BracketParseAttr.NO)): if (t.is_newline_before): if (t.newlines_before_count > 1): break crlf += 1 continue if (t.is_newline_before): if (t.whitespaces_before_count > 15): break crlf += 1 if (not t.chars.is_all_lower): if (t.previous is not None and t.previous.isChar('.')): break if ((isinstance(t.previous, MetaToken)) and BracketHelper.canBeEndOfSequence( (t.previous).end_token, False, None, False)): break if (crlf > 1): if (len(br_list) > 1): break if (crlf > 10): break if (t.isChar(';') and t.is_newline_after): break if ((len(br_list) == 1 and br_list[0].can_be_open and (isinstance(last, MetaToken))) and last.is_newline_after): if (BracketHelper.canBeEndOfSequence((last).end_token, False, None, False)): return BracketSequenceToken(t0, last) if (len(br_list) < 1): return None i = 1 while i < (len(br_list) - 1): if (br_list[i].char0_ == '<' and br_list[i + 1].char0_ == '>'): br_list[i].can_be_open = True br_list[i + 1].can_be_close = True i += 1 internals = None while len(br_list) > 3: i = len(br_list) - 1 if ((br_list[i].can_be_close and br_list[i - 1].can_be_open and not BracketHelper.__canBeCloseChar( br_list[i].char0_, br_list[0].char0_)) and BracketHelper.__canBeCloseChar(br_list[i].char0_, br_list[i - 1].char0_)): del br_list[len(br_list) - 2:len(br_list) - 2 + 2] continue break while len(br_list) >= 4: changed = False i = 1 while i < (len(br_list) - 2): if ((br_list[i].can_be_open and not br_list[i].can_be_close and br_list[i + 1].can_be_close) and not br_list[i + 1].can_be_open): ok = False if (BracketHelper.__mustBeCloseChar( br_list[i + 1].char0_, br_list[i].char0_) or br_list[i].char0_ != br_list[0].char0_): ok = True if ((i == 1 and ((i + 2) < len(br_list)) and br_list[i + 2].char0_ == ')') and br_list[i + 1].char0_ != ')' and BracketHelper.__canBeCloseChar( br_list[i + 1].char0_, br_list[i - 1].char0_)): br_list[i + 2] = br_list[i + 1] elif (i > 1 and ((i + 2) < len(br_list)) and BracketHelper.__mustBeCloseChar( br_list[i + 2].char0_, br_list[i - 1].char0_)): ok = True if (ok): if (internals is None): internals = list() internals.append( BracketSequenceToken(br_list[i].source, br_list[i + 1].source)) del br_list[i:i + 2] changed = True break i += 1 if (not changed): break res = None if ((len(br_list) >= 4 and br_list[1].can_be_open and br_list[2].can_be_close) and br_list[3].can_be_close and not br_list[3].can_be_open): if (BracketHelper.__canBeCloseChar(br_list[3].char0_, br_list[0].char0_)): res = BracketSequenceToken(br_list[0].source, br_list[3].source) if (br_list[0].source.next0_ != br_list[1].source or br_list[2].source.next0_ != br_list[3].source): res.internal.append( BracketSequenceToken(br_list[1].source, br_list[2].source)) if (internals is not None): res.internal.extend(internals) if ((res is None and len(br_list) >= 3 and br_list[2].can_be_close) and not br_list[2].can_be_open): if ((((typ) & (BracketParseAttr.NEARCLOSEBRACKET))) != (BracketParseAttr.NO)): if (BracketHelper.__canBeCloseChar(br_list[1].char0_, br_list[0].char0_)): return BracketSequenceToken(br_list[0].source, br_list[1].source) ok = True if (BracketHelper.__canBeCloseChar(br_list[2].char0_, br_list[0].char0_) and BracketHelper.__canBeCloseChar(br_list[1].char0_, br_list[0].char0_) and br_list[1].can_be_close): t = br_list[1].source while t != br_list[2].source and t is not None: if (t.is_newline_before): ok = False break if (t.chars.is_letter and t.chars.is_all_lower): ok = False break npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0) if (npt is not None): t = npt.end_token t = t.next0_ if (ok): t = br_list[0].source.next0_ while t != br_list[1].source and t is not None: if (t.is_newline_before): return BracketSequenceToken( br_list[0].source, t.previous) t = t.next0_ lev1 = 0 tt = br_list[0].source.previous first_pass2803 = True while True: if first_pass2803: first_pass2803 = False else: tt = tt.previous if (not (tt is not None)): break if (tt.is_newline_after or tt.is_table_control_char): break if (not ((isinstance(tt, TextToken)))): continue if (tt.chars.is_letter or tt.length_char > 1): continue ch = (tt).term[0] if (BracketHelper.__canBeCloseChar(ch, br_list[0].char0_)): lev1 += 1 elif (BracketHelper.__canBeCloseChar( br_list[1].char0_, ch)): lev1 -= 1 if (lev1 < 0): return BracketSequenceToken( br_list[0].source, br_list[1].source) if (ok and BracketHelper.__canBeCloseChar(br_list[2].char0_, br_list[0].char0_)): intern = BracketSequenceToken(br_list[1].source, br_list[2].source) res = BracketSequenceToken(br_list[0].source, br_list[2].source) res.internal.append(intern) elif (ok and BracketHelper.__canBeCloseChar( br_list[2].char0_, br_list[1].char0_) and br_list[0].can_be_open): if (BracketHelper.__canBeCloseChar(br_list[2].char0_, br_list[0].char0_)): intern = BracketSequenceToken(br_list[1].source, br_list[2].source) res = BracketSequenceToken(br_list[0].source, br_list[2].source) res.internal.append(intern) elif (len(br_list) == 3): return None if (res is None and len(br_list) > 1 and br_list[1].can_be_close): res = BracketSequenceToken(br_list[0].source, br_list[1].source) if (res is None and len(br_list) > 1 and BracketHelper.__canBeCloseChar( br_list[1].char0_, br_list[0].char0_)): res = BracketSequenceToken(br_list[0].source, br_list[1].source) if (res is None and len(br_list) == 2 and br_list[0].char0_ == br_list[1].char0_): res = BracketSequenceToken(br_list[0].source, br_list[1].source) if (res is not None and internals is not None): for i in internals: if (i.begin_char < res.end_char): res.internal.append(i) if (res is None): cou = 0 tt = t0.next0_ first_pass2804 = True while True: if first_pass2804: first_pass2804 = False else: tt = tt.next0_ cou += 1 if (not (tt is not None)): break if (tt.is_table_control_char): break if (MiscHelper.canBeStartOfSentence(tt)): break if (max_tokens > 0 and cou > max_tokens): break mt = Utils.asObjectOrNull(tt, MetaToken) if (mt is None): continue if (isinstance(mt.end_token, TextToken)): if ((mt.end_token).isCharOf(BracketHelper.M_CLOSE_CHARS)): bb = BracketHelper.Bracket( Utils.asObjectOrNull(mt.end_token, TextToken)) if (bb.can_be_close and BracketHelper.__canBeCloseChar( bb.char0_, br_list[0].char0_)): return BracketSequenceToken(t0, tt) return res
def when(self) -> 'Referent': """ Когда (DateReferent или DateRangeReferent) """ return Utils.asObjectOrNull(self.getSlotValue(BusinessFactReferent.ATTR_WHEN), Referent)
def __tryAttach(self, t : 'Token', key_word : bool) -> 'ReferentToken': if (t is None): return None t0 = t t1 = t uris_keys = None uris = None org0_ = None cor_org = None org_is_bank = False empty = 0 last_uri = None first_pass2749 = True while True: if first_pass2749: first_pass2749 = False else: t = t.next0_ if (not (t is not None)): break if (t.is_table_control_char and t != t0): break if (t.is_comma or t.morph.class0_.is_preposition or t.isCharOf("/\\")): continue bank_keyword = False if (t.isValue("ПОЛНЫЙ", None) and t.next0_ is not None and ((t.next0_.isValue("НАИМЕНОВАНИЕ", None) or t.next0_.isValue("НАЗВАНИЕ", None)))): t = t.next0_.next0_ if (t is None): break if (t.isValue("БАНК", None)): if ((isinstance(t, ReferentToken)) and t.getReferent().type_name == "ORGANIZATION"): bank_keyword = True tt = t.next0_ npt = NounPhraseHelper.tryParse(tt, NounPhraseParseAttr.NO, 0) if (npt is not None): tt = npt.end_token.next0_ if (tt is not None and tt.isChar(':')): tt = tt.next0_ if (tt is not None): if (not bank_keyword): t = tt bank_keyword = True elif (tt.getReferent() is not None and tt.getReferent().type_name == "ORGANIZATION"): t = tt r = t.getReferent() if (r is not None and r.type_name == "ORGANIZATION"): is_bank = False kk = 0 rr = r while rr is not None and (kk < 4): is_bank = Utils.compareStrings(Utils.ifNotNull(rr.getStringValue("KIND"), ""), "Bank", True) == 0 if (is_bank): break rr = rr.parent_referent; kk += 1 if (not is_bank and bank_keyword): is_bank = True if (not is_bank and uris is not None and "ИНН" in uris_keys): return None if ((last_uri is not None and last_uri.scheme == "К/С" and t.previous is not None) and t.previous.isValue("В", None)): cor_org = r t1 = t elif (org0_ is None or ((not org_is_bank and is_bank))): org0_ = r t1 = t org_is_bank = is_bank if (is_bank): continue if (uris is None and not key_word): return None continue if (isinstance(r, UriReferent)): u = Utils.asObjectOrNull(r, UriReferent) if (uris is None): if (not BankAnalyzer.__isBankReq(u.scheme)): return None if (u.scheme == "ИНН" and t.is_newline_after): return None uris = list() uris_keys = list() else: if (not BankAnalyzer.__isBankReq(u.scheme)): break if (u.scheme in uris_keys): break if (u.scheme == "ИНН"): if (empty > 0): break uris_keys.append(u.scheme) uris.append(u) last_uri = u t1 = t empty = 0 continue elif (uris is None and not key_word and not org_is_bank): return None if (r is not None and ((r.type_name == "GEO" or r.type_name == "ADDRESS"))): empty += 1 continue if (isinstance(t, TextToken)): if (t.isValue("ПОЛНЫЙ", None) or t.isValue("НАИМЕНОВАНИЕ", None) or t.isValue("НАЗВАНИЕ", None)): pass elif (t.chars.is_letter): tok = BankAnalyzer.__m_ontology.tryParse(t, TerminParseAttr.NO) if (tok is not None): t = tok.end_token empty = 0 else: empty += 1 if (t.is_newline_before): nnn = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0) if (nnn is not None and nnn.end_token.next0_ is not None and nnn.end_token.next0_.isChar(':')): break if (uris is None): break if (empty > 2): break if (empty > 0 and t.isChar(':') and t.is_newline_after): break if (((isinstance(t, NumberToken)) and t.is_newline_before and t.next0_ is not None) and not t.next0_.chars.is_letter): break if (uris is None): return None if (not "Р/С" in uris_keys and not "Л/С" in uris_keys): return None ok = False if ((len(uris) < 2) and org0_ is None): return None bdr = BankDataReferent() for u in uris: bdr.addSlot(BankDataReferent.ATTR_ITEM, u, False, 0) if (org0_ is not None): bdr.addSlot(BankDataReferent.ATTR_BANK, org0_, False, 0) if (cor_org is not None): bdr.addSlot(BankDataReferent.ATTR_CORBANK, cor_org, False, 0) org0 = (None if t0.previous is None else t0.previous.getReferent()) if (org0 is not None and org0.type_name == "ORGANIZATION"): for s in org0.slots: if (isinstance(s.value, UriReferent)): u = Utils.asObjectOrNull(s.value, UriReferent) if (BankAnalyzer.__isBankReq(u.scheme)): if (not u.scheme in uris_keys): bdr.addSlot(BankDataReferent.ATTR_ITEM, u, False, 0) return ReferentToken(bdr, t0, t1)
def who(self) -> 'Referent': """ Кто (действительный залог) """ return Utils.asObjectOrNull(self.getSlotValue(BusinessFactReferent.ATTR_WHO), Referent)
def date(self) -> 'DateReferent': """ Дата """ return Utils.asObjectOrNull( self.get_slot_value(TitlePageReferent.ATTR_DATE), DateReferent)
def whom(self) -> 'Referent': """ Кого (страдательный залог) """ return Utils.asObjectOrNull(self.getSlotValue(BusinessFactReferent.ATTR_WHOM), Referent)
def city(self) -> 'GeoReferent': """ Город """ return Utils.asObjectOrNull( self.get_slot_value(TitlePageReferent.ATTR_CITY), GeoReferent)
def canBeEquals(self, obj : 'Referent', typ : 'EqualType') -> bool: geo_ = Utils.asObjectOrNull(obj, GeoReferent) if (geo_ is None): return False if (geo_.alpha2 is not None and geo_.alpha2 == self.alpha2): return True if (self.is_city != geo_.is_city): return False if (self.is_union != geo_.is_union): return False if (self.is_union): for s in self.slots: if (s.type_name == GeoReferent.ATTR_REF): if (obj.findSlot(GeoReferent.ATTR_REF, s.value, True) is None): return False for s in obj.slots: if (s.type_name == GeoReferent.ATTR_REF): if (self.findSlot(GeoReferent.ATTR_REF, s.value, True) is None): return False return True ref1 = Utils.asObjectOrNull(self.getSlotValue(GeoReferent.ATTR_REF), Referent) ref2 = Utils.asObjectOrNull(geo_.getSlotValue(GeoReferent.ATTR_REF), Referent) if (ref1 is not None and ref2 is not None): if (ref1 != ref2): return False r = self.is_region or self.is_state r1 = geo_.is_region or geo_.is_state if (r != r1): if (self.is_territory != geo_.is_territory): return False return False eq_names = False for s in self.slots: if (s.type_name == GeoReferent.ATTR_NAME): if (geo_.findSlot(s.type_name, s.value, True) is not None): eq_names = True break if (not eq_names): return False if (self.is_region and geo_.is_region): typs1 = self.typs typs2 = geo_.typs ok = False for t in typs1: if (t in typs2): ok = True else: for tt in typs2: if (LanguageHelper.endsWith(tt, t) or LanguageHelper.endsWith(t, tt)): ok = True if (not ok): return False if (self.higher is not None and geo_.higher is not None): if (GeoReferent.__checkRoundDep(self) or GeoReferent.__checkRoundDep(geo_)): return False if (self.higher.canBeEquals(geo_.higher, typ)): pass elif (geo_.higher.higher is not None and self.higher.canBeEquals(geo_.higher.higher, typ)): pass elif (self.higher.higher is not None and self.higher.higher.canBeEquals(geo_.higher, typ)): pass else: return False return True
def process(self, kit: 'AnalysisKit') -> None: ad = kit.getAnalyzerData(self) is_lit_block = 0 refs_by_num = dict() rts = [] t = kit.first_token first_pass2754 = True while True: if first_pass2754: first_pass2754 = False else: t = t.next0_ if (not (t is not None)): break if (t.isChar('(')): br = BracketHelper.tryParse(t, BracketParseAttr.NO, 100) if (br is not None and br.length_char > 70 and (br.length_char < 400)): if (br.is_newline_after or ((br.end_token.next0_ is not None and br.end_token.next0_.isCharOf(".;")))): rts = BookLinkAnalyzer.__tryParse( t.next0_, False, br.end_char) if (rts is not None and len(rts) >= 1): if (len(rts) > 1): rts[1].referent = ad.registerReferent( rts[1].referent) kit.embedToken(rts[1]) (rts[0].referent).book = Utils.asObjectOrNull( rts[1].referent, BookLinkReferent) if (rts[0].begin_char == rts[1].begin_char): rts[0].begin_token = rts[1] if (rts[0].end_char == rts[1].end_char): rts[0].end_token = rts[1] rts[0].begin_token = t rts[0].end_token = br.end_token (rts[0].referent).typ = BookLinkRefType.INLINE rts[0].referent = ad.registerReferent( rts[0].referent) kit.embedToken(rts[0]) t = (rts[0]) continue if (not t.is_newline_before): continue if (is_lit_block <= 0): tt = BookLinkToken.parseStartOfLitBlock(t) if (tt is not None): is_lit_block = 5 t = tt continue rts = BookLinkAnalyzer.__tryParse(t, is_lit_block > 0, 0) if (rts is None or (len(rts) < 1)): is_lit_block -= 1 if ((is_lit_block) < 0): is_lit_block = 0 continue is_lit_block += 1 if ((is_lit_block) > 5): is_lit_block = 5 if (len(rts) > 1): rts[1].referent = ad.registerReferent(rts[1].referent) kit.embedToken(rts[1]) (rts[0].referent).book = Utils.asObjectOrNull( rts[1].referent, BookLinkReferent) if (rts[0].begin_char == rts[1].begin_char): rts[0].begin_token = rts[1] if (rts[0].end_char == rts[1].end_char): rts[0].end_token = rts[1] re = Utils.asObjectOrNull(rts[0].referent, BookLinkRefReferent) re = (Utils.asObjectOrNull(ad.registerReferent(re), BookLinkRefReferent)) rts[0].referent = (re) kit.embedToken(rts[0]) t = (rts[0]) if (re.number is not None): li = [] wrapli385 = RefOutArgWrapper(None) inoutres386 = Utils.tryGetValue(refs_by_num, re.number, wrapli385) li = wrapli385.value if (not inoutres386): li = list() refs_by_num[re.number] = li li.append(re) t = kit.first_token first_pass2755 = True while True: if first_pass2755: first_pass2755 = False else: t = t.next0_ if (not (t is not None)): break if (not ((isinstance(t, TextToken)))): continue rt = BookLinkAnalyzer.__tryParseShortInline(t) if (rt is None): continue re = Utils.asObjectOrNull(rt.referent, BookLinkRefReferent) li = [] wrapli387 = RefOutArgWrapper(None) inoutres388 = Utils.tryGetValue(refs_by_num, Utils.ifNotNull(re.number, ""), wrapli387) li = wrapli387.value if (not inoutres388): continue i = 0 while i < len(li): if (t.begin_char < li[i].occurrence[0].begin_char): break i += 1 if (i >= len(li)): continue re.book = li[i].book if (re.pages is None): re.pages = li[i].pages re.typ = BookLinkRefType.INLINE re = (Utils.asObjectOrNull(ad.registerReferent(re), BookLinkRefReferent)) rt.referent = (re) kit.embedToken(rt) t = (rt)
def ref(self) -> 'Referent': return Utils.asObjectOrNull( self.getSlotValue(InstrumentBlockReferent.ATTR_REF), Referent)
def _process(begin : 'Token', max_char_pos : int, kit : 'AnalysisKit', end_token : 'Token') -> 'TitlePageReferent': end_token.value = begin res = TitlePageReferent() term = None lines = Line.parse(begin, 30, 1500, max_char_pos) if (len(lines) < 1): return None cou = len(lines) min_newlines_count = 10 lines_count_stat = dict() i = 0 while i < len(lines): if (TitleNameToken.can_be_start_of_text_or_content(lines[i].begin_token, lines[i].end_token)): cou = i break j = lines[i].newlines_before_count if (i > 0 and j > 0): if (not j in lines_count_stat): lines_count_stat[j] = 1 else: lines_count_stat[j] += 1 i += 1 max0_ = 0 for kp in lines_count_stat.items(): if (kp[1] > max0_): max0_ = kp[1] min_newlines_count = kp[0] end_char = (lines[cou - 1].end_char if cou > 0 else 0) if (max_char_pos > 0 and end_char > max_char_pos): end_char = max_char_pos names = list() i = 0 while i < cou: if (i == 6): pass j = i while (j < cou) and (j < (i + 5)): if (i == 6 and j == 8): pass if (j > i): if (lines[j - 1].is_pure_en and lines[j].is_pure_ru): break if (lines[j - 1].is_pure_ru and lines[j].is_pure_en): break if (lines[j].newlines_before_count >= (min_newlines_count * 2)): break ttt = TitleNameToken.try_parse(lines[i].begin_token, lines[j].end_token, min_newlines_count) if (ttt is not None): if (lines[i].is_pure_en): ttt.morph.language = MorphLang.EN elif (lines[i].is_pure_ru): ttt.morph.language = MorphLang.RU names.append(ttt) j += 1 i += 1 TitleNameToken.sort(names) name_rt = None if (len(names) > 0): i0 = 0 if (names[i0].morph.language.is_en): ii = 1 while ii < len(names): if (names[ii].morph.language.is_ru and names[ii].rank > 0): i0 = ii break ii += 1 term = res._add_name(names[i0].begin_name_token, names[i0].end_name_token) if (names[i0].type_value is not None): res._add_type(names[i0].type_value) if (names[i0].speciality is not None): res.speciality = names[i0].speciality rt = ReferentToken(res, names[i0].begin_token, names[i0].end_token) if (kit is not None): kit.embed_token(rt) else: res.add_occurence(TextAnnotation(rt.begin_token, rt.end_token)) end_token.value = rt.end_token name_rt = rt if (begin.begin_char == rt.begin_char): begin = (rt) if (term is not None and kit is not None): t = kit.first_token first_pass3397 = True while True: if first_pass3397: first_pass3397 = False else: t = t.next0_ if (not (t is not None)): break tok = term.try_parse(t, TerminParseAttr.NO) if (tok is None): continue t0 = t t1 = tok.end_token if (t1.next0_ is not None and t1.next0_.is_char('.')): t1 = t1.next0_ if (BracketHelper.can_be_start_of_sequence(t0.previous, False, False) and BracketHelper.can_be_end_of_sequence(t1.next0_, False, None, False)): t0 = t0.previous t1 = t1.next0_ rt = ReferentToken(res, t0, t1) kit.embed_token(rt) t = (rt) pr = PersonRelations() pers_typ = TitleItemToken.Types.UNDEFINED pers_types = pr.rel_types t = begin first_pass3398 = True while True: if first_pass3398: first_pass3398 = False else: t = t.next0_ if (not (t is not None)): break if (max_char_pos > 0 and t.begin_char > max_char_pos): break if (t == name_rt): continue tpt = TitleItemToken.try_attach(t) if (tpt is not None): pers_typ = TitleItemToken.Types.UNDEFINED if (tpt.typ == TitleItemToken.Types.TYP): if (len(res.types) == 0): res._add_type(tpt.value) elif (len(res.types) == 1): ty = res.types[0].upper() if (ty == "РЕФЕРАТ"): res._add_type(tpt.value) elif (ty == "АВТОРЕФЕРАТ"): if (tpt.value == "КАНДИДАТСКАЯ ДИССЕРТАЦИЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат кандидатской диссертации", True, 0) elif (tpt.value == "ДОКТОРСКАЯ ДИССЕРТАЦИЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат докторской диссертации", True, 0) elif (tpt.value == "МАГИСТЕРСКАЯ ДИССЕРТАЦИЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат магистерской диссертации", True, 0) elif (tpt.value == "КАНДИДАТСЬКА ДИСЕРТАЦІЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат кандидатської дисертації", True, 0) elif (tpt.value == "ДОКТОРСЬКА ДИСЕРТАЦІЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат докторської дисертації", True, 0) elif (tpt.value == "МАГІСТЕРСЬКА ДИСЕРТАЦІЯ"): res.add_slot(TitlePageReferent.ATTR_TYPE, "автореферат магістерської дисертації", True, 0) else: res._add_type(tpt.value) elif (tpt.value == "РЕФЕРАТ" or tpt.value == "АВТОРЕФЕРАТ"): if (not tpt.value in ty): res._add_type(tpt.value) elif (tpt.typ == TitleItemToken.Types.SPECIALITY): if (res.speciality is None): res.speciality = tpt.value elif (tpt.typ in pers_types): pers_typ = tpt.typ t = tpt.end_token if (t.end_char > end_token.value.end_char): end_token.value = t if (t.next0_ is not None and t.next0_.is_char_of(":-")): t = t.next0_ continue if (t.end_char > end_char): break rli = t.get_referents() if (rli is None): continue if (not t.is_newline_before and (isinstance(t.previous, TextToken))): s = t.previous.term if (s == "ИМЕНИ" or s == "ИМ"): continue if (s == "." and t.previous.previous is not None and t.previous.previous.is_value("ИМ", None)): continue for r in rli: if (isinstance(r, PersonReferent)): if (r != rli[0]): continue p = Utils.asObjectOrNull(r, PersonReferent) if (pers_typ != TitleItemToken.Types.UNDEFINED): if (t.previous is not None and t.previous.is_char('.')): pers_typ = TitleItemToken.Types.UNDEFINED typ = pr.calc_typ_from_attrs(p) if (typ != TitleItemToken.Types.UNDEFINED): pr.add(p, typ, 1) pers_typ = typ elif (pers_typ != TitleItemToken.Types.UNDEFINED): pr.add(p, pers_typ, 1) elif (t.previous is not None and t.previous.is_char('©')): pers_typ = TitleItemToken.Types.WORKER pr.add(p, pers_typ, 1) else: tt = t.next0_ first_pass3399 = True while True: if first_pass3399: first_pass3399 = False else: tt = tt.next0_ if (not (tt is not None)): break rr = tt.get_referent() if (rr == res): pers_typ = TitleItemToken.Types.WORKER break if (isinstance(rr, PersonReferent)): if (pr.calc_typ_from_attrs(Utils.asObjectOrNull(r, PersonReferent)) != TitleItemToken.Types.UNDEFINED): break else: continue if (rr is not None): break tpt = TitleItemToken.try_attach(tt) if (tpt is not None): if (tpt.typ != TitleItemToken.Types.TYP and tpt.typ != TitleItemToken.Types.TYPANDTHEME): break tt = tpt.end_token if (tt.end_char > end_token.value.end_char): end_token.value = tt continue if (pers_typ == TitleItemToken.Types.UNDEFINED): tt = t.previous while tt is not None: rr = tt.get_referent() if (rr == res): pers_typ = TitleItemToken.Types.WORKER break if (rr is not None): break if ((tt.is_value("СТУДЕНТ", None) or tt.is_value("СТУДЕНТКА", None) or tt.is_value("СЛУШАТЕЛЬ", None)) or tt.is_value("ДИПЛОМНИК", None) or tt.is_value("ИСПОЛНИТЕЛЬ", None)): pers_typ = TitleItemToken.Types.WORKER break tpt = TitleItemToken.try_attach(tt) if (tpt is not None and tpt.typ != TitleItemToken.Types.TYP): break tt = tt.previous if (pers_typ != TitleItemToken.Types.UNDEFINED): pr.add(p, pers_typ, 1) else: pr.add(p, pers_typ, 0.5) if (t.end_char > end_token.value.end_char): end_token.value = t continue if (r == rli[0]): pers_typ = TitleItemToken.Types.UNDEFINED if (isinstance(r, DateReferent)): if (res.date is None): res.date = Utils.asObjectOrNull(r, DateReferent) if (t.end_char > end_token.value.end_char): end_token.value = t elif (isinstance(r, GeoReferent)): if (res.city is None and r.is_city): res.city = Utils.asObjectOrNull(r, GeoReferent) if (t.end_char > end_token.value.end_char): end_token.value = t if (isinstance(r, OrganizationReferent)): org0_ = Utils.asObjectOrNull(r, OrganizationReferent) if ("курс" in org0_.types and org0_.number is not None): i = 0 wrapi2673 = RefOutArgWrapper(0) inoutres2674 = Utils.tryParseInt(org0_.number, wrapi2673) i = wrapi2673.value if (inoutres2674): if (i > 0 and (i < 8)): res.student_year = i while org0_.higher is not None: if (org0_.kind != OrganizationKind.DEPARTMENT): break org0_ = org0_.higher if (org0_.kind != OrganizationKind.DEPARTMENT): if (res.org0_ is None): res.org0_ = org0_ elif (OrganizationReferent.can_be_higher(res.org0_, org0_)): res.org0_ = org0_ if (t.end_char > end_token.value.end_char): end_token.value = t if ((isinstance(r, UriReferent)) or (isinstance(r, GeoReferent))): if (t.end_char > end_token.value.end_char): end_token.value = t for ty in pers_types: for p in pr.get_persons(ty): if (pr.get_attr_name_for_type(ty) is not None): res.add_slot(pr.get_attr_name_for_type(ty), p, False, 0) if (res.get_slot_value(TitlePageReferent.ATTR_AUTHOR) is None): for p in pr.get_persons(TitleItemToken.Types.UNDEFINED): res.add_slot(TitlePageReferent.ATTR_AUTHOR, p, False, 0) break if (res.city is None and res.org0_ is not None): s = res.org0_.find_slot(OrganizationReferent.ATTR_GEO, None, True) if (s is not None and (isinstance(s.value, GeoReferent))): if (s.value.is_city): res.city = Utils.asObjectOrNull(s.value, GeoReferent) if (res.date is None): t = begin first_pass3400 = True while True: if first_pass3400: first_pass3400 = False else: t = t.next0_ if (not (t is not None and t.end_char <= end_char)): break city = Utils.asObjectOrNull(t.get_referent(), GeoReferent) if (city is None): continue if (isinstance(t.next0_, TextToken)): if (t.next0_.is_char_of(":,") or t.next0_.is_hiphen): t = t.next0_ rt = t.kit.process_referent(DateAnalyzer.ANALYZER_NAME, t.next0_) if (rt is not None): rt.save_to_local_ontology() res.date = Utils.asObjectOrNull(rt.referent, DateReferent) if (kit is not None): kit.embed_token(rt) break if (len(res.slots) == 0): return None else: return res
def try_parse(t: 'Token', items: typing.List['NounPhraseItem'], attrs: 'NounPhraseParseAttr') -> 'NounPhraseItem': if (t is None): return None t0 = t _can_be_surname = False _is_doubt_adj = False rt = Utils.asObjectOrNull(t, ReferentToken) if (rt is not None and rt.begin_token == rt.end_token and (isinstance(rt.begin_token, TextToken))): res = NounPhraseItem.try_parse(rt.begin_token, items, attrs) if (res is not None): res.begin_token = res.end_token = t res.can_be_noun = True return res if (rt is not None): res = NounPhraseItem(t, t) for m in t.morph.items: v = NounPhraseItemTextVar(m, None) v.normal_value = str(t.get_referent()) res.noun_morph.append(v) res.can_be_noun = True return res if (isinstance(t, NumberToken)): pass has_legal_verb = False if (isinstance(t, TextToken)): if (not t.chars.is_letter): return None str0_ = t.term if (str0_[len(str0_) - 1] == 'А' or str0_[len(str0_) - 1] == 'О'): for wf in t.morph.items: if ((isinstance(wf, MorphWordForm)) and wf.is_in_dictionary): if (wf.class0_.is_verb): mc = t.get_morph_class_in_dictionary() if (not mc.is_noun and (((attrs) & (NounPhraseParseAttr.IGNOREPARTICIPLES))) == (NounPhraseParseAttr.NO)): if (not LanguageHelper.ends_with_ex( str0_, "ОГО", "ЕГО", None, None)): return None has_legal_verb = True if (wf.class0_.is_adverb): if (t.next0_ is None or not t.next0_.is_hiphen): if ((str0_ == "ВСЕГО" or str0_ == "ДОМА" or str0_ == "НЕСКОЛЬКО") or str0_ == "МНОГО" or str0_ == "ПОРЯДКА"): pass else: return None if (wf.class0_.is_adjective): if (wf.contains_attr("к.ф.", None)): if (t.get_morph_class_in_dictionary() == MorphClass.ADJECTIVE): pass else: _is_doubt_adj = True mc0 = t.morph.class0_ if (mc0.is_proper_surname and not t.chars.is_all_lower): for wf in t.morph.items: if (wf.class0_.is_proper_surname and wf.number != MorphNumber.PLURAL): wff = Utils.asObjectOrNull(wf, MorphWordForm) if (wff is None): continue s = Utils.ifNotNull((Utils.ifNotNull( wff.normal_full, wff.normal_case)), "") if (LanguageHelper.ends_with_ex( s, "ИН", "ЕН", "ЫН", None)): if (not wff.is_in_dictionary): _can_be_surname = True else: return None if (wff.is_in_dictionary and LanguageHelper.ends_with(s, "ОВ")): _can_be_surname = True if (mc0.is_proper_name and not t.chars.is_all_lower): for wff in t.morph.items: wf = Utils.asObjectOrNull(wff, MorphWordForm) if (wf is None): continue if (wf.normal_case == "ГОР"): continue if (wf.class0_.is_proper_name and wf.is_in_dictionary): if (wf.normal_case is None or not wf.normal_case.startswith("ЛЮБ")): if (mc0.is_adjective and t.morph.contains_attr("неизм.", None)): pass elif ( (((attrs) & (NounPhraseParseAttr.REFERENTCANBENOUN)) ) == (NounPhraseParseAttr.REFERENTCANBENOUN)): pass else: if (items is None or (len(items) < 1)): return None if (not items[0].is_std_adjective): return None if (mc0.is_adjective and t.morph.items_count == 1): if (t.morph.get_indexer_item(0).contains_attr( "в.ср.ст.", None)): return None mc1 = t.get_morph_class_in_dictionary() if (mc1 == MorphClass.VERB and t.morph.case_.is_undefined): return None if (((((attrs) & (NounPhraseParseAttr.IGNOREPARTICIPLES))) == (NounPhraseParseAttr.IGNOREPARTICIPLES) and t.morph.class0_.is_verb and not t.morph.class0_.is_noun) and not t.morph.class0_.is_proper): for wf in t.morph.items: if (wf.class0_.is_verb): if (wf.contains_attr("дейст.з.", None)): if (LanguageHelper.ends_with(t.term, "СЯ")): pass else: return None t1 = None for k in range(2): t = (Utils.ifNotNull(t1, t0)) if (k == 0): if (((isinstance(t0, TextToken)) and t0.next0_ is not None and t0.next0_.is_hiphen) and t0.next0_.next0_ is not None): if (not t0.is_whitespace_after and not t0.morph.class0_.is_pronoun and not (isinstance(t0.next0_.next0_, NumberToken))): if (not t0.next0_.is_whitespace_after): t = t0.next0_.next0_ elif (t0.next0_.next0_.chars.is_all_lower and LanguageHelper.ends_with(t0.term, "О")): t = t0.next0_.next0_ it = NounPhraseItem._new404(t0, t, _can_be_surname) if (t0 == t and (isinstance(t0, ReferentToken))): it.can_be_noun = True it.morph = MorphCollection(t0.morph) can_be_prepos = False for v in t.morph.items: wf = Utils.asObjectOrNull(v, MorphWordForm) if (v.class0_.is_verb and not v.case_.is_undefined): it.can_be_adj = True it.adj_morph.append(NounPhraseItemTextVar(v, t)) continue if (v.class0_.is_preposition): can_be_prepos = True if (v.class0_.is_adjective or ((v.class0_.is_pronoun and not v.class0_.is_personal_pronoun and not v.contains_attr("неизм.", None))) or ((v.class0_.is_noun and (isinstance(t, NumberToken))))): if (NounPhraseItem.try_accord_variant( items, (0 if items is None else len(items)), v, False)): is_doub = False if (v.contains_attr("к.ф.", None)): continue if (v.contains_attr("собир.", None) and not (isinstance(t, NumberToken))): if (wf is not None and wf.is_in_dictionary): return None continue if (v.contains_attr("сравн.", None)): continue ok = True if (isinstance(t, TextToken)): s = t.term if (s == "ПРАВО" or s == "ПРАВА"): ok = False elif (LanguageHelper.ends_with(s, "ОВ") and t.get_morph_class_in_dictionary().is_noun): ok = False elif (isinstance(t, NumberToken)): if (v.class0_.is_noun and t.morph.class0_.is_adjective): ok = False elif (t.morph.class0_.is_noun and (( (attrs) & (NounPhraseParseAttr.PARSENUMERICASADJECTIVE))) == (NounPhraseParseAttr.NO)): ok = False if (ok): it.adj_morph.append(NounPhraseItemTextVar(v, t)) it.can_be_adj = True if (_is_doubt_adj and t0 == t): it.is_doubt_adjective = True if (has_legal_verb and wf is not None and wf.is_in_dictionary): it.can_be_noun = True if (wf is not None and wf.class0_.is_pronoun): it.can_be_noun = True it.noun_morph.append( NounPhraseItemTextVar(v, t)) can_be_noun_ = False if (isinstance(t, NumberToken)): pass elif (v.class0_.is_noun or ((wf is not None and wf.normal_case == "САМ"))): can_be_noun_ = True elif (v.class0_.is_personal_pronoun): if (items is None or len(items) == 0): can_be_noun_ = True else: for it1 in items: if (it1.is_verb): if (len(items) == 1 and not v.case_.is_nominative): can_be_noun_ = True else: return None if (len(items) == 1): if (items[0].can_be_adj_for_personal_pronoun): can_be_noun_ = True elif ( (v.class0_.is_pronoun and ((items is None or len(items) == 0 or ((len(items) == 1 and items[0].can_be_adj_for_personal_pronoun)))) and wf is not None) and (((((wf.normal_case == "ТОТ" or wf.normal_full == "ТО" or wf.normal_case == "ТО") or wf.normal_case == "ЭТО" or wf.normal_case == "ВСЕ") or wf.normal_case == "ЧТО" or wf.normal_case == "КТО") or wf.normal_full == "КОТОРЫЙ" or wf.normal_case == "КОТОРЫЙ"))): if (wf.normal_case == "ВСЕ"): if (t.next0_ is not None and t.next0_.is_value("РАВНО", None)): return None can_be_noun_ = True elif (wf is not None and ((Utils.ifNotNull( wf.normal_full, wf.normal_case))) == "КОТОРЫЙ" and (((attrs) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (NounPhraseParseAttr.NO)): return None elif (v.class0_.is_proper and (isinstance(t, TextToken))): if (t.length_char > 4 or v.class0_.is_proper_name): can_be_noun_ = True if (can_be_noun_): added = False if (items is not None and len(items) > 1 and (((attrs) & (NounPhraseParseAttr.MULTINOUNS))) != (NounPhraseParseAttr.NO)): ok1 = True ii = 1 while ii < len(items): if (not items[ii].conj_before): ok1 = False break ii += 1 if (ok1): if (NounPhraseItem.try_accord_variant( items, (0 if items is None else len(items)), v, True)): it.noun_morph.append( NounPhraseItemTextVar(v, t)) it.can_be_noun = True it.multi_nouns = True added = True if (not added): if (NounPhraseItem.try_accord_variant( items, (0 if items is None else len(items)), v, False)): it.noun_morph.append(NounPhraseItemTextVar(v, t)) it.can_be_noun = True if (v.class0_.is_personal_pronoun and t.morph.contains_attr("неизм.", None) and not it.can_be_adj): itt = NounPhraseItemTextVar(v, t) itt.case_ = MorphCase.ALL_CASES itt.number = MorphNumber.UNDEFINED if (itt.normal_value is None): pass it.adj_morph.append(itt) it.can_be_adj = True elif ((len(items) > 0 and len(items[0].adj_morph) > 0 and items[0].adj_morph[0].number == MorphNumber.PLURAL) and not ((items[0].adj_morph[0].case_) & v.case_).is_undefined and not items[0].adj_morph[0].class0_.is_verb): if (t.next0_ is not None and t.next0_.is_comma_and and (isinstance(t.next0_.next0_, TextToken))): npt2 = NounPhraseHelper.try_parse( t.next0_.next0_, attrs, 0, None) if (npt2 is not None and npt2.preposition is None and not ((npt2.morph.case_) & v.case_ & items[0].adj_morph[0].case_ ).is_undefined): it.noun_morph.append( NounPhraseItemTextVar(v, t)) it.can_be_noun = True if (t0 != t): for v in it.adj_morph: v.correct_prefix(Utils.asObjectOrNull(t0, TextToken), False) for v in it.noun_morph: v.correct_prefix(Utils.asObjectOrNull(t0, TextToken), True) if (k == 1 and it.can_be_noun and not it.can_be_adj): if (t1 is not None): it.end_token = t1 else: it.end_token = t0.next0_.next0_ for v in it.noun_morph: if (v.normal_value is not None and (v.normal_value.find('-') < 0)): v.normal_value = "{0}-{1}".format( v.normal_value, it.end_token.get_normal_case_text( None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False)) if (it.can_be_adj): if (NounPhraseItem.__m_std_adjectives.try_parse( it.begin_token, TerminParseAttr.NO) is not None): it.is_std_adjective = True if (can_be_prepos and it.can_be_noun): if (items is not None and len(items) > 0): npt1 = NounPhraseHelper.try_parse( t, Utils.valToEnum((NounPhraseParseAttr.PARSEPREPOSITION) | (NounPhraseParseAttr.PARSEPRONOUNS) | (NounPhraseParseAttr.PARSEVERBS), NounPhraseParseAttr), 0, None) if (npt1 is not None and npt1.end_char > t.end_char): return None else: npt1 = NounPhraseHelper.try_parse( t.next0_, Utils.valToEnum((NounPhraseParseAttr.PARSEPRONOUNS) | (NounPhraseParseAttr.PARSEVERBS), NounPhraseParseAttr), 0, None) if (npt1 is not None): mc = LanguageHelper.get_case_after_preposition(t.lemma) if (not ((mc) & npt1.morph.case_).is_undefined): return None if (it.can_be_noun or it.can_be_adj or k == 1): if (it.begin_token.morph.class0_.is_pronoun): tt2 = it.end_token.next0_ if ((tt2 is not None and tt2.is_hiphen and not tt2.is_whitespace_after) and not tt2.is_whitespace_before): tt2 = tt2.next0_ if (isinstance(tt2, TextToken)): ss = tt2.term if ((ss == "ЖЕ" or ss == "БЫ" or ss == "ЛИ") or ss == "Ж"): it.end_token = tt2 elif (ss == "НИБУДЬ" or ss == "ЛИБО" or (((ss == "ТО" and tt2.previous.is_hiphen)) and it.can_be_adj)): it.end_token = tt2 for m in it.adj_morph: m.normal_value = "{0}-{1}".format( m.normal_value, ss) if (m.single_number_value is not None): m.single_number_value = "{0}-{1}".format( m.single_number_value, ss) return it if (t0 == t): if (t0.is_value("БИЗНЕС", None) and t0.next0_ is not None and t0.next0_.chars == t0.chars): t1 = t0.next0_ continue return it return None