def _DelSurnameEnd(s : str) -> str: if (len(s) < 3): return s if (LanguageHelper.endsWithEx(s, "А", "У", "Е", None)): return s[0:0+len(s) - 1] if (LanguageHelper.endsWith(s, "ОМ") or LanguageHelper.endsWith(s, "ЫМ")): return s[0:0+len(s) - 2] if (LanguageHelper.endsWithEx(s, "Я", "Ю", None, None)): ch1 = s[len(s) - 2] if (ch1 == 'Н' or ch1 == 'Л'): return s[0:0+len(s) - 1] + "Ь" return s
def canHasRef(self, r: 'Referent') -> bool: """ Проверка, что этот референт может выступать в качестве ATTR_REF Args: r(Referent): """ nam = self.name if (nam is None or r is None): return False if (isinstance(r, GeoReferent)): g = Utils.asObjectOrNull(r, GeoReferent) if (LanguageHelper.endsWithEx(nam, "президент", "губернатор", None, None)): return g.is_state or g.is_region if (nam == "мэр" or nam == "градоначальник"): return g.is_city if (nam == "глава"): return True return False if (r.type_name == "ORGANIZATION"): if ((LanguageHelper.endsWith(nam, "губернатор") or nam == "мэр" or nam == "градоначальник") or nam == "президент"): return False if ("министр" in nam): if (r.findSlot(None, "министерство", True) is None): return False if (nam.endswith("директор")): if ((r.findSlot(None, "суд", True)) is not None): return False return True return False
def lemma(self) -> str: """ Лемма (вариант морфологической нормализации) """ if (self.__m_lemma is not None): return self.__m_lemma res = None if (self.word_forms is not None and len(self.word_forms) > 0): if (len(self.word_forms) == 1): res = (Utils.ifNotNull(self.word_forms[0].normal_full, self.word_forms[0].normal_case)) if (res is None and not self.char_info.is_all_lower): for m in self.word_forms: if (m.class0_.is_proper_surname): s = Utils.ifNotNull(m.normal_full, Utils.ifNotNull(m.normal_case, "")) if (LanguageHelper.endsWithEx(s, "ОВ", "ЕВ", None, None)): res = s break elif (m.class0_.is_proper_name and m.is_in_dictionary): return m.normal_case if (res is None): best = None for m in self.word_forms: if (best is None): best = m elif (self.__compareForms(best, m) > 0): best = m res = (Utils.ifNotNull(best.normal_full, best.normal_case)) if (res is not None): if (LanguageHelper.endsWithEx(res, "АНЫЙ", "ЕНЫЙ", None, None)): res = (res[0:0 + len(res) - 3] + "ННЫЙ") elif (LanguageHelper.endsWith(res, "ЙСЯ")): res = res[0:0 + len(res) - 2] elif (LanguageHelper.endsWith(res, "АНИЙ") and res == self.term): for wf in self.word_forms: if (wf.is_in_dictionary): return res return res[0:0 + len(res) - 1] + "Е" return res return Utils.ifNotNull(self.term, "?")
def process(self, word : str) -> typing.List['MorphWordForm']: """ Обработка одного слова Args: word(str): слово должно быть в верхнем регистре """ if (Utils.isNullOrEmpty(word)): return None res = None if (len(word) > 1): i = 0 while i < len(word): ch = word[i] if (LanguageHelper.isCyrillicVowel(ch) or LanguageHelper.isLatinVowel(ch)): break i += 1 if (i >= len(word)): return res mvs = [ ] tn = self.m_root i = 0 while i <= len(word): if (tn.lazy_pos > 0): self.__loadTreeNode(tn) if (tn.rules is not None): word_begin = None word_end = None if (i == 0): word_end = word elif (i < len(word)): word_end = word[i:] else: word_end = "" if (res is None): res = list() for r in tn.rules: wrapmvs14 = RefOutArgWrapper(None) inoutres15 = Utils.tryGetValue(r.variants, word_end, wrapmvs14) mvs = wrapmvs14.value if (inoutres15): if (word_begin is None): if (i == len(word)): word_begin = word elif (i > 0): word_begin = word[0:0+i] else: word_begin = "" r.processResult(res, word_begin, mvs) if (tn.nodes is None or i >= len(word)): break ch = ord(word[i]) wraptn16 = RefOutArgWrapper(None) inoutres17 = Utils.tryGetValue(tn.nodes, ch, wraptn16) tn = wraptn16.value if (not inoutres17): break i += 1 need_test_unknown_vars = True if (res is not None): for r in res: if ((r.class0_.is_pronoun or r.class0_.is_noun or r.class0_.is_adjective) or (r.class0_.is_misc and r.class0_.is_conjunction) or r.class0_.is_preposition): need_test_unknown_vars = False elif (r.class0_.is_adverb and r.normal_case is not None): if (not LanguageHelper.endsWithEx(r.normal_case, "О", "А", None, None)): need_test_unknown_vars = False elif (r.normal_case == "МНОГО"): need_test_unknown_vars = False elif (r.class0_.is_verb and len(res) > 1): ok = False for rr in res: if (rr != r and rr.class0_ != r.class0_): ok = True break if (ok and not LanguageHelper.endsWith(word, "ИМ")): need_test_unknown_vars = False if (need_test_unknown_vars and LanguageHelper.isCyrillicChar(word[0])): gl = 0 sog = 0 j = 0 while j < len(word): if (LanguageHelper.isCyrillicVowel(word[j])): gl += 1 else: sog += 1 j += 1 if ((gl < 2) or (sog < 2)): need_test_unknown_vars = False if (need_test_unknown_vars and res is not None and len(res) == 1): if (res[0].class0_.is_verb): if ("н.вр." in res[0].misc.attrs and "нес.в." in res[0].misc.attrs and not "страд.з." in res[0].misc.attrs): need_test_unknown_vars = False elif ("б.вр." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): need_test_unknown_vars = False elif (res[0].normal_case is not None and LanguageHelper.endsWith(res[0].normal_case, "СЯ")): need_test_unknown_vars = False if (res[0].class0_.is_undefined and "прдктв." in res[0].misc.attrs): need_test_unknown_vars = False if (need_test_unknown_vars): if (self.m_root_reverce is None): return res tn = self.m_root_reverce tn0 = None for i in range(len(word) - 1, -1, -1): if (tn.lazy_pos > 0): self.__loadTreeNode(tn) ch = ord(word[i]) if (tn.nodes is None): break wrapnext18 = RefOutArgWrapper(None) inoutres19 = Utils.tryGetValue(tn.nodes, ch, wrapnext18) next0_ = wrapnext18.value if (not inoutres19): break tn = next0_ if (tn.lazy_pos > 0): self.__loadTreeNode(tn) if (tn.reverce_variants is not None): tn0 = tn break else: i = -1 if (tn0 is not None): glas = i < 4 while i >= 0: if (LanguageHelper.isCyrillicVowel(word[i]) or LanguageHelper.isLatinVowel(word[i])): glas = True break i -= 1 if (glas): for mv in tn0.reverce_variants: if (((not mv.class0_.is_verb and not mv.class0_.is_adjective and not mv.class0_.is_noun) and not mv.class0_.is_proper_surname and not mv.class0_.is_proper_geo) and not mv.class0_.is_proper_secname): continue ok = False for rr in res: if (rr.is_in_dictionary): if (rr.class0_ == mv.class0_ or rr.class0_.is_noun): ok = True break if (not mv.class0_.is_adjective and rr.class0_.is_verb): ok = True break if (ok): continue if (len(mv.tail) > 0 and not LanguageHelper.endsWith(word, mv.tail)): continue r = MorphWordForm(mv, word) if (not MorphWordForm._hasMorphEquals(res, r)): r.undef_coef = mv.coef if (res is None): res = list() res.append(r) if (word == "ПРИ" and res is not None): for i in range(len(res) - 1, -1, -1): if (res[i].class0_.is_proper_geo): del res[i] else: i = -1 if (res is None or len(res) == 0): return None MorphEngine.__sort(res, word) for v in res: if (v.normal_case is None): v.normal_case = word if (v.class0_.is_verb): if (v.normal_full is None and LanguageHelper.endsWith(v.normal_case, "ТЬСЯ")): v.normal_full = v.normal_case[0:0+len(v.normal_case) - 2] v.language = self.language if (v.class0_.is_preposition): v.normal_case = LanguageHelper.normalizePreposition(v.normal_case) mc = MorphClass() for i in range(len(res) - 1, -1, -1): if (not res[i].is_in_dictionary and res[i].class0_.is_adjective and len(res) > 1): if ("к.ф." in res[i].misc.attrs or "неизм." in res[i].misc.attrs): del res[i] continue if (res[i].is_in_dictionary): mc.value |= res[i].class0_.value else: i = -1 if (mc == MorphClass.VERB and len(res) > 1): for r in res: if (r.undef_coef > (100) and r.class0_ == MorphClass.ADJECTIVE): r.undef_coef = (0) if (len(res) == 0): return None return res
def run(self, text: str, only_tokenizing: bool, dlang: 'MorphLang', progress: EventHandler, good_text: bool) -> typing.List['MorphToken']: """ Произвести морфологический анализ текста Args: text(str): исходный текст lang: язык (если null, то попробует определить) Returns: typing.List[MorphToken]: последовательность результирующих морфем """ if (Utils.isNullOrEmpty(text)): return None twr = TextWrapper(text, good_text) twrch = twr.chars res = list() uni_lex = dict() term0 = None pure_rus_words = 0 pure_ukr_words = 0 pure_by_words = 0 pure_kz_words = 0 tot_rus_words = 0 tot_ukr_words = 0 tot_by_words = 0 tot_kz_words = 0 i = 0 first_pass2708 = True while True: if first_pass2708: first_pass2708 = False else: i += 1 if (not (i < twr.length)): break ty = InnerMorphology._getCharTyp(twrch[i]) if (ty == 0): continue if (ty > 2): j = (i + 1) else: j = (i + 1) while j < twr.length: if (InnerMorphology._getCharTyp(twrch[j]) != ty): break j += 1 wstr = text[i:i + j - i] term = None if (good_text): term = wstr else: trstr = LanguageHelper.transliteralCorrection( wstr, term0, False) term = LanguageHelper.correctWord(trstr) if (Utils.isNullOrEmpty(term)): i = (j - 1) continue lang = InnerMorphology.__detectLang(twr, i, j - 1, term) if (lang == MorphLang.UA): pure_ukr_words += 1 elif (lang == MorphLang.RU): pure_rus_words += 1 elif (lang == MorphLang.BY): pure_by_words += 1 elif (lang == MorphLang.KZ): pure_kz_words += 1 if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN): tot_rus_words += 1 if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN): tot_ukr_words += 1 if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN): tot_by_words += 1 if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN): tot_kz_words += 1 if (ty == 1): term0 = term lemmas = None if (ty == 1 and not only_tokenizing): wraplemmas7 = RefOutArgWrapper(None) inoutres8 = Utils.tryGetValue(uni_lex, term, wraplemmas7) lemmas = wraplemmas7.value if (not inoutres8): lemmas = InnerMorphology.UniLexWrap._new6(lang) uni_lex[term] = lemmas tok = MorphToken() tok.term = term tok.begin_char = i if (i == 733860): pass tok.end_char = (j - 1) tok.tag = (lemmas) res.append(tok) i = (j - 1) def_lang = MorphLang(dlang) if (pure_rus_words > pure_ukr_words and pure_rus_words > pure_by_words and pure_rus_words > pure_kz_words): def_lang = MorphLang.RU elif (tot_rus_words > tot_ukr_words and tot_rus_words > tot_by_words and tot_rus_words > tot_kz_words): def_lang = MorphLang.RU elif (pure_ukr_words > pure_rus_words and pure_ukr_words > pure_by_words and pure_ukr_words > pure_kz_words): def_lang = MorphLang.UA elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words and tot_ukr_words > tot_kz_words): def_lang = MorphLang.UA elif (pure_kz_words > pure_rus_words and pure_kz_words > pure_ukr_words and pure_kz_words > pure_by_words): def_lang = MorphLang.KZ elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words and tot_kz_words > tot_by_words): def_lang = MorphLang.KZ elif (pure_by_words > pure_rus_words and pure_by_words > pure_ukr_words and pure_by_words > pure_kz_words): def_lang = MorphLang.BY elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words and tot_by_words > tot_kz_words): if (tot_rus_words > 10 and tot_by_words > (tot_rus_words + 20)): def_lang = MorphLang.BY elif (tot_rus_words == 0 or tot_by_words >= (tot_rus_words * 2)): def_lang = MorphLang.BY if (((def_lang.is_undefined or def_lang.is_ua)) and tot_rus_words > 0): if (((tot_ukr_words > tot_rus_words and InnerMorphology.M_ENGINE_UA.language.is_ua)) or ((tot_by_words > tot_rus_words and InnerMorphology.M_ENGINE_BY.language.is_by)) or ((tot_kz_words > tot_rus_words and InnerMorphology.M_ENGINE_KZ.language.is_kz))): cou0 = 0 tot_kz_words = 0 tot_ukr_words = tot_kz_words tot_by_words = tot_ukr_words tot_rus_words = tot_by_words for kp in uni_lex.items(): lang = MorphLang() wraplang9 = RefOutArgWrapper(lang) kp[1].word_forms = self.__processOneWord(kp[0], wraplang9) lang = wraplang9.value if (kp[1].word_forms is not None): for wf in kp[1].word_forms: lang |= wf.language kp[1].lang = lang if (lang.is_ru): tot_rus_words += 1 if (lang.is_ua): tot_ukr_words += 1 if (lang.is_by): tot_by_words += 1 if (lang.is_kz): tot_kz_words += 1 if (lang.is_cyrillic): cou0 += 1 if (cou0 >= 100): break if (tot_rus_words > ((math.floor(tot_by_words / 2))) and tot_rus_words > ((math.floor(tot_ukr_words / 2)))): def_lang = MorphLang.RU elif (tot_ukr_words > ((math.floor(tot_rus_words / 2))) and tot_ukr_words > ((math.floor(tot_by_words / 2)))): def_lang = MorphLang.UA elif (tot_by_words > ((math.floor(tot_rus_words / 2))) and tot_by_words > ((math.floor(tot_ukr_words / 2)))): def_lang = MorphLang.BY elif (def_lang.is_undefined): def_lang = MorphLang.RU cou = 0 tot_kz_words = 0 tot_ukr_words = tot_kz_words tot_by_words = tot_ukr_words tot_rus_words = tot_by_words for kp in uni_lex.items(): lang = def_lang if (lang.is_undefined): if (tot_rus_words > tot_by_words and tot_rus_words > tot_ukr_words and tot_rus_words > tot_kz_words): lang = MorphLang.RU elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words and tot_ukr_words > tot_kz_words): lang = MorphLang.UA elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words and tot_by_words > tot_kz_words): lang = MorphLang.BY elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words and tot_kz_words > tot_by_words): lang = MorphLang.KZ wraplang10 = RefOutArgWrapper(lang) kp[1].word_forms = self.__processOneWord(kp[0], wraplang10) lang = wraplang10.value kp[1].lang = lang if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN): tot_rus_words += 1 if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN): tot_ukr_words += 1 if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN): tot_by_words += 1 if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN): tot_kz_words += 1 if (progress is not None): self.__onProgress(cou, len(uni_lex), progress) cou += 1 debug_token = None empty_list = None for r in res: uni = Utils.asObjectOrNull(r.tag, InnerMorphology.UniLexWrap) r.tag = None if (uni is None or uni.word_forms is None or len(uni.word_forms) == 0): if (empty_list is None): empty_list = list() r.word_forms = empty_list if (uni is not None): r.language = uni.lang else: r.word_forms = uni.word_forms if (r.begin_char == 733860): debug_token = r if (not good_text): i = 0 first_pass2709 = True while True: if first_pass2709: first_pass2709 = False else: i += 1 if (not (i < (len(res) - 2))): break ui0 = twrch[res[i].begin_char] ui1 = twrch[res[i + 1].begin_char] ui2 = twrch[res[i + 2].begin_char] if (ui1.is_quot): p = res[i + 1].begin_char if ((p >= 2 and "БбТт".find(text[p - 1]) >= 0 and ((p + 3) < len(text))) and "ЕеЯяЁё".find(text[p + 1]) >= 0): wstr = LanguageHelper.transliteralCorrection( LanguageHelper.correctWord("{0}Ъ{1}".format( res[i].getSourceText(text), res[i + 2].getSourceText(text))), None, False) li = self.__processOneWord0(wstr) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i].end_char = res[i + 2].end_char res[i].term = wstr res[i].word_forms = li del res[i + 1:i + 1 + 2] elif ((ui1.is_apos and p > 0 and str.isalpha(text[p - 1])) and ((p + 1) < len(text)) and str.isalpha(text[p + 1])): if (def_lang == MorphLang.UA or (((res[i].language) & MorphLang.UA)) != MorphLang.UNKNOWN or (((res[i + 2].language) & MorphLang.UA)) != MorphLang.UNKNOWN): wstr = LanguageHelper.transliteralCorrection( LanguageHelper.correctWord("{0}{1}".format( res[i].getSourceText(text), res[i + 2].getSourceText(text))), None, False) li = self.__processOneWord0(wstr) okk = True if (okk): res[i].end_char = res[i + 2].end_char res[i].term = wstr if (li is None): li = list() res[i].word_forms = li if (li is not None and len(li) > 0): res[i].language = li[0].language del res[i + 1:i + 1 + 2] elif (((ui1.uni_char == '3' or ui1.uni_char == '4')) and res[i + 1].length == 1): src = ("З" if ui1.uni_char == '3' else "Ч") i0 = i + 1 if ((res[i].end_char + 1) == res[i + 1].begin_char and ui0.is_cyrillic): i0 -= 1 src = (res[i0].getSourceText(text) + src) i1 = i + 1 if ((res[i + 1].end_char + 1) == res[i + 2].begin_char and ui2.is_cyrillic): i1 += 1 src += res[i1].getSourceText(text) if (len(src) > 2): wstr = LanguageHelper.transliteralCorrection( LanguageHelper.correctWord(src), None, False) li = self.__processOneWord0(wstr) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i0].end_char = res[i1].end_char res[i0].term = wstr res[i0].word_forms = li del res[i0 + 1:i0 + 1 + i1 - i0] elif ((ui1.is_hiphen and ui0.is_letter and ui2.is_letter) and res[i].end_char > res[i].begin_char and res[i + 2].end_char > res[i + 2].begin_char): newline = False sps = 0 j = (res[i + 1].end_char + 1) while j < res[i + 2].begin_char: if (text[j] == '\r' or text[j] == '\n'): newline = True sps += 1 elif (not Utils.isWhitespace(text[j])): break else: sps += 1 j += 1 full_word = LanguageHelper.correctWord( res[i].getSourceText(text) + res[i + 2].getSourceText(text)) if (not newline): if (full_word in uni_lex or full_word == "ИЗЗА"): newline = True elif (text[res[i + 1].begin_char] == (chr(0x00AD))): newline = True elif (LanguageHelper.endsWithEx( res[i].getSourceText(text), "О", "о", None, None) and len(res[i + 2].word_forms) > 0 and res[i + 2].word_forms[0].is_in_dictionary): if (text[res[i + 1].begin_char] == '¬'): li = self.__processOneWord0(full_word) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): newline = True elif ((res[i].end_char + 2) == res[i + 2].begin_char): if (not str.isupper(text[res[i + 2].begin_char]) and (sps < 2) and len(full_word) > 4): newline = True if ((i + 3) < len(res)): ui3 = twrch[res[i + 3].begin_char] if (ui3.is_hiphen): newline = False elif (((res[i].end_char + 1) == res[i + 1].begin_char and sps > 0 and (sps < 3)) and len(full_word) > 4): newline = True if (newline): li = self.__processOneWord0(full_word) if (li is not None and len(li) > 0 and ((li[0].is_in_dictionary or full_word in uni_lex))): res[i].end_char = res[i + 2].end_char res[i].term = full_word res[i].word_forms = li del res[i + 1:i + 1 + 2] else: pass elif ((ui1.is_letter and ui0.is_letter and res[i].length > 2) and res[i + 1].length > 1): if (ui0.is_upper != ui1.is_upper): continue if (not ui0.is_cyrillic or not ui1.is_cyrillic): continue newline = False j = (res[i].end_char + 1) while j < res[i + 1].begin_char: if (twrch[j].code == 0xD or twrch[j].code == 0xA): newline = True break j += 1 if (not newline): continue full_word = LanguageHelper.correctWord( res[i].getSourceText(text) + res[i + 1].getSourceText(text)) if (not full_word in uni_lex): continue li = self.__processOneWord0(full_word) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i].end_char = res[i + 1].end_char res[i].term = full_word res[i].word_forms = li del res[i + 1] i = 0 first_pass2710 = True while True: if first_pass2710: first_pass2710 = False else: i += 1 if (not (i < len(res))): break mt = res[i] mt.char_info = CharsInfo() ui0 = twrch[mt.begin_char] ui00 = UnicodeInfo.ALL_CHARS[ord((res[i].term[0]))] j = (mt.begin_char + 1) while j <= mt.end_char: if (ui0.is_letter): break ui0 = twrch[j] j += 1 if (ui0.is_letter): res[i].char_info.is_letter = True if (ui00.is_latin): res[i].char_info.is_latin_letter = True elif (ui00.is_cyrillic): res[i].char_info.is_cyrillic_letter = True if (res[i].language == MorphLang.UNKNOWN): if (LanguageHelper.isCyrillic(mt.term)): res[i].language = (MorphLang.RU if def_lang.is_undefined else def_lang) if (good_text): continue all_up = True all_lo = True j = mt.begin_char while j <= mt.end_char: if (twrch[j].is_upper or twrch[j].is_digit): all_lo = False else: all_up = False j += 1 if (all_up): mt.char_info.is_all_upper = True elif (all_lo): mt.char_info.is_all_lower = True elif (((ui0.is_upper or twrch[mt.begin_char].is_digit)) and mt.end_char > mt.begin_char): all_lo = True j = (mt.begin_char + 1) while j <= mt.end_char: if (twrch[j].is_upper or twrch[j].is_digit): all_lo = False break j += 1 if (all_lo): mt.char_info.is_capital_upper = True elif (twrch[mt.end_char].is_lower and (mt.end_char - mt.begin_char) > 1): all_up = True j = mt.begin_char while j < mt.end_char: if (twrch[j].is_lower): all_up = False break j += 1 if (all_up): mt.char_info.is_last_lower = True if (mt.char_info.is_last_lower and mt.length > 2 and mt.char_info.is_cyrillic_letter): pref = text[mt.begin_char:mt.begin_char + mt.end_char - mt.begin_char] ok = False for wf in mt.word_forms: if (wf.normal_case == pref or wf.normal_full == pref): ok = True break if (not ok): mt.word_forms = list(mt.word_forms) mt.word_forms.insert( 0, MorphWordForm._new11(pref, MorphClass.NOUN, 1)) if (good_text or only_tokenizing): return res i = 0 first_pass2711 = True while True: if first_pass2711: first_pass2711 = False else: i += 1 if (not (i < len(res))): break if (res[i].length == 1 and res[i].char_info.is_latin_letter): ch = res[i].term[0] if (ch == 'C' or ch == 'A' or ch == 'P'): pass else: continue is_rus = False for ii in range(i - 1, -1, -1): if ((res[ii].end_char + 1) != res[ii + 1].begin_char): break elif (res[ii].char_info.is_letter): is_rus = res[ii].char_info.is_cyrillic_letter break if (not is_rus): ii = i + 1 while ii < len(res): if ((res[ii - 1].end_char + 1) != res[ii].begin_char): break elif (res[ii].char_info.is_letter): is_rus = res[ii].char_info.is_cyrillic_letter break ii += 1 if (is_rus): res[i].term = LanguageHelper.transliteralCorrection( res[i].term, None, True) res[i].char_info.is_cyrillic_letter = True res[i].char_info.is_latin_letter = True for r in res: if (r.char_info.is_all_upper or r.char_info.is_capital_upper): if (r.language.is_cyrillic): ok = False for wf in r.word_forms: if (wf.class0_.is_proper_surname): ok = True break if (not ok): r.word_forms = list(r.word_forms) InnerMorphology.M_ENGINE_RU.processSurnameVariants( r.term, r.word_forms) for r in res: for mv in r.word_forms: if (mv.normal_case is None): mv.normal_case = r.term i = 0 while i < (len(res) - 2): if (res[i].char_info.is_latin_letter and res[i].char_info.is_all_upper and res[i].length == 1): if (twrch[res[i + 1].begin_char].is_quot and res[i + 2].char_info.is_latin_letter and res[i + 2].length > 2): if ((res[i].end_char + 1) == res[i + 1].begin_char and (res[i + 1].end_char + 1) == res[i + 2].begin_char): wstr = "{0}{1}".format(res[i].term, res[i + 2].term) li = self.__processOneWord0(wstr) if (li is not None): res[i].word_forms = li res[i].end_char = res[i + 2].end_char res[i].term = wstr if (res[i + 2].char_info.is_all_lower): res[i].char_info.is_all_upper = False res[i].char_info.is_capital_upper = True elif (not res[i + 2].char_info.is_all_upper): res[i].char_info.is_all_upper = False del res[i + 1:i + 1 + 2] i += 1 i = 0 first_pass2712 = True while True: if first_pass2712: first_pass2712 = False else: i += 1 if (not (i < (len(res) - 1))): break if (not res[i].char_info.is_letter and not res[i + 1].char_info.is_letter and (res[i].end_char + 1) == res[i + 1].begin_char): if (twrch[res[i].begin_char].is_hiphen and twrch[res[i + 1].begin_char].is_hiphen): if (i == 0 or not twrch[res[i - 1].begin_char].is_hiphen): pass else: continue if ((i + 2) == len(res) or not twrch[res[i + 2].begin_char].is_hiphen): pass else: continue res[i].end_char = res[i + 1].end_char del res[i + 1] return res
def __try1(li: typing.List['CityItemToken'], oi: 'IntOntologyItem', ad: 'AnalyzerDataWithOntology') -> 'ReferentToken': oi.value = (None) if (li is None or (len(li) < 1)): return None elif (li[0].typ != CityItemToken.ItemType.CITY): if (len(li) != 2 or li[0].typ != CityItemToken.ItemType.PROPERNAME or li[1].typ != CityItemToken.ItemType.NOUN): return None i = 1 oi.value = li[0].onto_item ok = not li[0].doubtful if ((ok and li[0].onto_item is not None and li[0].onto_item.misc_attr is None) and ad is not None): if (li[0].onto_item.owner != ad.local_ontology and not li[0].onto_item.owner.is_ext_ontology): if (li[0].begin_token.previous is not None and li[0].begin_token.previous.isValue("В", None)): pass else: ok = False if (len(li) == 1 and li[0].begin_token.morph.class0_.is_adjective): sits = StreetItemToken.tryParseList(li[0].begin_token, None, 3) if (sits is not None and len(sits) == 2 and sits[1].typ == StreetItemType.NOUN): return None typ = None alttyp = None mc = li[0].morph if (i < len(li)): if (li[i].typ == CityItemToken.ItemType.NOUN): at = None if (not li[i].chars.is_all_lower and (li[i].whitespaces_after_count < 2)): sit = StreetItemToken.tryParse(li[i].end_token.next0_, None, False, None, False) if (sit is not None and sit.typ == StreetItemType.NOUN): at = AddressItemToken.tryParse(li[i].begin_token, None, False, False, None) if (at is not None): at2 = AddressItemToken.tryParse( li[i].end_token.next0_, None, False, False, None) if (at2 is not None and at2.typ == AddressItemToken.ItemType.STREET): at = (None) if (at is None): typ = li[i].value alttyp = li[i].alt_value if (li[i].begin_token.isValue("СТ", None) and li[i].begin_token.chars.is_all_upper): return None if ((i + 1) == len(li)): ok = True if (not li[i].morph.case_.is_undefined): mc = li[i].morph i += 1 elif (ok): i += 1 else: tt0 = li[0].begin_token.previous if ((isinstance(tt0, TextToken)) and (tt0.whitespaces_after_count < 3)): if (tt0.isValue("МЭР", "МЕР") or tt0.isValue("ГЛАВА", None) or tt0.isValue("ГРАДОНАЧАЛЬНИК", None)): ok = True i += 1 if (not ok and oi.value is not None and (len(oi.value.canonic_text) < 4)): return None if (not ok and li[0].begin_token.morph.class0_.is_proper_name): return None if (not ok): if (not MiscHelper.isExistsInDictionary( li[0].begin_token, li[0].end_token, (MorphClass.ADJECTIVE) | MorphClass.NOUN | MorphClass.PRONOUN)): ok = (li[0].geo_object_before or li[i - 1].geo_object_after) if (ok and li[0].begin_token == li[0].end_token): mcc = li[0].begin_token.getMorphClassInDictionary() if (mcc.is_proper_name or mcc.is_proper_surname): ok = False elif (li[0].geo_object_before and (li[0].whitespaces_after_count < 2)): ad1 = AddressItemToken.tryParse( li[0].begin_token, None, False, False, None) if (ad1 is not None and ad1.typ == AddressItemToken.ItemType.STREET): ad2 = AddressItemToken.tryParse( li[0].end_token.next0_, None, False, False, None) if (ad2 is None or ad2.typ != AddressItemToken.ItemType.STREET): ok = False elif (AddressItemToken.tryAttachOrg(li[0].begin_token) is not None): ok = False if (ok): if (li[0].kit.processReferent("PERSON", li[0].begin_token) is not None): ok = False if (not ok): ok = CityAttachHelper.checkYearAfter(li[0].end_token.next0_) if (not ok and ((not li[0].begin_token.morph.class0_.is_adjective or li[0].begin_token != li[0].end_token))): ok = CityAttachHelper.checkCityAfter(li[0].end_token.next0_) if (not ok): return None if (i < len(li)): del li[i:i + len(li) - i] rt = None if (oi.value is None): if (li[0].value is not None and li[0].higher_geo is not None): cap = GeoReferent() cap._addName(li[0].value) cap._addTypCity(li[0].kit.base_language) cap.higher = li[0].higher_geo if (typ is not None): cap._addTyp(typ) if (alttyp is not None): cap._addTyp(alttyp) rt = ReferentToken(cap, li[0].begin_token, li[0].end_token) else: if (li[0].value is None): return None if (typ is None): if ((len(li) == 1 and li[0].begin_token.previous is not None and li[0].begin_token.previous.is_hiphen) and (isinstance(li[0].begin_token.previous.previous, ReferentToken)) and (isinstance( li[0].begin_token.previous.previous.getReferent(), GeoReferent))): pass else: return None else: if (not LanguageHelper.endsWithEx(typ, "ПУНКТ", "ПОСЕЛЕНИЕ", "ПОСЕЛЕННЯ", "ПОСЕЛОК")): if (not LanguageHelper.endsWith(typ, "CITY")): if (typ == "СТАНЦИЯ" and ((MiscLocationHelper.checkGeoObjectBefore( li[0].begin_token)))): pass elif (len(li) > 1 and li[1].typ == CityItemToken.ItemType.NOUN and li[0].typ == CityItemToken.ItemType.CITY): pass else: return None if (li[0].begin_token.morph.class0_.is_adjective): li[0].value = ProperNameHelper.getNameEx( li[0].begin_token, li[0].end_token, MorphClass.ADJECTIVE, li[1].morph.case_, li[1].morph.gender, False, False) elif (isinstance(oi.value.referent, GeoReferent)): rt = ReferentToken._new719( Utils.asObjectOrNull(oi.value.referent, GeoReferent), li[0].begin_token, li[len(li) - 1].end_token, mc) elif (typ is None): typ = oi.value.typ if (rt is None): city = GeoReferent() city._addName( (li[0].value if oi.value is None else oi.value.canonic_text)) if (typ is not None): city._addTyp(typ) else: city._addTypCity(li[0].kit.base_language) if (alttyp is not None): city._addTyp(alttyp) rt = ReferentToken._new719(city, li[0].begin_token, li[len(li) - 1].end_token, mc) if ((isinstance(rt.referent, GeoReferent)) and len(li) == 1 and (rt.referent).is_city): if (rt.begin_token.previous is not None and rt.begin_token.previous.isValue("Г", None)): rt.begin_token = rt.begin_token.previous elif ((rt.begin_token.previous is not None and rt.begin_token.previous.isChar('.') and rt.begin_token.previous.previous is not None) and rt.begin_token.previous.previous.isValue("Г", None)): rt.begin_token = rt.begin_token.previous.previous elif (rt.end_token.next0_ is not None and (rt.whitespaces_after_count < 2) and rt.end_token.next0_.isValue("Г", None)): rt.end_token = rt.end_token.next0_ if (rt.end_token.next0_ is not None and rt.end_token.next0_.isChar('.')): rt.end_token = rt.end_token.next0_ return rt
def tryParse(t: 'Token', items: typing.List['NounPhraseItem'], attrs: 'NounPhraseParseAttr') -> 'NounPhraseItem': if (t is None): return None t0 = t _can_be_surname = False _is_doubt_adj = False rt = Utils.asObjectOrNull(t, ReferentToken) if (rt is not None and rt.begin_token == rt.end_token): res = NounPhraseItem.tryParse(rt.begin_token, items, attrs) if (res is not None): res.begin_token = res.end_token = t return res if (rt is not None and items is not None and len(items) > 0): res = NounPhraseItem(t, t) for m in t.morph.items: v = NounPhraseItemTextVar(m, None) v.normal_value = str(t.getReferent()) res.noun_morph.append(v) res.can_be_noun = True return res if (isinstance(t, NumberToken)): pass has_legal_verb = False if (isinstance(t, TextToken)): if (not t.chars.is_letter): return None str0_ = (t).term if (str0_[len(str0_) - 1] == 'А' or str0_[len(str0_) - 1] == 'О'): for wf in t.morph.items: if ((isinstance(wf, MorphWordForm)) and (wf).is_in_dictionary): if (wf.class0_.is_verb): mc = t.getMorphClassInDictionary() if (not mc.is_noun and (((attrs) & (NounPhraseParseAttr.IGNOREPARTICIPLES))) == (NounPhraseParseAttr.NO)): if (not LanguageHelper.endsWithEx( str0_, "ОГО", "ЕГО", None, None)): return None has_legal_verb = True if (wf.class0_.is_adverb): if (t.next0_ is None or not t.next0_.is_hiphen): if ((str0_ == "ВСЕГО" or str0_ == "ДОМА" or str0_ == "НЕСКОЛЬКО") or str0_ == "МНОГО" or str0_ == "ПОРЯДКА"): pass else: return None if (wf.class0_.is_adjective): if (wf.containsAttr("к.ф.", None)): if (t.getMorphClassInDictionary() == MorphClass.ADJECTIVE): pass else: _is_doubt_adj = True mc0 = t.morph.class0_ if (mc0.is_proper_surname and not t.chars.is_all_lower): for wf in t.morph.items: if (wf.class0_.is_proper_surname and wf.number != MorphNumber.PLURAL): wff = Utils.asObjectOrNull(wf, MorphWordForm) if (wff is None): continue s = Utils.ifNotNull((Utils.ifNotNull( wff.normal_full, wff.normal_case)), "") if (LanguageHelper.endsWithEx(s, "ИН", "ЕН", "ЫН", None)): if (not wff.is_in_dictionary): _can_be_surname = True else: return None if (wff.is_in_dictionary and LanguageHelper.endsWith(s, "ОВ")): _can_be_surname = True if (mc0.is_proper_name and not t.chars.is_all_lower): for wff in t.morph.items: wf = Utils.asObjectOrNull(wff, MorphWordForm) if (wf is None): continue if (wf.normal_case == "ГОР"): continue if (wf.class0_.is_proper_name and wf.is_in_dictionary): if (wf.normal_case is None or not wf.normal_case.startswith("ЛЮБ")): if (mc0.is_adjective and t.morph.containsAttr("неизм.", None)): pass elif ( (((attrs) & (NounPhraseParseAttr.REFERENTCANBENOUN)) ) == (NounPhraseParseAttr.REFERENTCANBENOUN)): pass else: if (items is None or (len(items) < 1)): return None if (not items[0].is_std_adjective): return None if (mc0.is_adjective and t.morph.items_count == 1): if (t.morph.getIndexerItem(0).containsAttr("в.ср.ст.", None)): return None mc1 = t.getMorphClassInDictionary() if (mc1 == MorphClass.VERB): return None if (((((attrs) & (NounPhraseParseAttr.IGNOREPARTICIPLES))) == (NounPhraseParseAttr.IGNOREPARTICIPLES) and t.morph.class0_.is_verb and not t.morph.class0_.is_noun) and not t.morph.class0_.is_proper): for wf in t.morph.items: if (wf.class0_.is_verb): if (wf.containsAttr("дейст.з.", None)): if (LanguageHelper.endsWith((t).term, "СЯ")): pass else: return None t1 = None for k in range(2): t = (Utils.ifNotNull(t1, t0)) if (k == 0): if ((((isinstance(t0, TextToken))) and t0.next0_ is not None and t0.next0_.is_hiphen) and t0.next0_.next0_ is not None): if (not t0.is_whitespace_after and not t0.morph.class0_.is_pronoun): if (not t0.next0_.is_whitespace_after): t = t0.next0_.next0_ elif (t0.next0_.next0_.chars.is_all_lower and LanguageHelper.endsWith((t0).term, "О")): t = t0.next0_.next0_ it = NounPhraseItem._new470(t0, t, _can_be_surname) if (t0 == t and (isinstance(t0, ReferentToken))): it.can_be_noun = True it.morph = MorphCollection(t0.morph) can_be_prepos = False for v in t.morph.items: wf = Utils.asObjectOrNull(v, MorphWordForm) if (v.class0_.is_preposition): can_be_prepos = True if (v.class0_.is_adjective or ((v.class0_.is_pronoun and not v.class0_.is_personal_pronoun)) or ((v.class0_.is_noun and (isinstance(t, NumberToken))))): if (NounPhraseItem.tryAccordVariant( items, (0 if items is None else len(items)), v)): is_doub = False if (v.containsAttr("к.ф.", None)): continue if (v.containsAttr("собир.", None) and not ((isinstance(t, NumberToken)))): if (wf is not None and wf.is_in_dictionary): return None continue if (v.containsAttr("сравн.", None)): continue ok = True if (isinstance(t, TextToken)): s = (t).term if (s == "ПРАВО" or s == "ПРАВА"): ok = False elif (LanguageHelper.endsWith(s, "ОВ") and t.getMorphClassInDictionary().is_noun): ok = False elif (wf is not None and ((wf.normal_case == "САМ" or wf.normal_case == "ТО"))): ok = False elif (isinstance(t, NumberToken)): if (v.class0_.is_noun and t.morph.class0_.is_adjective): ok = False elif (t.morph.class0_.is_noun and (( (attrs) & (NounPhraseParseAttr.PARSENUMERICASADJECTIVE))) == (NounPhraseParseAttr.NO)): ok = False if (ok): it.adj_morph.append(NounPhraseItemTextVar(v, t)) it.can_be_adj = True if (_is_doubt_adj and t0 == t): it.is_doubt_adjective = True if (has_legal_verb and wf is not None and wf.is_in_dictionary): it.can_be_noun = True can_be_noun_ = False if (isinstance(t, NumberToken)): pass elif (v.class0_.is_noun or ((wf is not None and wf.normal_case == "САМ"))): can_be_noun_ = True elif (v.class0_.is_personal_pronoun): if (items is None or len(items) == 0): can_be_noun_ = True else: for it1 in items: if (it1.is_verb): return None if (len(items) == 1): if (items[0].can_be_adj_for_personal_pronoun): can_be_noun_ = True elif ((v.class0_.is_pronoun and ((items is None or len(items) == 0 or ((len(items) == 1 and items[0].can_be_adj_for_personal_pronoun)))) and wf is not None) and ((((wf.normal_case == "ТОТ" or wf.normal_full == "ТО" or wf.normal_case == "ТО") or wf.normal_case == "ЭТО" or wf.normal_case == "ВСЕ") or wf.normal_case == "ЧТО" or wf.normal_case == "КТО"))): if (wf.normal_case == "ВСЕ"): if (t.next0_ is not None and t.next0_.isValue("РАВНО", None)): return None can_be_noun_ = True elif (wf is not None and ((Utils.ifNotNull( wf.normal_full, wf.normal_case))) == "КОТОРЫЙ"): return None elif (v.class0_.is_proper and (isinstance(t, TextToken))): if (t.length_char > 4 or v.class0_.is_proper_name): can_be_noun_ = True if (can_be_noun_): if (NounPhraseItem.tryAccordVariant( items, (0 if items is None else len(items)), v)): it.noun_morph.append(NounPhraseItemTextVar(v, t)) it.can_be_noun = True if (t0 != t): for v in it.adj_morph: v.correctPrefix(Utils.asObjectOrNull(t0, TextToken), False) for v in it.noun_morph: v.correctPrefix(Utils.asObjectOrNull(t0, TextToken), True) if (k == 1 and it.can_be_noun and not it.can_be_adj): if (t1 is not None): it.end_token = t1 else: it.end_token = t0.next0_.next0_ for v in it.noun_morph: if (v.normal_value is not None and (v.normal_value.find('-') < 0)): v.normal_value = "{0}-{1}".format( v.normal_value, it.end_token.getNormalCaseText( None, False, MorphGender.UNDEFINED, False)) if (it.can_be_adj): if (NounPhraseItem.__m_std_adjectives.tryParse( it.begin_token, TerminParseAttr.NO) is not None): it.is_std_adjective = True if (can_be_prepos and it.can_be_noun): if (items is not None and len(items) > 0): npt1 = NounPhraseHelper.tryParse( t, Utils.valToEnum((NounPhraseParseAttr.PARSEPREPOSITION) | (NounPhraseParseAttr.PARSEPRONOUNS) | (NounPhraseParseAttr.PARSEVERBS), NounPhraseParseAttr), 0) if (npt1 is not None and npt1.end_char > t.end_char): return None else: npt1 = NounPhraseHelper.tryParse( t.next0_, Utils.valToEnum((NounPhraseParseAttr.PARSEPRONOUNS) | (NounPhraseParseAttr.PARSEVERBS), NounPhraseParseAttr), 0) if (npt1 is not None): mc = LanguageHelper.getCaseAfterPreposition((t).lemma) if (not ((mc) & npt1.morph.case_).is_undefined): return None if (it.can_be_noun or it.can_be_adj or k == 1): if (it.begin_token.morph.class0_.is_pronoun): tt2 = it.end_token.next0_ if ((tt2 is not None and tt2.is_hiphen and not tt2.is_whitespace_after) and not tt2.is_whitespace_before): tt2 = tt2.next0_ if (isinstance(tt2, TextToken)): ss = (tt2).term if ((ss == "ЖЕ" or ss == "БЫ" or ss == "ЛИ") or ss == "Ж"): it.end_token = tt2 elif (ss == "НИБУДЬ" or ss == "ЛИБО" or (((ss == "ТО" and tt2.previous.is_hiphen)) and it.can_be_adj)): it.end_token = tt2 for m in it.adj_morph: m.normal_value = "{0}-{1}".format( m.normal_value, ss) if (m.single_number_value is not None): m.single_number_value = "{0}-{1}".format( m.single_number_value, ss) return it if (t0 == t): if (t0.isValue("БИЗНЕС", None) and t0.next0_ is not None and t0.next0_.chars == t0.chars): t1 = t0.next0_ continue return it return None
def __TryParse(t: 'Token', prev: 'TransItemToken', after_conj: bool, attach_high: bool = False) -> 'TransItemToken': if (t is None): return None t1 = t if (t1.isChar(',')): t1 = t1.next0_ if (t1 is not None and t1.isValue("ПРИНАДЛЕЖАТЬ", "НАЛЕЖАТИ")): t1 = t1.next0_ if (isinstance(t1, ReferentToken)): if (t1.getReferent().type_name == "ORGANIZATION"): return TransItemToken._new2521(t, t1, TransItemToken.Typs.ORG, t1.getReferent(), t1.morph) route = False if (t1 is not None and ((t1.isValue("СЛЕДОВАТЬ", "СЛІДУВАТИ") or t1.isValue("ВЫПОЛНЯТЬ", "ВИКОНУВАТИ")))): t1 = t1.next0_ route = True if (t1 is not None and t1.morph.class0_.is_preposition): t1 = t1.next0_ if (t1 is not None and ((t1.isValue("РЕЙС", None) or t1.isValue("МАРШРУТ", None)))): t1 = t1.next0_ route = True if (isinstance(t1, ReferentToken)): if (isinstance(t1.getReferent(), GeoReferent)): geo_ = Utils.asObjectOrNull(t1.getReferent(), GeoReferent) if (geo_.is_state or geo_.is_city): tit = TransItemToken._new2522(t, t1, TransItemToken.Typs.ROUTE, list()) tit.route_items.append(geo_) t1 = t1.next0_ first_pass3132 = True while True: if first_pass3132: first_pass3132 = False else: t1 = t1.next0_ if (not (t1 is not None)): break if (t1.is_hiphen): continue if (t1.morph.class0_.is_preposition or t1.morph.class0_.is_conjunction): continue geo_ = (Utils.asObjectOrNull(t1.getReferent(), GeoReferent)) if (geo_ is None): break if (not geo_.is_city and not geo_.is_state): break tit.route_items.append(geo_) tit.end_token = t1 if (len(tit.route_items) > 1 or route): return tit elif ((isinstance(t1.getReferent(), DateReferent)) and (t1.whitespaces_before_count < 3)): tit = TransItemToken._new2523(t, t1, TransItemToken.Typs.DATE, t1.getReferent()) if (t1.next0_ is not None): if (t1.next0_.isValue("В", None) and t1.next0_.next0_ is not None and t1.next0_.next0_.isChar('.')): tit.end_token = t1.next0_.next0_ elif (t1.next0_.isValue("ВЫП", None) or t1.next0_.isValue("ВЫПУСК", None)): tit.end_token = t1.next0_ if (t1.next0_.next0_ is not None and t1.next0_.next0_.isChar('.')): tit.end_token = t1.next0_.next0_ return tit if (isinstance(t, TextToken)): num = MiscHelper.checkNumberPrefix(t) if (num is not None): tit = TransItemToken.__attachRusAutoNumber(num) if (tit is None): tit = TransItemToken._attachNumber(num, False) if (tit is not None): tit.begin_token = t return tit tok = TransItemToken.M_ONTOLOGY.tryParse(t, TerminParseAttr.NO) if (tok is None and ((t.isValue("С", None) or t.isValue("C", None) or t.isValue("ЗА", None)))): tok = TransItemToken.M_ONTOLOGY.tryParse( t.next0_, TerminParseAttr.NO) if (tok is None and BracketHelper.isBracket(t, True)): tok1 = TransItemToken.M_ONTOLOGY.tryParse( t.next0_, TerminParseAttr.NO) if (tok1 is not None and BracketHelper.isBracket( tok1.end_token.next0_, True)): tok = tok1 tok.begin_token = t tok.end_token = tok.end_token.next0_ tok.begin_token = t elif (tok1 is not None): tt = Utils.asObjectOrNull(tok1.termin, TransItemToken.TransTermin) if (tt.typ == TransItemToken.Typs.BRAND): tok = tok1 tok.begin_token = t if (tok is None and t.isValue("МАРКА", None)): res1 = TransItemToken.__TryParse(t.next0_, prev, after_conj, False) if (res1 is not None): if (res1.typ == TransItemToken.Typs.NAME or res1.typ == TransItemToken.Typs.BRAND): res1.begin_token = t res1.typ = TransItemToken.Typs.BRAND return res1 if (tok is not None): tt = Utils.asObjectOrNull(tok.termin, TransItemToken.TransTermin) if (tt.typ == TransItemToken.Typs.NUMBER): tit = TransItemToken.__attachRusAutoNumber( tok.end_token.next0_) if (tit is None): tit = TransItemToken._attachNumber( tok.end_token.next0_, False) if (tit is not None): tit.begin_token = t return tit else: return None if (tt.is_doubt and not attach_high): if (prev is None or prev.typ != TransItemToken.Typs.NOUN): if ((prev is not None and prev.typ == TransItemToken.Typs.BRAND and tt.typ == TransItemToken.Typs.BRAND) and Utils.compareStrings( tt.canonic_text, prev.value, True) == 0): pass else: return None if (tt.canonic_text == "СУДНО"): if ((((tok.morph.number) & (MorphNumber.PLURAL))) != (MorphNumber.UNDEFINED)): if (not BracketHelper.canBeStartOfSequence( tok.end_token.next0_, False, False)): return None tit = TransItemToken._new2524(tok.begin_token, tok.end_token, tt.kind, tt.typ, tt.is_doubt, tok.chars, tok.morph) tit.value = tt.canonic_text if (tit.typ == TransItemToken.Typs.NOUN): tit.value = tit.value.lower() else: tit.value = tit.value.upper() return tit if (tok is None and t.morph.class0_.is_adjective): npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0) if (npt is not None and len(npt.adjectives) > 0): state_ = None tt = t first_pass3133 = True while True: if first_pass3133: first_pass3133 = False else: tt = tt.next0_ if (not (tt is not None and tt.previous != npt.end_token)): break tok = TransItemToken.M_ONTOLOGY.tryParse( tt, TerminParseAttr.NO) if (tok is None and state_ is None): state_ = tt.kit.processReferent("GEO", tt) if (tok is not None and tok.end_token == npt.end_token): if ((tok.termin).typ == TransItemToken.Typs.NOUN): tit = TransItemToken._new2524( t, tok.end_token, (tok.termin).kind, TransItemToken.Typs.NOUN, (tok.termin).is_doubt, tok.chars, npt.morph) tit.value = (tok.termin).canonic_text.lower() tit.alt_value = npt.getNormalCaseText( None, False, MorphGender.UNDEFINED, False).lower() if (LanguageHelper.endsWithEx( tit.alt_value, "суд", "суда", None, None)): if (not BracketHelper.canBeStartOfSequence( tok.end_token.next0_, False, False)): continue if (state_ is not None): if ((state_.referent).is_state): tit.state = state_ return tit if (t is not None and t.isValue("КЛАСС", None) and t.next0_ is not None): br = BracketHelper.tryParse(t.next0_, BracketParseAttr.NO, 100) if (br is not None): return TransItemToken._new2526( t, br.end_token, TransItemToken.Typs.CLASS, MiscHelper.getTextValueOfMetaToken(br, GetTextAttr.NO)) nt = Utils.asObjectOrNull(t, NumberToken) if (nt is not None): if (prev is None or nt.typ != NumberSpellingType.DIGIT): return None if (prev.typ == TransItemToken.Typs.BRAND): return TransItemToken.__attachModel(t, False, prev) else: return None res = TransItemToken.__attachRusAutoNumber(t) if ((res) is not None): if (not res.is_doubt): return res if (prev is not None and prev.typ == TransItemToken.Typs.NOUN and prev.kind == TransportKind.AUTO): return res if (prev is not None and ((prev.typ == TransItemToken.Typs.BRAND or prev.typ == TransItemToken.Typs.MODEL))): return res t1 = t if (t.is_hiphen): t1 = t.next0_ if (prev is not None and prev.typ == TransItemToken.Typs.BRAND and t1 is not None): tit = TransItemToken.__attachModel(t1, True, prev) if (tit is not None): tit.begin_token = t return tit if (prev is not None and ((prev.typ == TransItemToken.Typs.NOUN or after_conj))): br = BracketHelper.tryParse(t, BracketParseAttr.NO, 100) if (br is not None and br.is_quote_type): tit = TransItemToken.tryParse(br.begin_token.next0_, prev, after_conj, False) if (tit is not None and tit.end_token.next0_ == br.end_token): if (not tit.is_doubt or tit.typ == TransItemToken.Typs.BRAND): tit.begin_token = br.begin_token tit.end_token = br.end_token return tit s = MiscHelper.getTextValueOfMetaToken(br, GetTextAttr.NO) if (not Utils.isNullOrEmpty(s) and (len(s) < 30)): chars_ = 0 digs = 0 un = 0 for c in s: if (not Utils.isWhitespace(c)): if (str.isalpha(c)): chars_ += 1 elif (str.isdigit(c)): digs += 1 else: un += 1 if (((digs == 0 and un == 0 and t.next0_.chars.is_capital_upper)) or prev.kind == TransportKind.SHIP or prev.kind == TransportKind.SPACE): return TransItemToken._new2526( br.begin_token, br.end_token, TransItemToken.Typs.NAME, s) if (digs > 0 and (chars_ < 5)): return TransItemToken._new2526( br.begin_token, br.end_token, TransItemToken.Typs.MODEL, s.replace(" ", "")) if (prev is not None and (((prev.typ == TransItemToken.Typs.NOUN or prev.typ == TransItemToken.Typs.BRAND or prev.typ == TransItemToken.Typs.NAME) or prev.typ == TransItemToken.Typs.MODEL))): tit = TransItemToken.__attachModel( t, prev.typ != TransItemToken.Typs.NAME, prev) if (tit is not None): return tit if (((prev is not None and prev.typ == TransItemToken.Typs.NOUN and prev.kind == TransportKind.AUTO) and (isinstance(t, TextToken)) and t.chars.is_letter) and not t.chars.is_all_lower and (t.whitespaces_before_count < 2)): pt = t.kit.processReferent("PERSON", t) if (pt is None): tit = TransItemToken._new2529(t, t, TransItemToken.Typs.BRAND) tit.value = (t).term return tit if (((prev is not None and prev.typ == TransItemToken.Typs.NOUN and ((prev.kind == TransportKind.SHIP or prev.kind == TransportKind.SPACE)))) or after_conj): if (t.chars.is_capital_upper): ok = True npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0) if (npt is not None and len(npt.adjectives) > 0): ok = False else: rt = t.kit.processReferent("PERSON", t) if (rt is not None): ok = False if (t.getMorphClassInDictionary().is_proper_surname): if (not t.morph.case_.is_nominative): ok = False if (ok): t1 = t tt = t.next0_ while tt is not None: if (tt.whitespaces_before_count > 1): break if (tt.chars != t.chars): break tit = TransItemToken.tryParse(tt, None, False, False) if ((tit) is not None): break t1 = tt tt = tt.next0_ s = MiscHelper.getTextValue(t, t1, GetTextAttr.NO) if (s is not None): res1 = TransItemToken._new2530( t, t1, TransItemToken.Typs.NAME, True, s) if (not t1.is_newline_after): br = BracketHelper.tryParse( t1.next0_, BracketParseAttr.NO, 100) if (br is not None): res1.end_token = br.end_token res1.alt_value = res1.value res1.value = MiscHelper.getTextValueOfMetaToken( br, GetTextAttr.NO) return res1 return None
def __compareForms(self, x: 'MorphWordForm', y: 'MorphWordForm') -> int: vx = Utils.ifNotNull(x.normal_full, x.normal_case) vy = Utils.ifNotNull(y.normal_full, y.normal_case) if (vx == vy): return 0 if (Utils.isNullOrEmpty(vx)): return 1 if (Utils.isNullOrEmpty(vy)): return -1 lastx = vx[len(vx) - 1] lasty = vy[len(vy) - 1] if (x.class0_.is_proper_surname and not self.char_info.is_all_lower): if (LanguageHelper.endsWithEx(vx, "ОВ", "ЕВ", "ИН", None)): if (not y.class0_.is_proper_surname): return -1 if (y.class0_.is_proper_surname and not self.char_info.is_all_lower): if (LanguageHelper.endsWithEx(vy, "ОВ", "ЕВ", "ИН", None)): if (not x.class0_.is_proper_surname): return 1 if (len(vx) > len(vy)): return -1 if (len(vx) < len(vy)): return 1 return 0 if (x.class0_ == y.class0_): if (x.class0_.is_adjective): if (lastx == 'Й' and lasty != 'Й'): return -1 if (lastx != 'Й' and lasty == 'Й'): return 1 if (not LanguageHelper.endsWith(vx, "ОЙ") and LanguageHelper.endsWith(vy, "ОЙ")): return -1 if (LanguageHelper.endsWith(vx, "ОЙ") and not LanguageHelper.endsWith(vy, "ОЙ")): return 1 if (x.class0_.is_noun): if (x.number == MorphNumber.SINGULAR and y.number == MorphNumber.PLURAL and len(vx) <= (len(vy) + 1)): return -1 if (x.number == MorphNumber.PLURAL and y.number == MorphNumber.SINGULAR and len(vx) >= (len(vy) - 1)): return 1 if (len(vx) < len(vy)): return -1 if (len(vx) > len(vy)): return 1 return 0 if (x.class0_.is_adverb): return 1 if (x.class0_.is_noun and x.is_in_dictionary): if (y.class0_.is_adjective and y.is_in_dictionary): if (not "к.ф." in y.misc.attrs): return 1 return -1 if (x.class0_.is_adjective): if (not x.is_in_dictionary and y.class0_.is_noun and y.is_in_dictionary): return 1 return -1 if (x.class0_.is_verb): if (y.class0_.is_noun or y.class0_.is_adjective or y.class0_.is_preposition): return 1 return -1 if (y.class0_.is_adverb): return -1 if (y.class0_.is_noun and y.is_in_dictionary): return 1 if (y.class0_.is_adjective): if (((x.class0_.is_noun or x.class0_.is_proper_secname)) and x.is_in_dictionary): return -1 if (x.class0_.is_noun and not y.is_in_dictionary): if (len(vx) < len(vy)): return -1 return 1 if (y.class0_.is_verb): if (x.class0_.is_noun or x.class0_.is_adjective or x.class0_.is_preposition): return -1 if (x.class0_.is_proper): return -1 return 1 if (len(vx) < len(vy)): return -1 if (len(vx) > len(vy)): return 1 return 0