def copy_from(self, src: 'MorphBaseInfo') -> None: cla = MorphClass() cla.value = src.class0_.value self.class0_ = cla self.gender = src.gender self.number = src.number cas = MorphCase() cas.value = src.case_.value self.case_ = cas lng = MorphLang() lng.value = src.language.value self.language = lng
def run(self, text: str, only_tokenizing: bool, dlang: 'MorphLang', good_text: bool, progress: EventHandler) -> typing.List['MorphToken']: if (Utils.isNullOrEmpty(text)): return None twr = TextWrapper(text, good_text) twrch = twr.chars res = list() uni_lex = dict() i = 0 j = 0 term0 = None pure_rus_words = 0 pure_ukr_words = 0 pure_by_words = 0 pure_kz_words = 0 tot_rus_words = 0 tot_ukr_words = 0 tot_by_words = 0 tot_kz_words = 0 i = 0 first_pass2978 = True while True: if first_pass2978: first_pass2978 = False else: i += 1 if (not (i < twr.length)): break ty = self._get_char_typ(twrch[i]) if (ty == 0): continue if (ty > 2): j = (i + 1) else: j = (i + 1) while j < twr.length: if (self._get_char_typ(twrch[j]) != ty): break j += 1 wstr = text[i:i + j - i] term = None if (good_text): term = wstr else: trstr = LanguageHelper.transliteral_correction( wstr, term0, False) term = LanguageHelper.correct_word(trstr) if (Utils.isNullOrEmpty(term)): i = (j - 1) continue lang = LanguageHelper._get_word_lang(term) if (lang == MorphLang.UA): pure_ukr_words += 1 elif (lang == MorphLang.RU): pure_rus_words += 1 elif (lang == MorphLang.BY): pure_by_words += 1 elif (lang == MorphLang.KZ): pure_kz_words += 1 if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN): tot_rus_words += 1 if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN): tot_ukr_words += 1 if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN): tot_by_words += 1 if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN): tot_kz_words += 1 if (ty == 1): term0 = term lemmas = None if (ty == 1 and not only_tokenizing): wraplemmas1 = RefOutArgWrapper(None) inoutres2 = Utils.tryGetValue(uni_lex, term, wraplemmas1) lemmas = wraplemmas1.value if (not inoutres2): nuni = UniLexWrap(lang) uni_lex[term] = nuni lemmas = nuni tok = MorphToken() tok.term = term tok.begin_char = i if (i == 733860): pass tok.end_char = (j - 1) tok.tag = (lemmas) res.append(tok) i = (j - 1) def_lang = MorphLang() if (dlang is not None): def_lang.value = dlang.value if (pure_rus_words > pure_ukr_words and pure_rus_words > pure_by_words and pure_rus_words > pure_kz_words): def_lang = MorphLang.RU elif (tot_rus_words > tot_ukr_words and tot_rus_words > tot_by_words and tot_rus_words > tot_kz_words): def_lang = MorphLang.RU elif (pure_ukr_words > pure_rus_words and pure_ukr_words > pure_by_words and pure_ukr_words > pure_kz_words): def_lang = MorphLang.UA elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words and tot_ukr_words > tot_kz_words): def_lang = MorphLang.UA elif (pure_kz_words > pure_rus_words and pure_kz_words > pure_ukr_words and pure_kz_words > pure_by_words): def_lang = MorphLang.KZ elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words and tot_kz_words > tot_by_words): def_lang = MorphLang.KZ elif (pure_by_words > pure_rus_words and pure_by_words > pure_ukr_words and pure_by_words > pure_kz_words): def_lang = MorphLang.BY elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words and tot_by_words > tot_kz_words): if (tot_rus_words > 10 and tot_by_words > (tot_rus_words + 20)): def_lang = MorphLang.BY elif (tot_rus_words == 0 or tot_by_words >= (tot_rus_words * 2)): def_lang = MorphLang.BY if (((def_lang.is_undefined or def_lang.is_ua)) and tot_rus_words > 0): if (((tot_ukr_words > tot_rus_words and self.__m_engine_ua.language.is_ua)) or ((tot_by_words > tot_rus_words and self.__m_engine_by.language.is_by)) or ((tot_kz_words > tot_rus_words and self.__m_engine_kz.language.is_kz))): cou0 = 0 tot_kz_words = 0 tot_ukr_words = tot_kz_words tot_by_words = tot_ukr_words tot_rus_words = tot_by_words for kp in uni_lex.items(): lang = MorphLang() wraplang3 = RefOutArgWrapper(lang) kp[1].word_forms = self.__process_one_word( kp[0], wraplang3) lang = wraplang3.value if (kp[1].word_forms is not None): for wf in kp[1].word_forms: lang |= wf.language kp[1].lang = lang if (lang.is_ru): tot_rus_words += 1 if (lang.is_ua): tot_ukr_words += 1 if (lang.is_by): tot_by_words += 1 if (lang.is_kz): tot_kz_words += 1 if (lang.is_cyrillic): cou0 += 1 if (cou0 >= 100): break if (tot_rus_words > ((math.floor(tot_by_words / 2))) and tot_rus_words > ((math.floor(tot_ukr_words / 2)))): def_lang = MorphLang.RU elif (tot_ukr_words > ((math.floor(tot_rus_words / 2))) and tot_ukr_words > ((math.floor(tot_by_words / 2)))): def_lang = MorphLang.UA elif (tot_by_words > ((math.floor(tot_rus_words / 2))) and tot_by_words > ((math.floor(tot_ukr_words / 2)))): def_lang = MorphLang.BY elif (def_lang.is_undefined): def_lang = MorphLang.RU cou = 0 tot_kz_words = 0 tot_ukr_words = tot_kz_words tot_by_words = tot_ukr_words tot_rus_words = tot_by_words for kp in uni_lex.items(): lang = def_lang if (lang.is_undefined): if (tot_rus_words > tot_by_words and tot_rus_words > tot_ukr_words and tot_rus_words > tot_kz_words): lang = MorphLang.RU elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words and tot_ukr_words > tot_kz_words): lang = MorphLang.UA elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words and tot_by_words > tot_kz_words): lang = MorphLang.BY elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words and tot_kz_words > tot_by_words): lang = MorphLang.KZ wraplang4 = RefOutArgWrapper(lang) kp[1].word_forms = self.__process_one_word(kp[0], wraplang4) lang = wraplang4.value kp[1].lang = lang if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN): tot_rus_words += 1 if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN): tot_ukr_words += 1 if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN): tot_by_words += 1 if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN): tot_kz_words += 1 if (progress is not None): self.__on_progress(cou, len(uni_lex), progress) cou += 1 empty_list = None for r in res: uni = Utils.asObjectOrNull(r.tag, UniLexWrap) r.tag = None if (uni is None or uni.word_forms is None or len(uni.word_forms) == 0): if (empty_list is None): empty_list = list() r.word_forms = empty_list if (uni is not None): r.language = uni.lang else: r.word_forms = uni.word_forms if (not good_text): i = 0 first_pass2979 = True while True: if first_pass2979: first_pass2979 = False else: i += 1 if (not (i < (len(res) - 2))): break ui0 = twrch[res[i].begin_char] ui1 = twrch[res[i + 1].begin_char] ui2 = twrch[res[i + 2].begin_char] if (ui1.is_quot): p = res[i + 1].begin_char if ((p >= 2 and "БбТт".find(text[p - 1]) >= 0 and ((p + 3) < len(text))) and "ЕеЯяЁё".find(text[p + 1]) >= 0): wstr = LanguageHelper.transliteral_correction( LanguageHelper.correct_word("{0}Ъ{1}".format( res[i].get_source_text(text), res[i + 2].get_source_text(text))), None, False) li = self.__process_one_word0(wstr) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i].end_char = res[i + 2].end_char res[i].term = wstr res[i].word_forms = li del res[i + 1:i + 1 + 2] elif ((ui1.is_apos and p > 0 and str.isalpha(text[p - 1])) and ((p + 1) < len(text)) and str.isalpha(text[p + 1])): if (def_lang == MorphLang.UA or (((res[i].language) & MorphLang.UA)) != MorphLang.UNKNOWN or (((res[i + 2].language) & MorphLang.UA)) != MorphLang.UNKNOWN): wstr = LanguageHelper.transliteral_correction( LanguageHelper.correct_word("{0}{1}".format( res[i].get_source_text(text), res[i + 2].get_source_text(text))), None, False) li = self.__process_one_word0(wstr) okk = True if (okk): res[i].end_char = res[i + 2].end_char res[i].term = wstr if (li is None): li = list() if (li is not None and len(li) > 0): res[i].language = li[0].language res[i].word_forms = li del res[i + 1:i + 1 + 2] elif (((ui1.uni_char == '3' or ui1.uni_char == '4')) and res[i + 1].length == 1): src = ("З" if ui1.uni_char == '3' else "Ч") i0 = i + 1 if ((res[i].end_char + 1) == res[i + 1].begin_char and ui0.is_cyrillic): i0 -= 1 src = (res[i0].get_source_text(text) + src) i1 = i + 1 if ((res[i + 1].end_char + 1) == res[i + 2].begin_char and ui2.is_cyrillic): i1 += 1 src += res[i1].get_source_text(text) if (len(src) > 2): wstr = LanguageHelper.transliteral_correction( LanguageHelper.correct_word(src), None, False) li = self.__process_one_word0(wstr) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i0].end_char = res[i1].end_char res[i0].term = wstr res[i0].word_forms = li del res[i0 + 1:i0 + 1 + i1 - i0] elif ((ui1.is_hiphen and ui0.is_letter and ui2.is_letter) and res[i].end_char > res[i].begin_char and res[i + 2].end_char > res[i + 2].begin_char): newline = False sps = 0 j = (res[i + 1].end_char + 1) while j < res[i + 2].begin_char: if (text[j] == '\r' or text[j] == '\n'): newline = True sps += 1 elif (not Utils.isWhitespace(text[j])): break else: sps += 1 j += 1 full_word = LanguageHelper.correct_word( res[i].get_source_text(text) + res[i + 2].get_source_text(text)) if (not newline): if (full_word in uni_lex or full_word == "ИЗЗА"): newline = True elif (text[res[i + 1].begin_char] == (chr(0x00AD))): newline = True elif (LanguageHelper.ends_with_ex( res[i].get_source_text(text), "О", "о", None, None) and len(res[i + 2].word_forms) > 0 and res[i + 2].word_forms[0].is_in_dictionary): if (text[res[i + 1].begin_char] == '¬'): li = self.__process_one_word0(full_word) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): newline = True elif ((res[i].end_char + 2) == res[i + 2].begin_char): if (not str.isupper(text[res[i + 2].begin_char]) and (sps < 2) and len(full_word) > 4): newline = True if ((i + 3) < len(res)): ui3 = twrch[res[i + 3].begin_char] if (ui3.is_hiphen): newline = False elif (((res[i].end_char + 1) == res[i + 1].begin_char and sps > 0 and (sps < 3)) and len(full_word) > 4): newline = True if (newline): li = self.__process_one_word0(full_word) if (li is not None and len(li) > 0 and ((li[0].is_in_dictionary or full_word in uni_lex))): res[i].end_char = res[i + 2].end_char res[i].term = full_word res[i].word_forms = li del res[i + 1:i + 1 + 2] else: pass elif ((ui1.is_letter and ui0.is_letter and res[i].length > 2) and res[i + 1].length > 1): if (ui0.is_upper != ui1.is_upper): continue if (not ui0.is_cyrillic or not ui1.is_cyrillic): continue newline = False j = (res[i].end_char + 1) while j < res[i + 1].begin_char: if (twrch[j].code == 0xD or twrch[j].code == 0xA): newline = True break j += 1 if (not newline): continue full_word = LanguageHelper.correct_word( res[i].get_source_text(text) + res[i + 1].get_source_text(text)) if (not full_word in uni_lex): continue li = self.__process_one_word0(full_word) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i].end_char = res[i + 1].end_char res[i].term = full_word res[i].word_forms = li del res[i + 1] i = 0 first_pass2980 = True while True: if first_pass2980: first_pass2980 = False else: i += 1 if (not (i < len(res))): break mt = res[i] mt.char_info = CharsInfo() ui0 = twrch[mt.begin_char] ui00 = UnicodeInfo.ALL_CHARS[ord((mt.term[0]))] j = (mt.begin_char + 1) while j <= mt.end_char: if (ui0.is_letter): break ui0 = twrch[j] j += 1 if (ui0.is_letter): mt.char_info.is_letter = True if (ui00.is_latin): mt.char_info.is_latin_letter = True elif (ui00.is_cyrillic): mt.char_info.is_cyrillic_letter = True if (mt.language == MorphLang.UNKNOWN): if (LanguageHelper.is_cyrillic(mt.term)): mt.language = (MorphLang.RU if def_lang.is_undefined else def_lang) if (good_text): continue all_up = True all_lo = True j = mt.begin_char while j <= mt.end_char: if (twrch[j].is_upper or twrch[j].is_digit): all_lo = False else: all_up = False j += 1 if (all_up): mt.char_info.is_all_upper = True elif (all_lo): mt.char_info.is_all_lower = True elif (((ui0.is_upper or twrch[mt.begin_char].is_digit)) and mt.end_char > mt.begin_char): all_lo = True j = (mt.begin_char + 1) while j <= mt.end_char: if (twrch[j].is_upper or twrch[j].is_digit): all_lo = False break j += 1 if (all_lo): mt.char_info.is_capital_upper = True elif (twrch[mt.end_char].is_lower and (mt.end_char - mt.begin_char) > 1): all_up = True j = mt.begin_char while j < mt.end_char: if (twrch[j].is_lower): all_up = False break j += 1 if (all_up): mt.char_info.is_last_lower = True if (mt.char_info.is_last_lower and mt.length > 2 and mt.char_info.is_cyrillic_letter): pref = text[mt.begin_char:mt.begin_char + mt.end_char - mt.begin_char] ok = False for wf in mt.word_forms: if (wf.normal_case == pref or wf.normal_full == pref): ok = True break if (not ok): wf0 = MorphWordForm._new5(pref, MorphClass.NOUN, 1) mt.word_forms = list(mt.word_forms) mt.word_forms.insert(0, wf0) if (good_text or only_tokenizing): return res i = 0 first_pass2981 = True while True: if first_pass2981: first_pass2981 = False else: i += 1 if (not (i < len(res))): break if (res[i].length == 1 and res[i].char_info.is_latin_letter): ch = res[i].term[0] if (ch == 'C' or ch == 'A' or ch == 'P'): pass else: continue is_rus = False for ii in range(i - 1, -1, -1): if ((res[ii].end_char + 1) != res[ii + 1].begin_char): break elif (res[ii].char_info.is_letter): is_rus = res[ii].char_info.is_cyrillic_letter break if (not is_rus): ii = i + 1 while ii < len(res): if ((res[ii - 1].end_char + 1) != res[ii].begin_char): break elif (res[ii].char_info.is_letter): is_rus = res[ii].char_info.is_cyrillic_letter break ii += 1 if (is_rus): res[i].term = LanguageHelper.transliteral_correction( res[i].term, None, True) res[i].char_info.is_cyrillic_letter = True res[i].char_info.is_latin_letter = True for r in res: if (r.char_info.is_all_upper or r.char_info.is_capital_upper): if (r.language.is_cyrillic): ok = False for wf in r.word_forms: if (wf.class0_.is_proper_surname): ok = True break if (not ok): r.word_forms = list(r.word_forms) self.__m_engine_ru.process_surname_variants( r.term, r.word_forms) for r in res: for mv in r.word_forms: if (mv.normal_case is None): mv.normal_case = r.term i = 0 while i < (len(res) - 2): if (res[i].char_info.is_latin_letter and res[i].char_info.is_all_upper and res[i].length == 1): if (twrch[res[i + 1].begin_char].is_quot and res[i + 2].char_info.is_latin_letter and res[i + 2].length > 2): if ((res[i].end_char + 1) == res[i + 1].begin_char and (res[i + 1].end_char + 1) == res[i + 2].begin_char): wstr = "{0}{1}".format(res[i].term, res[i + 2].term) li = self.__process_one_word0(wstr) if (li is not None): res[i].word_forms = li res[i].end_char = res[i + 2].end_char res[i].term = wstr if (res[i + 2].char_info.is_all_lower): res[i].char_info.is_all_upper = False res[i].char_info.is_capital_upper = True elif (not res[i + 2].char_info.is_all_upper): res[i].char_info.is_all_upper = False del res[i + 1:i + 1 + 2] i += 1 i = 0 first_pass2982 = True while True: if first_pass2982: first_pass2982 = False else: i += 1 if (not (i < (len(res) - 1))): break if (not res[i].char_info.is_letter and not res[i + 1].char_info.is_letter and (res[i].end_char + 1) == res[i + 1].begin_char): if (twrch[res[i].begin_char].is_hiphen and twrch[res[i + 1].begin_char].is_hiphen): if (i == 0 or not twrch[res[i - 1].begin_char].is_hiphen): pass else: continue if ((i + 2) == len(res) or not twrch[res[i + 2].begin_char].is_hiphen): pass else: continue res[i].end_char = res[i + 1].end_char del res[i + 1] return res