def __getAttr(self, i: int) -> bool: if ((((self.__m_attrs) & 1)) == 0): self.__m_attrs = (1) if (self._m_previous is None): self._setAttr(1, True) self._setAttr(3, True) else: j = self._m_previous.end_char + 1 while j < self.begin_char: ch = self.kit.sofa.text[j] if (Utils.isWhitespace((ch))): self._setAttr(1, True) if ((ord(ch)) == 0xD or (ord(ch)) == 0xA or ch == '\f'): self._setAttr(3, True) j += 1 if (self._m_next is None): self._setAttr(2, True) self._setAttr(4, True) else: j = self.end_char + 1 while j < self._m_next.begin_char: ch = self.kit.sofa.text[j] if (Utils.isWhitespace(ch)): self._setAttr(2, True) if ((ord(ch)) == 0xD or (ord(ch)) == 0xA or ch == '\f'): self._setAttr(4, True) j += 1 return (((((self.__m_attrs) >> i)) & 1)) != 0
def addAbridge(self, abr: str) -> 'Abridge': if (abr == "В/ГОР"): pass a = Termin.Abridge() if (self.abridges is None): self.abridges = list() i = 0 while i < len(abr): if (not str.isalpha(abr[i])): break i += 1 if (i == 0): return None a.parts.append(Termin.AbridgePart._new604(abr[0:0 + i].upper())) self.abridges.append(a) if (((i + 1) < len(abr)) and abr[i] == '-'): a.tail = abr[i + 1:].upper() elif (i < len(abr)): if (not Utils.isWhitespace(abr[i])): a.parts[0].has_delim = True while i < len(abr): if (str.isalpha(abr[i])): j = (i + 1) while j < len(abr): if (not str.isalpha(abr[j])): break j += 1 p = Termin.AbridgePart._new604(abr[i:i + j - i].upper()) if (j < len(abr)): if (not Utils.isWhitespace(abr[j])): p.has_delim = True a.parts.append(p) i = j i += 1 return a
def create_number(gr : 'SemGraph', num : 'NumbersWithUnitToken') -> 'SemObject': rs = num.create_refenets_tokens_with_register(None, None, False) if (rs is None or len(rs) == 0): return None mr = Utils.asObjectOrNull(rs[len(rs) - 1].referent, MeasureReferent) sem = SemObject(gr) gr.objects.append(sem) sem.tokens.append(num) sem.morph.normal_case = mr.to_string(True, None, 0) sem.morph.normal_full = sem.morph.normal_case sem.typ = SemObjectType.NOUN sem.measure = mr.kind i = 0 first_pass3438 = True while True: if first_pass3438: first_pass3438 = False else: i += 1 if (not (i < len(sem.morph.normal_case))): break ch = sem.morph.normal_case[i] if (str.isdigit(ch) or Utils.isWhitespace(ch) or "[].+-".find(ch) >= 0): continue sem.quantity = SemQuantity(sem.morph.normal_case[0:0+i].strip(), num.begin_token, num.end_token) sem.morph.normal_case = sem.morph.normal_case[i:].strip() if (len(num.units) == 1 and num.units[0].unit is not None): sem.morph.normal_full = num.units[0].unit.fullname_cyr if (sem.morph.normal_full == "%"): sem.morph.normal_full = "процент" break sem.concept = (mr) return sem
def calcWhitespaceDistanceBetweenPositions(self, pos_from: int, pos_to: int) -> int: """ Вычислить расстояние в символах между соседними элементами Args: indFrom: indTo: """ if (pos_from == (pos_to + 1)): return 0 if (pos_from > pos_to or (pos_from < 0) or pos_to >= len(self.text)): return -1 res = 0 i = pos_from while i <= pos_to: ch = self.text[i] if (not Utils.isWhitespace(ch)): return -1 if (ch == '\r' or ch == '\n'): res += 10 elif (ch == '\t'): res += 5 else: res += 1 i += 1 return res
def __init__(self, t: 'Token') -> None: self.col_span = 0 self.row_span = 0 self.typ = TableHelper.TableTypes.UNDEFINED self.src = None self.src = t if (t is None): return if (t.is_char(chr(0x1E))): self.typ = TableHelper.TableTypes.TABLESTART return if (t.is_char(chr(0x1F))): self.typ = TableHelper.TableTypes.TABLEEND return if (not t.is_char(chr(7))): return txt = t.kit.sofa.text self.typ = TableHelper.TableTypes.CELLEND p = t.begin_char - 1 if (p < 0): return if ((ord(txt[p])) == 0xD or (ord(txt[p])) == 0xA): self.typ = TableHelper.TableTypes.ROWEND return self.row_span = 1 self.col_span = self.row_span while p >= 0: if (not Utils.isWhitespace(txt[p])): break elif (txt[p] == '\t'): self.col_span += 1 elif (txt[p] == '\f'): self.row_span += 1 p -= 1
def _initialize() -> None: if (MiscLocationHelper.__m_nords is not None): return MiscLocationHelper.__m_nords = TerminCollection() for s in [ "СЕВЕРНЫЙ", "ЮЖНЫЙ", "ЗАПАДНЫЙ", "ВОСТОЧНЫЙ", "ЦЕНТРАЛЬНЫЙ", "БЛИЖНИЙ", "ДАЛЬНИЙ", "СРЕДНИЙ", "СЕВЕР", "ЮГ", "ЗАПАД", "ВОСТОК", "СЕВЕРО", "ЮГО", "ЗАПАДНО", "ВОСТОЧНО", "СЕВЕРОЗАПАДНЫЙ", "СЕВЕРОВОСТОЧНЫЙ", "ЮГОЗАПАДНЫЙ", "ЮГОВОСТОЧНЫЙ" ]: MiscLocationHelper.__m_nords.add(Termin(s, MorphLang.RU, True)) table = "\nAF\tAFG\nAX\tALA\nAL\tALB\nDZ\tDZA\nAS\tASM\nAD\tAND\nAO\tAGO\nAI\tAIA\nAQ\tATA\nAG\tATG\nAR\tARG\nAM\tARM\nAW\tABW\nAU\tAUS\nAT\tAUT\nAZ\tAZE\nBS\tBHS\nBH\tBHR\nBD\tBGD\nBB\tBRB\nBY\tBLR\nBE\tBEL\nBZ\tBLZ\nBJ\tBEN\nBM\tBMU\nBT\tBTN\nBO\tBOL\nBA\tBIH\nBW\tBWA\nBV\tBVT\nBR\tBRA\nVG\tVGB\nIO\tIOT\nBN\tBRN\nBG\tBGR\nBF\tBFA\nBI\tBDI\nKH\tKHM\nCM\tCMR\nCA\tCAN\nCV\tCPV\nKY\tCYM\nCF\tCAF\nTD\tTCD\nCL\tCHL\nCN\tCHN\nHK\tHKG\nMO\tMAC\nCX\tCXR\nCC\tCCK\nCO\tCOL\nKM\tCOM\nCG\tCOG\nCD\tCOD\nCK\tCOK\nCR\tCRI\nCI\tCIV\nHR\tHRV\nCU\tCUB\nCY\tCYP\nCZ\tCZE\nDK\tDNK\nDJ\tDJI\nDM\tDMA\nDO\tDOM\nEC\tECU\nEG\tEGY\nSV\tSLV\nGQ\tGNQ\nER\tERI\nEE\tEST\nET\tETH\nFK\tFLK\nFO\tFRO\nFJ\tFJI\nFI\tFIN\nFR\tFRA\nGF\tGUF\nPF\tPYF\nTF\tATF\nGA\tGAB\nGM\tGMB\nGE\tGEO\nDE\tDEU\nGH\tGHA\nGI\tGIB\nGR\tGRC\nGL\tGRL\nGD\tGRD\nGP\tGLP\nGU\tGUM\nGT\tGTM\nGG\tGGY\nGN\tGIN\nGW\tGNB\nGY\tGUY\nHT\tHTI\nHM\tHMD\nVA\tVAT\nHN\tHND\nHU\tHUN\nIS\tISL\nIN\tIND\nID\tIDN\nIR\tIRN\nIQ\tIRQ\nIE\tIRL\nIM\tIMN\nIL\tISR\nIT\tITA\nJM\tJAM\nJP\tJPN\nJE\tJEY\nJO\tJOR\nKZ\tKAZ\nKE\tKEN\nKI\tKIR\nKP\tPRK\nKR\tKOR\nKW\tKWT\nKG\tKGZ\nLA\tLAO\nLV\tLVA\nLB\tLBN\nLS\tLSO\nLR\tLBR\nLY\tLBY\nLI\tLIE\nLT\tLTU\nLU\tLUX\nMK\tMKD\nMG\tMDG\nMW\tMWI\nMY\tMYS\nMV\tMDV\nML\tMLI\nMT\tMLT\nMH\tMHL\nMQ\tMTQ\nMR\tMRT\nMU\tMUS\nYT\tMYT\nMX\tMEX\nFM\tFSM\nMD\tMDA\nMC\tMCO\nMN\tMNG\nME\tMNE\nMS\tMSR\nMA\tMAR\nMZ\tMOZ\nMM\tMMR\nNA\tNAM\nNR\tNRU\nNP\tNPL\nNL\tNLD\nAN\tANT\nNC\tNCL\nNZ\tNZL\nNI\tNIC\nNE\tNER\nNG\tNGA\nNU\tNIU\nNF\tNFK\nMP\tMNP\nNO\tNOR\nOM\tOMN\nPK\tPAK\nPW\tPLW\nPS\tPSE\nPA\tPAN\nPG\tPNG\nPY\tPRY\nPE\tPER\nPH\tPHL\nPN\tPCN\nPL\tPOL\nPT\tPRT\nPR\tPRI\nQA\tQAT\nRE\tREU\nRO\tROU\nRU\tRUS\nRW\tRWA\nBL\tBLM\nSH\tSHN\nKN\tKNA\nLC\tLCA\nMF\tMAF\nPM\tSPM\nVC\tVCT\nWS\tWSM\nSM\tSMR\nST\tSTP\nSA\tSAU\nSN\tSEN\nRS\tSRB\nSC\tSYC\nSL\tSLE\nSG\tSGP\nSK\tSVK\nSI\tSVN\nSB\tSLB\nSO\tSOM\nZA\tZAF\nGS\tSGS\nSS\tSSD\nES\tESP\nLK\tLKA\nSD\tSDN\nSR\tSUR\nSJ\tSJM\nSZ\tSWZ\nSE\tSWE\nCH\tCHE\nSY\tSYR\nTW\tTWN\nTJ\tTJK\nTZ\tTZA\nTH\tTHA\nTL\tTLS\nTG\tTGO\nTK\tTKL\nTO\tTON\nTT\tTTO\nTN\tTUN\nTR\tTUR\nTM\tTKM\nTC\tTCA\nTV\tTUV\nUG\tUGA\nUA\tUKR\nAE\tARE\nGB\tGBR\nUS\tUSA\nUM\tUMI\nUY\tURY\nUZ\tUZB\nVU\tVUT\nVE\tVEN\nVN\tVNM\nVI\tVIR\nWF\tWLF\nEH\tESH\nYE\tYEM\nZM\tZMB\nZW\tZWE " for s in Utils.splitString(table, '\n', False): ss = s.strip() if ((len(ss) < 6) or not Utils.isWhitespace(ss[2])): continue cod2 = ss[0:0 + 2] cod3 = ss[3:].strip() if (len(cod3) != 3): continue if (not cod2 in MiscLocationHelper._m_alpha2_3): MiscLocationHelper._m_alpha2_3[cod2] = cod3 if (not cod3 in MiscLocationHelper._m_alpha3_2): MiscLocationHelper._m_alpha3_2[cod3] = cod2
def is_cyrillic(str0_: str) -> bool: if (str0_ is None): return False i = 0 while i < len(str0_): if (not LanguageHelper.is_cyrillic_char(str0_[i])): if (not Utils.isWhitespace(str0_[i]) and str0_[i] != '-'): return False i += 1 return True
def newlines_before_count(self) -> int: """ Количество переходов на новую строку перед """ ch0 = chr(0) res = 0 txt = self.kit.sofa.text for p in range(self.begin_char - 1, -1, -1): ch = txt[p] if ((ord(ch)) == 0xA): res += 1 elif ((ord(ch)) == 0xD and (ord(ch0)) != 0xA): res += 1 elif (ch == '\f'): res += 10 elif (not Utils.isWhitespace(ch)): break ch0 = ch return res
def newlines_after_count(self) -> int: """ Количество переходов на новую строку перед """ ch0 = chr(0) res = 0 txt = self.kit.sofa.text p = self.end_char + 1 while p < len(txt): ch = txt[p] if ((ord(ch)) == 0xD): res += 1 elif ((ord(ch)) == 0xA and (ord(ch0)) != 0xD): res += 1 elif (ch == '\f'): res += 10 elif (not Utils.isWhitespace(ch)): break ch0 = ch p += 1 return res
def initialize() -> None: if (UnicodeInfo.__m_inited): return UnicodeInfo.__m_inited = True UnicodeInfo.ALL_CHARS = list() cyrvowel = "АЕЁИОУЮЯЫЭЄІЇЎӘӨҰҮІ" cyrvowel += cyrvowel.lower() for i in range(0x10000): ch = chr(i) ui = UnicodeInfo(i) if (Utils.isWhitespace(ch)): ui.is_whitespace = True elif (str.isdigit(ch)): ui.is_digit = True elif (ch == 'º' or ch == '°'): pass elif (str.isalpha(ch)): ui.is_letter = True if (i >= 0x400 and (i < 0x500)): ui.is_cyrillic = True if (cyrvowel.find(ch) >= 0): ui.is_vowel = True elif (i < 0x200): ui.is_latin = True if ("AEIOUYaeiouy".find(ch) >= 0): ui.is_vowel = True if (str.isupper(ch)): ui.is_upper = True if (str.islower(ch)): ui.is_lower = True else: if (((((ch == '-' or ch == '–' or ch == '¬') or ch == '-' or ch == (chr(0x00AD))) or ch == (chr(0x2011)) or ch == '-') or ch == '—' or ch == '–') or ch == '−' or ch == '-'): ui.is_hiphen = True if ("\"'`“”’".find(ch) >= 0): ui.is_quot = True if ("'`’".find(ch) >= 0): ui.is_apos = True ui.is_quot = True if (i >= 0x300 and (i < 0x370)): ui.is_udaren = True UnicodeInfo.ALL_CHARS.append(ui)
def __getNameWithoutBrackets(begin: 'Token', end: 'Token', normalize_first_noun_group: bool = False, normal_first_group_single: bool = False, ignore_geo_referent: bool = False) -> str: """ Получить строковое значение между токенами, при этом исключая кавычки и скобки Args: begin(Token): начальный токен end(Token): конечный токен normalize_first_noun_group(bool): нормализовывать ли первую именную группу (именит. падеж) normal_first_group_single(bool): приводить ли к единственному числу первую именную группу ignore_geo_referent(bool): игнорировать внутри географические сущности """ res = None if (BracketHelper.canBeStartOfSequence(begin, False, False) and BracketHelper.canBeEndOfSequence(end, False, begin, False)): begin = begin.next0_ end = end.previous if (normalize_first_noun_group and not begin.morph.class0_.is_preposition): npt = NounPhraseHelper.tryParse( begin, NounPhraseParseAttr.REFERENTCANBENOUN, 0) if (npt is not None): if (npt.noun.getMorphClassInDictionary().is_undefined and len(npt.adjectives) == 0): npt = (None) if (npt is not None and npt.end_token.end_char > end.end_char): npt = (None) if (npt is not None): res = npt.getNormalCaseText(None, normal_first_group_single, MorphGender.UNDEFINED, False) te = npt.end_token.next0_ if (((te is not None and te.next0_ is not None and te.is_comma) and (isinstance(te.next0_, TextToken)) and te.next0_.end_char <= end.end_char) and te.next0_.morph.class0_.is_verb and te.next0_.morph.class0_.is_adjective): for it in te.next0_.morph.items: if (it.gender == npt.morph.gender or (((it.gender) & (npt.morph.gender))) != (MorphGender.UNDEFINED)): if (not ( (it.case_) & npt.morph.case_).is_undefined): if (it.number == npt.morph.number or (((it.number) & (npt.morph.number))) != (MorphNumber.UNDEFINED)): var = (te.next0_).term if (isinstance(it, MorphWordForm)): var = (it).normal_case bi = MorphBaseInfo._new549( MorphClass.ADJECTIVE, npt.morph.gender, npt.morph.number, npt.morph.language) var = Morphology.getWordform(var, bi) if (var is not None): res = "{0}, {1}".format(res, var) te = te.next0_.next0_ break if (te is not None and te.end_char <= end.end_char): s = ProperNameHelper.getNameEx(te, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, ignore_geo_referent) if (not Utils.isNullOrEmpty(s)): if (not str.isalnum(s[0])): res = "{0}{1}".format(res, s) else: res = "{0} {1}".format(res, s) elif ((isinstance(begin, TextToken)) and begin.chars.is_cyrillic_letter): mm = begin.getMorphClassInDictionary() if (not mm.is_undefined): res = begin.getNormalCaseText(mm, False, MorphGender.UNDEFINED, False) if (begin.end_char < end.end_char): res = "{0} {1}".format( res, ProperNameHelper.getNameEx(begin.next0_, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, False)) if (res is None): res = ProperNameHelper.getNameEx(begin, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, ignore_geo_referent) if (not Utils.isNullOrEmpty(res)): k = 0 i = len(res) - 1 while i >= 0: if (res[i] == '*' or Utils.isWhitespace(res[i])): pass else: break i -= 1 k += 1 if (k > 0): if (k == len(res)): return None res = res[0:0 + len(res) - k] return res
def run(self, text: str, only_tokenizing: bool, dlang: 'MorphLang', progress: EventHandler, good_text: bool) -> typing.List['MorphToken']: """ Произвести морфологический анализ текста Args: text(str): исходный текст lang: язык (если null, то попробует определить) Returns: typing.List[MorphToken]: последовательность результирующих морфем """ if (Utils.isNullOrEmpty(text)): return None twr = TextWrapper(text, good_text) twrch = twr.chars res = list() uni_lex = dict() term0 = None pure_rus_words = 0 pure_ukr_words = 0 pure_by_words = 0 pure_kz_words = 0 tot_rus_words = 0 tot_ukr_words = 0 tot_by_words = 0 tot_kz_words = 0 i = 0 first_pass2708 = True while True: if first_pass2708: first_pass2708 = False else: i += 1 if (not (i < twr.length)): break ty = InnerMorphology._getCharTyp(twrch[i]) if (ty == 0): continue if (ty > 2): j = (i + 1) else: j = (i + 1) while j < twr.length: if (InnerMorphology._getCharTyp(twrch[j]) != ty): break j += 1 wstr = text[i:i + j - i] term = None if (good_text): term = wstr else: trstr = LanguageHelper.transliteralCorrection( wstr, term0, False) term = LanguageHelper.correctWord(trstr) if (Utils.isNullOrEmpty(term)): i = (j - 1) continue lang = InnerMorphology.__detectLang(twr, i, j - 1, term) if (lang == MorphLang.UA): pure_ukr_words += 1 elif (lang == MorphLang.RU): pure_rus_words += 1 elif (lang == MorphLang.BY): pure_by_words += 1 elif (lang == MorphLang.KZ): pure_kz_words += 1 if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN): tot_rus_words += 1 if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN): tot_ukr_words += 1 if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN): tot_by_words += 1 if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN): tot_kz_words += 1 if (ty == 1): term0 = term lemmas = None if (ty == 1 and not only_tokenizing): wraplemmas7 = RefOutArgWrapper(None) inoutres8 = Utils.tryGetValue(uni_lex, term, wraplemmas7) lemmas = wraplemmas7.value if (not inoutres8): lemmas = InnerMorphology.UniLexWrap._new6(lang) uni_lex[term] = lemmas tok = MorphToken() tok.term = term tok.begin_char = i if (i == 733860): pass tok.end_char = (j - 1) tok.tag = (lemmas) res.append(tok) i = (j - 1) def_lang = MorphLang(dlang) if (pure_rus_words > pure_ukr_words and pure_rus_words > pure_by_words and pure_rus_words > pure_kz_words): def_lang = MorphLang.RU elif (tot_rus_words > tot_ukr_words and tot_rus_words > tot_by_words and tot_rus_words > tot_kz_words): def_lang = MorphLang.RU elif (pure_ukr_words > pure_rus_words and pure_ukr_words > pure_by_words and pure_ukr_words > pure_kz_words): def_lang = MorphLang.UA elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words and tot_ukr_words > tot_kz_words): def_lang = MorphLang.UA elif (pure_kz_words > pure_rus_words and pure_kz_words > pure_ukr_words and pure_kz_words > pure_by_words): def_lang = MorphLang.KZ elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words and tot_kz_words > tot_by_words): def_lang = MorphLang.KZ elif (pure_by_words > pure_rus_words and pure_by_words > pure_ukr_words and pure_by_words > pure_kz_words): def_lang = MorphLang.BY elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words and tot_by_words > tot_kz_words): if (tot_rus_words > 10 and tot_by_words > (tot_rus_words + 20)): def_lang = MorphLang.BY elif (tot_rus_words == 0 or tot_by_words >= (tot_rus_words * 2)): def_lang = MorphLang.BY if (((def_lang.is_undefined or def_lang.is_ua)) and tot_rus_words > 0): if (((tot_ukr_words > tot_rus_words and InnerMorphology.M_ENGINE_UA.language.is_ua)) or ((tot_by_words > tot_rus_words and InnerMorphology.M_ENGINE_BY.language.is_by)) or ((tot_kz_words > tot_rus_words and InnerMorphology.M_ENGINE_KZ.language.is_kz))): cou0 = 0 tot_kz_words = 0 tot_ukr_words = tot_kz_words tot_by_words = tot_ukr_words tot_rus_words = tot_by_words for kp in uni_lex.items(): lang = MorphLang() wraplang9 = RefOutArgWrapper(lang) kp[1].word_forms = self.__processOneWord(kp[0], wraplang9) lang = wraplang9.value if (kp[1].word_forms is not None): for wf in kp[1].word_forms: lang |= wf.language kp[1].lang = lang if (lang.is_ru): tot_rus_words += 1 if (lang.is_ua): tot_ukr_words += 1 if (lang.is_by): tot_by_words += 1 if (lang.is_kz): tot_kz_words += 1 if (lang.is_cyrillic): cou0 += 1 if (cou0 >= 100): break if (tot_rus_words > ((math.floor(tot_by_words / 2))) and tot_rus_words > ((math.floor(tot_ukr_words / 2)))): def_lang = MorphLang.RU elif (tot_ukr_words > ((math.floor(tot_rus_words / 2))) and tot_ukr_words > ((math.floor(tot_by_words / 2)))): def_lang = MorphLang.UA elif (tot_by_words > ((math.floor(tot_rus_words / 2))) and tot_by_words > ((math.floor(tot_ukr_words / 2)))): def_lang = MorphLang.BY elif (def_lang.is_undefined): def_lang = MorphLang.RU cou = 0 tot_kz_words = 0 tot_ukr_words = tot_kz_words tot_by_words = tot_ukr_words tot_rus_words = tot_by_words for kp in uni_lex.items(): lang = def_lang if (lang.is_undefined): if (tot_rus_words > tot_by_words and tot_rus_words > tot_ukr_words and tot_rus_words > tot_kz_words): lang = MorphLang.RU elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words and tot_ukr_words > tot_kz_words): lang = MorphLang.UA elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words and tot_by_words > tot_kz_words): lang = MorphLang.BY elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words and tot_kz_words > tot_by_words): lang = MorphLang.KZ wraplang10 = RefOutArgWrapper(lang) kp[1].word_forms = self.__processOneWord(kp[0], wraplang10) lang = wraplang10.value kp[1].lang = lang if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN): tot_rus_words += 1 if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN): tot_ukr_words += 1 if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN): tot_by_words += 1 if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN): tot_kz_words += 1 if (progress is not None): self.__onProgress(cou, len(uni_lex), progress) cou += 1 debug_token = None empty_list = None for r in res: uni = Utils.asObjectOrNull(r.tag, InnerMorphology.UniLexWrap) r.tag = None if (uni is None or uni.word_forms is None or len(uni.word_forms) == 0): if (empty_list is None): empty_list = list() r.word_forms = empty_list if (uni is not None): r.language = uni.lang else: r.word_forms = uni.word_forms if (r.begin_char == 733860): debug_token = r if (not good_text): i = 0 first_pass2709 = True while True: if first_pass2709: first_pass2709 = False else: i += 1 if (not (i < (len(res) - 2))): break ui0 = twrch[res[i].begin_char] ui1 = twrch[res[i + 1].begin_char] ui2 = twrch[res[i + 2].begin_char] if (ui1.is_quot): p = res[i + 1].begin_char if ((p >= 2 and "БбТт".find(text[p - 1]) >= 0 and ((p + 3) < len(text))) and "ЕеЯяЁё".find(text[p + 1]) >= 0): wstr = LanguageHelper.transliteralCorrection( LanguageHelper.correctWord("{0}Ъ{1}".format( res[i].getSourceText(text), res[i + 2].getSourceText(text))), None, False) li = self.__processOneWord0(wstr) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i].end_char = res[i + 2].end_char res[i].term = wstr res[i].word_forms = li del res[i + 1:i + 1 + 2] elif ((ui1.is_apos and p > 0 and str.isalpha(text[p - 1])) and ((p + 1) < len(text)) and str.isalpha(text[p + 1])): if (def_lang == MorphLang.UA or (((res[i].language) & MorphLang.UA)) != MorphLang.UNKNOWN or (((res[i + 2].language) & MorphLang.UA)) != MorphLang.UNKNOWN): wstr = LanguageHelper.transliteralCorrection( LanguageHelper.correctWord("{0}{1}".format( res[i].getSourceText(text), res[i + 2].getSourceText(text))), None, False) li = self.__processOneWord0(wstr) okk = True if (okk): res[i].end_char = res[i + 2].end_char res[i].term = wstr if (li is None): li = list() res[i].word_forms = li if (li is not None and len(li) > 0): res[i].language = li[0].language del res[i + 1:i + 1 + 2] elif (((ui1.uni_char == '3' or ui1.uni_char == '4')) and res[i + 1].length == 1): src = ("З" if ui1.uni_char == '3' else "Ч") i0 = i + 1 if ((res[i].end_char + 1) == res[i + 1].begin_char and ui0.is_cyrillic): i0 -= 1 src = (res[i0].getSourceText(text) + src) i1 = i + 1 if ((res[i + 1].end_char + 1) == res[i + 2].begin_char and ui2.is_cyrillic): i1 += 1 src += res[i1].getSourceText(text) if (len(src) > 2): wstr = LanguageHelper.transliteralCorrection( LanguageHelper.correctWord(src), None, False) li = self.__processOneWord0(wstr) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i0].end_char = res[i1].end_char res[i0].term = wstr res[i0].word_forms = li del res[i0 + 1:i0 + 1 + i1 - i0] elif ((ui1.is_hiphen and ui0.is_letter and ui2.is_letter) and res[i].end_char > res[i].begin_char and res[i + 2].end_char > res[i + 2].begin_char): newline = False sps = 0 j = (res[i + 1].end_char + 1) while j < res[i + 2].begin_char: if (text[j] == '\r' or text[j] == '\n'): newline = True sps += 1 elif (not Utils.isWhitespace(text[j])): break else: sps += 1 j += 1 full_word = LanguageHelper.correctWord( res[i].getSourceText(text) + res[i + 2].getSourceText(text)) if (not newline): if (full_word in uni_lex or full_word == "ИЗЗА"): newline = True elif (text[res[i + 1].begin_char] == (chr(0x00AD))): newline = True elif (LanguageHelper.endsWithEx( res[i].getSourceText(text), "О", "о", None, None) and len(res[i + 2].word_forms) > 0 and res[i + 2].word_forms[0].is_in_dictionary): if (text[res[i + 1].begin_char] == '¬'): li = self.__processOneWord0(full_word) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): newline = True elif ((res[i].end_char + 2) == res[i + 2].begin_char): if (not str.isupper(text[res[i + 2].begin_char]) and (sps < 2) and len(full_word) > 4): newline = True if ((i + 3) < len(res)): ui3 = twrch[res[i + 3].begin_char] if (ui3.is_hiphen): newline = False elif (((res[i].end_char + 1) == res[i + 1].begin_char and sps > 0 and (sps < 3)) and len(full_word) > 4): newline = True if (newline): li = self.__processOneWord0(full_word) if (li is not None and len(li) > 0 and ((li[0].is_in_dictionary or full_word in uni_lex))): res[i].end_char = res[i + 2].end_char res[i].term = full_word res[i].word_forms = li del res[i + 1:i + 1 + 2] else: pass elif ((ui1.is_letter and ui0.is_letter and res[i].length > 2) and res[i + 1].length > 1): if (ui0.is_upper != ui1.is_upper): continue if (not ui0.is_cyrillic or not ui1.is_cyrillic): continue newline = False j = (res[i].end_char + 1) while j < res[i + 1].begin_char: if (twrch[j].code == 0xD or twrch[j].code == 0xA): newline = True break j += 1 if (not newline): continue full_word = LanguageHelper.correctWord( res[i].getSourceText(text) + res[i + 1].getSourceText(text)) if (not full_word in uni_lex): continue li = self.__processOneWord0(full_word) if (li is not None and len(li) > 0 and li[0].is_in_dictionary): res[i].end_char = res[i + 1].end_char res[i].term = full_word res[i].word_forms = li del res[i + 1] i = 0 first_pass2710 = True while True: if first_pass2710: first_pass2710 = False else: i += 1 if (not (i < len(res))): break mt = res[i] mt.char_info = CharsInfo() ui0 = twrch[mt.begin_char] ui00 = UnicodeInfo.ALL_CHARS[ord((res[i].term[0]))] j = (mt.begin_char + 1) while j <= mt.end_char: if (ui0.is_letter): break ui0 = twrch[j] j += 1 if (ui0.is_letter): res[i].char_info.is_letter = True if (ui00.is_latin): res[i].char_info.is_latin_letter = True elif (ui00.is_cyrillic): res[i].char_info.is_cyrillic_letter = True if (res[i].language == MorphLang.UNKNOWN): if (LanguageHelper.isCyrillic(mt.term)): res[i].language = (MorphLang.RU if def_lang.is_undefined else def_lang) if (good_text): continue all_up = True all_lo = True j = mt.begin_char while j <= mt.end_char: if (twrch[j].is_upper or twrch[j].is_digit): all_lo = False else: all_up = False j += 1 if (all_up): mt.char_info.is_all_upper = True elif (all_lo): mt.char_info.is_all_lower = True elif (((ui0.is_upper or twrch[mt.begin_char].is_digit)) and mt.end_char > mt.begin_char): all_lo = True j = (mt.begin_char + 1) while j <= mt.end_char: if (twrch[j].is_upper or twrch[j].is_digit): all_lo = False break j += 1 if (all_lo): mt.char_info.is_capital_upper = True elif (twrch[mt.end_char].is_lower and (mt.end_char - mt.begin_char) > 1): all_up = True j = mt.begin_char while j < mt.end_char: if (twrch[j].is_lower): all_up = False break j += 1 if (all_up): mt.char_info.is_last_lower = True if (mt.char_info.is_last_lower and mt.length > 2 and mt.char_info.is_cyrillic_letter): pref = text[mt.begin_char:mt.begin_char + mt.end_char - mt.begin_char] ok = False for wf in mt.word_forms: if (wf.normal_case == pref or wf.normal_full == pref): ok = True break if (not ok): mt.word_forms = list(mt.word_forms) mt.word_forms.insert( 0, MorphWordForm._new11(pref, MorphClass.NOUN, 1)) if (good_text or only_tokenizing): return res i = 0 first_pass2711 = True while True: if first_pass2711: first_pass2711 = False else: i += 1 if (not (i < len(res))): break if (res[i].length == 1 and res[i].char_info.is_latin_letter): ch = res[i].term[0] if (ch == 'C' or ch == 'A' or ch == 'P'): pass else: continue is_rus = False for ii in range(i - 1, -1, -1): if ((res[ii].end_char + 1) != res[ii + 1].begin_char): break elif (res[ii].char_info.is_letter): is_rus = res[ii].char_info.is_cyrillic_letter break if (not is_rus): ii = i + 1 while ii < len(res): if ((res[ii - 1].end_char + 1) != res[ii].begin_char): break elif (res[ii].char_info.is_letter): is_rus = res[ii].char_info.is_cyrillic_letter break ii += 1 if (is_rus): res[i].term = LanguageHelper.transliteralCorrection( res[i].term, None, True) res[i].char_info.is_cyrillic_letter = True res[i].char_info.is_latin_letter = True for r in res: if (r.char_info.is_all_upper or r.char_info.is_capital_upper): if (r.language.is_cyrillic): ok = False for wf in r.word_forms: if (wf.class0_.is_proper_surname): ok = True break if (not ok): r.word_forms = list(r.word_forms) InnerMorphology.M_ENGINE_RU.processSurnameVariants( r.term, r.word_forms) for r in res: for mv in r.word_forms: if (mv.normal_case is None): mv.normal_case = r.term i = 0 while i < (len(res) - 2): if (res[i].char_info.is_latin_letter and res[i].char_info.is_all_upper and res[i].length == 1): if (twrch[res[i + 1].begin_char].is_quot and res[i + 2].char_info.is_latin_letter and res[i + 2].length > 2): if ((res[i].end_char + 1) == res[i + 1].begin_char and (res[i + 1].end_char + 1) == res[i + 2].begin_char): wstr = "{0}{1}".format(res[i].term, res[i + 2].term) li = self.__processOneWord0(wstr) if (li is not None): res[i].word_forms = li res[i].end_char = res[i + 2].end_char res[i].term = wstr if (res[i + 2].char_info.is_all_lower): res[i].char_info.is_all_upper = False res[i].char_info.is_capital_upper = True elif (not res[i + 2].char_info.is_all_upper): res[i].char_info.is_all_upper = False del res[i + 1:i + 1 + 2] i += 1 i = 0 first_pass2712 = True while True: if first_pass2712: first_pass2712 = False else: i += 1 if (not (i < (len(res) - 1))): break if (not res[i].char_info.is_letter and not res[i + 1].char_info.is_letter and (res[i].end_char + 1) == res[i + 1].begin_char): if (twrch[res[i].begin_char].is_hiphen and twrch[res[i + 1].begin_char].is_hiphen): if (i == 0 or not twrch[res[i - 1].begin_char].is_hiphen): pass else: continue if ((i + 2) == len(res) or not twrch[res[i + 2].begin_char].is_hiphen): pass else: continue res[i].end_char = res[i + 1].end_char del res[i + 1] return res
def __get_name_without_brackets(begin: 'Token', end: 'Token', normalize_first_noun_group: bool = False, normal_first_group_single: bool = False, ignore_geo_referent: bool = False) -> str: res = None if (BracketHelper.can_be_start_of_sequence(begin, False, False) and BracketHelper.can_be_end_of_sequence( end, False, begin, False)): begin = begin.next0_ end = end.previous if (normalize_first_noun_group and not begin.morph.class0_.is_preposition): npt = NounPhraseHelper.try_parse( begin, NounPhraseParseAttr.REFERENTCANBENOUN, 0, None) if (npt is not None): if (npt.noun.get_morph_class_in_dictionary().is_undefined and len(npt.adjectives) == 0): npt = (None) if (npt is not None and npt.end_token.end_char > end.end_char): npt = (None) if (npt is not None): res = npt.get_normal_case_text( None, (MorphNumber.SINGULAR if normal_first_group_single else MorphNumber.UNDEFINED), MorphGender.UNDEFINED, False) te = npt.end_token.next0_ if (((te is not None and te.next0_ is not None and te.is_comma) and (isinstance(te.next0_, TextToken)) and te.next0_.end_char <= end.end_char) and te.next0_.morph.class0_.is_verb and te.next0_.morph.class0_.is_adjective): for it in te.next0_.morph.items: if (it.gender == npt.morph.gender or ((it.gender) & (npt.morph.gender)) != (MorphGender.UNDEFINED)): if (not ( (it.case_) & npt.morph.case_).is_undefined): if (it.number == npt.morph.number or ((it.number) & (npt.morph.number)) != (MorphNumber.UNDEFINED)): var = te.next0_.term if (isinstance(it, MorphWordForm)): var = it.normal_case bi = MorphBaseInfo._new492( MorphClass.ADJECTIVE, npt.morph.gender, npt.morph.number, npt.morph.language) var = MorphologyService.get_wordform( var, bi) if (var is not None): res = "{0}, {1}".format(res, var) te = te.next0_.next0_ break if (te is not None and te.end_char <= end.end_char): s = ProperNameHelper.get_name_ex(te, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, ignore_geo_referent) if (not Utils.isNullOrEmpty(s)): if (not str.isalnum(s[0])): res = "{0}{1}".format(res, s) else: res = "{0} {1}".format(res, s) elif ((isinstance(begin, TextToken)) and begin.chars.is_cyrillic_letter): mm = begin.get_morph_class_in_dictionary() if (not mm.is_undefined): res = begin.get_normal_case_text(mm, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False) if (begin.end_char < end.end_char): res = "{0} {1}".format( res, ProperNameHelper.get_name_ex( begin.next0_, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, False)) if (res is None): res = ProperNameHelper.get_name_ex(begin, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, ignore_geo_referent) if (not Utils.isNullOrEmpty(res)): k = 0 i = len(res) - 1 while i >= 0: if (res[i] == '*' or Utils.isWhitespace(res[i])): pass else: break i -= 1 k += 1 if (k > 0): if (k == len(res)): return None res = res[0:0 + len(res) - k] return res
def __TryParse(t: 'Token', prev: 'TransItemToken', after_conj: bool, attach_high: bool = False) -> 'TransItemToken': if (t is None): return None t1 = t if (t1.isChar(',')): t1 = t1.next0_ if (t1 is not None and t1.isValue("ПРИНАДЛЕЖАТЬ", "НАЛЕЖАТИ")): t1 = t1.next0_ if (isinstance(t1, ReferentToken)): if (t1.getReferent().type_name == "ORGANIZATION"): return TransItemToken._new2521(t, t1, TransItemToken.Typs.ORG, t1.getReferent(), t1.morph) route = False if (t1 is not None and ((t1.isValue("СЛЕДОВАТЬ", "СЛІДУВАТИ") or t1.isValue("ВЫПОЛНЯТЬ", "ВИКОНУВАТИ")))): t1 = t1.next0_ route = True if (t1 is not None and t1.morph.class0_.is_preposition): t1 = t1.next0_ if (t1 is not None and ((t1.isValue("РЕЙС", None) or t1.isValue("МАРШРУТ", None)))): t1 = t1.next0_ route = True if (isinstance(t1, ReferentToken)): if (isinstance(t1.getReferent(), GeoReferent)): geo_ = Utils.asObjectOrNull(t1.getReferent(), GeoReferent) if (geo_.is_state or geo_.is_city): tit = TransItemToken._new2522(t, t1, TransItemToken.Typs.ROUTE, list()) tit.route_items.append(geo_) t1 = t1.next0_ first_pass3132 = True while True: if first_pass3132: first_pass3132 = False else: t1 = t1.next0_ if (not (t1 is not None)): break if (t1.is_hiphen): continue if (t1.morph.class0_.is_preposition or t1.morph.class0_.is_conjunction): continue geo_ = (Utils.asObjectOrNull(t1.getReferent(), GeoReferent)) if (geo_ is None): break if (not geo_.is_city and not geo_.is_state): break tit.route_items.append(geo_) tit.end_token = t1 if (len(tit.route_items) > 1 or route): return tit elif ((isinstance(t1.getReferent(), DateReferent)) and (t1.whitespaces_before_count < 3)): tit = TransItemToken._new2523(t, t1, TransItemToken.Typs.DATE, t1.getReferent()) if (t1.next0_ is not None): if (t1.next0_.isValue("В", None) and t1.next0_.next0_ is not None and t1.next0_.next0_.isChar('.')): tit.end_token = t1.next0_.next0_ elif (t1.next0_.isValue("ВЫП", None) or t1.next0_.isValue("ВЫПУСК", None)): tit.end_token = t1.next0_ if (t1.next0_.next0_ is not None and t1.next0_.next0_.isChar('.')): tit.end_token = t1.next0_.next0_ return tit if (isinstance(t, TextToken)): num = MiscHelper.checkNumberPrefix(t) if (num is not None): tit = TransItemToken.__attachRusAutoNumber(num) if (tit is None): tit = TransItemToken._attachNumber(num, False) if (tit is not None): tit.begin_token = t return tit tok = TransItemToken.M_ONTOLOGY.tryParse(t, TerminParseAttr.NO) if (tok is None and ((t.isValue("С", None) or t.isValue("C", None) or t.isValue("ЗА", None)))): tok = TransItemToken.M_ONTOLOGY.tryParse( t.next0_, TerminParseAttr.NO) if (tok is None and BracketHelper.isBracket(t, True)): tok1 = TransItemToken.M_ONTOLOGY.tryParse( t.next0_, TerminParseAttr.NO) if (tok1 is not None and BracketHelper.isBracket( tok1.end_token.next0_, True)): tok = tok1 tok.begin_token = t tok.end_token = tok.end_token.next0_ tok.begin_token = t elif (tok1 is not None): tt = Utils.asObjectOrNull(tok1.termin, TransItemToken.TransTermin) if (tt.typ == TransItemToken.Typs.BRAND): tok = tok1 tok.begin_token = t if (tok is None and t.isValue("МАРКА", None)): res1 = TransItemToken.__TryParse(t.next0_, prev, after_conj, False) if (res1 is not None): if (res1.typ == TransItemToken.Typs.NAME or res1.typ == TransItemToken.Typs.BRAND): res1.begin_token = t res1.typ = TransItemToken.Typs.BRAND return res1 if (tok is not None): tt = Utils.asObjectOrNull(tok.termin, TransItemToken.TransTermin) if (tt.typ == TransItemToken.Typs.NUMBER): tit = TransItemToken.__attachRusAutoNumber( tok.end_token.next0_) if (tit is None): tit = TransItemToken._attachNumber( tok.end_token.next0_, False) if (tit is not None): tit.begin_token = t return tit else: return None if (tt.is_doubt and not attach_high): if (prev is None or prev.typ != TransItemToken.Typs.NOUN): if ((prev is not None and prev.typ == TransItemToken.Typs.BRAND and tt.typ == TransItemToken.Typs.BRAND) and Utils.compareStrings( tt.canonic_text, prev.value, True) == 0): pass else: return None if (tt.canonic_text == "СУДНО"): if ((((tok.morph.number) & (MorphNumber.PLURAL))) != (MorphNumber.UNDEFINED)): if (not BracketHelper.canBeStartOfSequence( tok.end_token.next0_, False, False)): return None tit = TransItemToken._new2524(tok.begin_token, tok.end_token, tt.kind, tt.typ, tt.is_doubt, tok.chars, tok.morph) tit.value = tt.canonic_text if (tit.typ == TransItemToken.Typs.NOUN): tit.value = tit.value.lower() else: tit.value = tit.value.upper() return tit if (tok is None and t.morph.class0_.is_adjective): npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0) if (npt is not None and len(npt.adjectives) > 0): state_ = None tt = t first_pass3133 = True while True: if first_pass3133: first_pass3133 = False else: tt = tt.next0_ if (not (tt is not None and tt.previous != npt.end_token)): break tok = TransItemToken.M_ONTOLOGY.tryParse( tt, TerminParseAttr.NO) if (tok is None and state_ is None): state_ = tt.kit.processReferent("GEO", tt) if (tok is not None and tok.end_token == npt.end_token): if ((tok.termin).typ == TransItemToken.Typs.NOUN): tit = TransItemToken._new2524( t, tok.end_token, (tok.termin).kind, TransItemToken.Typs.NOUN, (tok.termin).is_doubt, tok.chars, npt.morph) tit.value = (tok.termin).canonic_text.lower() tit.alt_value = npt.getNormalCaseText( None, False, MorphGender.UNDEFINED, False).lower() if (LanguageHelper.endsWithEx( tit.alt_value, "суд", "суда", None, None)): if (not BracketHelper.canBeStartOfSequence( tok.end_token.next0_, False, False)): continue if (state_ is not None): if ((state_.referent).is_state): tit.state = state_ return tit if (t is not None and t.isValue("КЛАСС", None) and t.next0_ is not None): br = BracketHelper.tryParse(t.next0_, BracketParseAttr.NO, 100) if (br is not None): return TransItemToken._new2526( t, br.end_token, TransItemToken.Typs.CLASS, MiscHelper.getTextValueOfMetaToken(br, GetTextAttr.NO)) nt = Utils.asObjectOrNull(t, NumberToken) if (nt is not None): if (prev is None or nt.typ != NumberSpellingType.DIGIT): return None if (prev.typ == TransItemToken.Typs.BRAND): return TransItemToken.__attachModel(t, False, prev) else: return None res = TransItemToken.__attachRusAutoNumber(t) if ((res) is not None): if (not res.is_doubt): return res if (prev is not None and prev.typ == TransItemToken.Typs.NOUN and prev.kind == TransportKind.AUTO): return res if (prev is not None and ((prev.typ == TransItemToken.Typs.BRAND or prev.typ == TransItemToken.Typs.MODEL))): return res t1 = t if (t.is_hiphen): t1 = t.next0_ if (prev is not None and prev.typ == TransItemToken.Typs.BRAND and t1 is not None): tit = TransItemToken.__attachModel(t1, True, prev) if (tit is not None): tit.begin_token = t return tit if (prev is not None and ((prev.typ == TransItemToken.Typs.NOUN or after_conj))): br = BracketHelper.tryParse(t, BracketParseAttr.NO, 100) if (br is not None and br.is_quote_type): tit = TransItemToken.tryParse(br.begin_token.next0_, prev, after_conj, False) if (tit is not None and tit.end_token.next0_ == br.end_token): if (not tit.is_doubt or tit.typ == TransItemToken.Typs.BRAND): tit.begin_token = br.begin_token tit.end_token = br.end_token return tit s = MiscHelper.getTextValueOfMetaToken(br, GetTextAttr.NO) if (not Utils.isNullOrEmpty(s) and (len(s) < 30)): chars_ = 0 digs = 0 un = 0 for c in s: if (not Utils.isWhitespace(c)): if (str.isalpha(c)): chars_ += 1 elif (str.isdigit(c)): digs += 1 else: un += 1 if (((digs == 0 and un == 0 and t.next0_.chars.is_capital_upper)) or prev.kind == TransportKind.SHIP or prev.kind == TransportKind.SPACE): return TransItemToken._new2526( br.begin_token, br.end_token, TransItemToken.Typs.NAME, s) if (digs > 0 and (chars_ < 5)): return TransItemToken._new2526( br.begin_token, br.end_token, TransItemToken.Typs.MODEL, s.replace(" ", "")) if (prev is not None and (((prev.typ == TransItemToken.Typs.NOUN or prev.typ == TransItemToken.Typs.BRAND or prev.typ == TransItemToken.Typs.NAME) or prev.typ == TransItemToken.Typs.MODEL))): tit = TransItemToken.__attachModel( t, prev.typ != TransItemToken.Typs.NAME, prev) if (tit is not None): return tit if (((prev is not None and prev.typ == TransItemToken.Typs.NOUN and prev.kind == TransportKind.AUTO) and (isinstance(t, TextToken)) and t.chars.is_letter) and not t.chars.is_all_lower and (t.whitespaces_before_count < 2)): pt = t.kit.processReferent("PERSON", t) if (pt is None): tit = TransItemToken._new2529(t, t, TransItemToken.Typs.BRAND) tit.value = (t).term return tit if (((prev is not None and prev.typ == TransItemToken.Typs.NOUN and ((prev.kind == TransportKind.SHIP or prev.kind == TransportKind.SPACE)))) or after_conj): if (t.chars.is_capital_upper): ok = True npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0) if (npt is not None and len(npt.adjectives) > 0): ok = False else: rt = t.kit.processReferent("PERSON", t) if (rt is not None): ok = False if (t.getMorphClassInDictionary().is_proper_surname): if (not t.morph.case_.is_nominative): ok = False if (ok): t1 = t tt = t.next0_ while tt is not None: if (tt.whitespaces_before_count > 1): break if (tt.chars != t.chars): break tit = TransItemToken.tryParse(tt, None, False, False) if ((tit) is not None): break t1 = tt tt = tt.next0_ s = MiscHelper.getTextValue(t, t1, GetTextAttr.NO) if (s is not None): res1 = TransItemToken._new2530( t, t1, TransItemToken.Typs.NAME, True, s) if (not t1.is_newline_after): br = BracketHelper.tryParse( t1.next0_, BracketParseAttr.NO, 100) if (br is not None): res1.end_token = br.end_token res1.alt_value = res1.value res1.value = MiscHelper.getTextValueOfMetaToken( br, GetTextAttr.NO) return res1 return None
def __doCrLfCorrection(self, txt: str) -> str: """ Это анализ случаев принудительно отформатированного текста Args: txt(str): """ cou = 0 total_len = 0 i = 0 first_pass3166 = True while True: if first_pass3166: first_pass3166 = False else: i += 1 if (not (i < len(txt))): break ch = txt[i] if ((ord(ch)) != 0xD and (ord(ch)) != 0xA): continue len0_ = 0 last_char = ch j = (i + 1) while j < len(txt): ch = txt[j] if ((ord(ch)) == 0xD or (ord(ch)) == 0xA): break elif ((ord(ch)) == 0x9): len0_ += 5 else: last_char = ch len0_ += 1 j += 1 if (j >= len(txt)): break if (len0_ < 30): continue if (last_char != '.' and last_char != ':' and last_char != ';'): next_is_dig = False k = j + 1 while k < len(txt): if (not Utils.isWhitespace(txt[k])): if (str.isdigit(txt[k])): next_is_dig = True break k += 1 if (not next_is_dig): cou += 1 total_len += len0_ i = j if (cou < 4): return txt total_len = math.floor(total_len / cou) if ((total_len < 50) or total_len > 100): return txt tmp = Utils.newStringIO(txt) i = 0 while i < tmp.tell(): ch = Utils.getCharAtStringIO(tmp, i) len0_ = 0 last_char = ch j = (i + 1) while j < tmp.tell(): ch = Utils.getCharAtStringIO(tmp, j) if ((ord(ch)) == 0xD or (ord(ch)) == 0xA): break elif ((ord(ch)) == 0x9): len0_ += 5 else: last_char = ch len0_ += 1 j += 1 if (j >= tmp.tell()): break for jj in range(j - 1, -1, -1): last_char = Utils.getCharAtStringIO(tmp, jj) if (not Utils.isWhitespace(last_char)): break else: jj = -1 not_single = False jj = (j + 1) if ((jj < tmp.tell()) and (ord(Utils.getCharAtStringIO(tmp, j))) == 0xD and (ord(Utils.getCharAtStringIO(tmp, jj))) == 0xA): jj += 1 while jj < tmp.tell(): ch = Utils.getCharAtStringIO(tmp, jj) if (not Utils.isWhitespace(ch)): break if ((ord(ch)) == 0xD or (ord(ch)) == 0xA): not_single = True break jj += 1 if (((not not_single and len0_ > (total_len - 20) and (len0_ < (total_len + 10))) and last_char != '.' and last_char != ':') and last_char != ';'): Utils.setCharAtStringIO(tmp, j, ' ') self.crlf_corrected_count += 1 if ((j + 1) < tmp.tell()): ch = Utils.getCharAtStringIO(tmp, j + 1) if ((ord(ch)) == 0xA): Utils.setCharAtStringIO(tmp, j + 1, ' ') j += 1 i = (j - 1) i += 1 return Utils.toStringStringIO(tmp)