コード例 #1
0
ファイル: Token.py プロジェクト: MihaJjDa/APCLtask
 def __getAttr(self, i: int) -> bool:
     if ((((self.__m_attrs) & 1)) == 0):
         self.__m_attrs = (1)
         if (self._m_previous is None):
             self._setAttr(1, True)
             self._setAttr(3, True)
         else:
             j = self._m_previous.end_char + 1
             while j < self.begin_char:
                 ch = self.kit.sofa.text[j]
                 if (Utils.isWhitespace((ch))):
                     self._setAttr(1, True)
                     if ((ord(ch)) == 0xD or (ord(ch)) == 0xA
                             or ch == '\f'):
                         self._setAttr(3, True)
                 j += 1
         if (self._m_next is None):
             self._setAttr(2, True)
             self._setAttr(4, True)
         else:
             j = self.end_char + 1
             while j < self._m_next.begin_char:
                 ch = self.kit.sofa.text[j]
                 if (Utils.isWhitespace(ch)):
                     self._setAttr(2, True)
                     if ((ord(ch)) == 0xD or (ord(ch)) == 0xA
                             or ch == '\f'):
                         self._setAttr(4, True)
                 j += 1
     return (((((self.__m_attrs) >> i)) & 1)) != 0
コード例 #2
0
ファイル: Termin.py プロジェクト: MihaJjDa/APCLtask
 def addAbridge(self, abr: str) -> 'Abridge':
     if (abr == "В/ГОР"):
         pass
     a = Termin.Abridge()
     if (self.abridges is None):
         self.abridges = list()
     i = 0
     while i < len(abr):
         if (not str.isalpha(abr[i])):
             break
         i += 1
     if (i == 0):
         return None
     a.parts.append(Termin.AbridgePart._new604(abr[0:0 + i].upper()))
     self.abridges.append(a)
     if (((i + 1) < len(abr)) and abr[i] == '-'):
         a.tail = abr[i + 1:].upper()
     elif (i < len(abr)):
         if (not Utils.isWhitespace(abr[i])):
             a.parts[0].has_delim = True
         while i < len(abr):
             if (str.isalpha(abr[i])):
                 j = (i + 1)
                 while j < len(abr):
                     if (not str.isalpha(abr[j])):
                         break
                     j += 1
                 p = Termin.AbridgePart._new604(abr[i:i + j - i].upper())
                 if (j < len(abr)):
                     if (not Utils.isWhitespace(abr[j])):
                         p.has_delim = True
                 a.parts.append(p)
                 i = j
             i += 1
     return a
コード例 #3
0
 def create_number(gr : 'SemGraph', num : 'NumbersWithUnitToken') -> 'SemObject':
     rs = num.create_refenets_tokens_with_register(None, None, False)
     if (rs is None or len(rs) == 0): 
         return None
     mr = Utils.asObjectOrNull(rs[len(rs) - 1].referent, MeasureReferent)
     sem = SemObject(gr)
     gr.objects.append(sem)
     sem.tokens.append(num)
     sem.morph.normal_case = mr.to_string(True, None, 0)
     sem.morph.normal_full = sem.morph.normal_case
     sem.typ = SemObjectType.NOUN
     sem.measure = mr.kind
     i = 0
     first_pass3438 = True
     while True:
         if first_pass3438: first_pass3438 = False
         else: i += 1
         if (not (i < len(sem.morph.normal_case))): break
         ch = sem.morph.normal_case[i]
         if (str.isdigit(ch) or Utils.isWhitespace(ch) or "[].+-".find(ch) >= 0): 
             continue
         sem.quantity = SemQuantity(sem.morph.normal_case[0:0+i].strip(), num.begin_token, num.end_token)
         sem.morph.normal_case = sem.morph.normal_case[i:].strip()
         if (len(num.units) == 1 and num.units[0].unit is not None): 
             sem.morph.normal_full = num.units[0].unit.fullname_cyr
             if (sem.morph.normal_full == "%"): 
                 sem.morph.normal_full = "процент"
         break
     sem.concept = (mr)
     return sem
コード例 #4
0
ファイル: SourceOfAnalysis.py プロジェクト: MihaJjDa/APCLtask
 def calcWhitespaceDistanceBetweenPositions(self, pos_from: int,
                                            pos_to: int) -> int:
     """ Вычислить расстояние в символах между соседними элементами
     
     Args:
         indFrom: 
         indTo: 
     
     """
     if (pos_from == (pos_to + 1)):
         return 0
     if (pos_from > pos_to or (pos_from < 0) or pos_to >= len(self.text)):
         return -1
     res = 0
     i = pos_from
     while i <= pos_to:
         ch = self.text[i]
         if (not Utils.isWhitespace(ch)):
             return -1
         if (ch == '\r' or ch == '\n'):
             res += 10
         elif (ch == '\t'):
             res += 5
         else:
             res += 1
         i += 1
     return res
コード例 #5
0
 def __init__(self, t: 'Token') -> None:
     self.col_span = 0
     self.row_span = 0
     self.typ = TableHelper.TableTypes.UNDEFINED
     self.src = None
     self.src = t
     if (t is None):
         return
     if (t.is_char(chr(0x1E))):
         self.typ = TableHelper.TableTypes.TABLESTART
         return
     if (t.is_char(chr(0x1F))):
         self.typ = TableHelper.TableTypes.TABLEEND
         return
     if (not t.is_char(chr(7))):
         return
     txt = t.kit.sofa.text
     self.typ = TableHelper.TableTypes.CELLEND
     p = t.begin_char - 1
     if (p < 0):
         return
     if ((ord(txt[p])) == 0xD or (ord(txt[p])) == 0xA):
         self.typ = TableHelper.TableTypes.ROWEND
         return
     self.row_span = 1
     self.col_span = self.row_span
     while p >= 0:
         if (not Utils.isWhitespace(txt[p])):
             break
         elif (txt[p] == '\t'):
             self.col_span += 1
         elif (txt[p] == '\f'):
             self.row_span += 1
         p -= 1
コード例 #6
0
 def _initialize() -> None:
     if (MiscLocationHelper.__m_nords is not None):
         return
     MiscLocationHelper.__m_nords = TerminCollection()
     for s in [
             "СЕВЕРНЫЙ", "ЮЖНЫЙ", "ЗАПАДНЫЙ", "ВОСТОЧНЫЙ", "ЦЕНТРАЛЬНЫЙ",
             "БЛИЖНИЙ", "ДАЛЬНИЙ", "СРЕДНИЙ", "СЕВЕР", "ЮГ", "ЗАПАД",
             "ВОСТОК", "СЕВЕРО", "ЮГО", "ЗАПАДНО", "ВОСТОЧНО",
             "СЕВЕРОЗАПАДНЫЙ", "СЕВЕРОВОСТОЧНЫЙ", "ЮГОЗАПАДНЫЙ",
             "ЮГОВОСТОЧНЫЙ"
     ]:
         MiscLocationHelper.__m_nords.add(Termin(s, MorphLang.RU, True))
     table = "\nAF\tAFG\nAX\tALA\nAL\tALB\nDZ\tDZA\nAS\tASM\nAD\tAND\nAO\tAGO\nAI\tAIA\nAQ\tATA\nAG\tATG\nAR\tARG\nAM\tARM\nAW\tABW\nAU\tAUS\nAT\tAUT\nAZ\tAZE\nBS\tBHS\nBH\tBHR\nBD\tBGD\nBB\tBRB\nBY\tBLR\nBE\tBEL\nBZ\tBLZ\nBJ\tBEN\nBM\tBMU\nBT\tBTN\nBO\tBOL\nBA\tBIH\nBW\tBWA\nBV\tBVT\nBR\tBRA\nVG\tVGB\nIO\tIOT\nBN\tBRN\nBG\tBGR\nBF\tBFA\nBI\tBDI\nKH\tKHM\nCM\tCMR\nCA\tCAN\nCV\tCPV\nKY\tCYM\nCF\tCAF\nTD\tTCD\nCL\tCHL\nCN\tCHN\nHK\tHKG\nMO\tMAC\nCX\tCXR\nCC\tCCK\nCO\tCOL\nKM\tCOM\nCG\tCOG\nCD\tCOD\nCK\tCOK\nCR\tCRI\nCI\tCIV\nHR\tHRV\nCU\tCUB\nCY\tCYP\nCZ\tCZE\nDK\tDNK\nDJ\tDJI\nDM\tDMA\nDO\tDOM\nEC\tECU\nEG\tEGY\nSV\tSLV\nGQ\tGNQ\nER\tERI\nEE\tEST\nET\tETH\nFK\tFLK\nFO\tFRO\nFJ\tFJI\nFI\tFIN\nFR\tFRA\nGF\tGUF\nPF\tPYF\nTF\tATF\nGA\tGAB\nGM\tGMB\nGE\tGEO\nDE\tDEU\nGH\tGHA\nGI\tGIB\nGR\tGRC\nGL\tGRL\nGD\tGRD\nGP\tGLP\nGU\tGUM\nGT\tGTM\nGG\tGGY\nGN\tGIN\nGW\tGNB\nGY\tGUY\nHT\tHTI\nHM\tHMD\nVA\tVAT\nHN\tHND\nHU\tHUN\nIS\tISL\nIN\tIND\nID\tIDN\nIR\tIRN\nIQ\tIRQ\nIE\tIRL\nIM\tIMN\nIL\tISR\nIT\tITA\nJM\tJAM\nJP\tJPN\nJE\tJEY\nJO\tJOR\nKZ\tKAZ\nKE\tKEN\nKI\tKIR\nKP\tPRK\nKR\tKOR\nKW\tKWT\nKG\tKGZ\nLA\tLAO\nLV\tLVA\nLB\tLBN\nLS\tLSO\nLR\tLBR\nLY\tLBY\nLI\tLIE\nLT\tLTU\nLU\tLUX\nMK\tMKD\nMG\tMDG\nMW\tMWI\nMY\tMYS\nMV\tMDV\nML\tMLI\nMT\tMLT\nMH\tMHL\nMQ\tMTQ\nMR\tMRT\nMU\tMUS\nYT\tMYT\nMX\tMEX\nFM\tFSM\nMD\tMDA\nMC\tMCO\nMN\tMNG\nME\tMNE\nMS\tMSR\nMA\tMAR\nMZ\tMOZ\nMM\tMMR\nNA\tNAM\nNR\tNRU\nNP\tNPL\nNL\tNLD\nAN\tANT\nNC\tNCL\nNZ\tNZL\nNI\tNIC\nNE\tNER\nNG\tNGA\nNU\tNIU\nNF\tNFK\nMP\tMNP\nNO\tNOR\nOM\tOMN\nPK\tPAK\nPW\tPLW\nPS\tPSE\nPA\tPAN\nPG\tPNG\nPY\tPRY\nPE\tPER\nPH\tPHL\nPN\tPCN\nPL\tPOL\nPT\tPRT\nPR\tPRI\nQA\tQAT\nRE\tREU\nRO\tROU\nRU\tRUS\nRW\tRWA\nBL\tBLM\nSH\tSHN\nKN\tKNA\nLC\tLCA\nMF\tMAF\nPM\tSPM\nVC\tVCT\nWS\tWSM\nSM\tSMR\nST\tSTP\nSA\tSAU\nSN\tSEN\nRS\tSRB\nSC\tSYC\nSL\tSLE\nSG\tSGP\nSK\tSVK\nSI\tSVN\nSB\tSLB\nSO\tSOM\nZA\tZAF\nGS\tSGS\nSS\tSSD\nES\tESP\nLK\tLKA\nSD\tSDN\nSR\tSUR\nSJ\tSJM\nSZ\tSWZ\nSE\tSWE\nCH\tCHE\nSY\tSYR\nTW\tTWN\nTJ\tTJK\nTZ\tTZA\nTH\tTHA\nTL\tTLS\nTG\tTGO\nTK\tTKL\nTO\tTON\nTT\tTTO\nTN\tTUN\nTR\tTUR\nTM\tTKM\nTC\tTCA\nTV\tTUV\nUG\tUGA\nUA\tUKR\nAE\tARE\nGB\tGBR\nUS\tUSA\nUM\tUMI\nUY\tURY\nUZ\tUZB\nVU\tVUT\nVE\tVEN\nVN\tVNM\nVI\tVIR\nWF\tWLF\nEH\tESH\nYE\tYEM\nZM\tZMB\nZW\tZWE "
     for s in Utils.splitString(table, '\n', False):
         ss = s.strip()
         if ((len(ss) < 6) or not Utils.isWhitespace(ss[2])):
             continue
         cod2 = ss[0:0 + 2]
         cod3 = ss[3:].strip()
         if (len(cod3) != 3):
             continue
         if (not cod2 in MiscLocationHelper._m_alpha2_3):
             MiscLocationHelper._m_alpha2_3[cod2] = cod3
         if (not cod3 in MiscLocationHelper._m_alpha3_2):
             MiscLocationHelper._m_alpha3_2[cod3] = cod2
コード例 #7
0
 def is_cyrillic(str0_: str) -> bool:
     if (str0_ is None):
         return False
     i = 0
     while i < len(str0_):
         if (not LanguageHelper.is_cyrillic_char(str0_[i])):
             if (not Utils.isWhitespace(str0_[i]) and str0_[i] != '-'):
                 return False
         i += 1
     return True
コード例 #8
0
ファイル: Token.py プロジェクト: MihaJjDa/APCLtask
 def newlines_before_count(self) -> int:
     """ Количество переходов на новую строку перед """
     ch0 = chr(0)
     res = 0
     txt = self.kit.sofa.text
     for p in range(self.begin_char - 1, -1, -1):
         ch = txt[p]
         if ((ord(ch)) == 0xA):
             res += 1
         elif ((ord(ch)) == 0xD and (ord(ch0)) != 0xA):
             res += 1
         elif (ch == '\f'):
             res += 10
         elif (not Utils.isWhitespace(ch)):
             break
         ch0 = ch
     return res
コード例 #9
0
ファイル: Token.py プロジェクト: MihaJjDa/APCLtask
 def newlines_after_count(self) -> int:
     """ Количество переходов на новую строку перед """
     ch0 = chr(0)
     res = 0
     txt = self.kit.sofa.text
     p = self.end_char + 1
     while p < len(txt):
         ch = txt[p]
         if ((ord(ch)) == 0xD):
             res += 1
         elif ((ord(ch)) == 0xA and (ord(ch0)) != 0xD):
             res += 1
         elif (ch == '\f'):
             res += 10
         elif (not Utils.isWhitespace(ch)):
             break
         ch0 = ch
         p += 1
     return res
コード例 #10
0
 def initialize() -> None:
     if (UnicodeInfo.__m_inited):
         return
     UnicodeInfo.__m_inited = True
     UnicodeInfo.ALL_CHARS = list()
     cyrvowel = "АЕЁИОУЮЯЫЭЄІЇЎӘӨҰҮІ"
     cyrvowel += cyrvowel.lower()
     for i in range(0x10000):
         ch = chr(i)
         ui = UnicodeInfo(i)
         if (Utils.isWhitespace(ch)):
             ui.is_whitespace = True
         elif (str.isdigit(ch)):
             ui.is_digit = True
         elif (ch == 'º' or ch == '°'):
             pass
         elif (str.isalpha(ch)):
             ui.is_letter = True
             if (i >= 0x400 and (i < 0x500)):
                 ui.is_cyrillic = True
                 if (cyrvowel.find(ch) >= 0):
                     ui.is_vowel = True
             elif (i < 0x200):
                 ui.is_latin = True
                 if ("AEIOUYaeiouy".find(ch) >= 0):
                     ui.is_vowel = True
             if (str.isupper(ch)):
                 ui.is_upper = True
             if (str.islower(ch)):
                 ui.is_lower = True
         else:
             if (((((ch == '-' or ch == '–' or ch == '¬') or ch == '-' or ch
                    == (chr(0x00AD))) or ch == (chr(0x2011)) or ch == '-')
                  or ch == '—' or ch == '–') or ch == '−' or ch == '-'):
                 ui.is_hiphen = True
             if ("\"'`“”’".find(ch) >= 0):
                 ui.is_quot = True
             if ("'`’".find(ch) >= 0):
                 ui.is_apos = True
                 ui.is_quot = True
         if (i >= 0x300 and (i < 0x370)):
             ui.is_udaren = True
         UnicodeInfo.ALL_CHARS.append(ui)
コード例 #11
0
 def __getNameWithoutBrackets(begin: 'Token',
                              end: 'Token',
                              normalize_first_noun_group: bool = False,
                              normal_first_group_single: bool = False,
                              ignore_geo_referent: bool = False) -> str:
     """ Получить строковое значение между токенами, при этом исключая кавычки и скобки
     
     Args:
         begin(Token): начальный токен
         end(Token): конечный токен
         normalize_first_noun_group(bool): нормализовывать ли первую именную группу (именит. падеж)
         normal_first_group_single(bool): приводить ли к единственному числу первую именную группу
         ignore_geo_referent(bool): игнорировать внутри географические сущности
     
     """
     res = None
     if (BracketHelper.canBeStartOfSequence(begin, False, False) and
             BracketHelper.canBeEndOfSequence(end, False, begin, False)):
         begin = begin.next0_
         end = end.previous
     if (normalize_first_noun_group
             and not begin.morph.class0_.is_preposition):
         npt = NounPhraseHelper.tryParse(
             begin, NounPhraseParseAttr.REFERENTCANBENOUN, 0)
         if (npt is not None):
             if (npt.noun.getMorphClassInDictionary().is_undefined
                     and len(npt.adjectives) == 0):
                 npt = (None)
         if (npt is not None and npt.end_token.end_char > end.end_char):
             npt = (None)
         if (npt is not None):
             res = npt.getNormalCaseText(None, normal_first_group_single,
                                         MorphGender.UNDEFINED, False)
             te = npt.end_token.next0_
             if (((te is not None and te.next0_ is not None and te.is_comma)
                  and (isinstance(te.next0_, TextToken))
                  and te.next0_.end_char <= end.end_char)
                     and te.next0_.morph.class0_.is_verb
                     and te.next0_.morph.class0_.is_adjective):
                 for it in te.next0_.morph.items:
                     if (it.gender == npt.morph.gender
                             or (((it.gender) & (npt.morph.gender))) !=
                         (MorphGender.UNDEFINED)):
                         if (not (
                             (it.case_) & npt.morph.case_).is_undefined):
                             if (it.number == npt.morph.number or
                                 (((it.number) & (npt.morph.number))) !=
                                 (MorphNumber.UNDEFINED)):
                                 var = (te.next0_).term
                                 if (isinstance(it, MorphWordForm)):
                                     var = (it).normal_case
                                 bi = MorphBaseInfo._new549(
                                     MorphClass.ADJECTIVE, npt.morph.gender,
                                     npt.morph.number, npt.morph.language)
                                 var = Morphology.getWordform(var, bi)
                                 if (var is not None):
                                     res = "{0}, {1}".format(res, var)
                                     te = te.next0_.next0_
                                 break
             if (te is not None and te.end_char <= end.end_char):
                 s = ProperNameHelper.getNameEx(te, end,
                                                MorphClass.UNDEFINED,
                                                MorphCase.UNDEFINED,
                                                MorphGender.UNDEFINED, True,
                                                ignore_geo_referent)
                 if (not Utils.isNullOrEmpty(s)):
                     if (not str.isalnum(s[0])):
                         res = "{0}{1}".format(res, s)
                     else:
                         res = "{0} {1}".format(res, s)
         elif ((isinstance(begin, TextToken))
               and begin.chars.is_cyrillic_letter):
             mm = begin.getMorphClassInDictionary()
             if (not mm.is_undefined):
                 res = begin.getNormalCaseText(mm, False,
                                               MorphGender.UNDEFINED, False)
                 if (begin.end_char < end.end_char):
                     res = "{0} {1}".format(
                         res,
                         ProperNameHelper.getNameEx(begin.next0_, end,
                                                    MorphClass.UNDEFINED,
                                                    MorphCase.UNDEFINED,
                                                    MorphGender.UNDEFINED,
                                                    True, False))
     if (res is None):
         res = ProperNameHelper.getNameEx(begin, end, MorphClass.UNDEFINED,
                                          MorphCase.UNDEFINED,
                                          MorphGender.UNDEFINED, True,
                                          ignore_geo_referent)
     if (not Utils.isNullOrEmpty(res)):
         k = 0
         i = len(res) - 1
         while i >= 0:
             if (res[i] == '*' or Utils.isWhitespace(res[i])):
                 pass
             else:
                 break
             i -= 1
             k += 1
         if (k > 0):
             if (k == len(res)):
                 return None
             res = res[0:0 + len(res) - k]
     return res
コード例 #12
0
ファイル: InnerMorphology.py プロジェクト: MihaJjDa/APCLtask
 def run(self, text: str, only_tokenizing: bool, dlang: 'MorphLang',
         progress: EventHandler,
         good_text: bool) -> typing.List['MorphToken']:
     """ Произвести морфологический анализ текста
     
     Args:
         text(str): исходный текст
         lang: язык (если null, то попробует определить)
     
     Returns:
         typing.List[MorphToken]: последовательность результирующих морфем
     """
     if (Utils.isNullOrEmpty(text)):
         return None
     twr = TextWrapper(text, good_text)
     twrch = twr.chars
     res = list()
     uni_lex = dict()
     term0 = None
     pure_rus_words = 0
     pure_ukr_words = 0
     pure_by_words = 0
     pure_kz_words = 0
     tot_rus_words = 0
     tot_ukr_words = 0
     tot_by_words = 0
     tot_kz_words = 0
     i = 0
     first_pass2708 = True
     while True:
         if first_pass2708: first_pass2708 = False
         else: i += 1
         if (not (i < twr.length)): break
         ty = InnerMorphology._getCharTyp(twrch[i])
         if (ty == 0):
             continue
         if (ty > 2):
             j = (i + 1)
         else:
             j = (i + 1)
             while j < twr.length:
                 if (InnerMorphology._getCharTyp(twrch[j]) != ty):
                     break
                 j += 1
         wstr = text[i:i + j - i]
         term = None
         if (good_text):
             term = wstr
         else:
             trstr = LanguageHelper.transliteralCorrection(
                 wstr, term0, False)
             term = LanguageHelper.correctWord(trstr)
         if (Utils.isNullOrEmpty(term)):
             i = (j - 1)
             continue
         lang = InnerMorphology.__detectLang(twr, i, j - 1, term)
         if (lang == MorphLang.UA):
             pure_ukr_words += 1
         elif (lang == MorphLang.RU):
             pure_rus_words += 1
         elif (lang == MorphLang.BY):
             pure_by_words += 1
         elif (lang == MorphLang.KZ):
             pure_kz_words += 1
         if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN):
             tot_rus_words += 1
         if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN):
             tot_ukr_words += 1
         if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN):
             tot_by_words += 1
         if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN):
             tot_kz_words += 1
         if (ty == 1):
             term0 = term
         lemmas = None
         if (ty == 1 and not only_tokenizing):
             wraplemmas7 = RefOutArgWrapper(None)
             inoutres8 = Utils.tryGetValue(uni_lex, term, wraplemmas7)
             lemmas = wraplemmas7.value
             if (not inoutres8):
                 lemmas = InnerMorphology.UniLexWrap._new6(lang)
                 uni_lex[term] = lemmas
         tok = MorphToken()
         tok.term = term
         tok.begin_char = i
         if (i == 733860):
             pass
         tok.end_char = (j - 1)
         tok.tag = (lemmas)
         res.append(tok)
         i = (j - 1)
     def_lang = MorphLang(dlang)
     if (pure_rus_words > pure_ukr_words and pure_rus_words > pure_by_words
             and pure_rus_words > pure_kz_words):
         def_lang = MorphLang.RU
     elif (tot_rus_words > tot_ukr_words and tot_rus_words > tot_by_words
           and tot_rus_words > tot_kz_words):
         def_lang = MorphLang.RU
     elif (pure_ukr_words > pure_rus_words
           and pure_ukr_words > pure_by_words
           and pure_ukr_words > pure_kz_words):
         def_lang = MorphLang.UA
     elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words
           and tot_ukr_words > tot_kz_words):
         def_lang = MorphLang.UA
     elif (pure_kz_words > pure_rus_words and pure_kz_words > pure_ukr_words
           and pure_kz_words > pure_by_words):
         def_lang = MorphLang.KZ
     elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words
           and tot_kz_words > tot_by_words):
         def_lang = MorphLang.KZ
     elif (pure_by_words > pure_rus_words and pure_by_words > pure_ukr_words
           and pure_by_words > pure_kz_words):
         def_lang = MorphLang.BY
     elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words
           and tot_by_words > tot_kz_words):
         if (tot_rus_words > 10 and tot_by_words > (tot_rus_words + 20)):
             def_lang = MorphLang.BY
         elif (tot_rus_words == 0 or tot_by_words >= (tot_rus_words * 2)):
             def_lang = MorphLang.BY
     if (((def_lang.is_undefined or def_lang.is_ua)) and tot_rus_words > 0):
         if (((tot_ukr_words > tot_rus_words
               and InnerMorphology.M_ENGINE_UA.language.is_ua))
                 or ((tot_by_words > tot_rus_words
                      and InnerMorphology.M_ENGINE_BY.language.is_by))
                 or ((tot_kz_words > tot_rus_words
                      and InnerMorphology.M_ENGINE_KZ.language.is_kz))):
             cou0 = 0
             tot_kz_words = 0
             tot_ukr_words = tot_kz_words
             tot_by_words = tot_ukr_words
             tot_rus_words = tot_by_words
             for kp in uni_lex.items():
                 lang = MorphLang()
                 wraplang9 = RefOutArgWrapper(lang)
                 kp[1].word_forms = self.__processOneWord(kp[0], wraplang9)
                 lang = wraplang9.value
                 if (kp[1].word_forms is not None):
                     for wf in kp[1].word_forms:
                         lang |= wf.language
                 kp[1].lang = lang
                 if (lang.is_ru):
                     tot_rus_words += 1
                 if (lang.is_ua):
                     tot_ukr_words += 1
                 if (lang.is_by):
                     tot_by_words += 1
                 if (lang.is_kz):
                     tot_kz_words += 1
                 if (lang.is_cyrillic):
                     cou0 += 1
                 if (cou0 >= 100):
                     break
             if (tot_rus_words > ((math.floor(tot_by_words / 2)))
                     and tot_rus_words > ((math.floor(tot_ukr_words / 2)))):
                 def_lang = MorphLang.RU
             elif (tot_ukr_words > ((math.floor(tot_rus_words / 2)))
                   and tot_ukr_words > ((math.floor(tot_by_words / 2)))):
                 def_lang = MorphLang.UA
             elif (tot_by_words > ((math.floor(tot_rus_words / 2)))
                   and tot_by_words > ((math.floor(tot_ukr_words / 2)))):
                 def_lang = MorphLang.BY
         elif (def_lang.is_undefined):
             def_lang = MorphLang.RU
     cou = 0
     tot_kz_words = 0
     tot_ukr_words = tot_kz_words
     tot_by_words = tot_ukr_words
     tot_rus_words = tot_by_words
     for kp in uni_lex.items():
         lang = def_lang
         if (lang.is_undefined):
             if (tot_rus_words > tot_by_words
                     and tot_rus_words > tot_ukr_words
                     and tot_rus_words > tot_kz_words):
                 lang = MorphLang.RU
             elif (tot_ukr_words > tot_rus_words
                   and tot_ukr_words > tot_by_words
                   and tot_ukr_words > tot_kz_words):
                 lang = MorphLang.UA
             elif (tot_by_words > tot_rus_words
                   and tot_by_words > tot_ukr_words
                   and tot_by_words > tot_kz_words):
                 lang = MorphLang.BY
             elif (tot_kz_words > tot_rus_words
                   and tot_kz_words > tot_ukr_words
                   and tot_kz_words > tot_by_words):
                 lang = MorphLang.KZ
         wraplang10 = RefOutArgWrapper(lang)
         kp[1].word_forms = self.__processOneWord(kp[0], wraplang10)
         lang = wraplang10.value
         kp[1].lang = lang
         if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN):
             tot_rus_words += 1
         if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN):
             tot_ukr_words += 1
         if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN):
             tot_by_words += 1
         if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN):
             tot_kz_words += 1
         if (progress is not None):
             self.__onProgress(cou, len(uni_lex), progress)
         cou += 1
     debug_token = None
     empty_list = None
     for r in res:
         uni = Utils.asObjectOrNull(r.tag, InnerMorphology.UniLexWrap)
         r.tag = None
         if (uni is None or uni.word_forms is None
                 or len(uni.word_forms) == 0):
             if (empty_list is None):
                 empty_list = list()
             r.word_forms = empty_list
             if (uni is not None):
                 r.language = uni.lang
         else:
             r.word_forms = uni.word_forms
         if (r.begin_char == 733860):
             debug_token = r
     if (not good_text):
         i = 0
         first_pass2709 = True
         while True:
             if first_pass2709: first_pass2709 = False
             else: i += 1
             if (not (i < (len(res) - 2))): break
             ui0 = twrch[res[i].begin_char]
             ui1 = twrch[res[i + 1].begin_char]
             ui2 = twrch[res[i + 2].begin_char]
             if (ui1.is_quot):
                 p = res[i + 1].begin_char
                 if ((p >= 2 and "БбТт".find(text[p - 1]) >= 0 and
                      ((p + 3) < len(text)))
                         and "ЕеЯяЁё".find(text[p + 1]) >= 0):
                     wstr = LanguageHelper.transliteralCorrection(
                         LanguageHelper.correctWord("{0}Ъ{1}".format(
                             res[i].getSourceText(text),
                             res[i + 2].getSourceText(text))), None, False)
                     li = self.__processOneWord0(wstr)
                     if (li is not None and len(li) > 0
                             and li[0].is_in_dictionary):
                         res[i].end_char = res[i + 2].end_char
                         res[i].term = wstr
                         res[i].word_forms = li
                         del res[i + 1:i + 1 + 2]
                 elif ((ui1.is_apos and p > 0 and str.isalpha(text[p - 1]))
                       and ((p + 1) < len(text))
                       and str.isalpha(text[p + 1])):
                     if (def_lang == MorphLang.UA
                             or (((res[i].language) & MorphLang.UA)) !=
                             MorphLang.UNKNOWN
                             or (((res[i + 2].language) & MorphLang.UA)) !=
                             MorphLang.UNKNOWN):
                         wstr = LanguageHelper.transliteralCorrection(
                             LanguageHelper.correctWord("{0}{1}".format(
                                 res[i].getSourceText(text),
                                 res[i + 2].getSourceText(text))), None,
                             False)
                         li = self.__processOneWord0(wstr)
                         okk = True
                         if (okk):
                             res[i].end_char = res[i + 2].end_char
                             res[i].term = wstr
                             if (li is None):
                                 li = list()
                             res[i].word_forms = li
                             if (li is not None and len(li) > 0):
                                 res[i].language = li[0].language
                             del res[i + 1:i + 1 + 2]
             elif (((ui1.uni_char == '3' or ui1.uni_char == '4'))
                   and res[i + 1].length == 1):
                 src = ("З" if ui1.uni_char == '3' else "Ч")
                 i0 = i + 1
                 if ((res[i].end_char + 1) == res[i + 1].begin_char
                         and ui0.is_cyrillic):
                     i0 -= 1
                     src = (res[i0].getSourceText(text) + src)
                 i1 = i + 1
                 if ((res[i + 1].end_char + 1) == res[i + 2].begin_char
                         and ui2.is_cyrillic):
                     i1 += 1
                     src += res[i1].getSourceText(text)
                 if (len(src) > 2):
                     wstr = LanguageHelper.transliteralCorrection(
                         LanguageHelper.correctWord(src), None, False)
                     li = self.__processOneWord0(wstr)
                     if (li is not None and len(li) > 0
                             and li[0].is_in_dictionary):
                         res[i0].end_char = res[i1].end_char
                         res[i0].term = wstr
                         res[i0].word_forms = li
                         del res[i0 + 1:i0 + 1 + i1 - i0]
             elif ((ui1.is_hiphen and ui0.is_letter and ui2.is_letter)
                   and res[i].end_char > res[i].begin_char
                   and res[i + 2].end_char > res[i + 2].begin_char):
                 newline = False
                 sps = 0
                 j = (res[i + 1].end_char + 1)
                 while j < res[i + 2].begin_char:
                     if (text[j] == '\r' or text[j] == '\n'):
                         newline = True
                         sps += 1
                     elif (not Utils.isWhitespace(text[j])):
                         break
                     else:
                         sps += 1
                     j += 1
                 full_word = LanguageHelper.correctWord(
                     res[i].getSourceText(text) +
                     res[i + 2].getSourceText(text))
                 if (not newline):
                     if (full_word in uni_lex or full_word == "ИЗЗА"):
                         newline = True
                     elif (text[res[i + 1].begin_char] == (chr(0x00AD))):
                         newline = True
                     elif (LanguageHelper.endsWithEx(
                             res[i].getSourceText(text), "О", "о", None,
                             None) and len(res[i + 2].word_forms) > 0
                           and res[i + 2].word_forms[0].is_in_dictionary):
                         if (text[res[i + 1].begin_char] == '¬'):
                             li = self.__processOneWord0(full_word)
                             if (li is not None and len(li) > 0
                                     and li[0].is_in_dictionary):
                                 newline = True
                     elif ((res[i].end_char + 2) == res[i + 2].begin_char):
                         if (not str.isupper(text[res[i + 2].begin_char])
                                 and (sps < 2) and len(full_word) > 4):
                             newline = True
                             if ((i + 3) < len(res)):
                                 ui3 = twrch[res[i + 3].begin_char]
                                 if (ui3.is_hiphen):
                                     newline = False
                     elif (((res[i].end_char + 1) == res[i + 1].begin_char
                            and sps > 0 and (sps < 3))
                           and len(full_word) > 4):
                         newline = True
                 if (newline):
                     li = self.__processOneWord0(full_word)
                     if (li is not None and len(li) > 0
                             and ((li[0].is_in_dictionary
                                   or full_word in uni_lex))):
                         res[i].end_char = res[i + 2].end_char
                         res[i].term = full_word
                         res[i].word_forms = li
                         del res[i + 1:i + 1 + 2]
                 else:
                     pass
             elif ((ui1.is_letter and ui0.is_letter and res[i].length > 2)
                   and res[i + 1].length > 1):
                 if (ui0.is_upper != ui1.is_upper):
                     continue
                 if (not ui0.is_cyrillic or not ui1.is_cyrillic):
                     continue
                 newline = False
                 j = (res[i].end_char + 1)
                 while j < res[i + 1].begin_char:
                     if (twrch[j].code == 0xD or twrch[j].code == 0xA):
                         newline = True
                         break
                     j += 1
                 if (not newline):
                     continue
                 full_word = LanguageHelper.correctWord(
                     res[i].getSourceText(text) +
                     res[i + 1].getSourceText(text))
                 if (not full_word in uni_lex):
                     continue
                 li = self.__processOneWord0(full_word)
                 if (li is not None and len(li) > 0
                         and li[0].is_in_dictionary):
                     res[i].end_char = res[i + 1].end_char
                     res[i].term = full_word
                     res[i].word_forms = li
                     del res[i + 1]
     i = 0
     first_pass2710 = True
     while True:
         if first_pass2710: first_pass2710 = False
         else: i += 1
         if (not (i < len(res))): break
         mt = res[i]
         mt.char_info = CharsInfo()
         ui0 = twrch[mt.begin_char]
         ui00 = UnicodeInfo.ALL_CHARS[ord((res[i].term[0]))]
         j = (mt.begin_char + 1)
         while j <= mt.end_char:
             if (ui0.is_letter):
                 break
             ui0 = twrch[j]
             j += 1
         if (ui0.is_letter):
             res[i].char_info.is_letter = True
             if (ui00.is_latin):
                 res[i].char_info.is_latin_letter = True
             elif (ui00.is_cyrillic):
                 res[i].char_info.is_cyrillic_letter = True
             if (res[i].language == MorphLang.UNKNOWN):
                 if (LanguageHelper.isCyrillic(mt.term)):
                     res[i].language = (MorphLang.RU if
                                        def_lang.is_undefined else def_lang)
             if (good_text):
                 continue
             all_up = True
             all_lo = True
             j = mt.begin_char
             while j <= mt.end_char:
                 if (twrch[j].is_upper or twrch[j].is_digit):
                     all_lo = False
                 else:
                     all_up = False
                 j += 1
             if (all_up):
                 mt.char_info.is_all_upper = True
             elif (all_lo):
                 mt.char_info.is_all_lower = True
             elif (((ui0.is_upper or twrch[mt.begin_char].is_digit))
                   and mt.end_char > mt.begin_char):
                 all_lo = True
                 j = (mt.begin_char + 1)
                 while j <= mt.end_char:
                     if (twrch[j].is_upper or twrch[j].is_digit):
                         all_lo = False
                         break
                     j += 1
                 if (all_lo):
                     mt.char_info.is_capital_upper = True
                 elif (twrch[mt.end_char].is_lower
                       and (mt.end_char - mt.begin_char) > 1):
                     all_up = True
                     j = mt.begin_char
                     while j < mt.end_char:
                         if (twrch[j].is_lower):
                             all_up = False
                             break
                         j += 1
                     if (all_up):
                         mt.char_info.is_last_lower = True
         if (mt.char_info.is_last_lower and mt.length > 2
                 and mt.char_info.is_cyrillic_letter):
             pref = text[mt.begin_char:mt.begin_char + mt.end_char -
                         mt.begin_char]
             ok = False
             for wf in mt.word_forms:
                 if (wf.normal_case == pref or wf.normal_full == pref):
                     ok = True
                     break
             if (not ok):
                 mt.word_forms = list(mt.word_forms)
                 mt.word_forms.insert(
                     0, MorphWordForm._new11(pref, MorphClass.NOUN, 1))
     if (good_text or only_tokenizing):
         return res
     i = 0
     first_pass2711 = True
     while True:
         if first_pass2711: first_pass2711 = False
         else: i += 1
         if (not (i < len(res))): break
         if (res[i].length == 1 and res[i].char_info.is_latin_letter):
             ch = res[i].term[0]
             if (ch == 'C' or ch == 'A' or ch == 'P'):
                 pass
             else:
                 continue
             is_rus = False
             for ii in range(i - 1, -1, -1):
                 if ((res[ii].end_char + 1) != res[ii + 1].begin_char):
                     break
                 elif (res[ii].char_info.is_letter):
                     is_rus = res[ii].char_info.is_cyrillic_letter
                     break
             if (not is_rus):
                 ii = i + 1
                 while ii < len(res):
                     if ((res[ii - 1].end_char + 1) != res[ii].begin_char):
                         break
                     elif (res[ii].char_info.is_letter):
                         is_rus = res[ii].char_info.is_cyrillic_letter
                         break
                     ii += 1
             if (is_rus):
                 res[i].term = LanguageHelper.transliteralCorrection(
                     res[i].term, None, True)
                 res[i].char_info.is_cyrillic_letter = True
                 res[i].char_info.is_latin_letter = True
     for r in res:
         if (r.char_info.is_all_upper or r.char_info.is_capital_upper):
             if (r.language.is_cyrillic):
                 ok = False
                 for wf in r.word_forms:
                     if (wf.class0_.is_proper_surname):
                         ok = True
                         break
                 if (not ok):
                     r.word_forms = list(r.word_forms)
                     InnerMorphology.M_ENGINE_RU.processSurnameVariants(
                         r.term, r.word_forms)
     for r in res:
         for mv in r.word_forms:
             if (mv.normal_case is None):
                 mv.normal_case = r.term
     i = 0
     while i < (len(res) - 2):
         if (res[i].char_info.is_latin_letter
                 and res[i].char_info.is_all_upper and res[i].length == 1):
             if (twrch[res[i + 1].begin_char].is_quot
                     and res[i + 2].char_info.is_latin_letter
                     and res[i + 2].length > 2):
                 if ((res[i].end_char + 1) == res[i + 1].begin_char and
                     (res[i + 1].end_char + 1) == res[i + 2].begin_char):
                     wstr = "{0}{1}".format(res[i].term, res[i + 2].term)
                     li = self.__processOneWord0(wstr)
                     if (li is not None):
                         res[i].word_forms = li
                     res[i].end_char = res[i + 2].end_char
                     res[i].term = wstr
                     if (res[i + 2].char_info.is_all_lower):
                         res[i].char_info.is_all_upper = False
                         res[i].char_info.is_capital_upper = True
                     elif (not res[i + 2].char_info.is_all_upper):
                         res[i].char_info.is_all_upper = False
                     del res[i + 1:i + 1 + 2]
         i += 1
     i = 0
     first_pass2712 = True
     while True:
         if first_pass2712: first_pass2712 = False
         else: i += 1
         if (not (i < (len(res) - 1))): break
         if (not res[i].char_info.is_letter
                 and not res[i + 1].char_info.is_letter
                 and (res[i].end_char + 1) == res[i + 1].begin_char):
             if (twrch[res[i].begin_char].is_hiphen
                     and twrch[res[i + 1].begin_char].is_hiphen):
                 if (i == 0 or not twrch[res[i - 1].begin_char].is_hiphen):
                     pass
                 else:
                     continue
                 if ((i + 2) == len(res)
                         or not twrch[res[i + 2].begin_char].is_hiphen):
                     pass
                 else:
                     continue
                 res[i].end_char = res[i + 1].end_char
                 del res[i + 1]
     return res
コード例 #13
0
 def __get_name_without_brackets(begin: 'Token',
                                 end: 'Token',
                                 normalize_first_noun_group: bool = False,
                                 normal_first_group_single: bool = False,
                                 ignore_geo_referent: bool = False) -> str:
     res = None
     if (BracketHelper.can_be_start_of_sequence(begin, False, False)
             and BracketHelper.can_be_end_of_sequence(
                 end, False, begin, False)):
         begin = begin.next0_
         end = end.previous
     if (normalize_first_noun_group
             and not begin.morph.class0_.is_preposition):
         npt = NounPhraseHelper.try_parse(
             begin, NounPhraseParseAttr.REFERENTCANBENOUN, 0, None)
         if (npt is not None):
             if (npt.noun.get_morph_class_in_dictionary().is_undefined
                     and len(npt.adjectives) == 0):
                 npt = (None)
         if (npt is not None and npt.end_token.end_char > end.end_char):
             npt = (None)
         if (npt is not None):
             res = npt.get_normal_case_text(
                 None, (MorphNumber.SINGULAR if normal_first_group_single
                        else MorphNumber.UNDEFINED), MorphGender.UNDEFINED,
                 False)
             te = npt.end_token.next0_
             if (((te is not None and te.next0_ is not None and te.is_comma)
                  and (isinstance(te.next0_, TextToken))
                  and te.next0_.end_char <= end.end_char)
                     and te.next0_.morph.class0_.is_verb
                     and te.next0_.morph.class0_.is_adjective):
                 for it in te.next0_.morph.items:
                     if (it.gender == npt.morph.gender
                             or ((it.gender) & (npt.morph.gender)) !=
                         (MorphGender.UNDEFINED)):
                         if (not (
                             (it.case_) & npt.morph.case_).is_undefined):
                             if (it.number == npt.morph.number or
                                 ((it.number) & (npt.morph.number)) !=
                                 (MorphNumber.UNDEFINED)):
                                 var = te.next0_.term
                                 if (isinstance(it, MorphWordForm)):
                                     var = it.normal_case
                                 bi = MorphBaseInfo._new492(
                                     MorphClass.ADJECTIVE, npt.morph.gender,
                                     npt.morph.number, npt.morph.language)
                                 var = MorphologyService.get_wordform(
                                     var, bi)
                                 if (var is not None):
                                     res = "{0}, {1}".format(res, var)
                                     te = te.next0_.next0_
                                 break
             if (te is not None and te.end_char <= end.end_char):
                 s = ProperNameHelper.get_name_ex(te, end,
                                                  MorphClass.UNDEFINED,
                                                  MorphCase.UNDEFINED,
                                                  MorphGender.UNDEFINED,
                                                  True, ignore_geo_referent)
                 if (not Utils.isNullOrEmpty(s)):
                     if (not str.isalnum(s[0])):
                         res = "{0}{1}".format(res, s)
                     else:
                         res = "{0} {1}".format(res, s)
         elif ((isinstance(begin, TextToken))
               and begin.chars.is_cyrillic_letter):
             mm = begin.get_morph_class_in_dictionary()
             if (not mm.is_undefined):
                 res = begin.get_normal_case_text(mm, MorphNumber.UNDEFINED,
                                                  MorphGender.UNDEFINED,
                                                  False)
                 if (begin.end_char < end.end_char):
                     res = "{0} {1}".format(
                         res,
                         ProperNameHelper.get_name_ex(
                             begin.next0_, end, MorphClass.UNDEFINED,
                             MorphCase.UNDEFINED, MorphGender.UNDEFINED,
                             True, False))
     if (res is None):
         res = ProperNameHelper.get_name_ex(begin, end,
                                            MorphClass.UNDEFINED,
                                            MorphCase.UNDEFINED,
                                            MorphGender.UNDEFINED, True,
                                            ignore_geo_referent)
     if (not Utils.isNullOrEmpty(res)):
         k = 0
         i = len(res) - 1
         while i >= 0:
             if (res[i] == '*' or Utils.isWhitespace(res[i])):
                 pass
             else:
                 break
             i -= 1
             k += 1
         if (k > 0):
             if (k == len(res)):
                 return None
             res = res[0:0 + len(res) - k]
     return res
コード例 #14
0
 def __TryParse(t: 'Token',
                prev: 'TransItemToken',
                after_conj: bool,
                attach_high: bool = False) -> 'TransItemToken':
     if (t is None):
         return None
     t1 = t
     if (t1.isChar(',')):
         t1 = t1.next0_
     if (t1 is not None and t1.isValue("ПРИНАДЛЕЖАТЬ", "НАЛЕЖАТИ")):
         t1 = t1.next0_
     if (isinstance(t1, ReferentToken)):
         if (t1.getReferent().type_name == "ORGANIZATION"):
             return TransItemToken._new2521(t, t1, TransItemToken.Typs.ORG,
                                            t1.getReferent(), t1.morph)
     route = False
     if (t1 is not None and ((t1.isValue("СЛЕДОВАТЬ", "СЛІДУВАТИ")
                              or t1.isValue("ВЫПОЛНЯТЬ", "ВИКОНУВАТИ")))):
         t1 = t1.next0_
         route = True
     if (t1 is not None and t1.morph.class0_.is_preposition):
         t1 = t1.next0_
     if (t1 is not None and
         ((t1.isValue("РЕЙС", None) or t1.isValue("МАРШРУТ", None)))):
         t1 = t1.next0_
         route = True
     if (isinstance(t1, ReferentToken)):
         if (isinstance(t1.getReferent(), GeoReferent)):
             geo_ = Utils.asObjectOrNull(t1.getReferent(), GeoReferent)
             if (geo_.is_state or geo_.is_city):
                 tit = TransItemToken._new2522(t, t1,
                                               TransItemToken.Typs.ROUTE,
                                               list())
                 tit.route_items.append(geo_)
                 t1 = t1.next0_
                 first_pass3132 = True
                 while True:
                     if first_pass3132: first_pass3132 = False
                     else: t1 = t1.next0_
                     if (not (t1 is not None)): break
                     if (t1.is_hiphen):
                         continue
                     if (t1.morph.class0_.is_preposition
                             or t1.morph.class0_.is_conjunction):
                         continue
                     geo_ = (Utils.asObjectOrNull(t1.getReferent(),
                                                  GeoReferent))
                     if (geo_ is None):
                         break
                     if (not geo_.is_city and not geo_.is_state):
                         break
                     tit.route_items.append(geo_)
                     tit.end_token = t1
                 if (len(tit.route_items) > 1 or route):
                     return tit
         elif ((isinstance(t1.getReferent(), DateReferent))
               and (t1.whitespaces_before_count < 3)):
             tit = TransItemToken._new2523(t, t1, TransItemToken.Typs.DATE,
                                           t1.getReferent())
             if (t1.next0_ is not None):
                 if (t1.next0_.isValue("В", None)
                         and t1.next0_.next0_ is not None
                         and t1.next0_.next0_.isChar('.')):
                     tit.end_token = t1.next0_.next0_
                 elif (t1.next0_.isValue("ВЫП", None)
                       or t1.next0_.isValue("ВЫПУСК", None)):
                     tit.end_token = t1.next0_
                     if (t1.next0_.next0_ is not None
                             and t1.next0_.next0_.isChar('.')):
                         tit.end_token = t1.next0_.next0_
             return tit
     if (isinstance(t, TextToken)):
         num = MiscHelper.checkNumberPrefix(t)
         if (num is not None):
             tit = TransItemToken.__attachRusAutoNumber(num)
             if (tit is None):
                 tit = TransItemToken._attachNumber(num, False)
             if (tit is not None):
                 tit.begin_token = t
                 return tit
         tok = TransItemToken.M_ONTOLOGY.tryParse(t, TerminParseAttr.NO)
         if (tok is None and ((t.isValue("С", None) or t.isValue("C", None)
                               or t.isValue("ЗА", None)))):
             tok = TransItemToken.M_ONTOLOGY.tryParse(
                 t.next0_, TerminParseAttr.NO)
         if (tok is None and BracketHelper.isBracket(t, True)):
             tok1 = TransItemToken.M_ONTOLOGY.tryParse(
                 t.next0_, TerminParseAttr.NO)
             if (tok1 is not None and BracketHelper.isBracket(
                     tok1.end_token.next0_, True)):
                 tok = tok1
                 tok.begin_token = t
                 tok.end_token = tok.end_token.next0_
                 tok.begin_token = t
             elif (tok1 is not None):
                 tt = Utils.asObjectOrNull(tok1.termin,
                                           TransItemToken.TransTermin)
                 if (tt.typ == TransItemToken.Typs.BRAND):
                     tok = tok1
                     tok.begin_token = t
         if (tok is None and t.isValue("МАРКА", None)):
             res1 = TransItemToken.__TryParse(t.next0_, prev, after_conj,
                                              False)
             if (res1 is not None):
                 if (res1.typ == TransItemToken.Typs.NAME
                         or res1.typ == TransItemToken.Typs.BRAND):
                     res1.begin_token = t
                     res1.typ = TransItemToken.Typs.BRAND
                     return res1
         if (tok is not None):
             tt = Utils.asObjectOrNull(tok.termin,
                                       TransItemToken.TransTermin)
             if (tt.typ == TransItemToken.Typs.NUMBER):
                 tit = TransItemToken.__attachRusAutoNumber(
                     tok.end_token.next0_)
                 if (tit is None):
                     tit = TransItemToken._attachNumber(
                         tok.end_token.next0_, False)
                 if (tit is not None):
                     tit.begin_token = t
                     return tit
                 else:
                     return None
             if (tt.is_doubt and not attach_high):
                 if (prev is None or prev.typ != TransItemToken.Typs.NOUN):
                     if ((prev is not None
                          and prev.typ == TransItemToken.Typs.BRAND
                          and tt.typ == TransItemToken.Typs.BRAND)
                             and Utils.compareStrings(
                                 tt.canonic_text, prev.value, True) == 0):
                         pass
                     else:
                         return None
             if (tt.canonic_text == "СУДНО"):
                 if ((((tok.morph.number) & (MorphNumber.PLURAL))) !=
                     (MorphNumber.UNDEFINED)):
                     if (not BracketHelper.canBeStartOfSequence(
                             tok.end_token.next0_, False, False)):
                         return None
             tit = TransItemToken._new2524(tok.begin_token, tok.end_token,
                                           tt.kind, tt.typ, tt.is_doubt,
                                           tok.chars, tok.morph)
             tit.value = tt.canonic_text
             if (tit.typ == TransItemToken.Typs.NOUN):
                 tit.value = tit.value.lower()
             else:
                 tit.value = tit.value.upper()
             return tit
         if (tok is None and t.morph.class0_.is_adjective):
             npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0)
             if (npt is not None and len(npt.adjectives) > 0):
                 state_ = None
                 tt = t
                 first_pass3133 = True
                 while True:
                     if first_pass3133: first_pass3133 = False
                     else: tt = tt.next0_
                     if (not (tt is not None
                              and tt.previous != npt.end_token)):
                         break
                     tok = TransItemToken.M_ONTOLOGY.tryParse(
                         tt, TerminParseAttr.NO)
                     if (tok is None and state_ is None):
                         state_ = tt.kit.processReferent("GEO", tt)
                     if (tok is not None
                             and tok.end_token == npt.end_token):
                         if ((tok.termin).typ == TransItemToken.Typs.NOUN):
                             tit = TransItemToken._new2524(
                                 t, tok.end_token, (tok.termin).kind,
                                 TransItemToken.Typs.NOUN,
                                 (tok.termin).is_doubt, tok.chars,
                                 npt.morph)
                             tit.value = (tok.termin).canonic_text.lower()
                             tit.alt_value = npt.getNormalCaseText(
                                 None, False, MorphGender.UNDEFINED,
                                 False).lower()
                             if (LanguageHelper.endsWithEx(
                                     tit.alt_value, "суд", "суда", None,
                                     None)):
                                 if (not BracketHelper.canBeStartOfSequence(
                                         tok.end_token.next0_, False,
                                         False)):
                                     continue
                             if (state_ is not None):
                                 if ((state_.referent).is_state):
                                     tit.state = state_
                             return tit
     if (t is not None and t.isValue("КЛАСС", None)
             and t.next0_ is not None):
         br = BracketHelper.tryParse(t.next0_, BracketParseAttr.NO, 100)
         if (br is not None):
             return TransItemToken._new2526(
                 t, br.end_token, TransItemToken.Typs.CLASS,
                 MiscHelper.getTextValueOfMetaToken(br, GetTextAttr.NO))
     nt = Utils.asObjectOrNull(t, NumberToken)
     if (nt is not None):
         if (prev is None or nt.typ != NumberSpellingType.DIGIT):
             return None
         if (prev.typ == TransItemToken.Typs.BRAND):
             return TransItemToken.__attachModel(t, False, prev)
         else:
             return None
     res = TransItemToken.__attachRusAutoNumber(t)
     if ((res) is not None):
         if (not res.is_doubt):
             return res
         if (prev is not None and prev.typ == TransItemToken.Typs.NOUN
                 and prev.kind == TransportKind.AUTO):
             return res
         if (prev is not None
                 and ((prev.typ == TransItemToken.Typs.BRAND
                       or prev.typ == TransItemToken.Typs.MODEL))):
             return res
     t1 = t
     if (t.is_hiphen):
         t1 = t.next0_
     if (prev is not None and prev.typ == TransItemToken.Typs.BRAND
             and t1 is not None):
         tit = TransItemToken.__attachModel(t1, True, prev)
         if (tit is not None):
             tit.begin_token = t
             return tit
     if (prev is not None
             and ((prev.typ == TransItemToken.Typs.NOUN or after_conj))):
         br = BracketHelper.tryParse(t, BracketParseAttr.NO, 100)
         if (br is not None and br.is_quote_type):
             tit = TransItemToken.tryParse(br.begin_token.next0_, prev,
                                           after_conj, False)
             if (tit is not None and tit.end_token.next0_ == br.end_token):
                 if (not tit.is_doubt
                         or tit.typ == TransItemToken.Typs.BRAND):
                     tit.begin_token = br.begin_token
                     tit.end_token = br.end_token
                     return tit
             s = MiscHelper.getTextValueOfMetaToken(br, GetTextAttr.NO)
             if (not Utils.isNullOrEmpty(s) and (len(s) < 30)):
                 chars_ = 0
                 digs = 0
                 un = 0
                 for c in s:
                     if (not Utils.isWhitespace(c)):
                         if (str.isalpha(c)):
                             chars_ += 1
                         elif (str.isdigit(c)):
                             digs += 1
                         else:
                             un += 1
                 if (((digs == 0 and un == 0
                       and t.next0_.chars.is_capital_upper))
                         or prev.kind == TransportKind.SHIP
                         or prev.kind == TransportKind.SPACE):
                     return TransItemToken._new2526(
                         br.begin_token, br.end_token,
                         TransItemToken.Typs.NAME, s)
                 if (digs > 0 and (chars_ < 5)):
                     return TransItemToken._new2526(
                         br.begin_token, br.end_token,
                         TransItemToken.Typs.MODEL, s.replace(" ", ""))
     if (prev is not None and (((prev.typ == TransItemToken.Typs.NOUN
                                 or prev.typ == TransItemToken.Typs.BRAND
                                 or prev.typ == TransItemToken.Typs.NAME)
                                or prev.typ == TransItemToken.Typs.MODEL))):
         tit = TransItemToken.__attachModel(
             t, prev.typ != TransItemToken.Typs.NAME, prev)
         if (tit is not None):
             return tit
     if (((prev is not None and prev.typ == TransItemToken.Typs.NOUN
           and prev.kind == TransportKind.AUTO) and
          (isinstance(t, TextToken)) and t.chars.is_letter)
             and not t.chars.is_all_lower
             and (t.whitespaces_before_count < 2)):
         pt = t.kit.processReferent("PERSON", t)
         if (pt is None):
             tit = TransItemToken._new2529(t, t, TransItemToken.Typs.BRAND)
             tit.value = (t).term
             return tit
     if (((prev is not None and prev.typ == TransItemToken.Typs.NOUN and
           ((prev.kind == TransportKind.SHIP
             or prev.kind == TransportKind.SPACE)))) or after_conj):
         if (t.chars.is_capital_upper):
             ok = True
             npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0)
             if (npt is not None and len(npt.adjectives) > 0):
                 ok = False
             else:
                 rt = t.kit.processReferent("PERSON", t)
                 if (rt is not None):
                     ok = False
             if (t.getMorphClassInDictionary().is_proper_surname):
                 if (not t.morph.case_.is_nominative):
                     ok = False
             if (ok):
                 t1 = t
                 tt = t.next0_
                 while tt is not None:
                     if (tt.whitespaces_before_count > 1):
                         break
                     if (tt.chars != t.chars):
                         break
                     tit = TransItemToken.tryParse(tt, None, False, False)
                     if ((tit) is not None):
                         break
                     t1 = tt
                     tt = tt.next0_
                 s = MiscHelper.getTextValue(t, t1, GetTextAttr.NO)
                 if (s is not None):
                     res1 = TransItemToken._new2530(
                         t, t1, TransItemToken.Typs.NAME, True, s)
                     if (not t1.is_newline_after):
                         br = BracketHelper.tryParse(
                             t1.next0_, BracketParseAttr.NO, 100)
                         if (br is not None):
                             res1.end_token = br.end_token
                             res1.alt_value = res1.value
                             res1.value = MiscHelper.getTextValueOfMetaToken(
                                 br, GetTextAttr.NO)
                     return res1
     return None
コード例 #15
0
ファイル: SourceOfAnalysis.py プロジェクト: MihaJjDa/APCLtask
 def __doCrLfCorrection(self, txt: str) -> str:
     """ Это анализ случаев принудительно отформатированного текста
     
     Args:
         txt(str): 
     """
     cou = 0
     total_len = 0
     i = 0
     first_pass3166 = True
     while True:
         if first_pass3166: first_pass3166 = False
         else: i += 1
         if (not (i < len(txt))): break
         ch = txt[i]
         if ((ord(ch)) != 0xD and (ord(ch)) != 0xA):
             continue
         len0_ = 0
         last_char = ch
         j = (i + 1)
         while j < len(txt):
             ch = txt[j]
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 break
             elif ((ord(ch)) == 0x9):
                 len0_ += 5
             else:
                 last_char = ch
                 len0_ += 1
             j += 1
         if (j >= len(txt)):
             break
         if (len0_ < 30):
             continue
         if (last_char != '.' and last_char != ':' and last_char != ';'):
             next_is_dig = False
             k = j + 1
             while k < len(txt):
                 if (not Utils.isWhitespace(txt[k])):
                     if (str.isdigit(txt[k])):
                         next_is_dig = True
                     break
                 k += 1
             if (not next_is_dig):
                 cou += 1
                 total_len += len0_
         i = j
     if (cou < 4):
         return txt
     total_len = math.floor(total_len / cou)
     if ((total_len < 50) or total_len > 100):
         return txt
     tmp = Utils.newStringIO(txt)
     i = 0
     while i < tmp.tell():
         ch = Utils.getCharAtStringIO(tmp, i)
         len0_ = 0
         last_char = ch
         j = (i + 1)
         while j < tmp.tell():
             ch = Utils.getCharAtStringIO(tmp, j)
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 break
             elif ((ord(ch)) == 0x9):
                 len0_ += 5
             else:
                 last_char = ch
                 len0_ += 1
             j += 1
         if (j >= tmp.tell()):
             break
         for jj in range(j - 1, -1, -1):
             last_char = Utils.getCharAtStringIO(tmp, jj)
             if (not Utils.isWhitespace(last_char)):
                 break
         else:
             jj = -1
         not_single = False
         jj = (j + 1)
         if ((jj < tmp.tell())
                 and (ord(Utils.getCharAtStringIO(tmp, j))) == 0xD
                 and (ord(Utils.getCharAtStringIO(tmp, jj))) == 0xA):
             jj += 1
         while jj < tmp.tell():
             ch = Utils.getCharAtStringIO(tmp, jj)
             if (not Utils.isWhitespace(ch)):
                 break
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 not_single = True
                 break
             jj += 1
         if (((not not_single and len0_ >
               (total_len - 20) and (len0_ < (total_len + 10)))
              and last_char != '.' and last_char != ':')
                 and last_char != ';'):
             Utils.setCharAtStringIO(tmp, j, ' ')
             self.crlf_corrected_count += 1
             if ((j + 1) < tmp.tell()):
                 ch = Utils.getCharAtStringIO(tmp, j + 1)
                 if ((ord(ch)) == 0xA):
                     Utils.setCharAtStringIO(tmp, j + 1, ' ')
                     j += 1
         i = (j - 1)
         i += 1
     return Utils.toStringStringIO(tmp)