Python Utils.isWhitespace Examples

Programming Language: Python

Namespace/Package Name: pullenti.unisharp.Utils

Class/Type: Utils

Method/Function: isWhitespace

Examples at hotexamples.com: 15

Python Utils.isWhitespace - 15 examples found. These are the top rated real world Python examples of pullenti.unisharp.Utils.Utils.isWhitespace extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

asObjectOrNull(30)

tryParseInt(30)

toStringStringIO(30)

isNullOrEmpty(30)

tryGetValue(30)

ifNotNull(30)

valToEnum(30)

enumToString(30)

setLengthStringIO(28)

getCharAtStringIO(23)

splitString(18)

newException(18)

isWhitespace(15)

newStringIO(14)

compareStrings(12)

indexOfList(10)

insertStringIO(10)

newArrayOfBytes(9)

endsWithString(8)

getResourceStream(7)

lastDayOfMonth(7)

getResourcesNames(7)

getResourceInfo(7)

writeByteIO(7)

removeStringIO(6)

setCharAtStringIO(6)

tryParseFloat(4)

readIO(3)

trimEndString(2)

writeIO(2)

replaceStringIO(2)

parseXmlFromString(2)

getXmlLocalName(2)

getXmlInnerText(2)

getDate(2)

readByteIO(1)

parseXmlFromStream(1)

startsWithString(1)

joinStrings(1)

getXmlName(1)

getXmlAttrByName(1)

getLengthIO(1)

newArray(1)

Example #1

Show file

File: Token.py Project: MihaJjDa/APCLtask

 def __getAttr(self, i: int) -> bool:
     if ((((self.__m_attrs) & 1)) == 0):
         self.__m_attrs = (1)
         if (self._m_previous is None):
             self._setAttr(1, True)
             self._setAttr(3, True)
         else:
             j = self._m_previous.end_char + 1
             while j < self.begin_char:
                 ch = self.kit.sofa.text[j]
                 if (Utils.isWhitespace((ch))):
                     self._setAttr(1, True)
                     if ((ord(ch)) == 0xD or (ord(ch)) == 0xA
                             or ch == '\f'):
                         self._setAttr(3, True)
                 j += 1
         if (self._m_next is None):
             self._setAttr(2, True)
             self._setAttr(4, True)
         else:
             j = self.end_char + 1
             while j < self._m_next.begin_char:
                 ch = self.kit.sofa.text[j]
                 if (Utils.isWhitespace(ch)):
                     self._setAttr(2, True)
                     if ((ord(ch)) == 0xD or (ord(ch)) == 0xA
                             or ch == '\f'):
                         self._setAttr(4, True)
                 j += 1
     return (((((self.__m_attrs) >> i)) & 1)) != 0

Example #2

Show file

File: Termin.py Project: MihaJjDa/APCLtask

 def addAbridge(self, abr: str) -> 'Abridge':
     if (abr == "В/ГОР"):
         pass
     a = Termin.Abridge()
     if (self.abridges is None):
         self.abridges = list()
     i = 0
     while i < len(abr):
         if (not str.isalpha(abr[i])):
             break
         i += 1
     if (i == 0):
         return None
     a.parts.append(Termin.AbridgePart._new604(abr[0:0 + i].upper()))
     self.abridges.append(a)
     if (((i + 1) < len(abr)) and abr[i] == '-'):
         a.tail = abr[i + 1:].upper()
     elif (i < len(abr)):
         if (not Utils.isWhitespace(abr[i])):
             a.parts[0].has_delim = True
         while i < len(abr):
             if (str.isalpha(abr[i])):
                 j = (i + 1)
                 while j < len(abr):
                     if (not str.isalpha(abr[j])):
                         break
                     j += 1
                 p = Termin.AbridgePart._new604(abr[i:i + j - i].upper())
                 if (j < len(abr)):
                     if (not Utils.isWhitespace(abr[j])):
                         p.has_delim = True
                 a.parts.append(p)
                 i = j
             i += 1
     return a

Example #3

Show file

File: CreateHelper.py Project: pullenti/PullentiPython

 def create_number(gr : 'SemGraph', num : 'NumbersWithUnitToken') -> 'SemObject':
     rs = num.create_refenets_tokens_with_register(None, None, False)
     if (rs is None or len(rs) == 0): 
         return None
     mr = Utils.asObjectOrNull(rs[len(rs) - 1].referent, MeasureReferent)
     sem = SemObject(gr)
     gr.objects.append(sem)
     sem.tokens.append(num)
     sem.morph.normal_case = mr.to_string(True, None, 0)
     sem.morph.normal_full = sem.morph.normal_case
     sem.typ = SemObjectType.NOUN
     sem.measure = mr.kind
     i = 0
     first_pass3438 = True
     while True:
         if first_pass3438: first_pass3438 = False
         else: i += 1
         if (not (i < len(sem.morph.normal_case))): break
         ch = sem.morph.normal_case[i]
         if (str.isdigit(ch) or Utils.isWhitespace(ch) or "[].+-".find(ch) >= 0): 
             continue
         sem.quantity = SemQuantity(sem.morph.normal_case[0:0+i].strip(), num.begin_token, num.end_token)
         sem.morph.normal_case = sem.morph.normal_case[i:].strip()
         if (len(num.units) == 1 and num.units[0].unit is not None): 
             sem.morph.normal_full = num.units[0].unit.fullname_cyr
             if (sem.morph.normal_full == "%"): 
                 sem.morph.normal_full = "процент"
         break
     sem.concept = (mr)
     return sem

Example #4

Show file

File: SourceOfAnalysis.py Project: MihaJjDa/APCLtask

 def calcWhitespaceDistanceBetweenPositions(self, pos_from: int,
                                            pos_to: int) -> int:
     """ Вычислить расстояние в символах между соседними элементами
     
     Args:
         indFrom: 
         indTo: 
     
     """
     if (pos_from == (pos_to + 1)):
         return 0
     if (pos_from > pos_to or (pos_from < 0) or pos_to >= len(self.text)):
         return -1
     res = 0
     i = pos_from
     while i <= pos_to:
         ch = self.text[i]
         if (not Utils.isWhitespace(ch)):
             return -1
         if (ch == '\r' or ch == '\n'):
             res += 10
         elif (ch == '\t'):
             res += 5
         else:
             res += 1
         i += 1
     return res

Example #5

Show file

File: TableHelper.py Project: pullenti/PullentiPython

 def __init__(self, t: 'Token') -> None:
     self.col_span = 0
     self.row_span = 0
     self.typ = TableHelper.TableTypes.UNDEFINED
     self.src = None
     self.src = t
     if (t is None):
         return
     if (t.is_char(chr(0x1E))):
         self.typ = TableHelper.TableTypes.TABLESTART
         return
     if (t.is_char(chr(0x1F))):
         self.typ = TableHelper.TableTypes.TABLEEND
         return
     if (not t.is_char(chr(7))):
         return
     txt = t.kit.sofa.text
     self.typ = TableHelper.TableTypes.CELLEND
     p = t.begin_char - 1
     if (p < 0):
         return
     if ((ord(txt[p])) == 0xD or (ord(txt[p])) == 0xA):
         self.typ = TableHelper.TableTypes.ROWEND
         return
     self.row_span = 1
     self.col_span = self.row_span
     while p >= 0:
         if (not Utils.isWhitespace(txt[p])):
             break
         elif (txt[p] == '\t'):
             self.col_span += 1
         elif (txt[p] == '\f'):
             self.row_span += 1
         p -= 1

Example #6

Show file

File: MiscLocationHelper.py Project: pullenti/PullentiPython

 def _initialize() -> None:
     if (MiscLocationHelper.__m_nords is not None):
         return
     MiscLocationHelper.__m_nords = TerminCollection()
     for s in [
             "СЕВЕРНЫЙ", "ЮЖНЫЙ", "ЗАПАДНЫЙ", "ВОСТОЧНЫЙ", "ЦЕНТРАЛЬНЫЙ",
             "БЛИЖНИЙ", "ДАЛЬНИЙ", "СРЕДНИЙ", "СЕВЕР", "ЮГ", "ЗАПАД",
             "ВОСТОК", "СЕВЕРО", "ЮГО", "ЗАПАДНО", "ВОСТОЧНО",
             "СЕВЕРОЗАПАДНЫЙ", "СЕВЕРОВОСТОЧНЫЙ", "ЮГОЗАПАДНЫЙ",
             "ЮГОВОСТОЧНЫЙ"
     ]:
         MiscLocationHelper.__m_nords.add(Termin(s, MorphLang.RU, True))
     table = "\nAF\tAFG\nAX\tALA\nAL\tALB\nDZ\tDZA\nAS\tASM\nAD\tAND\nAO\tAGO\nAI\tAIA\nAQ\tATA\nAG\tATG\nAR\tARG\nAM\tARM\nAW\tABW\nAU\tAUS\nAT\tAUT\nAZ\tAZE\nBS\tBHS\nBH\tBHR\nBD\tBGD\nBB\tBRB\nBY\tBLR\nBE\tBEL\nBZ\tBLZ\nBJ\tBEN\nBM\tBMU\nBT\tBTN\nBO\tBOL\nBA\tBIH\nBW\tBWA\nBV\tBVT\nBR\tBRA\nVG\tVGB\nIO\tIOT\nBN\tBRN\nBG\tBGR\nBF\tBFA\nBI\tBDI\nKH\tKHM\nCM\tCMR\nCA\tCAN\nCV\tCPV\nKY\tCYM\nCF\tCAF\nTD\tTCD\nCL\tCHL\nCN\tCHN\nHK\tHKG\nMO\tMAC\nCX\tCXR\nCC\tCCK\nCO\tCOL\nKM\tCOM\nCG\tCOG\nCD\tCOD\nCK\tCOK\nCR\tCRI\nCI\tCIV\nHR\tHRV\nCU\tCUB\nCY\tCYP\nCZ\tCZE\nDK\tDNK\nDJ\tDJI\nDM\tDMA\nDO\tDOM\nEC\tECU\nEG\tEGY\nSV\tSLV\nGQ\tGNQ\nER\tERI\nEE\tEST\nET\tETH\nFK\tFLK\nFO\tFRO\nFJ\tFJI\nFI\tFIN\nFR\tFRA\nGF\tGUF\nPF\tPYF\nTF\tATF\nGA\tGAB\nGM\tGMB\nGE\tGEO\nDE\tDEU\nGH\tGHA\nGI\tGIB\nGR\tGRC\nGL\tGRL\nGD\tGRD\nGP\tGLP\nGU\tGUM\nGT\tGTM\nGG\tGGY\nGN\tGIN\nGW\tGNB\nGY\tGUY\nHT\tHTI\nHM\tHMD\nVA\tVAT\nHN\tHND\nHU\tHUN\nIS\tISL\nIN\tIND\nID\tIDN\nIR\tIRN\nIQ\tIRQ\nIE\tIRL\nIM\tIMN\nIL\tISR\nIT\tITA\nJM\tJAM\nJP\tJPN\nJE\tJEY\nJO\tJOR\nKZ\tKAZ\nKE\tKEN\nKI\tKIR\nKP\tPRK\nKR\tKOR\nKW\tKWT\nKG\tKGZ\nLA\tLAO\nLV\tLVA\nLB\tLBN\nLS\tLSO\nLR\tLBR\nLY\tLBY\nLI\tLIE\nLT\tLTU\nLU\tLUX\nMK\tMKD\nMG\tMDG\nMW\tMWI\nMY\tMYS\nMV\tMDV\nML\tMLI\nMT\tMLT\nMH\tMHL\nMQ\tMTQ\nMR\tMRT\nMU\tMUS\nYT\tMYT\nMX\tMEX\nFM\tFSM\nMD\tMDA\nMC\tMCO\nMN\tMNG\nME\tMNE\nMS\tMSR\nMA\tMAR\nMZ\tMOZ\nMM\tMMR\nNA\tNAM\nNR\tNRU\nNP\tNPL\nNL\tNLD\nAN\tANT\nNC\tNCL\nNZ\tNZL\nNI\tNIC\nNE\tNER\nNG\tNGA\nNU\tNIU\nNF\tNFK\nMP\tMNP\nNO\tNOR\nOM\tOMN\nPK\tPAK\nPW\tPLW\nPS\tPSE\nPA\tPAN\nPG\tPNG\nPY\tPRY\nPE\tPER\nPH\tPHL\nPN\tPCN\nPL\tPOL\nPT\tPRT\nPR\tPRI\nQA\tQAT\nRE\tREU\nRO\tROU\nRU\tRUS\nRW\tRWA\nBL\tBLM\nSH\tSHN\nKN\tKNA\nLC\tLCA\nMF\tMAF\nPM\tSPM\nVC\tVCT\nWS\tWSM\nSM\tSMR\nST\tSTP\nSA\tSAU\nSN\tSEN\nRS\tSRB\nSC\tSYC\nSL\tSLE\nSG\tSGP\nSK\tSVK\nSI\tSVN\nSB\tSLB\nSO\tSOM\nZA\tZAF\nGS\tSGS\nSS\tSSD\nES\tESP\nLK\tLKA\nSD\tSDN\nSR\tSUR\nSJ\tSJM\nSZ\tSWZ\nSE\tSWE\nCH\tCHE\nSY\tSYR\nTW\tTWN\nTJ\tTJK\nTZ\tTZA\nTH\tTHA\nTL\tTLS\nTG\tTGO\nTK\tTKL\nTO\tTON\nTT\tTTO\nTN\tTUN\nTR\tTUR\nTM\tTKM\nTC\tTCA\nTV\tTUV\nUG\tUGA\nUA\tUKR\nAE\tARE\nGB\tGBR\nUS\tUSA\nUM\tUMI\nUY\tURY\nUZ\tUZB\nVU\tVUT\nVE\tVEN\nVN\tVNM\nVI\tVIR\nWF\tWLF\nEH\tESH\nYE\tYEM\nZM\tZMB\nZW\tZWE "
     for s in Utils.splitString(table, '\n', False):
         ss = s.strip()
         if ((len(ss) < 6) or not Utils.isWhitespace(ss[2])):
             continue
         cod2 = ss[0:0 + 2]
         cod3 = ss[3:].strip()
         if (len(cod3) != 3):
             continue
         if (not cod2 in MiscLocationHelper._m_alpha2_3):
             MiscLocationHelper._m_alpha2_3[cod2] = cod3
         if (not cod3 in MiscLocationHelper._m_alpha3_2):
             MiscLocationHelper._m_alpha3_2[cod3] = cod2

Example #7

Show file

File: LanguageHelper.py Project: AAA1911/PullentiPython

 def is_cyrillic(str0_: str) -> bool:
     if (str0_ is None):
         return False
     i = 0
     while i < len(str0_):
         if (not LanguageHelper.is_cyrillic_char(str0_[i])):
             if (not Utils.isWhitespace(str0_[i]) and str0_[i] != '-'):
                 return False
         i += 1
     return True

Example #8

Show file

File: Token.py Project: MihaJjDa/APCLtask

 def newlines_before_count(self) -> int:
     """ Количество переходов на новую строку перед """
     ch0 = chr(0)
     res = 0
     txt = self.kit.sofa.text
     for p in range(self.begin_char - 1, -1, -1):
         ch = txt[p]
         if ((ord(ch)) == 0xA):
             res += 1
         elif ((ord(ch)) == 0xD and (ord(ch0)) != 0xA):
             res += 1
         elif (ch == '\f'):
             res += 10
         elif (not Utils.isWhitespace(ch)):
             break
         ch0 = ch
     return res

Example #9

Show file

File: Token.py Project: MihaJjDa/APCLtask

 def newlines_after_count(self) -> int:
     """ Количество переходов на новую строку перед """
     ch0 = chr(0)
     res = 0
     txt = self.kit.sofa.text
     p = self.end_char + 1
     while p < len(txt):
         ch = txt[p]
         if ((ord(ch)) == 0xD):
             res += 1
         elif ((ord(ch)) == 0xA and (ord(ch0)) != 0xD):
             res += 1
         elif (ch == '\f'):
             res += 10
         elif (not Utils.isWhitespace(ch)):
             break
         ch0 = ch
         p += 1
     return res

Example #10

Show file

File: UnicodeInfo.py Project: pullenti/PullentiPython

 def initialize() -> None:
     if (UnicodeInfo.__m_inited):
         return
     UnicodeInfo.__m_inited = True
     UnicodeInfo.ALL_CHARS = list()
     cyrvowel = "АЕЁИОУЮЯЫЭЄІЇЎӘӨҰҮІ"
     cyrvowel += cyrvowel.lower()
     for i in range(0x10000):
         ch = chr(i)
         ui = UnicodeInfo(i)
         if (Utils.isWhitespace(ch)):
             ui.is_whitespace = True
         elif (str.isdigit(ch)):
             ui.is_digit = True
         elif (ch == 'º' or ch == '°'):
             pass
         elif (str.isalpha(ch)):
             ui.is_letter = True
             if (i >= 0x400 and (i < 0x500)):
                 ui.is_cyrillic = True
                 if (cyrvowel.find(ch) >= 0):
                     ui.is_vowel = True
             elif (i < 0x200):
                 ui.is_latin = True
                 if ("AEIOUYaeiouy".find(ch) >= 0):
                     ui.is_vowel = True
             if (str.isupper(ch)):
                 ui.is_upper = True
             if (str.islower(ch)):
                 ui.is_lower = True
         else:
             if (((((ch == '-' or ch == '–' or ch == '¬') or ch == '-' or ch
                    == (chr(0x00AD))) or ch == (chr(0x2011)) or ch == '-')
                  or ch == '—' or ch == '–') or ch == '−' or ch == '-'):
                 ui.is_hiphen = True
             if ("\"'`“”’".find(ch) >= 0):
                 ui.is_quot = True
             if ("'`’".find(ch) >= 0):
                 ui.is_apos = True
                 ui.is_quot = True
         if (i >= 0x300 and (i < 0x370)):
             ui.is_udaren = True
         UnicodeInfo.ALL_CHARS.append(ui)

Example #11

Show file

 def __getNameWithoutBrackets(begin: 'Token',
                              end: 'Token',
                              normalize_first_noun_group: bool = False,
                              normal_first_group_single: bool = False,
                              ignore_geo_referent: bool = False) -> str:
     """ Получить строковое значение между токенами, при этом исключая кавычки и скобки
     
     Args:
         begin(Token): начальный токен
         end(Token): конечный токен
         normalize_first_noun_group(bool): нормализовывать ли первую именную группу (именит. падеж)
         normal_first_group_single(bool): приводить ли к единственному числу первую именную группу
         ignore_geo_referent(bool): игнорировать внутри географические сущности
     
     """
     res = None
     if (BracketHelper.canBeStartOfSequence(begin, False, False) and
             BracketHelper.canBeEndOfSequence(end, False, begin, False)):
         begin = begin.next0_
         end = end.previous
     if (normalize_first_noun_group
             and not begin.morph.class0_.is_preposition):
         npt = NounPhraseHelper.tryParse(
             begin, NounPhraseParseAttr.REFERENTCANBENOUN, 0)
         if (npt is not None):
             if (npt.noun.getMorphClassInDictionary().is_undefined
                     and len(npt.adjectives) == 0):
                 npt = (None)
         if (npt is not None and npt.end_token.end_char > end.end_char):
             npt = (None)
         if (npt is not None):
             res = npt.getNormalCaseText(None, normal_first_group_single,
                                         MorphGender.UNDEFINED, False)
             te = npt.end_token.next0_
             if (((te is not None and te.next0_ is not None and te.is_comma)
                  and (isinstance(te.next0_, TextToken))
                  and te.next0_.end_char <= end.end_char)
                     and te.next0_.morph.class0_.is_verb
                     and te.next0_.morph.class0_.is_adjective):
                 for it in te.next0_.morph.items:
                     if (it.gender == npt.morph.gender
                             or (((it.gender) & (npt.morph.gender))) !=
                         (MorphGender.UNDEFINED)):
                         if (not (
                             (it.case_) & npt.morph.case_).is_undefined):
                             if (it.number == npt.morph.number or
                                 (((it.number) & (npt.morph.number))) !=
                                 (MorphNumber.UNDEFINED)):
                                 var = (te.next0_).term
                                 if (isinstance(it, MorphWordForm)):
                                     var = (it).normal_case
                                 bi = MorphBaseInfo._new549(
                                     MorphClass.ADJECTIVE, npt.morph.gender,
                                     npt.morph.number, npt.morph.language)
                                 var = Morphology.getWordform(var, bi)
                                 if (var is not None):
                                     res = "{0}, {1}".format(res, var)
                                     te = te.next0_.next0_
                                 break
             if (te is not None and te.end_char <= end.end_char):
                 s = ProperNameHelper.getNameEx(te, end,
                                                MorphClass.UNDEFINED,
                                                MorphCase.UNDEFINED,
                                                MorphGender.UNDEFINED, True,
                                                ignore_geo_referent)
                 if (not Utils.isNullOrEmpty(s)):
                     if (not str.isalnum(s[0])):
                         res = "{0}{1}".format(res, s)
                     else:
                         res = "{0} {1}".format(res, s)
         elif ((isinstance(begin, TextToken))
               and begin.chars.is_cyrillic_letter):
             mm = begin.getMorphClassInDictionary()
             if (not mm.is_undefined):
                 res = begin.getNormalCaseText(mm, False,
                                               MorphGender.UNDEFINED, False)
                 if (begin.end_char < end.end_char):
                     res = "{0} {1}".format(
                         res,
                         ProperNameHelper.getNameEx(begin.next0_, end,
                                                    MorphClass.UNDEFINED,
                                                    MorphCase.UNDEFINED,
                                                    MorphGender.UNDEFINED,
                                                    True, False))
     if (res is None):
         res = ProperNameHelper.getNameEx(begin, end, MorphClass.UNDEFINED,
                                          MorphCase.UNDEFINED,
                                          MorphGender.UNDEFINED, True,
                                          ignore_geo_referent)
     if (not Utils.isNullOrEmpty(res)):
         k = 0
         i = len(res) - 1
         while i >= 0:
             if (res[i] == '*' or Utils.isWhitespace(res[i])):
                 pass
             else:
                 break
             i -= 1
             k += 1
         if (k > 0):
             if (k == len(res)):
                 return None
             res = res[0:0 + len(res) - k]
     return res

Example #12

Show file

File: InnerMorphology.py Project: MihaJjDa/APCLtask

 def run(self, text: str, only_tokenizing: bool, dlang: 'MorphLang',
         progress: EventHandler,
         good_text: bool) -> typing.List['MorphToken']:
     """ Произвести морфологический анализ текста
     
     Args:
         text(str): исходный текст
         lang: язык (если null, то попробует определить)
     
     Returns:
         typing.List[MorphToken]: последовательность результирующих морфем
     """
     if (Utils.isNullOrEmpty(text)):
         return None
     twr = TextWrapper(text, good_text)
     twrch = twr.chars
     res = list()
     uni_lex = dict()
     term0 = None
     pure_rus_words = 0
     pure_ukr_words = 0
     pure_by_words = 0
     pure_kz_words = 0
     tot_rus_words = 0
     tot_ukr_words = 0
     tot_by_words = 0
     tot_kz_words = 0
     i = 0
     first_pass2708 = True
     while True:
         if first_pass2708: first_pass2708 = False
         else: i += 1
         if (not (i < twr.length)): break
         ty = InnerMorphology._getCharTyp(twrch[i])
         if (ty == 0):
             continue
         if (ty > 2):
             j = (i + 1)
         else:
             j = (i + 1)
             while j < twr.length:
                 if (InnerMorphology._getCharTyp(twrch[j]) != ty):
                     break
                 j += 1
         wstr = text[i:i + j - i]
         term = None
         if (good_text):
             term = wstr
         else:
             trstr = LanguageHelper.transliteralCorrection(
                 wstr, term0, False)
             term = LanguageHelper.correctWord(trstr)
         if (Utils.isNullOrEmpty(term)):
             i = (j - 1)
             continue
         lang = InnerMorphology.__detectLang(twr, i, j - 1, term)
         if (lang == MorphLang.UA):
             pure_ukr_words += 1
         elif (lang == MorphLang.RU):
             pure_rus_words += 1
         elif (lang == MorphLang.BY):
             pure_by_words += 1
         elif (lang == MorphLang.KZ):
             pure_kz_words += 1
         if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN):
             tot_rus_words += 1
         if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN):
             tot_ukr_words += 1
         if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN):
             tot_by_words += 1
         if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN):
             tot_kz_words += 1
         if (ty == 1):
             term0 = term
         lemmas = None
         if (ty == 1 and not only_tokenizing):
             wraplemmas7 = RefOutArgWrapper(None)
             inoutres8 = Utils.tryGetValue(uni_lex, term, wraplemmas7)
             lemmas = wraplemmas7.value
             if (not inoutres8):
                 lemmas = InnerMorphology.UniLexWrap._new6(lang)
                 uni_lex[term] = lemmas
         tok = MorphToken()
         tok.term = term
         tok.begin_char = i
         if (i == 733860):
             pass
         tok.end_char = (j - 1)
         tok.tag = (lemmas)
         res.append(tok)
         i = (j - 1)
     def_lang = MorphLang(dlang)
     if (pure_rus_words > pure_ukr_words and pure_rus_words > pure_by_words
             and pure_rus_words > pure_kz_words):
         def_lang = MorphLang.RU
     elif (tot_rus_words > tot_ukr_words and tot_rus_words > tot_by_words
           and tot_rus_words > tot_kz_words):
         def_lang = MorphLang.RU
     elif (pure_ukr_words > pure_rus_words
           and pure_ukr_words > pure_by_words
           and pure_ukr_words > pure_kz_words):
         def_lang = MorphLang.UA
     elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words
           and tot_ukr_words > tot_kz_words):
         def_lang = MorphLang.UA
     elif (pure_kz_words > pure_rus_words and pure_kz_words > pure_ukr_words
           and pure_kz_words > pure_by_words):
         def_lang = MorphLang.KZ
     elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words
           and tot_kz_words > tot_by_words):
         def_lang = MorphLang.KZ
     elif (pure_by_words > pure_rus_words and pure_by_words > pure_ukr_words
           and pure_by_words > pure_kz_words):
         def_lang = MorphLang.BY
     elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words
           and tot_by_words > tot_kz_words):
         if (tot_rus_words > 10 and tot_by_words > (tot_rus_words + 20)):
             def_lang = MorphLang.BY
         elif (tot_rus_words == 0 or tot_by_words >= (tot_rus_words * 2)):
             def_lang = MorphLang.BY
     if (((def_lang.is_undefined or def_lang.is_ua)) and tot_rus_words > 0):
         if (((tot_ukr_words > tot_rus_words
               and InnerMorphology.M_ENGINE_UA.language.is_ua))
                 or ((tot_by_words > tot_rus_words
                      and InnerMorphology.M_ENGINE_BY.language.is_by))
                 or ((tot_kz_words > tot_rus_words
                      and InnerMorphology.M_ENGINE_KZ.language.is_kz))):
             cou0 = 0
             tot_kz_words = 0
             tot_ukr_words = tot_kz_words
             tot_by_words = tot_ukr_words
             tot_rus_words = tot_by_words
             for kp in uni_lex.items():
                 lang = MorphLang()
                 wraplang9 = RefOutArgWrapper(lang)
                 kp[1].word_forms = self.__processOneWord(kp[0], wraplang9)
                 lang = wraplang9.value
                 if (kp[1].word_forms is not None):
                     for wf in kp[1].word_forms:
                         lang |= wf.language
                 kp[1].lang = lang
                 if (lang.is_ru):
                     tot_rus_words += 1
                 if (lang.is_ua):
                     tot_ukr_words += 1
                 if (lang.is_by):
                     tot_by_words += 1
                 if (lang.is_kz):
                     tot_kz_words += 1
                 if (lang.is_cyrillic):
                     cou0 += 1
                 if (cou0 >= 100):
                     break
             if (tot_rus_words > ((math.floor(tot_by_words / 2)))
                     and tot_rus_words > ((math.floor(tot_ukr_words / 2)))):
                 def_lang = MorphLang.RU
             elif (tot_ukr_words > ((math.floor(tot_rus_words / 2)))
                   and tot_ukr_words > ((math.floor(tot_by_words / 2)))):
                 def_lang = MorphLang.UA
             elif (tot_by_words > ((math.floor(tot_rus_words / 2)))
                   and tot_by_words > ((math.floor(tot_ukr_words / 2)))):
                 def_lang = MorphLang.BY
         elif (def_lang.is_undefined):
             def_lang = MorphLang.RU
     cou = 0
     tot_kz_words = 0
     tot_ukr_words = tot_kz_words
     tot_by_words = tot_ukr_words
     tot_rus_words = tot_by_words
     for kp in uni_lex.items():
         lang = def_lang
         if (lang.is_undefined):
             if (tot_rus_words > tot_by_words
                     and tot_rus_words > tot_ukr_words
                     and tot_rus_words > tot_kz_words):
                 lang = MorphLang.RU
             elif (tot_ukr_words > tot_rus_words
                   and tot_ukr_words > tot_by_words
                   and tot_ukr_words > tot_kz_words):
                 lang = MorphLang.UA
             elif (tot_by_words > tot_rus_words
                   and tot_by_words > tot_ukr_words
                   and tot_by_words > tot_kz_words):
                 lang = MorphLang.BY
             elif (tot_kz_words > tot_rus_words
                   and tot_kz_words > tot_ukr_words
                   and tot_kz_words > tot_by_words):
                 lang = MorphLang.KZ
         wraplang10 = RefOutArgWrapper(lang)
         kp[1].word_forms = self.__processOneWord(kp[0], wraplang10)
         lang = wraplang10.value
         kp[1].lang = lang
         if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN):
             tot_rus_words += 1
         if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN):
             tot_ukr_words += 1
         if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN):
             tot_by_words += 1
         if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN):
             tot_kz_words += 1
         if (progress is not None):
             self.__onProgress(cou, len(uni_lex), progress)
         cou += 1
     debug_token = None
     empty_list = None
     for r in res:
         uni = Utils.asObjectOrNull(r.tag, InnerMorphology.UniLexWrap)
         r.tag = None
         if (uni is None or uni.word_forms is None
                 or len(uni.word_forms) == 0):
             if (empty_list is None):
                 empty_list = list()
             r.word_forms = empty_list
             if (uni is not None):
                 r.language = uni.lang
         else:
             r.word_forms = uni.word_forms
         if (r.begin_char == 733860):
             debug_token = r
     if (not good_text):
         i = 0
         first_pass2709 = True
         while True:
             if first_pass2709: first_pass2709 = False
             else: i += 1
             if (not (i < (len(res) - 2))): break
             ui0 = twrch[res[i].begin_char]
             ui1 = twrch[res[i + 1].begin_char]
             ui2 = twrch[res[i + 2].begin_char]
             if (ui1.is_quot):
                 p = res[i + 1].begin_char
                 if ((p >= 2 and "БбТт".find(text[p - 1]) >= 0 and
                      ((p + 3) < len(text)))
                         and "ЕеЯяЁё".find(text[p + 1]) >= 0):
                     wstr = LanguageHelper.transliteralCorrection(
                         LanguageHelper.correctWord("{0}Ъ{1}".format(
                             res[i].getSourceText(text),
                             res[i + 2].getSourceText(text))), None, False)
                     li = self.__processOneWord0(wstr)
                     if (li is not None and len(li) > 0
                             and li[0].is_in_dictionary):
                         res[i].end_char = res[i + 2].end_char
                         res[i].term = wstr
                         res[i].word_forms = li
                         del res[i + 1:i + 1 + 2]
                 elif ((ui1.is_apos and p > 0 and str.isalpha(text[p - 1]))
                       and ((p + 1) < len(text))
                       and str.isalpha(text[p + 1])):
                     if (def_lang == MorphLang.UA
                             or (((res[i].language) & MorphLang.UA)) !=
                             MorphLang.UNKNOWN
                             or (((res[i + 2].language) & MorphLang.UA)) !=
                             MorphLang.UNKNOWN):
                         wstr = LanguageHelper.transliteralCorrection(
                             LanguageHelper.correctWord("{0}{1}".format(
                                 res[i].getSourceText(text),
                                 res[i + 2].getSourceText(text))), None,
                             False)
                         li = self.__processOneWord0(wstr)
                         okk = True
                         if (okk):
                             res[i].end_char = res[i + 2].end_char
                             res[i].term = wstr
                             if (li is None):
                                 li = list()
                             res[i].word_forms = li
                             if (li is not None and len(li) > 0):
                                 res[i].language = li[0].language
                             del res[i + 1:i + 1 + 2]
             elif (((ui1.uni_char == '3' or ui1.uni_char == '4'))
                   and res[i + 1].length == 1):
                 src = ("З" if ui1.uni_char == '3' else "Ч")
                 i0 = i + 1
                 if ((res[i].end_char + 1) == res[i + 1].begin_char
                         and ui0.is_cyrillic):
                     i0 -= 1
                     src = (res[i0].getSourceText(text) + src)
                 i1 = i + 1
                 if ((res[i + 1].end_char + 1) == res[i + 2].begin_char
                         and ui2.is_cyrillic):
                     i1 += 1
                     src += res[i1].getSourceText(text)
                 if (len(src) > 2):
                     wstr = LanguageHelper.transliteralCorrection(
                         LanguageHelper.correctWord(src), None, False)
                     li = self.__processOneWord0(wstr)
                     if (li is not None and len(li) > 0
                             and li[0].is_in_dictionary):
                         res[i0].end_char = res[i1].end_char
                         res[i0].term = wstr
                         res[i0].word_forms = li
                         del res[i0 + 1:i0 + 1 + i1 - i0]
             elif ((ui1.is_hiphen and ui0.is_letter and ui2.is_letter)
                   and res[i].end_char > res[i].begin_char
                   and res[i + 2].end_char > res[i + 2].begin_char):
                 newline = False
                 sps = 0
                 j = (res[i + 1].end_char + 1)
                 while j < res[i + 2].begin_char:
                     if (text[j] == '\r' or text[j] == '\n'):
                         newline = True
                         sps += 1
                     elif (not Utils.isWhitespace(text[j])):
                         break
                     else:
                         sps += 1
                     j += 1
                 full_word = LanguageHelper.correctWord(
                     res[i].getSourceText(text) +
                     res[i + 2].getSourceText(text))
                 if (not newline):
                     if (full_word in uni_lex or full_word == "ИЗЗА"):
                         newline = True
                     elif (text[res[i + 1].begin_char] == (chr(0x00AD))):
                         newline = True
                     elif (LanguageHelper.endsWithEx(
                             res[i].getSourceText(text), "О", "о", None,
                             None) and len(res[i + 2].word_forms) > 0
                           and res[i + 2].word_forms[0].is_in_dictionary):
                         if (text[res[i + 1].begin_char] == '¬'):
                             li = self.__processOneWord0(full_word)
                             if (li is not None and len(li) > 0
                                     and li[0].is_in_dictionary):
                                 newline = True
                     elif ((res[i].end_char + 2) == res[i + 2].begin_char):
                         if (not str.isupper(text[res[i + 2].begin_char])
                                 and (sps < 2) and len(full_word) > 4):
                             newline = True
                             if ((i + 3) < len(res)):
                                 ui3 = twrch[res[i + 3].begin_char]
                                 if (ui3.is_hiphen):
                                     newline = False
                     elif (((res[i].end_char + 1) == res[i + 1].begin_char
                            and sps > 0 and (sps < 3))
                           and len(full_word) > 4):
                         newline = True
                 if (newline):
                     li = self.__processOneWord0(full_word)
                     if (li is not None and len(li) > 0
                             and ((li[0].is_in_dictionary
                                   or full_word in uni_lex))):
                         res[i].end_char = res[i + 2].end_char
                         res[i].term = full_word
                         res[i].word_forms = li
                         del res[i + 1:i + 1 + 2]
                 else:
                     pass
             elif ((ui1.is_letter and ui0.is_letter and res[i].length > 2)
                   and res[i + 1].length > 1):
                 if (ui0.is_upper != ui1.is_upper):
                     continue
                 if (not ui0.is_cyrillic or not ui1.is_cyrillic):
                     continue
                 newline = False
                 j = (res[i].end_char + 1)
                 while j < res[i + 1].begin_char:
                     if (twrch[j].code == 0xD or twrch[j].code == 0xA):
                         newline = True
                         break
                     j += 1
                 if (not newline):
                     continue
                 full_word = LanguageHelper.correctWord(
                     res[i].getSourceText(text) +
                     res[i + 1].getSourceText(text))
                 if (not full_word in uni_lex):
                     continue
                 li = self.__processOneWord0(full_word)
                 if (li is not None and len(li) > 0
                         and li[0].is_in_dictionary):
                     res[i].end_char = res[i + 1].end_char
                     res[i].term = full_word
                     res[i].word_forms = li
                     del res[i + 1]
     i = 0
     first_pass2710 = True
     while True:
         if first_pass2710: first_pass2710 = False
         else: i += 1
         if (not (i < len(res))): break
         mt = res[i]
         mt.char_info = CharsInfo()
         ui0 = twrch[mt.begin_char]
         ui00 = UnicodeInfo.ALL_CHARS[ord((res[i].term[0]))]
         j = (mt.begin_char + 1)
         while j <= mt.end_char:
             if (ui0.is_letter):
                 break
             ui0 = twrch[j]
             j += 1
         if (ui0.is_letter):
             res[i].char_info.is_letter = True
             if (ui00.is_latin):
                 res[i].char_info.is_latin_letter = True
             elif (ui00.is_cyrillic):
                 res[i].char_info.is_cyrillic_letter = True
             if (res[i].language == MorphLang.UNKNOWN):
                 if (LanguageHelper.isCyrillic(mt.term)):
                     res[i].language = (MorphLang.RU if
                                        def_lang.is_undefined else def_lang)
             if (good_text):
                 continue
             all_up = True
             all_lo = True
             j = mt.begin_char
             while j <= mt.end_char:
                 if (twrch[j].is_upper or twrch[j].is_digit):
                     all_lo = False
                 else:
                     all_up = False
                 j += 1
             if (all_up):
                 mt.char_info.is_all_upper = True
             elif (all_lo):
                 mt.char_info.is_all_lower = True
             elif (((ui0.is_upper or twrch[mt.begin_char].is_digit))
                   and mt.end_char > mt.begin_char):
                 all_lo = True
                 j = (mt.begin_char + 1)
                 while j <= mt.end_char:
                     if (twrch[j].is_upper or twrch[j].is_digit):
                         all_lo = False
                         break
                     j += 1
                 if (all_lo):
                     mt.char_info.is_capital_upper = True
                 elif (twrch[mt.end_char].is_lower
                       and (mt.end_char - mt.begin_char) > 1):
                     all_up = True
                     j = mt.begin_char
                     while j < mt.end_char:
                         if (twrch[j].is_lower):
                             all_up = False
                             break
                         j += 1
                     if (all_up):
                         mt.char_info.is_last_lower = True
         if (mt.char_info.is_last_lower and mt.length > 2
                 and mt.char_info.is_cyrillic_letter):
             pref = text[mt.begin_char:mt.begin_char + mt.end_char -
                         mt.begin_char]
             ok = False
             for wf in mt.word_forms:
                 if (wf.normal_case == pref or wf.normal_full == pref):
                     ok = True
                     break
             if (not ok):
                 mt.word_forms = list(mt.word_forms)
                 mt.word_forms.insert(
                     0, MorphWordForm._new11(pref, MorphClass.NOUN, 1))
     if (good_text or only_tokenizing):
         return res
     i = 0
     first_pass2711 = True
     while True:
         if first_pass2711: first_pass2711 = False
         else: i += 1
         if (not (i < len(res))): break
         if (res[i].length == 1 and res[i].char_info.is_latin_letter):
             ch = res[i].term[0]
             if (ch == 'C' or ch == 'A' or ch == 'P'):
                 pass
             else:
                 continue
             is_rus = False
             for ii in range(i - 1, -1, -1):
                 if ((res[ii].end_char + 1) != res[ii + 1].begin_char):
                     break
                 elif (res[ii].char_info.is_letter):
                     is_rus = res[ii].char_info.is_cyrillic_letter
                     break
             if (not is_rus):
                 ii = i + 1
                 while ii < len(res):
                     if ((res[ii - 1].end_char + 1) != res[ii].begin_char):
                         break
                     elif (res[ii].char_info.is_letter):
                         is_rus = res[ii].char_info.is_cyrillic_letter
                         break
                     ii += 1
             if (is_rus):
                 res[i].term = LanguageHelper.transliteralCorrection(
                     res[i].term, None, True)
                 res[i].char_info.is_cyrillic_letter = True
                 res[i].char_info.is_latin_letter = True
     for r in res:
         if (r.char_info.is_all_upper or r.char_info.is_capital_upper):
             if (r.language.is_cyrillic):
                 ok = False
                 for wf in r.word_forms:
                     if (wf.class0_.is_proper_surname):
                         ok = True
                         break
                 if (not ok):
                     r.word_forms = list(r.word_forms)
                     InnerMorphology.M_ENGINE_RU.processSurnameVariants(
                         r.term, r.word_forms)
     for r in res:
         for mv in r.word_forms:
             if (mv.normal_case is None):
                 mv.normal_case = r.term
     i = 0
     while i < (len(res) - 2):
         if (res[i].char_info.is_latin_letter
                 and res[i].char_info.is_all_upper and res[i].length == 1):
             if (twrch[res[i + 1].begin_char].is_quot
                     and res[i + 2].char_info.is_latin_letter
                     and res[i + 2].length > 2):
                 if ((res[i].end_char + 1) == res[i + 1].begin_char and
                     (res[i + 1].end_char + 1) == res[i + 2].begin_char):
                     wstr = "{0}{1}".format(res[i].term, res[i + 2].term)
                     li = self.__processOneWord0(wstr)
                     if (li is not None):
                         res[i].word_forms = li
                     res[i].end_char = res[i + 2].end_char
                     res[i].term = wstr
                     if (res[i + 2].char_info.is_all_lower):
                         res[i].char_info.is_all_upper = False
                         res[i].char_info.is_capital_upper = True
                     elif (not res[i + 2].char_info.is_all_upper):
                         res[i].char_info.is_all_upper = False
                     del res[i + 1:i + 1 + 2]
         i += 1
     i = 0
     first_pass2712 = True
     while True:
         if first_pass2712: first_pass2712 = False
         else: i += 1
         if (not (i < (len(res) - 1))): break
         if (not res[i].char_info.is_letter
                 and not res[i + 1].char_info.is_letter
                 and (res[i].end_char + 1) == res[i + 1].begin_char):
             if (twrch[res[i].begin_char].is_hiphen
                     and twrch[res[i + 1].begin_char].is_hiphen):
                 if (i == 0 or not twrch[res[i - 1].begin_char].is_hiphen):
                     pass
                 else:
                     continue
                 if ((i + 2) == len(res)
                         or not twrch[res[i + 2].begin_char].is_hiphen):
                     pass
                 else:
                     continue
                 res[i].end_char = res[i + 1].end_char
                 del res[i + 1]
     return res

Example #13

Show file

File: ProperNameHelper.py Project: pullenti/PullentiPython

 def __get_name_without_brackets(begin: 'Token',
                                 end: 'Token',
                                 normalize_first_noun_group: bool = False,
                                 normal_first_group_single: bool = False,
                                 ignore_geo_referent: bool = False) -> str:
     res = None
     if (BracketHelper.can_be_start_of_sequence(begin, False, False)
             and BracketHelper.can_be_end_of_sequence(
                 end, False, begin, False)):
         begin = begin.next0_
         end = end.previous
     if (normalize_first_noun_group
             and not begin.morph.class0_.is_preposition):
         npt = NounPhraseHelper.try_parse(
             begin, NounPhraseParseAttr.REFERENTCANBENOUN, 0, None)
         if (npt is not None):
             if (npt.noun.get_morph_class_in_dictionary().is_undefined
                     and len(npt.adjectives) == 0):
                 npt = (None)
         if (npt is not None and npt.end_token.end_char > end.end_char):
             npt = (None)
         if (npt is not None):
             res = npt.get_normal_case_text(
                 None, (MorphNumber.SINGULAR if normal_first_group_single
                        else MorphNumber.UNDEFINED), MorphGender.UNDEFINED,
                 False)
             te = npt.end_token.next0_
             if (((te is not None and te.next0_ is not None and te.is_comma)
                  and (isinstance(te.next0_, TextToken))
                  and te.next0_.end_char <= end.end_char)
                     and te.next0_.morph.class0_.is_verb
                     and te.next0_.morph.class0_.is_adjective):
                 for it in te.next0_.morph.items:
                     if (it.gender == npt.morph.gender
                             or ((it.gender) & (npt.morph.gender)) !=
                         (MorphGender.UNDEFINED)):
                         if (not (
                             (it.case_) & npt.morph.case_).is_undefined):
                             if (it.number == npt.morph.number or
                                 ((it.number) & (npt.morph.number)) !=
                                 (MorphNumber.UNDEFINED)):
                                 var = te.next0_.term
                                 if (isinstance(it, MorphWordForm)):
                                     var = it.normal_case
                                 bi = MorphBaseInfo._new492(
                                     MorphClass.ADJECTIVE, npt.morph.gender,
                                     npt.morph.number, npt.morph.language)
                                 var = MorphologyService.get_wordform(
                                     var, bi)
                                 if (var is not None):
                                     res = "{0}, {1}".format(res, var)
                                     te = te.next0_.next0_
                                 break
             if (te is not None and te.end_char <= end.end_char):
                 s = ProperNameHelper.get_name_ex(te, end,
                                                  MorphClass.UNDEFINED,
                                                  MorphCase.UNDEFINED,
                                                  MorphGender.UNDEFINED,
                                                  True, ignore_geo_referent)
                 if (not Utils.isNullOrEmpty(s)):
                     if (not str.isalnum(s[0])):
                         res = "{0}{1}".format(res, s)
                     else:
                         res = "{0} {1}".format(res, s)
         elif ((isinstance(begin, TextToken))
               and begin.chars.is_cyrillic_letter):
             mm = begin.get_morph_class_in_dictionary()
             if (not mm.is_undefined):
                 res = begin.get_normal_case_text(mm, MorphNumber.UNDEFINED,
                                                  MorphGender.UNDEFINED,
                                                  False)
                 if (begin.end_char < end.end_char):
                     res = "{0} {1}".format(
                         res,
                         ProperNameHelper.get_name_ex(
                             begin.next0_, end, MorphClass.UNDEFINED,
                             MorphCase.UNDEFINED, MorphGender.UNDEFINED,
                             True, False))
     if (res is None):
         res = ProperNameHelper.get_name_ex(begin, end,
                                            MorphClass.UNDEFINED,
                                            MorphCase.UNDEFINED,
                                            MorphGender.UNDEFINED, True,
                                            ignore_geo_referent)
     if (not Utils.isNullOrEmpty(res)):
         k = 0
         i = len(res) - 1
         while i >= 0:
             if (res[i] == '*' or Utils.isWhitespace(res[i])):
                 pass
             else:
                 break
             i -= 1
             k += 1
         if (k > 0):
             if (k == len(res)):
                 return None
             res = res[0:0 + len(res) - k]
     return res

Example #14

Show file

 def __TryParse(t: 'Token',
                prev: 'TransItemToken',
                after_conj: bool,
                attach_high: bool = False) -> 'TransItemToken':
     if (t is None):
         return None
     t1 = t
     if (t1.isChar(',')):
         t1 = t1.next0_
     if (t1 is not None and t1.isValue("ПРИНАДЛЕЖАТЬ", "НАЛЕЖАТИ")):
         t1 = t1.next0_
     if (isinstance(t1, ReferentToken)):
         if (t1.getReferent().type_name == "ORGANIZATION"):
             return TransItemToken._new2521(t, t1, TransItemToken.Typs.ORG,
                                            t1.getReferent(), t1.morph)
     route = False
     if (t1 is not None and ((t1.isValue("СЛЕДОВАТЬ", "СЛІДУВАТИ")
                              or t1.isValue("ВЫПОЛНЯТЬ", "ВИКОНУВАТИ")))):
         t1 = t1.next0_
         route = True
     if (t1 is not None and t1.morph.class0_.is_preposition):
         t1 = t1.next0_
     if (t1 is not None and
         ((t1.isValue("РЕЙС", None) or t1.isValue("МАРШРУТ", None)))):
         t1 = t1.next0_
         route = True
     if (isinstance(t1, ReferentToken)):
         if (isinstance(t1.getReferent(), GeoReferent)):
             geo_ = Utils.asObjectOrNull(t1.getReferent(), GeoReferent)
             if (geo_.is_state or geo_.is_city):
                 tit = TransItemToken._new2522(t, t1,
                                               TransItemToken.Typs.ROUTE,
                                               list())
                 tit.route_items.append(geo_)
                 t1 = t1.next0_
                 first_pass3132 = True
                 while True:
                     if first_pass3132: first_pass3132 = False
                     else: t1 = t1.next0_
                     if (not (t1 is not None)): break
                     if (t1.is_hiphen):
                         continue
                     if (t1.morph.class0_.is_preposition
                             or t1.morph.class0_.is_conjunction):
                         continue
                     geo_ = (Utils.asObjectOrNull(t1.getReferent(),
                                                  GeoReferent))
                     if (geo_ is None):
                         break
                     if (not geo_.is_city and not geo_.is_state):
                         break
                     tit.route_items.append(geo_)
                     tit.end_token = t1
                 if (len(tit.route_items) > 1 or route):
                     return tit
         elif ((isinstance(t1.getReferent(), DateReferent))
               and (t1.whitespaces_before_count < 3)):
             tit = TransItemToken._new2523(t, t1, TransItemToken.Typs.DATE,
                                           t1.getReferent())
             if (t1.next0_ is not None):
                 if (t1.next0_.isValue("В", None)
                         and t1.next0_.next0_ is not None
                         and t1.next0_.next0_.isChar('.')):
                     tit.end_token = t1.next0_.next0_
                 elif (t1.next0_.isValue("ВЫП", None)
                       or t1.next0_.isValue("ВЫПУСК", None)):
                     tit.end_token = t1.next0_
                     if (t1.next0_.next0_ is not None
                             and t1.next0_.next0_.isChar('.')):
                         tit.end_token = t1.next0_.next0_
             return tit
     if (isinstance(t, TextToken)):
         num = MiscHelper.checkNumberPrefix(t)
         if (num is not None):
             tit = TransItemToken.__attachRusAutoNumber(num)
             if (tit is None):
                 tit = TransItemToken._attachNumber(num, False)
             if (tit is not None):
                 tit.begin_token = t
                 return tit
         tok = TransItemToken.M_ONTOLOGY.tryParse(t, TerminParseAttr.NO)
         if (tok is None and ((t.isValue("С", None) or t.isValue("C", None)
                               or t.isValue("ЗА", None)))):
             tok = TransItemToken.M_ONTOLOGY.tryParse(
                 t.next0_, TerminParseAttr.NO)
         if (tok is None and BracketHelper.isBracket(t, True)):
             tok1 = TransItemToken.M_ONTOLOGY.tryParse(
                 t.next0_, TerminParseAttr.NO)
             if (tok1 is not None and BracketHelper.isBracket(
                     tok1.end_token.next0_, True)):
                 tok = tok1
                 tok.begin_token = t
                 tok.end_token = tok.end_token.next0_
                 tok.begin_token = t
             elif (tok1 is not None):
                 tt = Utils.asObjectOrNull(tok1.termin,
                                           TransItemToken.TransTermin)
                 if (tt.typ == TransItemToken.Typs.BRAND):
                     tok = tok1
                     tok.begin_token = t
         if (tok is None and t.isValue("МАРКА", None)):
             res1 = TransItemToken.__TryParse(t.next0_, prev, after_conj,
                                              False)
             if (res1 is not None):
                 if (res1.typ == TransItemToken.Typs.NAME
                         or res1.typ == TransItemToken.Typs.BRAND):
                     res1.begin_token = t
                     res1.typ = TransItemToken.Typs.BRAND
                     return res1
         if (tok is not None):
             tt = Utils.asObjectOrNull(tok.termin,
                                       TransItemToken.TransTermin)
             if (tt.typ == TransItemToken.Typs.NUMBER):
                 tit = TransItemToken.__attachRusAutoNumber(
                     tok.end_token.next0_)
                 if (tit is None):
                     tit = TransItemToken._attachNumber(
                         tok.end_token.next0_, False)
                 if (tit is not None):
                     tit.begin_token = t
                     return tit
                 else:
                     return None
             if (tt.is_doubt and not attach_high):
                 if (prev is None or prev.typ != TransItemToken.Typs.NOUN):
                     if ((prev is not None
                          and prev.typ == TransItemToken.Typs.BRAND
                          and tt.typ == TransItemToken.Typs.BRAND)
                             and Utils.compareStrings(
                                 tt.canonic_text, prev.value, True) == 0):
                         pass
                     else:
                         return None
             if (tt.canonic_text == "СУДНО"):
                 if ((((tok.morph.number) & (MorphNumber.PLURAL))) !=
                     (MorphNumber.UNDEFINED)):
                     if (not BracketHelper.canBeStartOfSequence(
                             tok.end_token.next0_, False, False)):
                         return None
             tit = TransItemToken._new2524(tok.begin_token, tok.end_token,
                                           tt.kind, tt.typ, tt.is_doubt,
                                           tok.chars, tok.morph)
             tit.value = tt.canonic_text
             if (tit.typ == TransItemToken.Typs.NOUN):
                 tit.value = tit.value.lower()
             else:
                 tit.value = tit.value.upper()
             return tit
         if (tok is None and t.morph.class0_.is_adjective):
             npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0)
             if (npt is not None and len(npt.adjectives) > 0):
                 state_ = None
                 tt = t
                 first_pass3133 = True
                 while True:
                     if first_pass3133: first_pass3133 = False
                     else: tt = tt.next0_
                     if (not (tt is not None
                              and tt.previous != npt.end_token)):
                         break
                     tok = TransItemToken.M_ONTOLOGY.tryParse(
                         tt, TerminParseAttr.NO)
                     if (tok is None and state_ is None):
                         state_ = tt.kit.processReferent("GEO", tt)
                     if (tok is not None
                             and tok.end_token == npt.end_token):
                         if ((tok.termin).typ == TransItemToken.Typs.NOUN):
                             tit = TransItemToken._new2524(
                                 t, tok.end_token, (tok.termin).kind,
                                 TransItemToken.Typs.NOUN,
                                 (tok.termin).is_doubt, tok.chars,
                                 npt.morph)
                             tit.value = (tok.termin).canonic_text.lower()
                             tit.alt_value = npt.getNormalCaseText(
                                 None, False, MorphGender.UNDEFINED,
                                 False).lower()
                             if (LanguageHelper.endsWithEx(
                                     tit.alt_value, "суд", "суда", None,
                                     None)):
                                 if (not BracketHelper.canBeStartOfSequence(
                                         tok.end_token.next0_, False,
                                         False)):
                                     continue
                             if (state_ is not None):
                                 if ((state_.referent).is_state):
                                     tit.state = state_
                             return tit
     if (t is not None and t.isValue("КЛАСС", None)
             and t.next0_ is not None):
         br = BracketHelper.tryParse(t.next0_, BracketParseAttr.NO, 100)
         if (br is not None):
             return TransItemToken._new2526(
                 t, br.end_token, TransItemToken.Typs.CLASS,
                 MiscHelper.getTextValueOfMetaToken(br, GetTextAttr.NO))
     nt = Utils.asObjectOrNull(t, NumberToken)
     if (nt is not None):
         if (prev is None or nt.typ != NumberSpellingType.DIGIT):
             return None
         if (prev.typ == TransItemToken.Typs.BRAND):
             return TransItemToken.__attachModel(t, False, prev)
         else:
             return None
     res = TransItemToken.__attachRusAutoNumber(t)
     if ((res) is not None):
         if (not res.is_doubt):
             return res
         if (prev is not None and prev.typ == TransItemToken.Typs.NOUN
                 and prev.kind == TransportKind.AUTO):
             return res
         if (prev is not None
                 and ((prev.typ == TransItemToken.Typs.BRAND
                       or prev.typ == TransItemToken.Typs.MODEL))):
             return res
     t1 = t
     if (t.is_hiphen):
         t1 = t.next0_
     if (prev is not None and prev.typ == TransItemToken.Typs.BRAND
             and t1 is not None):
         tit = TransItemToken.__attachModel(t1, True, prev)
         if (tit is not None):
             tit.begin_token = t
             return tit
     if (prev is not None
             and ((prev.typ == TransItemToken.Typs.NOUN or after_conj))):
         br = BracketHelper.tryParse(t, BracketParseAttr.NO, 100)
         if (br is not None and br.is_quote_type):
             tit = TransItemToken.tryParse(br.begin_token.next0_, prev,
                                           after_conj, False)
             if (tit is not None and tit.end_token.next0_ == br.end_token):
                 if (not tit.is_doubt
                         or tit.typ == TransItemToken.Typs.BRAND):
                     tit.begin_token = br.begin_token
                     tit.end_token = br.end_token
                     return tit
             s = MiscHelper.getTextValueOfMetaToken(br, GetTextAttr.NO)
             if (not Utils.isNullOrEmpty(s) and (len(s) < 30)):
                 chars_ = 0
                 digs = 0
                 un = 0
                 for c in s:
                     if (not Utils.isWhitespace(c)):
                         if (str.isalpha(c)):
                             chars_ += 1
                         elif (str.isdigit(c)):
                             digs += 1
                         else:
                             un += 1
                 if (((digs == 0 and un == 0
                       and t.next0_.chars.is_capital_upper))
                         or prev.kind == TransportKind.SHIP
                         or prev.kind == TransportKind.SPACE):
                     return TransItemToken._new2526(
                         br.begin_token, br.end_token,
                         TransItemToken.Typs.NAME, s)
                 if (digs > 0 and (chars_ < 5)):
                     return TransItemToken._new2526(
                         br.begin_token, br.end_token,
                         TransItemToken.Typs.MODEL, s.replace(" ", ""))
     if (prev is not None and (((prev.typ == TransItemToken.Typs.NOUN
                                 or prev.typ == TransItemToken.Typs.BRAND
                                 or prev.typ == TransItemToken.Typs.NAME)
                                or prev.typ == TransItemToken.Typs.MODEL))):
         tit = TransItemToken.__attachModel(
             t, prev.typ != TransItemToken.Typs.NAME, prev)
         if (tit is not None):
             return tit
     if (((prev is not None and prev.typ == TransItemToken.Typs.NOUN
           and prev.kind == TransportKind.AUTO) and
          (isinstance(t, TextToken)) and t.chars.is_letter)
             and not t.chars.is_all_lower
             and (t.whitespaces_before_count < 2)):
         pt = t.kit.processReferent("PERSON", t)
         if (pt is None):
             tit = TransItemToken._new2529(t, t, TransItemToken.Typs.BRAND)
             tit.value = (t).term
             return tit
     if (((prev is not None and prev.typ == TransItemToken.Typs.NOUN and
           ((prev.kind == TransportKind.SHIP
             or prev.kind == TransportKind.SPACE)))) or after_conj):
         if (t.chars.is_capital_upper):
             ok = True
             npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0)
             if (npt is not None and len(npt.adjectives) > 0):
                 ok = False
             else:
                 rt = t.kit.processReferent("PERSON", t)
                 if (rt is not None):
                     ok = False
             if (t.getMorphClassInDictionary().is_proper_surname):
                 if (not t.morph.case_.is_nominative):
                     ok = False
             if (ok):
                 t1 = t
                 tt = t.next0_
                 while tt is not None:
                     if (tt.whitespaces_before_count > 1):
                         break
                     if (tt.chars != t.chars):
                         break
                     tit = TransItemToken.tryParse(tt, None, False, False)
                     if ((tit) is not None):
                         break
                     t1 = tt
                     tt = tt.next0_
                 s = MiscHelper.getTextValue(t, t1, GetTextAttr.NO)
                 if (s is not None):
                     res1 = TransItemToken._new2530(
                         t, t1, TransItemToken.Typs.NAME, True, s)
                     if (not t1.is_newline_after):
                         br = BracketHelper.tryParse(
                             t1.next0_, BracketParseAttr.NO, 100)
                         if (br is not None):
                             res1.end_token = br.end_token
                             res1.alt_value = res1.value
                             res1.value = MiscHelper.getTextValueOfMetaToken(
                                 br, GetTextAttr.NO)
                     return res1
     return None

Example #15

Show file

File: SourceOfAnalysis.py Project: MihaJjDa/APCLtask

 def __doCrLfCorrection(self, txt: str) -> str:
     """ Это анализ случаев принудительно отформатированного текста
     
     Args:
         txt(str): 
     """
     cou = 0
     total_len = 0
     i = 0
     first_pass3166 = True
     while True:
         if first_pass3166: first_pass3166 = False
         else: i += 1
         if (not (i < len(txt))): break
         ch = txt[i]
         if ((ord(ch)) != 0xD and (ord(ch)) != 0xA):
             continue
         len0_ = 0
         last_char = ch
         j = (i + 1)
         while j < len(txt):
             ch = txt[j]
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 break
             elif ((ord(ch)) == 0x9):
                 len0_ += 5
             else:
                 last_char = ch
                 len0_ += 1
             j += 1
         if (j >= len(txt)):
             break
         if (len0_ < 30):
             continue
         if (last_char != '.' and last_char != ':' and last_char != ';'):
             next_is_dig = False
             k = j + 1
             while k < len(txt):
                 if (not Utils.isWhitespace(txt[k])):
                     if (str.isdigit(txt[k])):
                         next_is_dig = True
                     break
                 k += 1
             if (not next_is_dig):
                 cou += 1
                 total_len += len0_
         i = j
     if (cou < 4):
         return txt
     total_len = math.floor(total_len / cou)
     if ((total_len < 50) or total_len > 100):
         return txt
     tmp = Utils.newStringIO(txt)
     i = 0
     while i < tmp.tell():
         ch = Utils.getCharAtStringIO(tmp, i)
         len0_ = 0
         last_char = ch
         j = (i + 1)
         while j < tmp.tell():
             ch = Utils.getCharAtStringIO(tmp, j)
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 break
             elif ((ord(ch)) == 0x9):
                 len0_ += 5
             else:
                 last_char = ch
                 len0_ += 1
             j += 1
         if (j >= tmp.tell()):
             break
         for jj in range(j - 1, -1, -1):
             last_char = Utils.getCharAtStringIO(tmp, jj)
             if (not Utils.isWhitespace(last_char)):
                 break
         else:
             jj = -1
         not_single = False
         jj = (j + 1)
         if ((jj < tmp.tell())
                 and (ord(Utils.getCharAtStringIO(tmp, j))) == 0xD
                 and (ord(Utils.getCharAtStringIO(tmp, jj))) == 0xA):
             jj += 1
         while jj < tmp.tell():
             ch = Utils.getCharAtStringIO(tmp, jj)
             if (not Utils.isWhitespace(ch)):
                 break
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 not_single = True
                 break
             jj += 1
         if (((not not_single and len0_ >
               (total_len - 20) and (len0_ < (total_len + 10)))
              and last_char != '.' and last_char != ':')
                 and last_char != ';'):
             Utils.setCharAtStringIO(tmp, j, ' ')
             self.crlf_corrected_count += 1
             if ((j + 1) < tmp.tell()):
                 ch = Utils.getCharAtStringIO(tmp, j + 1)
                 if ((ord(ch)) == 0xA):
                     Utils.setCharAtStringIO(tmp, j + 1, ' ')
                     j += 1
         i = (j - 1)
         i += 1
     return Utils.toStringStringIO(tmp)