Python LanguageHelper.endsWithEx Examples

Programming Language: Python

Namespace/Package Name: pullenti.morph.LanguageHelper

Class/Type: LanguageHelper

Method/Function: endsWithEx

Examples at hotexamples.com: 9

Python LanguageHelper.endsWithEx - 9 examples found. These are the top rated real world Python examples of pullenti.morph.LanguageHelper.LanguageHelper.endsWithEx extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

endsWith(21)

ends_with(18)

is_cyrillic_char(15)

isCyrillicChar(12)

endsWithEx(9)

ends_with_ex(8)

isCyrillicVowel(7)

is_cyrillic_vowel(6)

isLatinChar(5)

is_latin_char(4)

get_case_after_preposition(4)

isLatin(3)

is_latin(3)

normalize_preposition(2)

normalizePreposition(2)

is_cyrillic(2)

_get_word_lang(2)

correctWord(2)

isCyrillic(2)

getCaseAfterPreposition(2)

correct_word(2)

isHiphen(1)

isLatinVowel(1)

get_lat_for_cyr(1)

get_cyr_for_lat(1)

is_hiphen(1)

getLatForCyr(1)

getCyrForLat(1)

is_latin_vowel(1)

transliteralCorrection(1)

transliteral_correction(1)

Example #1

Show file

 def _DelSurnameEnd(s : str) -> str:
     if (len(s) < 3): 
         return s
     if (LanguageHelper.endsWithEx(s, "А", "У", "Е", None)): 
         return s[0:0+len(s) - 1]
     if (LanguageHelper.endsWith(s, "ОМ") or LanguageHelper.endsWith(s, "ЫМ")): 
         return s[0:0+len(s) - 2]
     if (LanguageHelper.endsWithEx(s, "Я", "Ю", None, None)): 
         ch1 = s[len(s) - 2]
         if (ch1 == 'Н' or ch1 == 'Л'): 
             return s[0:0+len(s) - 1] + "Ь"
     return s

Example #2

Show file

 def canHasRef(self, r: 'Referent') -> bool:
     """ Проверка, что этот референт может выступать в качестве ATTR_REF
     
     Args:
         r(Referent): 
     
     """
     nam = self.name
     if (nam is None or r is None):
         return False
     if (isinstance(r, GeoReferent)):
         g = Utils.asObjectOrNull(r, GeoReferent)
         if (LanguageHelper.endsWithEx(nam, "президент", "губернатор", None,
                                       None)):
             return g.is_state or g.is_region
         if (nam == "мэр" or nam == "градоначальник"):
             return g.is_city
         if (nam == "глава"):
             return True
         return False
     if (r.type_name == "ORGANIZATION"):
         if ((LanguageHelper.endsWith(nam, "губернатор") or nam == "мэр"
              or nam == "градоначальник") or nam == "президент"):
             return False
         if ("министр" in nam):
             if (r.findSlot(None, "министерство", True) is None):
                 return False
         if (nam.endswith("директор")):
             if ((r.findSlot(None, "суд", True)) is not None):
                 return False
         return True
     return False

Example #3

Show file

File: MorphToken.py Project: MihaJjDa/APCLtask

 def lemma(self) -> str:
     """ Лемма (вариант морфологической нормализации) """
     if (self.__m_lemma is not None):
         return self.__m_lemma
     res = None
     if (self.word_forms is not None and len(self.word_forms) > 0):
         if (len(self.word_forms) == 1):
             res = (Utils.ifNotNull(self.word_forms[0].normal_full,
                                    self.word_forms[0].normal_case))
         if (res is None and not self.char_info.is_all_lower):
             for m in self.word_forms:
                 if (m.class0_.is_proper_surname):
                     s = Utils.ifNotNull(m.normal_full,
                                         Utils.ifNotNull(m.normal_case, ""))
                     if (LanguageHelper.endsWithEx(s, "ОВ", "ЕВ", None,
                                                   None)):
                         res = s
                         break
                 elif (m.class0_.is_proper_name and m.is_in_dictionary):
                     return m.normal_case
         if (res is None):
             best = None
             for m in self.word_forms:
                 if (best is None):
                     best = m
                 elif (self.__compareForms(best, m) > 0):
                     best = m
             res = (Utils.ifNotNull(best.normal_full, best.normal_case))
     if (res is not None):
         if (LanguageHelper.endsWithEx(res, "АНЫЙ", "ЕНЫЙ", None, None)):
             res = (res[0:0 + len(res) - 3] + "ННЫЙ")
         elif (LanguageHelper.endsWith(res, "ЙСЯ")):
             res = res[0:0 + len(res) - 2]
         elif (LanguageHelper.endsWith(res, "АНИЙ") and res == self.term):
             for wf in self.word_forms:
                 if (wf.is_in_dictionary):
                     return res
             return res[0:0 + len(res) - 1] + "Е"
         return res
     return Utils.ifNotNull(self.term, "?")

Example #4

Show file

File: MorphEngine.py Project: MihaJjDa/APCLtask

 def process(self, word : str) -> typing.List['MorphWordForm']:
     """ Обработка одного слова
     
     Args:
         word(str): слово должно быть в верхнем регистре
     
     """
     if (Utils.isNullOrEmpty(word)): 
         return None
     res = None
     if (len(word) > 1): 
         i = 0
         while i < len(word): 
             ch = word[i]
             if (LanguageHelper.isCyrillicVowel(ch) or LanguageHelper.isLatinVowel(ch)): 
                 break
             i += 1
         if (i >= len(word)): 
             return res
     mvs = [ ]
     tn = self.m_root
     i = 0
     while i <= len(word): 
         if (tn.lazy_pos > 0): 
             self.__loadTreeNode(tn)
         if (tn.rules is not None): 
             word_begin = None
             word_end = None
             if (i == 0): 
                 word_end = word
             elif (i < len(word)): 
                 word_end = word[i:]
             else: 
                 word_end = ""
             if (res is None): 
                 res = list()
             for r in tn.rules: 
                 wrapmvs14 = RefOutArgWrapper(None)
                 inoutres15 = Utils.tryGetValue(r.variants, word_end, wrapmvs14)
                 mvs = wrapmvs14.value
                 if (inoutres15): 
                     if (word_begin is None): 
                         if (i == len(word)): 
                             word_begin = word
                         elif (i > 0): 
                             word_begin = word[0:0+i]
                         else: 
                             word_begin = ""
                     r.processResult(res, word_begin, mvs)
         if (tn.nodes is None or i >= len(word)): 
             break
         ch = ord(word[i])
         wraptn16 = RefOutArgWrapper(None)
         inoutres17 = Utils.tryGetValue(tn.nodes, ch, wraptn16)
         tn = wraptn16.value
         if (not inoutres17): 
             break
         i += 1
     need_test_unknown_vars = True
     if (res is not None): 
         for r in res: 
             if ((r.class0_.is_pronoun or r.class0_.is_noun or r.class0_.is_adjective) or (r.class0_.is_misc and r.class0_.is_conjunction) or r.class0_.is_preposition): 
                 need_test_unknown_vars = False
             elif (r.class0_.is_adverb and r.normal_case is not None): 
                 if (not LanguageHelper.endsWithEx(r.normal_case, "О", "А", None, None)): 
                     need_test_unknown_vars = False
                 elif (r.normal_case == "МНОГО"): 
                     need_test_unknown_vars = False
             elif (r.class0_.is_verb and len(res) > 1): 
                 ok = False
                 for rr in res: 
                     if (rr != r and rr.class0_ != r.class0_): 
                         ok = True
                         break
                 if (ok and not LanguageHelper.endsWith(word, "ИМ")): 
                     need_test_unknown_vars = False
     if (need_test_unknown_vars and LanguageHelper.isCyrillicChar(word[0])): 
         gl = 0
         sog = 0
         j = 0
         while j < len(word): 
             if (LanguageHelper.isCyrillicVowel(word[j])): 
                 gl += 1
             else: 
                 sog += 1
             j += 1
         if ((gl < 2) or (sog < 2)): 
             need_test_unknown_vars = False
     if (need_test_unknown_vars and res is not None and len(res) == 1): 
         if (res[0].class0_.is_verb): 
             if ("н.вр." in res[0].misc.attrs and "нес.в." in res[0].misc.attrs and not "страд.з." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif ("б.вр." in res[0].misc.attrs and "сов.в." in res[0].misc.attrs): 
                 need_test_unknown_vars = False
             elif (res[0].normal_case is not None and LanguageHelper.endsWith(res[0].normal_case, "СЯ")): 
                 need_test_unknown_vars = False
         if (res[0].class0_.is_undefined and "прдктв." in res[0].misc.attrs): 
             need_test_unknown_vars = False
     if (need_test_unknown_vars): 
         if (self.m_root_reverce is None): 
             return res
         tn = self.m_root_reverce
         tn0 = None
         for i in range(len(word) - 1, -1, -1):
             if (tn.lazy_pos > 0): 
                 self.__loadTreeNode(tn)
             ch = ord(word[i])
             if (tn.nodes is None): 
                 break
             wrapnext18 = RefOutArgWrapper(None)
             inoutres19 = Utils.tryGetValue(tn.nodes, ch, wrapnext18)
             next0_ = wrapnext18.value
             if (not inoutres19): 
                 break
             tn = next0_
             if (tn.lazy_pos > 0): 
                 self.__loadTreeNode(tn)
             if (tn.reverce_variants is not None): 
                 tn0 = tn
                 break
         else: i = -1
         if (tn0 is not None): 
             glas = i < 4
             while i >= 0: 
                 if (LanguageHelper.isCyrillicVowel(word[i]) or LanguageHelper.isLatinVowel(word[i])): 
                     glas = True
                     break
                 i -= 1
             if (glas): 
                 for mv in tn0.reverce_variants: 
                     if (((not mv.class0_.is_verb and not mv.class0_.is_adjective and not mv.class0_.is_noun) and not mv.class0_.is_proper_surname and not mv.class0_.is_proper_geo) and not mv.class0_.is_proper_secname): 
                         continue
                     ok = False
                     for rr in res: 
                         if (rr.is_in_dictionary): 
                             if (rr.class0_ == mv.class0_ or rr.class0_.is_noun): 
                                 ok = True
                                 break
                             if (not mv.class0_.is_adjective and rr.class0_.is_verb): 
                                 ok = True
                                 break
                     if (ok): 
                         continue
                     if (len(mv.tail) > 0 and not LanguageHelper.endsWith(word, mv.tail)): 
                         continue
                     r = MorphWordForm(mv, word)
                     if (not MorphWordForm._hasMorphEquals(res, r)): 
                         r.undef_coef = mv.coef
                         if (res is None): 
                             res = list()
                         res.append(r)
     if (word == "ПРИ" and res is not None): 
         for i in range(len(res) - 1, -1, -1):
             if (res[i].class0_.is_proper_geo): 
                 del res[i]
         else: i = -1
     if (res is None or len(res) == 0): 
         return None
     MorphEngine.__sort(res, word)
     for v in res: 
         if (v.normal_case is None): 
             v.normal_case = word
         if (v.class0_.is_verb): 
             if (v.normal_full is None and LanguageHelper.endsWith(v.normal_case, "ТЬСЯ")): 
                 v.normal_full = v.normal_case[0:0+len(v.normal_case) - 2]
         v.language = self.language
         if (v.class0_.is_preposition): 
             v.normal_case = LanguageHelper.normalizePreposition(v.normal_case)
     mc = MorphClass()
     for i in range(len(res) - 1, -1, -1):
         if (not res[i].is_in_dictionary and res[i].class0_.is_adjective and len(res) > 1): 
             if ("к.ф." in res[i].misc.attrs or "неизм." in res[i].misc.attrs): 
                 del res[i]
                 continue
         if (res[i].is_in_dictionary): 
             mc.value |= res[i].class0_.value
     else: i = -1
     if (mc == MorphClass.VERB and len(res) > 1): 
         for r in res: 
             if (r.undef_coef > (100) and r.class0_ == MorphClass.ADJECTIVE): 
                 r.undef_coef = (0)
     if (len(res) == 0): 
         return None
     return res

Example #5

Show file

File: InnerMorphology.py Project: MihaJjDa/APCLtask

 def run(self, text: str, only_tokenizing: bool, dlang: 'MorphLang',
         progress: EventHandler,
         good_text: bool) -> typing.List['MorphToken']:
     """ Произвести морфологический анализ текста
     
     Args:
         text(str): исходный текст
         lang: язык (если null, то попробует определить)
     
     Returns:
         typing.List[MorphToken]: последовательность результирующих морфем
     """
     if (Utils.isNullOrEmpty(text)):
         return None
     twr = TextWrapper(text, good_text)
     twrch = twr.chars
     res = list()
     uni_lex = dict()
     term0 = None
     pure_rus_words = 0
     pure_ukr_words = 0
     pure_by_words = 0
     pure_kz_words = 0
     tot_rus_words = 0
     tot_ukr_words = 0
     tot_by_words = 0
     tot_kz_words = 0
     i = 0
     first_pass2708 = True
     while True:
         if first_pass2708: first_pass2708 = False
         else: i += 1
         if (not (i < twr.length)): break
         ty = InnerMorphology._getCharTyp(twrch[i])
         if (ty == 0):
             continue
         if (ty > 2):
             j = (i + 1)
         else:
             j = (i + 1)
             while j < twr.length:
                 if (InnerMorphology._getCharTyp(twrch[j]) != ty):
                     break
                 j += 1
         wstr = text[i:i + j - i]
         term = None
         if (good_text):
             term = wstr
         else:
             trstr = LanguageHelper.transliteralCorrection(
                 wstr, term0, False)
             term = LanguageHelper.correctWord(trstr)
         if (Utils.isNullOrEmpty(term)):
             i = (j - 1)
             continue
         lang = InnerMorphology.__detectLang(twr, i, j - 1, term)
         if (lang == MorphLang.UA):
             pure_ukr_words += 1
         elif (lang == MorphLang.RU):
             pure_rus_words += 1
         elif (lang == MorphLang.BY):
             pure_by_words += 1
         elif (lang == MorphLang.KZ):
             pure_kz_words += 1
         if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN):
             tot_rus_words += 1
         if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN):
             tot_ukr_words += 1
         if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN):
             tot_by_words += 1
         if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN):
             tot_kz_words += 1
         if (ty == 1):
             term0 = term
         lemmas = None
         if (ty == 1 and not only_tokenizing):
             wraplemmas7 = RefOutArgWrapper(None)
             inoutres8 = Utils.tryGetValue(uni_lex, term, wraplemmas7)
             lemmas = wraplemmas7.value
             if (not inoutres8):
                 lemmas = InnerMorphology.UniLexWrap._new6(lang)
                 uni_lex[term] = lemmas
         tok = MorphToken()
         tok.term = term
         tok.begin_char = i
         if (i == 733860):
             pass
         tok.end_char = (j - 1)
         tok.tag = (lemmas)
         res.append(tok)
         i = (j - 1)
     def_lang = MorphLang(dlang)
     if (pure_rus_words > pure_ukr_words and pure_rus_words > pure_by_words
             and pure_rus_words > pure_kz_words):
         def_lang = MorphLang.RU
     elif (tot_rus_words > tot_ukr_words and tot_rus_words > tot_by_words
           and tot_rus_words > tot_kz_words):
         def_lang = MorphLang.RU
     elif (pure_ukr_words > pure_rus_words
           and pure_ukr_words > pure_by_words
           and pure_ukr_words > pure_kz_words):
         def_lang = MorphLang.UA
     elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words
           and tot_ukr_words > tot_kz_words):
         def_lang = MorphLang.UA
     elif (pure_kz_words > pure_rus_words and pure_kz_words > pure_ukr_words
           and pure_kz_words > pure_by_words):
         def_lang = MorphLang.KZ
     elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words
           and tot_kz_words > tot_by_words):
         def_lang = MorphLang.KZ
     elif (pure_by_words > pure_rus_words and pure_by_words > pure_ukr_words
           and pure_by_words > pure_kz_words):
         def_lang = MorphLang.BY
     elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words
           and tot_by_words > tot_kz_words):
         if (tot_rus_words > 10 and tot_by_words > (tot_rus_words + 20)):
             def_lang = MorphLang.BY
         elif (tot_rus_words == 0 or tot_by_words >= (tot_rus_words * 2)):
             def_lang = MorphLang.BY
     if (((def_lang.is_undefined or def_lang.is_ua)) and tot_rus_words > 0):
         if (((tot_ukr_words > tot_rus_words
               and InnerMorphology.M_ENGINE_UA.language.is_ua))
                 or ((tot_by_words > tot_rus_words
                      and InnerMorphology.M_ENGINE_BY.language.is_by))
                 or ((tot_kz_words > tot_rus_words
                      and InnerMorphology.M_ENGINE_KZ.language.is_kz))):
             cou0 = 0
             tot_kz_words = 0
             tot_ukr_words = tot_kz_words
             tot_by_words = tot_ukr_words
             tot_rus_words = tot_by_words
             for kp in uni_lex.items():
                 lang = MorphLang()
                 wraplang9 = RefOutArgWrapper(lang)
                 kp[1].word_forms = self.__processOneWord(kp[0], wraplang9)
                 lang = wraplang9.value
                 if (kp[1].word_forms is not None):
                     for wf in kp[1].word_forms:
                         lang |= wf.language
                 kp[1].lang = lang
                 if (lang.is_ru):
                     tot_rus_words += 1
                 if (lang.is_ua):
                     tot_ukr_words += 1
                 if (lang.is_by):
                     tot_by_words += 1
                 if (lang.is_kz):
                     tot_kz_words += 1
                 if (lang.is_cyrillic):
                     cou0 += 1
                 if (cou0 >= 100):
                     break
             if (tot_rus_words > ((math.floor(tot_by_words / 2)))
                     and tot_rus_words > ((math.floor(tot_ukr_words / 2)))):
                 def_lang = MorphLang.RU
             elif (tot_ukr_words > ((math.floor(tot_rus_words / 2)))
                   and tot_ukr_words > ((math.floor(tot_by_words / 2)))):
                 def_lang = MorphLang.UA
             elif (tot_by_words > ((math.floor(tot_rus_words / 2)))
                   and tot_by_words > ((math.floor(tot_ukr_words / 2)))):
                 def_lang = MorphLang.BY
         elif (def_lang.is_undefined):
             def_lang = MorphLang.RU
     cou = 0
     tot_kz_words = 0
     tot_ukr_words = tot_kz_words
     tot_by_words = tot_ukr_words
     tot_rus_words = tot_by_words
     for kp in uni_lex.items():
         lang = def_lang
         if (lang.is_undefined):
             if (tot_rus_words > tot_by_words
                     and tot_rus_words > tot_ukr_words
                     and tot_rus_words > tot_kz_words):
                 lang = MorphLang.RU
             elif (tot_ukr_words > tot_rus_words
                   and tot_ukr_words > tot_by_words
                   and tot_ukr_words > tot_kz_words):
                 lang = MorphLang.UA
             elif (tot_by_words > tot_rus_words
                   and tot_by_words > tot_ukr_words
                   and tot_by_words > tot_kz_words):
                 lang = MorphLang.BY
             elif (tot_kz_words > tot_rus_words
                   and tot_kz_words > tot_ukr_words
                   and tot_kz_words > tot_by_words):
                 lang = MorphLang.KZ
         wraplang10 = RefOutArgWrapper(lang)
         kp[1].word_forms = self.__processOneWord(kp[0], wraplang10)
         lang = wraplang10.value
         kp[1].lang = lang
         if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN):
             tot_rus_words += 1
         if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN):
             tot_ukr_words += 1
         if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN):
             tot_by_words += 1
         if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN):
             tot_kz_words += 1
         if (progress is not None):
             self.__onProgress(cou, len(uni_lex), progress)
         cou += 1
     debug_token = None
     empty_list = None
     for r in res:
         uni = Utils.asObjectOrNull(r.tag, InnerMorphology.UniLexWrap)
         r.tag = None
         if (uni is None or uni.word_forms is None
                 or len(uni.word_forms) == 0):
             if (empty_list is None):
                 empty_list = list()
             r.word_forms = empty_list
             if (uni is not None):
                 r.language = uni.lang
         else:
             r.word_forms = uni.word_forms
         if (r.begin_char == 733860):
             debug_token = r
     if (not good_text):
         i = 0
         first_pass2709 = True
         while True:
             if first_pass2709: first_pass2709 = False
             else: i += 1
             if (not (i < (len(res) - 2))): break
             ui0 = twrch[res[i].begin_char]
             ui1 = twrch[res[i + 1].begin_char]
             ui2 = twrch[res[i + 2].begin_char]
             if (ui1.is_quot):
                 p = res[i + 1].begin_char
                 if ((p >= 2 and "БбТт".find(text[p - 1]) >= 0 and
                      ((p + 3) < len(text)))
                         and "ЕеЯяЁё".find(text[p + 1]) >= 0):
                     wstr = LanguageHelper.transliteralCorrection(
                         LanguageHelper.correctWord("{0}Ъ{1}".format(
                             res[i].getSourceText(text),
                             res[i + 2].getSourceText(text))), None, False)
                     li = self.__processOneWord0(wstr)
                     if (li is not None and len(li) > 0
                             and li[0].is_in_dictionary):
                         res[i].end_char = res[i + 2].end_char
                         res[i].term = wstr
                         res[i].word_forms = li
                         del res[i + 1:i + 1 + 2]
                 elif ((ui1.is_apos and p > 0 and str.isalpha(text[p - 1]))
                       and ((p + 1) < len(text))
                       and str.isalpha(text[p + 1])):
                     if (def_lang == MorphLang.UA
                             or (((res[i].language) & MorphLang.UA)) !=
                             MorphLang.UNKNOWN
                             or (((res[i + 2].language) & MorphLang.UA)) !=
                             MorphLang.UNKNOWN):
                         wstr = LanguageHelper.transliteralCorrection(
                             LanguageHelper.correctWord("{0}{1}".format(
                                 res[i].getSourceText(text),
                                 res[i + 2].getSourceText(text))), None,
                             False)
                         li = self.__processOneWord0(wstr)
                         okk = True
                         if (okk):
                             res[i].end_char = res[i + 2].end_char
                             res[i].term = wstr
                             if (li is None):
                                 li = list()
                             res[i].word_forms = li
                             if (li is not None and len(li) > 0):
                                 res[i].language = li[0].language
                             del res[i + 1:i + 1 + 2]
             elif (((ui1.uni_char == '3' or ui1.uni_char == '4'))
                   and res[i + 1].length == 1):
                 src = ("З" if ui1.uni_char == '3' else "Ч")
                 i0 = i + 1
                 if ((res[i].end_char + 1) == res[i + 1].begin_char
                         and ui0.is_cyrillic):
                     i0 -= 1
                     src = (res[i0].getSourceText(text) + src)
                 i1 = i + 1
                 if ((res[i + 1].end_char + 1) == res[i + 2].begin_char
                         and ui2.is_cyrillic):
                     i1 += 1
                     src += res[i1].getSourceText(text)
                 if (len(src) > 2):
                     wstr = LanguageHelper.transliteralCorrection(
                         LanguageHelper.correctWord(src), None, False)
                     li = self.__processOneWord0(wstr)
                     if (li is not None and len(li) > 0
                             and li[0].is_in_dictionary):
                         res[i0].end_char = res[i1].end_char
                         res[i0].term = wstr
                         res[i0].word_forms = li
                         del res[i0 + 1:i0 + 1 + i1 - i0]
             elif ((ui1.is_hiphen and ui0.is_letter and ui2.is_letter)
                   and res[i].end_char > res[i].begin_char
                   and res[i + 2].end_char > res[i + 2].begin_char):
                 newline = False
                 sps = 0
                 j = (res[i + 1].end_char + 1)
                 while j < res[i + 2].begin_char:
                     if (text[j] == '\r' or text[j] == '\n'):
                         newline = True
                         sps += 1
                     elif (not Utils.isWhitespace(text[j])):
                         break
                     else:
                         sps += 1
                     j += 1
                 full_word = LanguageHelper.correctWord(
                     res[i].getSourceText(text) +
                     res[i + 2].getSourceText(text))
                 if (not newline):
                     if (full_word in uni_lex or full_word == "ИЗЗА"):
                         newline = True
                     elif (text[res[i + 1].begin_char] == (chr(0x00AD))):
                         newline = True
                     elif (LanguageHelper.endsWithEx(
                             res[i].getSourceText(text), "О", "о", None,
                             None) and len(res[i + 2].word_forms) > 0
                           and res[i + 2].word_forms[0].is_in_dictionary):
                         if (text[res[i + 1].begin_char] == '¬'):
                             li = self.__processOneWord0(full_word)
                             if (li is not None and len(li) > 0
                                     and li[0].is_in_dictionary):
                                 newline = True
                     elif ((res[i].end_char + 2) == res[i + 2].begin_char):
                         if (not str.isupper(text[res[i + 2].begin_char])
                                 and (sps < 2) and len(full_word) > 4):
                             newline = True
                             if ((i + 3) < len(res)):
                                 ui3 = twrch[res[i + 3].begin_char]
                                 if (ui3.is_hiphen):
                                     newline = False
                     elif (((res[i].end_char + 1) == res[i + 1].begin_char
                            and sps > 0 and (sps < 3))
                           and len(full_word) > 4):
                         newline = True
                 if (newline):
                     li = self.__processOneWord0(full_word)
                     if (li is not None and len(li) > 0
                             and ((li[0].is_in_dictionary
                                   or full_word in uni_lex))):
                         res[i].end_char = res[i + 2].end_char
                         res[i].term = full_word
                         res[i].word_forms = li
                         del res[i + 1:i + 1 + 2]
                 else:
                     pass
             elif ((ui1.is_letter and ui0.is_letter and res[i].length > 2)
                   and res[i + 1].length > 1):
                 if (ui0.is_upper != ui1.is_upper):
                     continue
                 if (not ui0.is_cyrillic or not ui1.is_cyrillic):
                     continue
                 newline = False
                 j = (res[i].end_char + 1)
                 while j < res[i + 1].begin_char:
                     if (twrch[j].code == 0xD or twrch[j].code == 0xA):
                         newline = True
                         break
                     j += 1
                 if (not newline):
                     continue
                 full_word = LanguageHelper.correctWord(
                     res[i].getSourceText(text) +
                     res[i + 1].getSourceText(text))
                 if (not full_word in uni_lex):
                     continue
                 li = self.__processOneWord0(full_word)
                 if (li is not None and len(li) > 0
                         and li[0].is_in_dictionary):
                     res[i].end_char = res[i + 1].end_char
                     res[i].term = full_word
                     res[i].word_forms = li
                     del res[i + 1]
     i = 0
     first_pass2710 = True
     while True:
         if first_pass2710: first_pass2710 = False
         else: i += 1
         if (not (i < len(res))): break
         mt = res[i]
         mt.char_info = CharsInfo()
         ui0 = twrch[mt.begin_char]
         ui00 = UnicodeInfo.ALL_CHARS[ord((res[i].term[0]))]
         j = (mt.begin_char + 1)
         while j <= mt.end_char:
             if (ui0.is_letter):
                 break
             ui0 = twrch[j]
             j += 1
         if (ui0.is_letter):
             res[i].char_info.is_letter = True
             if (ui00.is_latin):
                 res[i].char_info.is_latin_letter = True
             elif (ui00.is_cyrillic):
                 res[i].char_info.is_cyrillic_letter = True
             if (res[i].language == MorphLang.UNKNOWN):
                 if (LanguageHelper.isCyrillic(mt.term)):
                     res[i].language = (MorphLang.RU if
                                        def_lang.is_undefined else def_lang)
             if (good_text):
                 continue
             all_up = True
             all_lo = True
             j = mt.begin_char
             while j <= mt.end_char:
                 if (twrch[j].is_upper or twrch[j].is_digit):
                     all_lo = False
                 else:
                     all_up = False
                 j += 1
             if (all_up):
                 mt.char_info.is_all_upper = True
             elif (all_lo):
                 mt.char_info.is_all_lower = True
             elif (((ui0.is_upper or twrch[mt.begin_char].is_digit))
                   and mt.end_char > mt.begin_char):
                 all_lo = True
                 j = (mt.begin_char + 1)
                 while j <= mt.end_char:
                     if (twrch[j].is_upper or twrch[j].is_digit):
                         all_lo = False
                         break
                     j += 1
                 if (all_lo):
                     mt.char_info.is_capital_upper = True
                 elif (twrch[mt.end_char].is_lower
                       and (mt.end_char - mt.begin_char) > 1):
                     all_up = True
                     j = mt.begin_char
                     while j < mt.end_char:
                         if (twrch[j].is_lower):
                             all_up = False
                             break
                         j += 1
                     if (all_up):
                         mt.char_info.is_last_lower = True
         if (mt.char_info.is_last_lower and mt.length > 2
                 and mt.char_info.is_cyrillic_letter):
             pref = text[mt.begin_char:mt.begin_char + mt.end_char -
                         mt.begin_char]
             ok = False
             for wf in mt.word_forms:
                 if (wf.normal_case == pref or wf.normal_full == pref):
                     ok = True
                     break
             if (not ok):
                 mt.word_forms = list(mt.word_forms)
                 mt.word_forms.insert(
                     0, MorphWordForm._new11(pref, MorphClass.NOUN, 1))
     if (good_text or only_tokenizing):
         return res
     i = 0
     first_pass2711 = True
     while True:
         if first_pass2711: first_pass2711 = False
         else: i += 1
         if (not (i < len(res))): break
         if (res[i].length == 1 and res[i].char_info.is_latin_letter):
             ch = res[i].term[0]
             if (ch == 'C' or ch == 'A' or ch == 'P'):
                 pass
             else:
                 continue
             is_rus = False
             for ii in range(i - 1, -1, -1):
                 if ((res[ii].end_char + 1) != res[ii + 1].begin_char):
                     break
                 elif (res[ii].char_info.is_letter):
                     is_rus = res[ii].char_info.is_cyrillic_letter
                     break
             if (not is_rus):
                 ii = i + 1
                 while ii < len(res):
                     if ((res[ii - 1].end_char + 1) != res[ii].begin_char):
                         break
                     elif (res[ii].char_info.is_letter):
                         is_rus = res[ii].char_info.is_cyrillic_letter
                         break
                     ii += 1
             if (is_rus):
                 res[i].term = LanguageHelper.transliteralCorrection(
                     res[i].term, None, True)
                 res[i].char_info.is_cyrillic_letter = True
                 res[i].char_info.is_latin_letter = True
     for r in res:
         if (r.char_info.is_all_upper or r.char_info.is_capital_upper):
             if (r.language.is_cyrillic):
                 ok = False
                 for wf in r.word_forms:
                     if (wf.class0_.is_proper_surname):
                         ok = True
                         break
                 if (not ok):
                     r.word_forms = list(r.word_forms)
                     InnerMorphology.M_ENGINE_RU.processSurnameVariants(
                         r.term, r.word_forms)
     for r in res:
         for mv in r.word_forms:
             if (mv.normal_case is None):
                 mv.normal_case = r.term
     i = 0
     while i < (len(res) - 2):
         if (res[i].char_info.is_latin_letter
                 and res[i].char_info.is_all_upper and res[i].length == 1):
             if (twrch[res[i + 1].begin_char].is_quot
                     and res[i + 2].char_info.is_latin_letter
                     and res[i + 2].length > 2):
                 if ((res[i].end_char + 1) == res[i + 1].begin_char and
                     (res[i + 1].end_char + 1) == res[i + 2].begin_char):
                     wstr = "{0}{1}".format(res[i].term, res[i + 2].term)
                     li = self.__processOneWord0(wstr)
                     if (li is not None):
                         res[i].word_forms = li
                     res[i].end_char = res[i + 2].end_char
                     res[i].term = wstr
                     if (res[i + 2].char_info.is_all_lower):
                         res[i].char_info.is_all_upper = False
                         res[i].char_info.is_capital_upper = True
                     elif (not res[i + 2].char_info.is_all_upper):
                         res[i].char_info.is_all_upper = False
                     del res[i + 1:i + 1 + 2]
         i += 1
     i = 0
     first_pass2712 = True
     while True:
         if first_pass2712: first_pass2712 = False
         else: i += 1
         if (not (i < (len(res) - 1))): break
         if (not res[i].char_info.is_letter
                 and not res[i + 1].char_info.is_letter
                 and (res[i].end_char + 1) == res[i + 1].begin_char):
             if (twrch[res[i].begin_char].is_hiphen
                     and twrch[res[i + 1].begin_char].is_hiphen):
                 if (i == 0 or not twrch[res[i - 1].begin_char].is_hiphen):
                     pass
                 else:
                     continue
                 if ((i + 2) == len(res)
                         or not twrch[res[i + 2].begin_char].is_hiphen):
                     pass
                 else:
                     continue
                 res[i].end_char = res[i + 1].end_char
                 del res[i + 1]
     return res

Example #6

Show file

File: CityAttachHelper.py Project: MihaJjDa/APCLtask

 def __try1(li: typing.List['CityItemToken'], oi: 'IntOntologyItem',
            ad: 'AnalyzerDataWithOntology') -> 'ReferentToken':
     oi.value = (None)
     if (li is None or (len(li) < 1)):
         return None
     elif (li[0].typ != CityItemToken.ItemType.CITY):
         if (len(li) != 2 or li[0].typ != CityItemToken.ItemType.PROPERNAME
                 or li[1].typ != CityItemToken.ItemType.NOUN):
             return None
     i = 1
     oi.value = li[0].onto_item
     ok = not li[0].doubtful
     if ((ok and li[0].onto_item is not None
          and li[0].onto_item.misc_attr is None) and ad is not None):
         if (li[0].onto_item.owner != ad.local_ontology
                 and not li[0].onto_item.owner.is_ext_ontology):
             if (li[0].begin_token.previous is not None
                     and li[0].begin_token.previous.isValue("В", None)):
                 pass
             else:
                 ok = False
     if (len(li) == 1 and li[0].begin_token.morph.class0_.is_adjective):
         sits = StreetItemToken.tryParseList(li[0].begin_token, None, 3)
         if (sits is not None and len(sits) == 2
                 and sits[1].typ == StreetItemType.NOUN):
             return None
     typ = None
     alttyp = None
     mc = li[0].morph
     if (i < len(li)):
         if (li[i].typ == CityItemToken.ItemType.NOUN):
             at = None
             if (not li[i].chars.is_all_lower
                     and (li[i].whitespaces_after_count < 2)):
                 sit = StreetItemToken.tryParse(li[i].end_token.next0_,
                                                None, False, None, False)
                 if (sit is not None and sit.typ == StreetItemType.NOUN):
                     at = AddressItemToken.tryParse(li[i].begin_token, None,
                                                    False, False, None)
                     if (at is not None):
                         at2 = AddressItemToken.tryParse(
                             li[i].end_token.next0_, None, False, False,
                             None)
                         if (at2 is not None and at2.typ
                                 == AddressItemToken.ItemType.STREET):
                             at = (None)
             if (at is None):
                 typ = li[i].value
                 alttyp = li[i].alt_value
                 if (li[i].begin_token.isValue("СТ", None)
                         and li[i].begin_token.chars.is_all_upper):
                     return None
                 if ((i + 1) == len(li)):
                     ok = True
                     if (not li[i].morph.case_.is_undefined):
                         mc = li[i].morph
                     i += 1
                 elif (ok):
                     i += 1
                 else:
                     tt0 = li[0].begin_token.previous
                     if ((isinstance(tt0, TextToken))
                             and (tt0.whitespaces_after_count < 3)):
                         if (tt0.isValue("МЭР", "МЕР")
                                 or tt0.isValue("ГЛАВА", None)
                                 or tt0.isValue("ГРАДОНАЧАЛЬНИК", None)):
                             ok = True
                             i += 1
     if (not ok and oi.value is not None
             and (len(oi.value.canonic_text) < 4)):
         return None
     if (not ok and li[0].begin_token.morph.class0_.is_proper_name):
         return None
     if (not ok):
         if (not MiscHelper.isExistsInDictionary(
                 li[0].begin_token, li[0].end_token, (MorphClass.ADJECTIVE)
                 | MorphClass.NOUN | MorphClass.PRONOUN)):
             ok = (li[0].geo_object_before or li[i - 1].geo_object_after)
             if (ok and li[0].begin_token == li[0].end_token):
                 mcc = li[0].begin_token.getMorphClassInDictionary()
                 if (mcc.is_proper_name or mcc.is_proper_surname):
                     ok = False
                 elif (li[0].geo_object_before
                       and (li[0].whitespaces_after_count < 2)):
                     ad1 = AddressItemToken.tryParse(
                         li[0].begin_token, None, False, False, None)
                     if (ad1 is not None and ad1.typ
                             == AddressItemToken.ItemType.STREET):
                         ad2 = AddressItemToken.tryParse(
                             li[0].end_token.next0_, None, False, False,
                             None)
                         if (ad2 is None or ad2.typ !=
                                 AddressItemToken.ItemType.STREET):
                             ok = False
                     elif (AddressItemToken.tryAttachOrg(li[0].begin_token)
                           is not None):
                         ok = False
         if (ok):
             if (li[0].kit.processReferent("PERSON", li[0].begin_token)
                     is not None):
                 ok = False
     if (not ok):
         ok = CityAttachHelper.checkYearAfter(li[0].end_token.next0_)
     if (not ok and ((not li[0].begin_token.morph.class0_.is_adjective
                      or li[0].begin_token != li[0].end_token))):
         ok = CityAttachHelper.checkCityAfter(li[0].end_token.next0_)
     if (not ok):
         return None
     if (i < len(li)):
         del li[i:i + len(li) - i]
     rt = None
     if (oi.value is None):
         if (li[0].value is not None and li[0].higher_geo is not None):
             cap = GeoReferent()
             cap._addName(li[0].value)
             cap._addTypCity(li[0].kit.base_language)
             cap.higher = li[0].higher_geo
             if (typ is not None):
                 cap._addTyp(typ)
             if (alttyp is not None):
                 cap._addTyp(alttyp)
             rt = ReferentToken(cap, li[0].begin_token, li[0].end_token)
         else:
             if (li[0].value is None):
                 return None
             if (typ is None):
                 if ((len(li) == 1
                      and li[0].begin_token.previous is not None
                      and li[0].begin_token.previous.is_hiphen) and
                     (isinstance(li[0].begin_token.previous.previous,
                                 ReferentToken)) and
                     (isinstance(
                         li[0].begin_token.previous.previous.getReferent(),
                         GeoReferent))):
                     pass
                 else:
                     return None
             else:
                 if (not LanguageHelper.endsWithEx(typ, "ПУНКТ",
                                                   "ПОСЕЛЕНИЕ", "ПОСЕЛЕННЯ",
                                                   "ПОСЕЛОК")):
                     if (not LanguageHelper.endsWith(typ, "CITY")):
                         if (typ == "СТАНЦИЯ" and
                             ((MiscLocationHelper.checkGeoObjectBefore(
                                 li[0].begin_token)))):
                             pass
                         elif (len(li) > 1
                               and li[1].typ == CityItemToken.ItemType.NOUN
                               and li[0].typ
                               == CityItemToken.ItemType.CITY):
                             pass
                         else:
                             return None
                 if (li[0].begin_token.morph.class0_.is_adjective):
                     li[0].value = ProperNameHelper.getNameEx(
                         li[0].begin_token, li[0].end_token,
                         MorphClass.ADJECTIVE, li[1].morph.case_,
                         li[1].morph.gender, False, False)
     elif (isinstance(oi.value.referent, GeoReferent)):
         rt = ReferentToken._new719(
             Utils.asObjectOrNull(oi.value.referent, GeoReferent),
             li[0].begin_token, li[len(li) - 1].end_token, mc)
     elif (typ is None):
         typ = oi.value.typ
     if (rt is None):
         city = GeoReferent()
         city._addName(
             (li[0].value if oi.value is None else oi.value.canonic_text))
         if (typ is not None):
             city._addTyp(typ)
         else:
             city._addTypCity(li[0].kit.base_language)
         if (alttyp is not None):
             city._addTyp(alttyp)
         rt = ReferentToken._new719(city, li[0].begin_token,
                                    li[len(li) - 1].end_token, mc)
     if ((isinstance(rt.referent, GeoReferent)) and len(li) == 1
             and (rt.referent).is_city):
         if (rt.begin_token.previous is not None
                 and rt.begin_token.previous.isValue("Г", None)):
             rt.begin_token = rt.begin_token.previous
         elif ((rt.begin_token.previous is not None
                and rt.begin_token.previous.isChar('.')
                and rt.begin_token.previous.previous is not None)
               and rt.begin_token.previous.previous.isValue("Г", None)):
             rt.begin_token = rt.begin_token.previous.previous
         elif (rt.end_token.next0_ is not None
               and (rt.whitespaces_after_count < 2)
               and rt.end_token.next0_.isValue("Г", None)):
             rt.end_token = rt.end_token.next0_
             if (rt.end_token.next0_ is not None
                     and rt.end_token.next0_.isChar('.')):
                 rt.end_token = rt.end_token.next0_
     return rt

Example #7

Show file

File: NounPhraseItem.py Project: MihaJjDa/APCLtask

 def tryParse(t: 'Token', items: typing.List['NounPhraseItem'],
              attrs: 'NounPhraseParseAttr') -> 'NounPhraseItem':
     if (t is None):
         return None
     t0 = t
     _can_be_surname = False
     _is_doubt_adj = False
     rt = Utils.asObjectOrNull(t, ReferentToken)
     if (rt is not None and rt.begin_token == rt.end_token):
         res = NounPhraseItem.tryParse(rt.begin_token, items, attrs)
         if (res is not None):
             res.begin_token = res.end_token = t
             return res
     if (rt is not None and items is not None and len(items) > 0):
         res = NounPhraseItem(t, t)
         for m in t.morph.items:
             v = NounPhraseItemTextVar(m, None)
             v.normal_value = str(t.getReferent())
             res.noun_morph.append(v)
         res.can_be_noun = True
         return res
     if (isinstance(t, NumberToken)):
         pass
     has_legal_verb = False
     if (isinstance(t, TextToken)):
         if (not t.chars.is_letter):
             return None
         str0_ = (t).term
         if (str0_[len(str0_) - 1] == 'А' or str0_[len(str0_) - 1] == 'О'):
             for wf in t.morph.items:
                 if ((isinstance(wf, MorphWordForm))
                         and (wf).is_in_dictionary):
                     if (wf.class0_.is_verb):
                         mc = t.getMorphClassInDictionary()
                         if (not mc.is_noun and
                             (((attrs) &
                               (NounPhraseParseAttr.IGNOREPARTICIPLES)))
                                 == (NounPhraseParseAttr.NO)):
                             if (not LanguageHelper.endsWithEx(
                                     str0_, "ОГО", "ЕГО", None, None)):
                                 return None
                         has_legal_verb = True
                     if (wf.class0_.is_adverb):
                         if (t.next0_ is None or not t.next0_.is_hiphen):
                             if ((str0_ == "ВСЕГО" or str0_ == "ДОМА"
                                  or str0_ == "НЕСКОЛЬКО")
                                     or str0_ == "МНОГО"
                                     or str0_ == "ПОРЯДКА"):
                                 pass
                             else:
                                 return None
                     if (wf.class0_.is_adjective):
                         if (wf.containsAttr("к.ф.", None)):
                             if (t.getMorphClassInDictionary() ==
                                     MorphClass.ADJECTIVE):
                                 pass
                             else:
                                 _is_doubt_adj = True
         mc0 = t.morph.class0_
         if (mc0.is_proper_surname and not t.chars.is_all_lower):
             for wf in t.morph.items:
                 if (wf.class0_.is_proper_surname
                         and wf.number != MorphNumber.PLURAL):
                     wff = Utils.asObjectOrNull(wf, MorphWordForm)
                     if (wff is None):
                         continue
                     s = Utils.ifNotNull((Utils.ifNotNull(
                         wff.normal_full, wff.normal_case)), "")
                     if (LanguageHelper.endsWithEx(s, "ИН", "ЕН", "ЫН",
                                                   None)):
                         if (not wff.is_in_dictionary):
                             _can_be_surname = True
                         else:
                             return None
                     if (wff.is_in_dictionary
                             and LanguageHelper.endsWith(s, "ОВ")):
                         _can_be_surname = True
         if (mc0.is_proper_name and not t.chars.is_all_lower):
             for wff in t.morph.items:
                 wf = Utils.asObjectOrNull(wff, MorphWordForm)
                 if (wf is None):
                     continue
                 if (wf.normal_case == "ГОР"):
                     continue
                 if (wf.class0_.is_proper_name and wf.is_in_dictionary):
                     if (wf.normal_case is None
                             or not wf.normal_case.startswith("ЛЮБ")):
                         if (mc0.is_adjective
                                 and t.morph.containsAttr("неизм.", None)):
                             pass
                         elif (
                             (((attrs) &
                               (NounPhraseParseAttr.REFERENTCANBENOUN))
                              ) == (NounPhraseParseAttr.REFERENTCANBENOUN)):
                             pass
                         else:
                             if (items is None or (len(items) < 1)):
                                 return None
                             if (not items[0].is_std_adjective):
                                 return None
         if (mc0.is_adjective and t.morph.items_count == 1):
             if (t.morph.getIndexerItem(0).containsAttr("в.ср.ст.", None)):
                 return None
         mc1 = t.getMorphClassInDictionary()
         if (mc1 == MorphClass.VERB):
             return None
         if (((((attrs) & (NounPhraseParseAttr.IGNOREPARTICIPLES)))
              == (NounPhraseParseAttr.IGNOREPARTICIPLES)
              and t.morph.class0_.is_verb and not t.morph.class0_.is_noun)
                 and not t.morph.class0_.is_proper):
             for wf in t.morph.items:
                 if (wf.class0_.is_verb):
                     if (wf.containsAttr("дейст.з.", None)):
                         if (LanguageHelper.endsWith((t).term, "СЯ")):
                             pass
                         else:
                             return None
     t1 = None
     for k in range(2):
         t = (Utils.ifNotNull(t1, t0))
         if (k == 0):
             if ((((isinstance(t0, TextToken))) and t0.next0_ is not None
                  and t0.next0_.is_hiphen)
                     and t0.next0_.next0_ is not None):
                 if (not t0.is_whitespace_after
                         and not t0.morph.class0_.is_pronoun):
                     if (not t0.next0_.is_whitespace_after):
                         t = t0.next0_.next0_
                     elif (t0.next0_.next0_.chars.is_all_lower
                           and LanguageHelper.endsWith((t0).term, "О")):
                         t = t0.next0_.next0_
         it = NounPhraseItem._new470(t0, t, _can_be_surname)
         if (t0 == t and (isinstance(t0, ReferentToken))):
             it.can_be_noun = True
             it.morph = MorphCollection(t0.morph)
         can_be_prepos = False
         for v in t.morph.items:
             wf = Utils.asObjectOrNull(v, MorphWordForm)
             if (v.class0_.is_preposition):
                 can_be_prepos = True
             if (v.class0_.is_adjective
                     or ((v.class0_.is_pronoun
                          and not v.class0_.is_personal_pronoun)) or
                 ((v.class0_.is_noun and (isinstance(t, NumberToken))))):
                 if (NounPhraseItem.tryAccordVariant(
                         items, (0 if items is None else len(items)), v)):
                     is_doub = False
                     if (v.containsAttr("к.ф.", None)):
                         continue
                     if (v.containsAttr("собир.", None)
                             and not ((isinstance(t, NumberToken)))):
                         if (wf is not None and wf.is_in_dictionary):
                             return None
                         continue
                     if (v.containsAttr("сравн.", None)):
                         continue
                     ok = True
                     if (isinstance(t, TextToken)):
                         s = (t).term
                         if (s == "ПРАВО" or s == "ПРАВА"):
                             ok = False
                         elif (LanguageHelper.endsWith(s, "ОВ")
                               and t.getMorphClassInDictionary().is_noun):
                             ok = False
                         elif (wf is not None
                               and ((wf.normal_case == "САМ"
                                     or wf.normal_case == "ТО"))):
                             ok = False
                     elif (isinstance(t, NumberToken)):
                         if (v.class0_.is_noun
                                 and t.morph.class0_.is_adjective):
                             ok = False
                         elif (t.morph.class0_.is_noun and ((
                             (attrs) &
                             (NounPhraseParseAttr.PARSENUMERICASADJECTIVE)))
                               == (NounPhraseParseAttr.NO)):
                             ok = False
                     if (ok):
                         it.adj_morph.append(NounPhraseItemTextVar(v, t))
                         it.can_be_adj = True
                         if (_is_doubt_adj and t0 == t):
                             it.is_doubt_adjective = True
                         if (has_legal_verb and wf is not None
                                 and wf.is_in_dictionary):
                             it.can_be_noun = True
             can_be_noun_ = False
             if (isinstance(t, NumberToken)):
                 pass
             elif (v.class0_.is_noun
                   or ((wf is not None and wf.normal_case == "САМ"))):
                 can_be_noun_ = True
             elif (v.class0_.is_personal_pronoun):
                 if (items is None or len(items) == 0):
                     can_be_noun_ = True
                 else:
                     for it1 in items:
                         if (it1.is_verb):
                             return None
                     if (len(items) == 1):
                         if (items[0].can_be_adj_for_personal_pronoun):
                             can_be_noun_ = True
             elif ((v.class0_.is_pronoun and
                    ((items is None or len(items) == 0 or
                      ((len(items) == 1
                        and items[0].can_be_adj_for_personal_pronoun))))
                    and wf is not None) and
                   ((((wf.normal_case == "ТОТ" or wf.normal_full == "ТО"
                       or wf.normal_case == "ТО") or wf.normal_case == "ЭТО"
                      or wf.normal_case == "ВСЕ") or wf.normal_case == "ЧТО"
                     or wf.normal_case == "КТО"))):
                 if (wf.normal_case == "ВСЕ"):
                     if (t.next0_ is not None
                             and t.next0_.isValue("РАВНО", None)):
                         return None
                 can_be_noun_ = True
             elif (wf is not None and ((Utils.ifNotNull(
                     wf.normal_full, wf.normal_case))) == "КОТОРЫЙ"):
                 return None
             elif (v.class0_.is_proper and (isinstance(t, TextToken))):
                 if (t.length_char > 4 or v.class0_.is_proper_name):
                     can_be_noun_ = True
             if (can_be_noun_):
                 if (NounPhraseItem.tryAccordVariant(
                         items, (0 if items is None else len(items)), v)):
                     it.noun_morph.append(NounPhraseItemTextVar(v, t))
                     it.can_be_noun = True
         if (t0 != t):
             for v in it.adj_morph:
                 v.correctPrefix(Utils.asObjectOrNull(t0, TextToken), False)
             for v in it.noun_morph:
                 v.correctPrefix(Utils.asObjectOrNull(t0, TextToken), True)
         if (k == 1 and it.can_be_noun and not it.can_be_adj):
             if (t1 is not None):
                 it.end_token = t1
             else:
                 it.end_token = t0.next0_.next0_
             for v in it.noun_morph:
                 if (v.normal_value is not None
                         and (v.normal_value.find('-') < 0)):
                     v.normal_value = "{0}-{1}".format(
                         v.normal_value,
                         it.end_token.getNormalCaseText(
                             None, False, MorphGender.UNDEFINED, False))
         if (it.can_be_adj):
             if (NounPhraseItem.__m_std_adjectives.tryParse(
                     it.begin_token, TerminParseAttr.NO) is not None):
                 it.is_std_adjective = True
         if (can_be_prepos and it.can_be_noun):
             if (items is not None and len(items) > 0):
                 npt1 = NounPhraseHelper.tryParse(
                     t,
                     Utils.valToEnum((NounPhraseParseAttr.PARSEPREPOSITION)
                                     | (NounPhraseParseAttr.PARSEPRONOUNS) |
                                     (NounPhraseParseAttr.PARSEVERBS),
                                     NounPhraseParseAttr), 0)
                 if (npt1 is not None and npt1.end_char > t.end_char):
                     return None
             else:
                 npt1 = NounPhraseHelper.tryParse(
                     t.next0_,
                     Utils.valToEnum((NounPhraseParseAttr.PARSEPRONOUNS) |
                                     (NounPhraseParseAttr.PARSEVERBS),
                                     NounPhraseParseAttr), 0)
                 if (npt1 is not None):
                     mc = LanguageHelper.getCaseAfterPreposition((t).lemma)
                     if (not ((mc) & npt1.morph.case_).is_undefined):
                         return None
         if (it.can_be_noun or it.can_be_adj or k == 1):
             if (it.begin_token.morph.class0_.is_pronoun):
                 tt2 = it.end_token.next0_
                 if ((tt2 is not None and tt2.is_hiphen
                      and not tt2.is_whitespace_after)
                         and not tt2.is_whitespace_before):
                     tt2 = tt2.next0_
                 if (isinstance(tt2, TextToken)):
                     ss = (tt2).term
                     if ((ss == "ЖЕ" or ss == "БЫ" or ss == "ЛИ")
                             or ss == "Ж"):
                         it.end_token = tt2
                     elif (ss == "НИБУДЬ" or ss == "ЛИБО"
                           or (((ss == "ТО" and tt2.previous.is_hiphen))
                               and it.can_be_adj)):
                         it.end_token = tt2
                         for m in it.adj_morph:
                             m.normal_value = "{0}-{1}".format(
                                 m.normal_value, ss)
                             if (m.single_number_value is not None):
                                 m.single_number_value = "{0}-{1}".format(
                                     m.single_number_value, ss)
             return it
         if (t0 == t):
             if (t0.isValue("БИЗНЕС", None) and t0.next0_ is not None
                     and t0.next0_.chars == t0.chars):
                 t1 = t0.next0_
                 continue
             return it
     return None

Example #8

Show file

 def __TryParse(t: 'Token',
                prev: 'TransItemToken',
                after_conj: bool,
                attach_high: bool = False) -> 'TransItemToken':
     if (t is None):
         return None
     t1 = t
     if (t1.isChar(',')):
         t1 = t1.next0_
     if (t1 is not None and t1.isValue("ПРИНАДЛЕЖАТЬ", "НАЛЕЖАТИ")):
         t1 = t1.next0_
     if (isinstance(t1, ReferentToken)):
         if (t1.getReferent().type_name == "ORGANIZATION"):
             return TransItemToken._new2521(t, t1, TransItemToken.Typs.ORG,
                                            t1.getReferent(), t1.morph)
     route = False
     if (t1 is not None and ((t1.isValue("СЛЕДОВАТЬ", "СЛІДУВАТИ")
                              or t1.isValue("ВЫПОЛНЯТЬ", "ВИКОНУВАТИ")))):
         t1 = t1.next0_
         route = True
     if (t1 is not None and t1.morph.class0_.is_preposition):
         t1 = t1.next0_
     if (t1 is not None and
         ((t1.isValue("РЕЙС", None) or t1.isValue("МАРШРУТ", None)))):
         t1 = t1.next0_
         route = True
     if (isinstance(t1, ReferentToken)):
         if (isinstance(t1.getReferent(), GeoReferent)):
             geo_ = Utils.asObjectOrNull(t1.getReferent(), GeoReferent)
             if (geo_.is_state or geo_.is_city):
                 tit = TransItemToken._new2522(t, t1,
                                               TransItemToken.Typs.ROUTE,
                                               list())
                 tit.route_items.append(geo_)
                 t1 = t1.next0_
                 first_pass3132 = True
                 while True:
                     if first_pass3132: first_pass3132 = False
                     else: t1 = t1.next0_
                     if (not (t1 is not None)): break
                     if (t1.is_hiphen):
                         continue
                     if (t1.morph.class0_.is_preposition
                             or t1.morph.class0_.is_conjunction):
                         continue
                     geo_ = (Utils.asObjectOrNull(t1.getReferent(),
                                                  GeoReferent))
                     if (geo_ is None):
                         break
                     if (not geo_.is_city and not geo_.is_state):
                         break
                     tit.route_items.append(geo_)
                     tit.end_token = t1
                 if (len(tit.route_items) > 1 or route):
                     return tit
         elif ((isinstance(t1.getReferent(), DateReferent))
               and (t1.whitespaces_before_count < 3)):
             tit = TransItemToken._new2523(t, t1, TransItemToken.Typs.DATE,
                                           t1.getReferent())
             if (t1.next0_ is not None):
                 if (t1.next0_.isValue("В", None)
                         and t1.next0_.next0_ is not None
                         and t1.next0_.next0_.isChar('.')):
                     tit.end_token = t1.next0_.next0_
                 elif (t1.next0_.isValue("ВЫП", None)
                       or t1.next0_.isValue("ВЫПУСК", None)):
                     tit.end_token = t1.next0_
                     if (t1.next0_.next0_ is not None
                             and t1.next0_.next0_.isChar('.')):
                         tit.end_token = t1.next0_.next0_
             return tit
     if (isinstance(t, TextToken)):
         num = MiscHelper.checkNumberPrefix(t)
         if (num is not None):
             tit = TransItemToken.__attachRusAutoNumber(num)
             if (tit is None):
                 tit = TransItemToken._attachNumber(num, False)
             if (tit is not None):
                 tit.begin_token = t
                 return tit
         tok = TransItemToken.M_ONTOLOGY.tryParse(t, TerminParseAttr.NO)
         if (tok is None and ((t.isValue("С", None) or t.isValue("C", None)
                               or t.isValue("ЗА", None)))):
             tok = TransItemToken.M_ONTOLOGY.tryParse(
                 t.next0_, TerminParseAttr.NO)
         if (tok is None and BracketHelper.isBracket(t, True)):
             tok1 = TransItemToken.M_ONTOLOGY.tryParse(
                 t.next0_, TerminParseAttr.NO)
             if (tok1 is not None and BracketHelper.isBracket(
                     tok1.end_token.next0_, True)):
                 tok = tok1
                 tok.begin_token = t
                 tok.end_token = tok.end_token.next0_
                 tok.begin_token = t
             elif (tok1 is not None):
                 tt = Utils.asObjectOrNull(tok1.termin,
                                           TransItemToken.TransTermin)
                 if (tt.typ == TransItemToken.Typs.BRAND):
                     tok = tok1
                     tok.begin_token = t
         if (tok is None and t.isValue("МАРКА", None)):
             res1 = TransItemToken.__TryParse(t.next0_, prev, after_conj,
                                              False)
             if (res1 is not None):
                 if (res1.typ == TransItemToken.Typs.NAME
                         or res1.typ == TransItemToken.Typs.BRAND):
                     res1.begin_token = t
                     res1.typ = TransItemToken.Typs.BRAND
                     return res1
         if (tok is not None):
             tt = Utils.asObjectOrNull(tok.termin,
                                       TransItemToken.TransTermin)
             if (tt.typ == TransItemToken.Typs.NUMBER):
                 tit = TransItemToken.__attachRusAutoNumber(
                     tok.end_token.next0_)
                 if (tit is None):
                     tit = TransItemToken._attachNumber(
                         tok.end_token.next0_, False)
                 if (tit is not None):
                     tit.begin_token = t
                     return tit
                 else:
                     return None
             if (tt.is_doubt and not attach_high):
                 if (prev is None or prev.typ != TransItemToken.Typs.NOUN):
                     if ((prev is not None
                          and prev.typ == TransItemToken.Typs.BRAND
                          and tt.typ == TransItemToken.Typs.BRAND)
                             and Utils.compareStrings(
                                 tt.canonic_text, prev.value, True) == 0):
                         pass
                     else:
                         return None
             if (tt.canonic_text == "СУДНО"):
                 if ((((tok.morph.number) & (MorphNumber.PLURAL))) !=
                     (MorphNumber.UNDEFINED)):
                     if (not BracketHelper.canBeStartOfSequence(
                             tok.end_token.next0_, False, False)):
                         return None
             tit = TransItemToken._new2524(tok.begin_token, tok.end_token,
                                           tt.kind, tt.typ, tt.is_doubt,
                                           tok.chars, tok.morph)
             tit.value = tt.canonic_text
             if (tit.typ == TransItemToken.Typs.NOUN):
                 tit.value = tit.value.lower()
             else:
                 tit.value = tit.value.upper()
             return tit
         if (tok is None and t.morph.class0_.is_adjective):
             npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0)
             if (npt is not None and len(npt.adjectives) > 0):
                 state_ = None
                 tt = t
                 first_pass3133 = True
                 while True:
                     if first_pass3133: first_pass3133 = False
                     else: tt = tt.next0_
                     if (not (tt is not None
                              and tt.previous != npt.end_token)):
                         break
                     tok = TransItemToken.M_ONTOLOGY.tryParse(
                         tt, TerminParseAttr.NO)
                     if (tok is None and state_ is None):
                         state_ = tt.kit.processReferent("GEO", tt)
                     if (tok is not None
                             and tok.end_token == npt.end_token):
                         if ((tok.termin).typ == TransItemToken.Typs.NOUN):
                             tit = TransItemToken._new2524(
                                 t, tok.end_token, (tok.termin).kind,
                                 TransItemToken.Typs.NOUN,
                                 (tok.termin).is_doubt, tok.chars,
                                 npt.morph)
                             tit.value = (tok.termin).canonic_text.lower()
                             tit.alt_value = npt.getNormalCaseText(
                                 None, False, MorphGender.UNDEFINED,
                                 False).lower()
                             if (LanguageHelper.endsWithEx(
                                     tit.alt_value, "суд", "суда", None,
                                     None)):
                                 if (not BracketHelper.canBeStartOfSequence(
                                         tok.end_token.next0_, False,
                                         False)):
                                     continue
                             if (state_ is not None):
                                 if ((state_.referent).is_state):
                                     tit.state = state_
                             return tit
     if (t is not None and t.isValue("КЛАСС", None)
             and t.next0_ is not None):
         br = BracketHelper.tryParse(t.next0_, BracketParseAttr.NO, 100)
         if (br is not None):
             return TransItemToken._new2526(
                 t, br.end_token, TransItemToken.Typs.CLASS,
                 MiscHelper.getTextValueOfMetaToken(br, GetTextAttr.NO))
     nt = Utils.asObjectOrNull(t, NumberToken)
     if (nt is not None):
         if (prev is None or nt.typ != NumberSpellingType.DIGIT):
             return None
         if (prev.typ == TransItemToken.Typs.BRAND):
             return TransItemToken.__attachModel(t, False, prev)
         else:
             return None
     res = TransItemToken.__attachRusAutoNumber(t)
     if ((res) is not None):
         if (not res.is_doubt):
             return res
         if (prev is not None and prev.typ == TransItemToken.Typs.NOUN
                 and prev.kind == TransportKind.AUTO):
             return res
         if (prev is not None
                 and ((prev.typ == TransItemToken.Typs.BRAND
                       or prev.typ == TransItemToken.Typs.MODEL))):
             return res
     t1 = t
     if (t.is_hiphen):
         t1 = t.next0_
     if (prev is not None and prev.typ == TransItemToken.Typs.BRAND
             and t1 is not None):
         tit = TransItemToken.__attachModel(t1, True, prev)
         if (tit is not None):
             tit.begin_token = t
             return tit
     if (prev is not None
             and ((prev.typ == TransItemToken.Typs.NOUN or after_conj))):
         br = BracketHelper.tryParse(t, BracketParseAttr.NO, 100)
         if (br is not None and br.is_quote_type):
             tit = TransItemToken.tryParse(br.begin_token.next0_, prev,
                                           after_conj, False)
             if (tit is not None and tit.end_token.next0_ == br.end_token):
                 if (not tit.is_doubt
                         or tit.typ == TransItemToken.Typs.BRAND):
                     tit.begin_token = br.begin_token
                     tit.end_token = br.end_token
                     return tit
             s = MiscHelper.getTextValueOfMetaToken(br, GetTextAttr.NO)
             if (not Utils.isNullOrEmpty(s) and (len(s) < 30)):
                 chars_ = 0
                 digs = 0
                 un = 0
                 for c in s:
                     if (not Utils.isWhitespace(c)):
                         if (str.isalpha(c)):
                             chars_ += 1
                         elif (str.isdigit(c)):
                             digs += 1
                         else:
                             un += 1
                 if (((digs == 0 and un == 0
                       and t.next0_.chars.is_capital_upper))
                         or prev.kind == TransportKind.SHIP
                         or prev.kind == TransportKind.SPACE):
                     return TransItemToken._new2526(
                         br.begin_token, br.end_token,
                         TransItemToken.Typs.NAME, s)
                 if (digs > 0 and (chars_ < 5)):
                     return TransItemToken._new2526(
                         br.begin_token, br.end_token,
                         TransItemToken.Typs.MODEL, s.replace(" ", ""))
     if (prev is not None and (((prev.typ == TransItemToken.Typs.NOUN
                                 or prev.typ == TransItemToken.Typs.BRAND
                                 or prev.typ == TransItemToken.Typs.NAME)
                                or prev.typ == TransItemToken.Typs.MODEL))):
         tit = TransItemToken.__attachModel(
             t, prev.typ != TransItemToken.Typs.NAME, prev)
         if (tit is not None):
             return tit
     if (((prev is not None and prev.typ == TransItemToken.Typs.NOUN
           and prev.kind == TransportKind.AUTO) and
          (isinstance(t, TextToken)) and t.chars.is_letter)
             and not t.chars.is_all_lower
             and (t.whitespaces_before_count < 2)):
         pt = t.kit.processReferent("PERSON", t)
         if (pt is None):
             tit = TransItemToken._new2529(t, t, TransItemToken.Typs.BRAND)
             tit.value = (t).term
             return tit
     if (((prev is not None and prev.typ == TransItemToken.Typs.NOUN and
           ((prev.kind == TransportKind.SHIP
             or prev.kind == TransportKind.SPACE)))) or after_conj):
         if (t.chars.is_capital_upper):
             ok = True
             npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0)
             if (npt is not None and len(npt.adjectives) > 0):
                 ok = False
             else:
                 rt = t.kit.processReferent("PERSON", t)
                 if (rt is not None):
                     ok = False
             if (t.getMorphClassInDictionary().is_proper_surname):
                 if (not t.morph.case_.is_nominative):
                     ok = False
             if (ok):
                 t1 = t
                 tt = t.next0_
                 while tt is not None:
                     if (tt.whitespaces_before_count > 1):
                         break
                     if (tt.chars != t.chars):
                         break
                     tit = TransItemToken.tryParse(tt, None, False, False)
                     if ((tit) is not None):
                         break
                     t1 = tt
                     tt = tt.next0_
                 s = MiscHelper.getTextValue(t, t1, GetTextAttr.NO)
                 if (s is not None):
                     res1 = TransItemToken._new2530(
                         t, t1, TransItemToken.Typs.NAME, True, s)
                     if (not t1.is_newline_after):
                         br = BracketHelper.tryParse(
                             t1.next0_, BracketParseAttr.NO, 100)
                         if (br is not None):
                             res1.end_token = br.end_token
                             res1.alt_value = res1.value
                             res1.value = MiscHelper.getTextValueOfMetaToken(
                                 br, GetTextAttr.NO)
                     return res1
     return None

Example #9

Show file

File: MorphToken.py Project: MihaJjDa/APCLtask

 def __compareForms(self, x: 'MorphWordForm', y: 'MorphWordForm') -> int:
     vx = Utils.ifNotNull(x.normal_full, x.normal_case)
     vy = Utils.ifNotNull(y.normal_full, y.normal_case)
     if (vx == vy):
         return 0
     if (Utils.isNullOrEmpty(vx)):
         return 1
     if (Utils.isNullOrEmpty(vy)):
         return -1
     lastx = vx[len(vx) - 1]
     lasty = vy[len(vy) - 1]
     if (x.class0_.is_proper_surname and not self.char_info.is_all_lower):
         if (LanguageHelper.endsWithEx(vx, "ОВ", "ЕВ", "ИН", None)):
             if (not y.class0_.is_proper_surname):
                 return -1
     if (y.class0_.is_proper_surname and not self.char_info.is_all_lower):
         if (LanguageHelper.endsWithEx(vy, "ОВ", "ЕВ", "ИН", None)):
             if (not x.class0_.is_proper_surname):
                 return 1
             if (len(vx) > len(vy)):
                 return -1
             if (len(vx) < len(vy)):
                 return 1
             return 0
     if (x.class0_ == y.class0_):
         if (x.class0_.is_adjective):
             if (lastx == 'Й' and lasty != 'Й'):
                 return -1
             if (lastx != 'Й' and lasty == 'Й'):
                 return 1
             if (not LanguageHelper.endsWith(vx, "ОЙ")
                     and LanguageHelper.endsWith(vy, "ОЙ")):
                 return -1
             if (LanguageHelper.endsWith(vx, "ОЙ")
                     and not LanguageHelper.endsWith(vy, "ОЙ")):
                 return 1
         if (x.class0_.is_noun):
             if (x.number == MorphNumber.SINGULAR
                     and y.number == MorphNumber.PLURAL and len(vx) <=
                 (len(vy) + 1)):
                 return -1
             if (x.number == MorphNumber.PLURAL
                     and y.number == MorphNumber.SINGULAR and len(vx) >=
                 (len(vy) - 1)):
                 return 1
         if (len(vx) < len(vy)):
             return -1
         if (len(vx) > len(vy)):
             return 1
         return 0
     if (x.class0_.is_adverb):
         return 1
     if (x.class0_.is_noun and x.is_in_dictionary):
         if (y.class0_.is_adjective and y.is_in_dictionary):
             if (not "к.ф." in y.misc.attrs):
                 return 1
         return -1
     if (x.class0_.is_adjective):
         if (not x.is_in_dictionary and y.class0_.is_noun
                 and y.is_in_dictionary):
             return 1
         return -1
     if (x.class0_.is_verb):
         if (y.class0_.is_noun or y.class0_.is_adjective
                 or y.class0_.is_preposition):
             return 1
         return -1
     if (y.class0_.is_adverb):
         return -1
     if (y.class0_.is_noun and y.is_in_dictionary):
         return 1
     if (y.class0_.is_adjective):
         if (((x.class0_.is_noun or x.class0_.is_proper_secname))
                 and x.is_in_dictionary):
             return -1
         if (x.class0_.is_noun and not y.is_in_dictionary):
             if (len(vx) < len(vy)):
                 return -1
         return 1
     if (y.class0_.is_verb):
         if (x.class0_.is_noun or x.class0_.is_adjective
                 or x.class0_.is_preposition):
             return -1
         if (x.class0_.is_proper):
             return -1
         return 1
     if (len(vx) < len(vy)):
         return -1
     if (len(vx) > len(vy)):
         return 1
     return 0