Python LanguageHelper.ends_with_ex Examples

Programming Language: Python

Namespace/Package Name: pullenti.morph.LanguageHelper

Class/Type: LanguageHelper

Method/Function: ends_with_ex

Examples at hotexamples.com: 8

Python LanguageHelper.ends_with_ex - 8 examples found. These are the top rated real world Python examples of pullenti.morph.LanguageHelper.LanguageHelper.ends_with_ex extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

endsWith(21)

ends_with(18)

is_cyrillic_char(15)

isCyrillicChar(12)

endsWithEx(9)

ends_with_ex(8)

isCyrillicVowel(7)

is_cyrillic_vowel(6)

isLatinChar(5)

is_latin_char(4)

get_case_after_preposition(4)

isLatin(3)

is_latin(3)

normalize_preposition(2)

normalizePreposition(2)

is_cyrillic(2)

_get_word_lang(2)

correctWord(2)

isCyrillic(2)

getCaseAfterPreposition(2)

correct_word(2)

isHiphen(1)

isLatinVowel(1)

get_lat_for_cyr(1)

get_cyr_for_lat(1)

is_hiphen(1)

getLatForCyr(1)

getCyrForLat(1)

is_latin_vowel(1)

transliteralCorrection(1)

transliteral_correction(1)

Example #1

Show file

File: PersonReferent.py Project: pullenti/PullentiPython

 def _del_surname_end(s: str) -> str:
     if (len(s) < 3):
         return s
     if (LanguageHelper.ends_with_ex(s, "А", "У", "Е", None)):
         return s[0:0 + len(s) - 1]
     if (LanguageHelper.ends_with(s, "ОМ")
             or LanguageHelper.ends_with(s, "ЫМ")):
         return s[0:0 + len(s) - 2]
     if (LanguageHelper.ends_with_ex(s, "Я", "Ю", None, None)):
         ch1 = s[len(s) - 2]
         if (ch1 == 'Н' or ch1 == 'Л'):
             return s[0:0 + len(s) - 1] + "Ь"
     return s

Example #2

Show file

 def can_has_ref(self, r: 'Referent') -> bool:
     # Проверка, что этот референт может выступать в качестве ATTR_REF
     nam = self.name
     if (nam is None or r is None):
         return False
     if (isinstance(r, GeoReferent)):
         g = Utils.asObjectOrNull(r, GeoReferent)
         if (LanguageHelper.ends_with_ex(nam, "президент", "губернатор",
                                         None, None)):
             return g.is_state or g.is_region
         if (nam == "мэр" or nam == "градоначальник"):
             return g.is_city
         if (nam == "глава"):
             return True
         return False
     if (r.type_name == "ORGANIZATION"):
         if ((LanguageHelper.ends_with(nam, "губернатор") or nam == "мэр"
              or nam == "градоначальник") or nam == "президент"):
             return False
         if ("министр" in nam):
             if (r.find_slot(None, "министерство", True) is None):
                 return False
         if (nam.endswith("директор")):
             if ((r.find_slot(None, "суд", True)) is not None):
                 return False
         return True
     return False

Example #3

Show file

 def get_lemma(self) -> str:
     """ Лемма (вариант морфологической нормализации) """
     if (self.__m_lemma is not None):
         return self.__m_lemma
     res = None
     if (self.word_forms is not None and len(self.word_forms) > 0):
         if (len(self.word_forms) == 1):
             res = (Utils.ifNotNull(self.word_forms[0].normal_full,
                                    self.word_forms[0].normal_case))
         if (res is None and not self.char_info.is_all_lower):
             for m in self.word_forms:
                 if (m.class0_.is_proper_surname):
                     s = Utils.ifNotNull(m.normal_full,
                                         Utils.ifNotNull(m.normal_case, ""))
                     if (LanguageHelper.ends_with_ex(
                             s, "ОВ", "ЕВ", None, None)):
                         res = s
                         break
                 elif (m.class0_.is_proper_name and m.is_in_dictionary):
                     return m.normal_case
         if (res is None):
             best = None
             for m in self.word_forms:
                 if (best is None):
                     best = m
                 elif (self.__compare_forms(best, m) > 0):
                     best = m
             res = (Utils.ifNotNull(best.normal_full, best.normal_case))
     if (res is not None):
         if (LanguageHelper.ends_with_ex(res, "АНЫЙ", "ЕНЫЙ", None, None)):
             res = (res[0:0 + len(res) - 3] + "ННЫЙ")
         elif (LanguageHelper.ends_with(res, "ЙСЯ")):
             res = res[0:0 + len(res) - 2]
         elif (LanguageHelper.ends_with(res, "АНИЙ") and res == self.term):
             for wf in self.word_forms:
                 if (wf.is_in_dictionary):
                     return res
             return res[0:0 + len(res) - 1] + "Е"
         return res
     return Utils.ifNotNull(self.term, "?")

Example #4

Show file

File: MorphEngine.py Project: AAA1911/PullentiPython

 def process(self, word: str) -> typing.List['MorphWordForm']:
     """ Обработка одного слова
     
     Args:
         word(str): слово должно быть в верхнем регистре
     
     """
     if (Utils.isNullOrEmpty(word)):
         return None
     res = None
     if (len(word) > 1):
         i = 0
         while i < len(word):
             ch = word[i]
             if (LanguageHelper.is_cyrillic_vowel(ch)
                     or LanguageHelper.is_latin_vowel(ch)):
                 break
             i += 1
         if (i >= len(word)):
             return res
     mvs = []
     tn = self.m_root
     i = 0
     while i <= len(word):
         if (tn.lazy_pos > 0):
             self.__load_tree_node(tn)
         if (tn.rules is not None):
             word_begin = None
             word_end = None
             if (i == 0):
                 word_end = word
             elif (i < len(word)):
                 word_end = word[i:]
             else:
                 word_end = ""
             if (res is None):
                 res = list()
             for r in tn.rules:
                 wrapmvs20 = RefOutArgWrapper(None)
                 inoutres21 = Utils.tryGetValue(r.variants, word_end,
                                                wrapmvs20)
                 mvs = wrapmvs20.value
                 if (inoutres21):
                     if (word_begin is None):
                         if (i == len(word)):
                             word_begin = word
                         elif (i > 0):
                             word_begin = word[0:0 + i]
                         else:
                             word_begin = ""
                     r.process_result(res, word_begin, mvs)
         if (tn.nodes is None or i >= len(word)):
             break
         ch = ord(word[i])
         wraptn22 = RefOutArgWrapper(None)
         inoutres23 = Utils.tryGetValue(tn.nodes, ch, wraptn22)
         tn = wraptn22.value
         if (not inoutres23):
             break
         i += 1
     need_test_unknown_vars = True
     if (res is not None):
         for r in res:
             if ((r.class0_.is_pronoun or r.class0_.is_noun
                  or r.class0_.is_adjective)
                     or (r.class0_.is_misc and r.class0_.is_conjunction)
                     or r.class0_.is_preposition):
                 need_test_unknown_vars = False
             elif (r.class0_.is_adverb and r.normal_case is not None):
                 if (not LanguageHelper.ends_with_ex(
                         r.normal_case, "О", "А", None, None)):
                     need_test_unknown_vars = False
                 elif (r.normal_case == "МНОГО"):
                     need_test_unknown_vars = False
             elif (r.class0_.is_verb and len(res) > 1):
                 ok = False
                 for rr in res:
                     if (rr != r and rr.class0_ != r.class0_):
                         ok = True
                         break
                 if (ok and not LanguageHelper.ends_with(word, "ИМ")):
                     need_test_unknown_vars = False
     if (need_test_unknown_vars
             and LanguageHelper.is_cyrillic_char(word[0])):
         gl = 0
         sog = 0
         j = 0
         while j < len(word):
             if (LanguageHelper.is_cyrillic_vowel(word[j])):
                 gl += 1
             else:
                 sog += 1
             j += 1
         if ((gl < 2) or (sog < 2)):
             need_test_unknown_vars = False
     if (need_test_unknown_vars and res is not None and len(res) == 1):
         if (res[0].class0_.is_verb):
             if ("н.вр." in res[0].misc.attrs
                     and "нес.в." in res[0].misc.attrs
                     and not "страд.з." in res[0].misc.attrs):
                 need_test_unknown_vars = False
             elif ("б.вр." in res[0].misc.attrs
                   and "сов.в." in res[0].misc.attrs):
                 need_test_unknown_vars = False
             elif (res[0].normal_case is not None
                   and LanguageHelper.ends_with(res[0].normal_case, "СЯ")):
                 need_test_unknown_vars = False
         if (res[0].class0_.is_undefined
                 and "прдктв." in res[0].misc.attrs):
             need_test_unknown_vars = False
     if (need_test_unknown_vars):
         if (self.m_root_reverce is None):
             return res
         tn = self.m_root_reverce
         tn0 = None
         for i in range(len(word) - 1, -1, -1):
             if (tn.lazy_pos > 0):
                 self.__load_tree_node(tn)
             ch = ord(word[i])
             if (tn.nodes is None):
                 break
             wrapnext24 = RefOutArgWrapper(None)
             inoutres25 = Utils.tryGetValue(tn.nodes, ch, wrapnext24)
             next0_ = wrapnext24.value
             if (not inoutres25):
                 break
             tn = next0_
             if (tn.lazy_pos > 0):
                 self.__load_tree_node(tn)
             if (tn.reverce_variants is not None):
                 tn0 = tn
                 break
         else:
             i = -1
         if (tn0 is not None):
             glas = i < 4
             while i >= 0:
                 if (LanguageHelper.is_cyrillic_vowel(word[i])
                         or LanguageHelper.is_latin_vowel(word[i])):
                     glas = True
                     break
                 i -= 1
             if (glas):
                 for mv in tn0.reverce_variants:
                     if (((not mv.class0_.is_verb
                           and not mv.class0_.is_adjective
                           and not mv.class0_.is_noun)
                          and not mv.class0_.is_proper_surname
                          and not mv.class0_.is_proper_geo)
                             and not mv.class0_.is_proper_secname):
                         continue
                     ok = False
                     for rr in res:
                         if (rr.is_in_dictionary):
                             if (rr.class0_ == mv.class0_
                                     or rr.class0_.is_noun):
                                 ok = True
                                 break
                             if (not mv.class0_.is_adjective
                                     and rr.class0_.is_verb):
                                 ok = True
                                 break
                     if (ok):
                         continue
                     if (len(mv.tail) > 0 and
                             not LanguageHelper.ends_with(word, mv.tail)):
                         continue
                     r = MorphWordForm(mv, word)
                     if (not MorphWordForm._has_morph_equals(res, r)):
                         r.undef_coef = mv.coef
                         if (res is None):
                             res = list()
                         res.append(r)
     if (word == "ПРИ" and res is not None):
         for i in range(len(res) - 1, -1, -1):
             if (res[i].class0_.is_proper_geo):
                 del res[i]
         else:
             i = -1
     if (res is None or len(res) == 0):
         return None
     MorphEngine.__sort(res, word)
     for v in res:
         if (v.normal_case is None):
             v.normal_case = word
         if (v.class0_.is_verb):
             if (v.normal_full is None
                     and LanguageHelper.ends_with(v.normal_case, "ТЬСЯ")):
                 v.normal_full = v.normal_case[0:0 + len(v.normal_case) - 2]
         v.language = self.language
         if (v.class0_.is_preposition):
             v.normal_case = LanguageHelper.normalize_preposition(
                 v.normal_case)
     mc = MorphClass()
     for i in range(len(res) - 1, -1, -1):
         if (not res[i].is_in_dictionary and res[i].class0_.is_adjective
                 and len(res) > 1):
             if ("к.ф." in res[i].misc.attrs
                     or "неизм." in res[i].misc.attrs):
                 del res[i]
                 continue
         if (res[i].is_in_dictionary):
             mc.value |= res[i].class0_.value
     else:
         i = -1
     if (mc == MorphClass.VERB and len(res) > 1):
         for r in res:
             if (r.undef_coef > (100)
                     and r.class0_ == MorphClass.ADJECTIVE):
                 r.undef_coef = (0)
     if (len(res) == 0):
         return None
     return res

Example #5

Show file

 def __compare_forms(self, x: 'MorphWordForm', y: 'MorphWordForm') -> int:
     vx = Utils.ifNotNull(x.normal_full, x.normal_case)
     vy = Utils.ifNotNull(y.normal_full, y.normal_case)
     if (vx == vy):
         return 0
     if (Utils.isNullOrEmpty(vx)):
         return 1
     if (Utils.isNullOrEmpty(vy)):
         return -1
     lastx = vx[len(vx) - 1]
     lasty = vy[len(vy) - 1]
     if (x.class0_.is_proper_surname and not self.char_info.is_all_lower):
         if (LanguageHelper.ends_with_ex(vx, "ОВ", "ЕВ", "ИН", None)):
             if (not y.class0_.is_proper_surname):
                 return -1
     if (y.class0_.is_proper_surname and not self.char_info.is_all_lower):
         if (LanguageHelper.ends_with_ex(vy, "ОВ", "ЕВ", "ИН", None)):
             if (not x.class0_.is_proper_surname):
                 return 1
             if (len(vx) > len(vy)):
                 return -1
             if (len(vx) < len(vy)):
                 return 1
             return 0
     if (x.class0_ == y.class0_):
         if (x.class0_.is_adjective):
             if (lastx == 'Й' and lasty != 'Й'):
                 return -1
             if (lastx != 'Й' and lasty == 'Й'):
                 return 1
             if (not LanguageHelper.ends_with(vx, "ОЙ")
                     and LanguageHelper.ends_with(vy, "ОЙ")):
                 return -1
             if (LanguageHelper.ends_with(vx, "ОЙ")
                     and not LanguageHelper.ends_with(vy, "ОЙ")):
                 return 1
         if (x.class0_.is_noun):
             if (x.number == MorphNumber.SINGULAR
                     and y.number == MorphNumber.PLURAL and len(vx) <=
                 (len(vy) + 1)):
                 return -1
             if (x.number == MorphNumber.PLURAL
                     and y.number == MorphNumber.SINGULAR and len(vx) >=
                 (len(vy) - 1)):
                 return 1
         if (len(vx) < len(vy)):
             return -1
         if (len(vx) > len(vy)):
             return 1
         return 0
     if (x.class0_.is_adverb):
         return 1
     if (x.class0_.is_noun and x.is_in_dictionary):
         if (y.class0_.is_adjective and y.is_in_dictionary):
             if (not "к.ф." in y.misc.attrs):
                 return 1
         return -1
     if (x.class0_.is_adjective):
         if (not x.is_in_dictionary and y.class0_.is_noun
                 and y.is_in_dictionary):
             return 1
         return -1
     if (x.class0_.is_verb):
         if (y.class0_.is_noun or y.class0_.is_adjective
                 or y.class0_.is_preposition):
             return 1
         return -1
     if (y.class0_.is_adverb):
         return -1
     if (y.class0_.is_noun and y.is_in_dictionary):
         return 1
     if (y.class0_.is_adjective):
         if (((x.class0_.is_noun or x.class0_.is_proper_secname))
                 and x.is_in_dictionary):
             return -1
         if (x.class0_.is_noun and not y.is_in_dictionary):
             if (len(vx) < len(vy)):
                 return -1
         return 1
     if (y.class0_.is_verb):
         if (x.class0_.is_noun or x.class0_.is_adjective
                 or x.class0_.is_preposition):
             return -1
         if (x.class0_.is_proper):
             return -1
         return 1
     if (len(vx) < len(vy)):
         return -1
     if (len(vx) > len(vy)):
         return 1
     return 0

Example #6

Show file

 def try_parse(t: 'Token', items: typing.List['NounPhraseItem'],
               attrs: 'NounPhraseParseAttr') -> 'NounPhraseItem':
     if (t is None):
         return None
     t0 = t
     _can_be_surname = False
     _is_doubt_adj = False
     rt = Utils.asObjectOrNull(t, ReferentToken)
     if (rt is not None and rt.begin_token == rt.end_token
             and (isinstance(rt.begin_token, TextToken))):
         res = NounPhraseItem.try_parse(rt.begin_token, items, attrs)
         if (res is not None):
             res.begin_token = res.end_token = t
             res.can_be_noun = True
             return res
     if (rt is not None):
         res = NounPhraseItem(t, t)
         for m in t.morph.items:
             v = NounPhraseItemTextVar(m, None)
             v.normal_value = str(t.get_referent())
             res.noun_morph.append(v)
         res.can_be_noun = True
         return res
     if (isinstance(t, NumberToken)):
         pass
     has_legal_verb = False
     if (isinstance(t, TextToken)):
         if (not t.chars.is_letter):
             return None
         str0_ = t.term
         if (str0_[len(str0_) - 1] == 'А' or str0_[len(str0_) - 1] == 'О'):
             for wf in t.morph.items:
                 if ((isinstance(wf, MorphWordForm))
                         and wf.is_in_dictionary):
                     if (wf.class0_.is_verb):
                         mc = t.get_morph_class_in_dictionary()
                         if (not mc.is_noun and
                             (((attrs) &
                               (NounPhraseParseAttr.IGNOREPARTICIPLES)))
                                 == (NounPhraseParseAttr.NO)):
                             if (not LanguageHelper.ends_with_ex(
                                     str0_, "ОГО", "ЕГО", None, None)):
                                 return None
                         has_legal_verb = True
                     if (wf.class0_.is_adverb):
                         if (t.next0_ is None or not t.next0_.is_hiphen):
                             if ((str0_ == "ВСЕГО" or str0_ == "ДОМА"
                                  or str0_ == "НЕСКОЛЬКО")
                                     or str0_ == "МНОГО"
                                     or str0_ == "ПОРЯДКА"):
                                 pass
                             else:
                                 return None
                     if (wf.class0_.is_adjective):
                         if (wf.contains_attr("к.ф.", None)):
                             if (t.get_morph_class_in_dictionary() ==
                                     MorphClass.ADJECTIVE):
                                 pass
                             else:
                                 _is_doubt_adj = True
         mc0 = t.morph.class0_
         if (mc0.is_proper_surname and not t.chars.is_all_lower):
             for wf in t.morph.items:
                 if (wf.class0_.is_proper_surname
                         and wf.number != MorphNumber.PLURAL):
                     wff = Utils.asObjectOrNull(wf, MorphWordForm)
                     if (wff is None):
                         continue
                     s = Utils.ifNotNull((Utils.ifNotNull(
                         wff.normal_full, wff.normal_case)), "")
                     if (LanguageHelper.ends_with_ex(
                             s, "ИН", "ЕН", "ЫН", None)):
                         if (not wff.is_in_dictionary):
                             _can_be_surname = True
                         else:
                             return None
                     if (wff.is_in_dictionary
                             and LanguageHelper.ends_with(s, "ОВ")):
                         _can_be_surname = True
         if (mc0.is_proper_name and not t.chars.is_all_lower):
             for wff in t.morph.items:
                 wf = Utils.asObjectOrNull(wff, MorphWordForm)
                 if (wf is None):
                     continue
                 if (wf.normal_case == "ГОР"):
                     continue
                 if (wf.class0_.is_proper_name and wf.is_in_dictionary):
                     if (wf.normal_case is None
                             or not wf.normal_case.startswith("ЛЮБ")):
                         if (mc0.is_adjective
                                 and t.morph.contains_attr("неизм.", None)):
                             pass
                         elif (
                             (((attrs) &
                               (NounPhraseParseAttr.REFERENTCANBENOUN))
                              ) == (NounPhraseParseAttr.REFERENTCANBENOUN)):
                             pass
                         else:
                             if (items is None or (len(items) < 1)):
                                 return None
                             if (not items[0].is_std_adjective):
                                 return None
         if (mc0.is_adjective and t.morph.items_count == 1):
             if (t.morph.get_indexer_item(0).contains_attr(
                     "в.ср.ст.", None)):
                 return None
         mc1 = t.get_morph_class_in_dictionary()
         if (mc1 == MorphClass.VERB and t.morph.case_.is_undefined):
             return None
         if (((((attrs) & (NounPhraseParseAttr.IGNOREPARTICIPLES)))
              == (NounPhraseParseAttr.IGNOREPARTICIPLES)
              and t.morph.class0_.is_verb and not t.morph.class0_.is_noun)
                 and not t.morph.class0_.is_proper):
             for wf in t.morph.items:
                 if (wf.class0_.is_verb):
                     if (wf.contains_attr("дейст.з.", None)):
                         if (LanguageHelper.ends_with(t.term, "СЯ")):
                             pass
                         else:
                             return None
     t1 = None
     for k in range(2):
         t = (Utils.ifNotNull(t1, t0))
         if (k == 0):
             if (((isinstance(t0, TextToken)) and t0.next0_ is not None
                  and t0.next0_.is_hiphen)
                     and t0.next0_.next0_ is not None):
                 if (not t0.is_whitespace_after
                         and not t0.morph.class0_.is_pronoun and
                         not (isinstance(t0.next0_.next0_, NumberToken))):
                     if (not t0.next0_.is_whitespace_after):
                         t = t0.next0_.next0_
                     elif (t0.next0_.next0_.chars.is_all_lower
                           and LanguageHelper.ends_with(t0.term, "О")):
                         t = t0.next0_.next0_
         it = NounPhraseItem._new404(t0, t, _can_be_surname)
         if (t0 == t and (isinstance(t0, ReferentToken))):
             it.can_be_noun = True
             it.morph = MorphCollection(t0.morph)
         can_be_prepos = False
         for v in t.morph.items:
             wf = Utils.asObjectOrNull(v, MorphWordForm)
             if (v.class0_.is_verb and not v.case_.is_undefined):
                 it.can_be_adj = True
                 it.adj_morph.append(NounPhraseItemTextVar(v, t))
                 continue
             if (v.class0_.is_preposition):
                 can_be_prepos = True
             if (v.class0_.is_adjective
                     or ((v.class0_.is_pronoun
                          and not v.class0_.is_personal_pronoun
                          and not v.contains_attr("неизм.", None))) or
                 ((v.class0_.is_noun and (isinstance(t, NumberToken))))):
                 if (NounPhraseItem.try_accord_variant(
                         items, (0 if items is None else len(items)), v,
                         False)):
                     is_doub = False
                     if (v.contains_attr("к.ф.", None)):
                         continue
                     if (v.contains_attr("собир.", None)
                             and not (isinstance(t, NumberToken))):
                         if (wf is not None and wf.is_in_dictionary):
                             return None
                         continue
                     if (v.contains_attr("сравн.", None)):
                         continue
                     ok = True
                     if (isinstance(t, TextToken)):
                         s = t.term
                         if (s == "ПРАВО" or s == "ПРАВА"):
                             ok = False
                         elif (LanguageHelper.ends_with(s, "ОВ") and
                               t.get_morph_class_in_dictionary().is_noun):
                             ok = False
                     elif (isinstance(t, NumberToken)):
                         if (v.class0_.is_noun
                                 and t.morph.class0_.is_adjective):
                             ok = False
                         elif (t.morph.class0_.is_noun and ((
                             (attrs) &
                             (NounPhraseParseAttr.PARSENUMERICASADJECTIVE)))
                               == (NounPhraseParseAttr.NO)):
                             ok = False
                     if (ok):
                         it.adj_morph.append(NounPhraseItemTextVar(v, t))
                         it.can_be_adj = True
                         if (_is_doubt_adj and t0 == t):
                             it.is_doubt_adjective = True
                         if (has_legal_verb and wf is not None
                                 and wf.is_in_dictionary):
                             it.can_be_noun = True
                         if (wf is not None and wf.class0_.is_pronoun):
                             it.can_be_noun = True
                             it.noun_morph.append(
                                 NounPhraseItemTextVar(v, t))
             can_be_noun_ = False
             if (isinstance(t, NumberToken)):
                 pass
             elif (v.class0_.is_noun
                   or ((wf is not None and wf.normal_case == "САМ"))):
                 can_be_noun_ = True
             elif (v.class0_.is_personal_pronoun):
                 if (items is None or len(items) == 0):
                     can_be_noun_ = True
                 else:
                     for it1 in items:
                         if (it1.is_verb):
                             if (len(items) == 1
                                     and not v.case_.is_nominative):
                                 can_be_noun_ = True
                             else:
                                 return None
                     if (len(items) == 1):
                         if (items[0].can_be_adj_for_personal_pronoun):
                             can_be_noun_ = True
             elif (
                 (v.class0_.is_pronoun and
                  ((items is None or len(items) == 0 or
                    ((len(items) == 1
                      and items[0].can_be_adj_for_personal_pronoun))))
                  and wf is not None) and
                 (((((wf.normal_case == "ТОТ" or wf.normal_full == "ТО"
                      or wf.normal_case == "ТО") or wf.normal_case == "ЭТО"
                     or wf.normal_case == "ВСЕ") or wf.normal_case == "ЧТО"
                    or wf.normal_case == "КТО") or wf.normal_full
                   == "КОТОРЫЙ" or wf.normal_case == "КОТОРЫЙ"))):
                 if (wf.normal_case == "ВСЕ"):
                     if (t.next0_ is not None
                             and t.next0_.is_value("РАВНО", None)):
                         return None
                 can_be_noun_ = True
             elif (wf is not None and ((Utils.ifNotNull(
                     wf.normal_full, wf.normal_case))) == "КОТОРЫЙ"
                   and (((attrs) & (NounPhraseParseAttr.PARSEPRONOUNS)))
                   == (NounPhraseParseAttr.NO)):
                 return None
             elif (v.class0_.is_proper and (isinstance(t, TextToken))):
                 if (t.length_char > 4 or v.class0_.is_proper_name):
                     can_be_noun_ = True
             if (can_be_noun_):
                 added = False
                 if (items is not None and len(items) > 1 and
                     (((attrs) & (NounPhraseParseAttr.MULTINOUNS))) !=
                     (NounPhraseParseAttr.NO)):
                     ok1 = True
                     ii = 1
                     while ii < len(items):
                         if (not items[ii].conj_before):
                             ok1 = False
                             break
                         ii += 1
                     if (ok1):
                         if (NounPhraseItem.try_accord_variant(
                                 items,
                             (0 if items is None else len(items)), v,
                                 True)):
                             it.noun_morph.append(
                                 NounPhraseItemTextVar(v, t))
                             it.can_be_noun = True
                             it.multi_nouns = True
                             added = True
                 if (not added):
                     if (NounPhraseItem.try_accord_variant(
                             items, (0 if items is None else len(items)), v,
                             False)):
                         it.noun_morph.append(NounPhraseItemTextVar(v, t))
                         it.can_be_noun = True
                         if (v.class0_.is_personal_pronoun
                                 and t.morph.contains_attr("неизм.", None)
                                 and not it.can_be_adj):
                             itt = NounPhraseItemTextVar(v, t)
                             itt.case_ = MorphCase.ALL_CASES
                             itt.number = MorphNumber.UNDEFINED
                             if (itt.normal_value is None):
                                 pass
                             it.adj_morph.append(itt)
                             it.can_be_adj = True
                     elif ((len(items) > 0 and len(items[0].adj_morph) > 0
                            and items[0].adj_morph[0].number
                            == MorphNumber.PLURAL)
                           and not ((items[0].adj_morph[0].case_)
                                    & v.case_).is_undefined
                           and not items[0].adj_morph[0].class0_.is_verb):
                         if (t.next0_ is not None and t.next0_.is_comma_and
                                 and
                             (isinstance(t.next0_.next0_, TextToken))):
                             npt2 = NounPhraseHelper.try_parse(
                                 t.next0_.next0_, attrs, 0, None)
                             if (npt2 is not None
                                     and npt2.preposition is None
                                     and not ((npt2.morph.case_) & v.case_
                                              & items[0].adj_morph[0].case_
                                              ).is_undefined):
                                 it.noun_morph.append(
                                     NounPhraseItemTextVar(v, t))
                                 it.can_be_noun = True
         if (t0 != t):
             for v in it.adj_morph:
                 v.correct_prefix(Utils.asObjectOrNull(t0, TextToken),
                                  False)
             for v in it.noun_morph:
                 v.correct_prefix(Utils.asObjectOrNull(t0, TextToken), True)
         if (k == 1 and it.can_be_noun and not it.can_be_adj):
             if (t1 is not None):
                 it.end_token = t1
             else:
                 it.end_token = t0.next0_.next0_
             for v in it.noun_morph:
                 if (v.normal_value is not None
                         and (v.normal_value.find('-') < 0)):
                     v.normal_value = "{0}-{1}".format(
                         v.normal_value,
                         it.end_token.get_normal_case_text(
                             None, MorphNumber.UNDEFINED,
                             MorphGender.UNDEFINED, False))
         if (it.can_be_adj):
             if (NounPhraseItem.__m_std_adjectives.try_parse(
                     it.begin_token, TerminParseAttr.NO) is not None):
                 it.is_std_adjective = True
         if (can_be_prepos and it.can_be_noun):
             if (items is not None and len(items) > 0):
                 npt1 = NounPhraseHelper.try_parse(
                     t,
                     Utils.valToEnum((NounPhraseParseAttr.PARSEPREPOSITION)
                                     | (NounPhraseParseAttr.PARSEPRONOUNS) |
                                     (NounPhraseParseAttr.PARSEVERBS),
                                     NounPhraseParseAttr), 0, None)
                 if (npt1 is not None and npt1.end_char > t.end_char):
                     return None
             else:
                 npt1 = NounPhraseHelper.try_parse(
                     t.next0_,
                     Utils.valToEnum((NounPhraseParseAttr.PARSEPRONOUNS) |
                                     (NounPhraseParseAttr.PARSEVERBS),
                                     NounPhraseParseAttr), 0, None)
                 if (npt1 is not None):
                     mc = LanguageHelper.get_case_after_preposition(t.lemma)
                     if (not ((mc) & npt1.morph.case_).is_undefined):
                         return None
         if (it.can_be_noun or it.can_be_adj or k == 1):
             if (it.begin_token.morph.class0_.is_pronoun):
                 tt2 = it.end_token.next0_
                 if ((tt2 is not None and tt2.is_hiphen
                      and not tt2.is_whitespace_after)
                         and not tt2.is_whitespace_before):
                     tt2 = tt2.next0_
                 if (isinstance(tt2, TextToken)):
                     ss = tt2.term
                     if ((ss == "ЖЕ" or ss == "БЫ" or ss == "ЛИ")
                             or ss == "Ж"):
                         it.end_token = tt2
                     elif (ss == "НИБУДЬ" or ss == "ЛИБО"
                           or (((ss == "ТО" and tt2.previous.is_hiphen))
                               and it.can_be_adj)):
                         it.end_token = tt2
                         for m in it.adj_morph:
                             m.normal_value = "{0}-{1}".format(
                                 m.normal_value, ss)
                             if (m.single_number_value is not None):
                                 m.single_number_value = "{0}-{1}".format(
                                     m.single_number_value, ss)
             return it
         if (t0 == t):
             if (t0.is_value("БИЗНЕС", None) and t0.next0_ is not None
                     and t0.next0_.chars == t0.chars):
                 t1 = t0.next0_
                 continue
             return it
     return None

Example #7

Show file

File: InnerMorphology.py Project: AAA1911/PullentiPython

 def run(self, text: str, only_tokenizing: bool, dlang: 'MorphLang',
         progress: EventHandler,
         good_text: bool) -> typing.List['MorphToken']:
     """ Произвести морфологический анализ текста
     
     Args:
         text(str): исходный текст
         lang: язык (если null, то попробует определить)
     
     Returns:
         typing.List[MorphToken]: последовательность результирующих морфем
     """
     if (Utils.isNullOrEmpty(text)):
         return None
     twr = TextWrapper(text, good_text)
     twrch = twr.chars
     res = list()
     uni_lex = dict()
     term0 = None
     pure_rus_words = 0
     pure_ukr_words = 0
     pure_by_words = 0
     pure_kz_words = 0
     tot_rus_words = 0
     tot_ukr_words = 0
     tot_by_words = 0
     tot_kz_words = 0
     i = 0
     first_pass2884 = True
     while True:
         if first_pass2884: first_pass2884 = False
         else: i += 1
         if (not (i < twr.length)): break
         ty = InnerMorphology._get_char_typ(twrch[i])
         if (ty == 0):
             continue
         if (ty > 2):
             j = (i + 1)
         else:
             j = (i + 1)
             while j < twr.length:
                 if (InnerMorphology._get_char_typ(twrch[j]) != ty):
                     break
                 j += 1
         wstr = text[i:i + j - i]
         term = None
         if (good_text):
             term = wstr
         else:
             trstr = LanguageHelper.transliteral_correction(
                 wstr, term0, False)
             term = LanguageHelper.correct_word(trstr)
         if (Utils.isNullOrEmpty(term)):
             i = (j - 1)
             continue
         lang = InnerMorphology.__detect_lang(twr, i, j - 1, term)
         if (lang == MorphLang.UA):
             pure_ukr_words += 1
         elif (lang == MorphLang.RU):
             pure_rus_words += 1
         elif (lang == MorphLang.BY):
             pure_by_words += 1
         elif (lang == MorphLang.KZ):
             pure_kz_words += 1
         if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN):
             tot_rus_words += 1
         if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN):
             tot_ukr_words += 1
         if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN):
             tot_by_words += 1
         if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN):
             tot_kz_words += 1
         if (ty == 1):
             term0 = term
         lemmas = None
         if (ty == 1 and not only_tokenizing):
             wraplemmas12 = RefOutArgWrapper(None)
             inoutres13 = Utils.tryGetValue(uni_lex, term, wraplemmas12)
             lemmas = wraplemmas12.value
             if (not inoutres13):
                 lemmas = InnerMorphology.UniLexWrap._new11(lang)
                 uni_lex[term] = lemmas
         tok = MorphToken()
         tok.term = term
         tok.begin_char = i
         if (i == 733860):
             pass
         tok.end_char = (j - 1)
         tok.tag = (lemmas)
         res.append(tok)
         i = (j - 1)
     def_lang = MorphLang(dlang)
     if (pure_rus_words > pure_ukr_words and pure_rus_words > pure_by_words
             and pure_rus_words > pure_kz_words):
         def_lang = MorphLang.RU
     elif (tot_rus_words > tot_ukr_words and tot_rus_words > tot_by_words
           and tot_rus_words > tot_kz_words):
         def_lang = MorphLang.RU
     elif (pure_ukr_words > pure_rus_words
           and pure_ukr_words > pure_by_words
           and pure_ukr_words > pure_kz_words):
         def_lang = MorphLang.UA
     elif (tot_ukr_words > tot_rus_words and tot_ukr_words > tot_by_words
           and tot_ukr_words > tot_kz_words):
         def_lang = MorphLang.UA
     elif (pure_kz_words > pure_rus_words and pure_kz_words > pure_ukr_words
           and pure_kz_words > pure_by_words):
         def_lang = MorphLang.KZ
     elif (tot_kz_words > tot_rus_words and tot_kz_words > tot_ukr_words
           and tot_kz_words > tot_by_words):
         def_lang = MorphLang.KZ
     elif (pure_by_words > pure_rus_words and pure_by_words > pure_ukr_words
           and pure_by_words > pure_kz_words):
         def_lang = MorphLang.BY
     elif (tot_by_words > tot_rus_words and tot_by_words > tot_ukr_words
           and tot_by_words > tot_kz_words):
         if (tot_rus_words > 10 and tot_by_words > (tot_rus_words + 20)):
             def_lang = MorphLang.BY
         elif (tot_rus_words == 0 or tot_by_words >= (tot_rus_words * 2)):
             def_lang = MorphLang.BY
     if (((def_lang.is_undefined or def_lang.is_ua)) and tot_rus_words > 0):
         if (((tot_ukr_words > tot_rus_words
               and InnerMorphology.M_ENGINE_UA.language.is_ua))
                 or ((tot_by_words > tot_rus_words
                      and InnerMorphology.M_ENGINE_BY.language.is_by))
                 or ((tot_kz_words > tot_rus_words
                      and InnerMorphology.M_ENGINE_KZ.language.is_kz))):
             cou0 = 0
             tot_kz_words = 0
             tot_ukr_words = tot_kz_words
             tot_by_words = tot_ukr_words
             tot_rus_words = tot_by_words
             for kp in uni_lex.items():
                 lang = MorphLang()
                 wraplang14 = RefOutArgWrapper(lang)
                 kp[1].word_forms = self.__process_one_word(
                     kp[0], wraplang14)
                 lang = wraplang14.value
                 if (kp[1].word_forms is not None):
                     for wf in kp[1].word_forms:
                         lang |= wf.language
                 kp[1].lang = lang
                 if (lang.is_ru):
                     tot_rus_words += 1
                 if (lang.is_ua):
                     tot_ukr_words += 1
                 if (lang.is_by):
                     tot_by_words += 1
                 if (lang.is_kz):
                     tot_kz_words += 1
                 if (lang.is_cyrillic):
                     cou0 += 1
                 if (cou0 >= 100):
                     break
             if (tot_rus_words > ((math.floor(tot_by_words / 2)))
                     and tot_rus_words > ((math.floor(tot_ukr_words / 2)))):
                 def_lang = MorphLang.RU
             elif (tot_ukr_words > ((math.floor(tot_rus_words / 2)))
                   and tot_ukr_words > ((math.floor(tot_by_words / 2)))):
                 def_lang = MorphLang.UA
             elif (tot_by_words > ((math.floor(tot_rus_words / 2)))
                   and tot_by_words > ((math.floor(tot_ukr_words / 2)))):
                 def_lang = MorphLang.BY
         elif (def_lang.is_undefined):
             def_lang = MorphLang.RU
     cou = 0
     tot_kz_words = 0
     tot_ukr_words = tot_kz_words
     tot_by_words = tot_ukr_words
     tot_rus_words = tot_by_words
     for kp in uni_lex.items():
         lang = def_lang
         if (lang.is_undefined):
             if (tot_rus_words > tot_by_words
                     and tot_rus_words > tot_ukr_words
                     and tot_rus_words > tot_kz_words):
                 lang = MorphLang.RU
             elif (tot_ukr_words > tot_rus_words
                   and tot_ukr_words > tot_by_words
                   and tot_ukr_words > tot_kz_words):
                 lang = MorphLang.UA
             elif (tot_by_words > tot_rus_words
                   and tot_by_words > tot_ukr_words
                   and tot_by_words > tot_kz_words):
                 lang = MorphLang.BY
             elif (tot_kz_words > tot_rus_words
                   and tot_kz_words > tot_ukr_words
                   and tot_kz_words > tot_by_words):
                 lang = MorphLang.KZ
         wraplang15 = RefOutArgWrapper(lang)
         kp[1].word_forms = self.__process_one_word(kp[0], wraplang15)
         lang = wraplang15.value
         kp[1].lang = lang
         if ((((lang) & MorphLang.RU)) != MorphLang.UNKNOWN):
             tot_rus_words += 1
         if ((((lang) & MorphLang.UA)) != MorphLang.UNKNOWN):
             tot_ukr_words += 1
         if ((((lang) & MorphLang.BY)) != MorphLang.UNKNOWN):
             tot_by_words += 1
         if ((((lang) & MorphLang.KZ)) != MorphLang.UNKNOWN):
             tot_kz_words += 1
         if (progress is not None):
             self.__on_progress(cou, len(uni_lex), progress)
         cou += 1
     debug_token = None
     empty_list = None
     for r in res:
         uni = Utils.asObjectOrNull(r.tag, InnerMorphology.UniLexWrap)
         r.tag = None
         if (uni is None or uni.word_forms is None
                 or len(uni.word_forms) == 0):
             if (empty_list is None):
                 empty_list = list()
             r.word_forms = empty_list
             if (uni is not None):
                 r.language = uni.lang
         else:
             r.word_forms = uni.word_forms
         if (r.begin_char == 733860):
             debug_token = r
     if (not good_text):
         i = 0
         first_pass2885 = True
         while True:
             if first_pass2885: first_pass2885 = False
             else: i += 1
             if (not (i < (len(res) - 2))): break
             ui0 = twrch[res[i].begin_char]
             ui1 = twrch[res[i + 1].begin_char]
             ui2 = twrch[res[i + 2].begin_char]
             if (ui1.is_quot):
                 p = res[i + 1].begin_char
                 if ((p >= 2 and "БбТт".find(text[p - 1]) >= 0 and
                      ((p + 3) < len(text)))
                         and "ЕеЯяЁё".find(text[p + 1]) >= 0):
                     wstr = LanguageHelper.transliteral_correction(
                         LanguageHelper.correct_word("{0}Ъ{1}".format(
                             res[i].get_source_text(text),
                             res[i + 2].get_source_text(text))), None,
                         False)
                     li = self.__process_one_word0(wstr)
                     if (li is not None and len(li) > 0
                             and li[0].is_in_dictionary):
                         res[i].end_char = res[i + 2].end_char
                         res[i].term = wstr
                         res[i].word_forms = li
                         del res[i + 1:i + 1 + 2]
                 elif ((ui1.is_apos and p > 0 and str.isalpha(text[p - 1]))
                       and ((p + 1) < len(text))
                       and str.isalpha(text[p + 1])):
                     if (def_lang == MorphLang.UA
                             or (((res[i].language) & MorphLang.UA)) !=
                             MorphLang.UNKNOWN
                             or (((res[i + 2].language) & MorphLang.UA)) !=
                             MorphLang.UNKNOWN):
                         wstr = LanguageHelper.transliteral_correction(
                             LanguageHelper.correct_word("{0}{1}".format(
                                 res[i].get_source_text(text),
                                 res[i + 2].get_source_text(text))), None,
                             False)
                         li = self.__process_one_word0(wstr)
                         okk = True
                         if (okk):
                             res[i].end_char = res[i + 2].end_char
                             res[i].term = wstr
                             if (li is None):
                                 li = list()
                             res[i].word_forms = li
                             if (li is not None and len(li) > 0):
                                 res[i].language = li[0].language
                             del res[i + 1:i + 1 + 2]
             elif (((ui1.uni_char == '3' or ui1.uni_char == '4'))
                   and res[i + 1].length == 1):
                 src = ("З" if ui1.uni_char == '3' else "Ч")
                 i0 = i + 1
                 if ((res[i].end_char + 1) == res[i + 1].begin_char
                         and ui0.is_cyrillic):
                     i0 -= 1
                     src = (res[i0].get_source_text(text) + src)
                 i1 = i + 1
                 if ((res[i + 1].end_char + 1) == res[i + 2].begin_char
                         and ui2.is_cyrillic):
                     i1 += 1
                     src += res[i1].get_source_text(text)
                 if (len(src) > 2):
                     wstr = LanguageHelper.transliteral_correction(
                         LanguageHelper.correct_word(src), None, False)
                     li = self.__process_one_word0(wstr)
                     if (li is not None and len(li) > 0
                             and li[0].is_in_dictionary):
                         res[i0].end_char = res[i1].end_char
                         res[i0].term = wstr
                         res[i0].word_forms = li
                         del res[i0 + 1:i0 + 1 + i1 - i0]
             elif ((ui1.is_hiphen and ui0.is_letter and ui2.is_letter)
                   and res[i].end_char > res[i].begin_char
                   and res[i + 2].end_char > res[i + 2].begin_char):
                 newline = False
                 sps = 0
                 j = (res[i + 1].end_char + 1)
                 while j < res[i + 2].begin_char:
                     if (text[j] == '\r' or text[j] == '\n'):
                         newline = True
                         sps += 1
                     elif (not Utils.isWhitespace(text[j])):
                         break
                     else:
                         sps += 1
                     j += 1
                 full_word = LanguageHelper.correct_word(
                     res[i].get_source_text(text) +
                     res[i + 2].get_source_text(text))
                 if (not newline):
                     if (full_word in uni_lex or full_word == "ИЗЗА"):
                         newline = True
                     elif (text[res[i + 1].begin_char] == (chr(0x00AD))):
                         newline = True
                     elif (LanguageHelper.ends_with_ex(
                             res[i].get_source_text(text), "О", "о", None,
                             None) and len(res[i + 2].word_forms) > 0
                           and res[i + 2].word_forms[0].is_in_dictionary):
                         if (text[res[i + 1].begin_char] == '¬'):
                             li = self.__process_one_word0(full_word)
                             if (li is not None and len(li) > 0
                                     and li[0].is_in_dictionary):
                                 newline = True
                     elif ((res[i].end_char + 2) == res[i + 2].begin_char):
                         if (not str.isupper(text[res[i + 2].begin_char])
                                 and (sps < 2) and len(full_word) > 4):
                             newline = True
                             if ((i + 3) < len(res)):
                                 ui3 = twrch[res[i + 3].begin_char]
                                 if (ui3.is_hiphen):
                                     newline = False
                     elif (((res[i].end_char + 1) == res[i + 1].begin_char
                            and sps > 0 and (sps < 3))
                           and len(full_word) > 4):
                         newline = True
                 if (newline):
                     li = self.__process_one_word0(full_word)
                     if (li is not None and len(li) > 0
                             and ((li[0].is_in_dictionary
                                   or full_word in uni_lex))):
                         res[i].end_char = res[i + 2].end_char
                         res[i].term = full_word
                         res[i].word_forms = li
                         del res[i + 1:i + 1 + 2]
                 else:
                     pass
             elif ((ui1.is_letter and ui0.is_letter and res[i].length > 2)
                   and res[i + 1].length > 1):
                 if (ui0.is_upper != ui1.is_upper):
                     continue
                 if (not ui0.is_cyrillic or not ui1.is_cyrillic):
                     continue
                 newline = False
                 j = (res[i].end_char + 1)
                 while j < res[i + 1].begin_char:
                     if (twrch[j].code == 0xD or twrch[j].code == 0xA):
                         newline = True
                         break
                     j += 1
                 if (not newline):
                     continue
                 full_word = LanguageHelper.correct_word(
                     res[i].get_source_text(text) +
                     res[i + 1].get_source_text(text))
                 if (not full_word in uni_lex):
                     continue
                 li = self.__process_one_word0(full_word)
                 if (li is not None and len(li) > 0
                         and li[0].is_in_dictionary):
                     res[i].end_char = res[i + 1].end_char
                     res[i].term = full_word
                     res[i].word_forms = li
                     del res[i + 1]
     i = 0
     first_pass2886 = True
     while True:
         if first_pass2886: first_pass2886 = False
         else: i += 1
         if (not (i < len(res))): break
         mt = res[i]
         mt.char_info = CharsInfo()
         ui0 = twrch[mt.begin_char]
         ui00 = UnicodeInfo.ALL_CHARS[ord((res[i].term[0]))]
         j = (mt.begin_char + 1)
         while j <= mt.end_char:
             if (ui0.is_letter):
                 break
             ui0 = twrch[j]
             j += 1
         if (ui0.is_letter):
             res[i].char_info.is_letter = True
             if (ui00.is_latin):
                 res[i].char_info.is_latin_letter = True
             elif (ui00.is_cyrillic):
                 res[i].char_info.is_cyrillic_letter = True
             if (res[i].language == MorphLang.UNKNOWN):
                 if (LanguageHelper.is_cyrillic(mt.term)):
                     res[i].language = (MorphLang.RU if
                                        def_lang.is_undefined else def_lang)
             if (good_text):
                 continue
             all_up = True
             all_lo = True
             j = mt.begin_char
             while j <= mt.end_char:
                 if (twrch[j].is_upper or twrch[j].is_digit):
                     all_lo = False
                 else:
                     all_up = False
                 j += 1
             if (all_up):
                 mt.char_info.is_all_upper = True
             elif (all_lo):
                 mt.char_info.is_all_lower = True
             elif (((ui0.is_upper or twrch[mt.begin_char].is_digit))
                   and mt.end_char > mt.begin_char):
                 all_lo = True
                 j = (mt.begin_char + 1)
                 while j <= mt.end_char:
                     if (twrch[j].is_upper or twrch[j].is_digit):
                         all_lo = False
                         break
                     j += 1
                 if (all_lo):
                     mt.char_info.is_capital_upper = True
                 elif (twrch[mt.end_char].is_lower
                       and (mt.end_char - mt.begin_char) > 1):
                     all_up = True
                     j = mt.begin_char
                     while j < mt.end_char:
                         if (twrch[j].is_lower):
                             all_up = False
                             break
                         j += 1
                     if (all_up):
                         mt.char_info.is_last_lower = True
         if (mt.char_info.is_last_lower and mt.length > 2
                 and mt.char_info.is_cyrillic_letter):
             pref = text[mt.begin_char:mt.begin_char + mt.end_char -
                         mt.begin_char]
             ok = False
             for wf in mt.word_forms:
                 if (wf.normal_case == pref or wf.normal_full == pref):
                     ok = True
                     break
             if (not ok):
                 mt.word_forms = list(mt.word_forms)
                 mt.word_forms.insert(
                     0, MorphWordForm._new16(pref, MorphClass.NOUN, 1))
     if (good_text or only_tokenizing):
         return res
     i = 0
     first_pass2887 = True
     while True:
         if first_pass2887: first_pass2887 = False
         else: i += 1
         if (not (i < len(res))): break
         if (res[i].length == 1 and res[i].char_info.is_latin_letter):
             ch = res[i].term[0]
             if (ch == 'C' or ch == 'A' or ch == 'P'):
                 pass
             else:
                 continue
             is_rus = False
             for ii in range(i - 1, -1, -1):
                 if ((res[ii].end_char + 1) != res[ii + 1].begin_char):
                     break
                 elif (res[ii].char_info.is_letter):
                     is_rus = res[ii].char_info.is_cyrillic_letter
                     break
             if (not is_rus):
                 ii = i + 1
                 while ii < len(res):
                     if ((res[ii - 1].end_char + 1) != res[ii].begin_char):
                         break
                     elif (res[ii].char_info.is_letter):
                         is_rus = res[ii].char_info.is_cyrillic_letter
                         break
                     ii += 1
             if (is_rus):
                 res[i].term = LanguageHelper.transliteral_correction(
                     res[i].term, None, True)
                 res[i].char_info.is_cyrillic_letter = True
                 res[i].char_info.is_latin_letter = True
     for r in res:
         if (r.char_info.is_all_upper or r.char_info.is_capital_upper):
             if (r.language.is_cyrillic):
                 ok = False
                 for wf in r.word_forms:
                     if (wf.class0_.is_proper_surname):
                         ok = True
                         break
                 if (not ok):
                     r.word_forms = list(r.word_forms)
                     InnerMorphology.M_ENGINE_RU.process_surname_variants(
                         r.term, r.word_forms)
     for r in res:
         for mv in r.word_forms:
             if (mv.normal_case is None):
                 mv.normal_case = r.term
     i = 0
     while i < (len(res) - 2):
         if (res[i].char_info.is_latin_letter
                 and res[i].char_info.is_all_upper and res[i].length == 1):
             if (twrch[res[i + 1].begin_char].is_quot
                     and res[i + 2].char_info.is_latin_letter
                     and res[i + 2].length > 2):
                 if ((res[i].end_char + 1) == res[i + 1].begin_char and
                     (res[i + 1].end_char + 1) == res[i + 2].begin_char):
                     wstr = "{0}{1}".format(res[i].term, res[i + 2].term)
                     li = self.__process_one_word0(wstr)
                     if (li is not None):
                         res[i].word_forms = li
                     res[i].end_char = res[i + 2].end_char
                     res[i].term = wstr
                     if (res[i + 2].char_info.is_all_lower):
                         res[i].char_info.is_all_upper = False
                         res[i].char_info.is_capital_upper = True
                     elif (not res[i + 2].char_info.is_all_upper):
                         res[i].char_info.is_all_upper = False
                     del res[i + 1:i + 1 + 2]
         i += 1
     i = 0
     first_pass2888 = True
     while True:
         if first_pass2888: first_pass2888 = False
         else: i += 1
         if (not (i < (len(res) - 1))): break
         if (not res[i].char_info.is_letter
                 and not res[i + 1].char_info.is_letter
                 and (res[i].end_char + 1) == res[i + 1].begin_char):
             if (twrch[res[i].begin_char].is_hiphen
                     and twrch[res[i + 1].begin_char].is_hiphen):
                 if (i == 0 or not twrch[res[i - 1].begin_char].is_hiphen):
                     pass
                 else:
                     continue
                 if ((i + 2) == len(res)
                         or not twrch[res[i + 2].begin_char].is_hiphen):
                     pass
                 else:
                     continue
                 res[i].end_char = res[i + 1].end_char
                 del res[i + 1]
     return res

Example #8

Show file

 def __try1(li: typing.List['CityItemToken'], oi: 'IntOntologyItem',
            ad: 'AnalyzerDataWithOntology') -> 'ReferentToken':
     oi.value = (None)
     if (li is None or (len(li) < 1)):
         return None
     elif (li[0].typ != CityItemToken.ItemType.CITY):
         if (len(li) != 2 or li[0].typ != CityItemToken.ItemType.PROPERNAME
                 or li[1].typ != CityItemToken.ItemType.NOUN):
             return None
     i = 1
     oi.value = li[0].onto_item
     ok = not li[0].doubtful
     if ((ok and li[0].onto_item is not None
          and li[0].onto_item.misc_attr is None) and ad is not None):
         if (li[0].onto_item.owner != ad.local_ontology
                 and not li[0].onto_item.owner.is_ext_ontology):
             if (li[0].begin_token.previous is not None
                     and li[0].begin_token.previous.is_value("В", None)):
                 pass
             else:
                 ok = False
     if (len(li) == 1 and li[0].begin_token.morph.class0_.is_adjective):
         sits = StreetItemToken.try_parse_list(li[0].begin_token, None, 3)
         if (sits is not None and len(sits) == 2
                 and sits[1].typ == StreetItemType.NOUN):
             return None
     typ = None
     alttyp = None
     mc = li[0].morph
     if (i < len(li)):
         if (li[i].typ == CityItemToken.ItemType.NOUN):
             at = None
             if (not li[i].chars.is_all_lower
                     and (li[i].whitespaces_after_count < 2)):
                 sit = StreetItemToken.try_parse(li[i].end_token.next0_,
                                                 None, False, None, False)
                 if (sit is not None and sit.typ == StreetItemType.NOUN):
                     at = AddressItemToken.try_parse(
                         li[i].begin_token, None, False, False, None)
                     if (at is not None):
                         at2 = AddressItemToken.try_parse(
                             li[i].end_token.next0_, None, False, False,
                             None)
                         if (at2 is not None and at2.typ
                                 == AddressItemToken.ItemType.STREET):
                             at = (None)
             if (at is None):
                 typ = li[i].value
                 alttyp = li[i].alt_value
                 if (li[i].begin_token.is_value("СТ", None)
                         and li[i].begin_token.chars.is_all_upper):
                     return None
                 if ((i + 1) == len(li)):
                     ok = True
                     if (not li[i].morph.case_.is_undefined):
                         mc = li[i].morph
                     i += 1
                 elif (ok):
                     i += 1
                 else:
                     tt0 = li[0].begin_token.previous
                     if ((isinstance(tt0, TextToken))
                             and (tt0.whitespaces_after_count < 3)):
                         if (tt0.is_value("МЭР", "МЕР")
                                 or tt0.is_value("ГЛАВА", None)
                                 or tt0.is_value("ГРАДОНАЧАЛЬНИК", None)):
                             ok = True
                             i += 1
     if (not ok and oi.value is not None
             and (len(oi.value.canonic_text) < 4)):
         return None
     if (not ok and li[0].begin_token.morph.class0_.is_proper_name):
         return None
     if (not ok):
         if (not MiscHelper.is_exists_in_dictionary(
                 li[0].begin_token, li[0].end_token, (MorphClass.ADJECTIVE)
                 | MorphClass.NOUN | MorphClass.PRONOUN)):
             ok = (li[0].geo_object_before or li[i - 1].geo_object_after)
             if (ok and li[0].begin_token == li[0].end_token):
                 mcc = li[0].begin_token.get_morph_class_in_dictionary()
                 if (mcc.is_proper_name or mcc.is_proper_surname):
                     ok = False
                 elif (li[0].geo_object_before
                       and (li[0].whitespaces_after_count < 2)):
                     ad1 = AddressItemToken.try_parse(
                         li[0].begin_token, None, False, False, None)
                     if (ad1 is not None and ad1.typ
                             == AddressItemToken.ItemType.STREET):
                         ad2 = AddressItemToken.try_parse(
                             li[0].end_token.next0_, None, False, False,
                             None)
                         if (ad2 is None or ad2.typ !=
                                 AddressItemToken.ItemType.STREET):
                             ok = False
                     elif (AddressItemToken.try_attach_org(
                             li[0].begin_token) is not None):
                         ok = False
         if (ok):
             if (li[0].kit.process_referent("PERSON", li[0].begin_token)
                     is not None):
                 ok = False
     if (not ok):
         ok = CityAttachHelper.check_year_after(li[0].end_token.next0_)
     if (not ok and ((not li[0].begin_token.morph.class0_.is_adjective
                      or li[0].begin_token != li[0].end_token))):
         ok = CityAttachHelper.check_city_after(li[0].end_token.next0_)
     if (not ok):
         return None
     if (i < len(li)):
         del li[i:i + len(li) - i]
     rt = None
     if (oi.value is None):
         if (li[0].value is not None and li[0].higher_geo is not None):
             cap = GeoReferent()
             cap._add_name(li[0].value)
             cap._add_typ_city(li[0].kit.base_language)
             cap.higher = li[0].higher_geo
             if (typ is not None):
                 cap._add_typ(typ)
             if (alttyp is not None):
                 cap._add_typ(alttyp)
             rt = ReferentToken(cap, li[0].begin_token, li[0].end_token)
         else:
             if (li[0].value is None):
                 return None
             if (typ is None):
                 if ((len(li) == 1
                      and li[0].begin_token.previous is not None
                      and li[0].begin_token.previous.is_hiphen) and
                     (isinstance(li[0].begin_token.previous.previous,
                                 ReferentToken)) and
                     (isinstance(
                         li[0].begin_token.previous.previous.get_referent(),
                         GeoReferent))):
                     pass
                 else:
                     return None
             else:
                 if (not LanguageHelper.ends_with_ex(
                         typ, "ПУНКТ", "ПОСЕЛЕНИЕ", "ПОСЕЛЕННЯ",
                         "ПОСЕЛОК")):
                     if (not LanguageHelper.ends_with(typ, "CITY")):
                         if (typ == "СТАНЦИЯ" and
                             ((MiscLocationHelper.check_geo_object_before(
                                 li[0].begin_token)))):
                             pass
                         elif (len(li) > 1
                               and li[1].typ == CityItemToken.ItemType.NOUN
                               and li[0].typ
                               == CityItemToken.ItemType.CITY):
                             pass
                         elif ((len(li) == 2 and li[1].typ
                                == CityItemToken.ItemType.NOUN and li[0].typ
                                == CityItemToken.ItemType.PROPERNAME)
                               and ((li[0].geo_object_before
                                     or li[1].geo_object_after))):
                             pass
                         else:
                             return None
                 if (li[0].begin_token.morph.class0_.is_adjective):
                     li[0].value = ProperNameHelper.get_name_ex(
                         li[0].begin_token, li[0].end_token,
                         MorphClass.ADJECTIVE, li[1].morph.case_,
                         li[1].morph.gender, False, False)
     elif (isinstance(oi.value.referent, GeoReferent)):
         city = Utils.asObjectOrNull(oi.value.referent.clone(), GeoReferent)
         city.occurrence.clear()
         rt = ReferentToken._new734(city, li[0].begin_token,
                                    li[len(li) - 1].end_token, mc)
     elif (typ is None):
         typ = oi.value.typ
     if (rt is None):
         city = GeoReferent()
         city._add_name(
             (li[0].value if oi.value is None else oi.value.canonic_text))
         if (typ is not None):
             city._add_typ(typ)
         else:
             city._add_typ_city(li[0].kit.base_language)
         if (alttyp is not None):
             city._add_typ(alttyp)
         rt = ReferentToken._new734(city, li[0].begin_token,
                                    li[len(li) - 1].end_token, mc)
     if ((isinstance(rt.referent, GeoReferent)) and len(li) == 1
             and rt.referent.is_city):
         if (rt.begin_token.previous is not None
                 and rt.begin_token.previous.is_value("Г", None)):
             rt.begin_token = rt.begin_token.previous
         elif ((rt.begin_token.previous is not None
                and rt.begin_token.previous.is_char('.')
                and rt.begin_token.previous.previous is not None)
               and rt.begin_token.previous.previous.is_value("Г", None)):
             rt.begin_token = rt.begin_token.previous.previous
         elif (rt.end_token.next0_ is not None
               and (rt.whitespaces_after_count < 2)
               and rt.end_token.next0_.is_value("Г", None)):
             rt.end_token = rt.end_token.next0_
             if (rt.end_token.next0_ is not None
                     and rt.end_token.next0_.is_char('.')):
                 rt.end_token = rt.end_token.next0_
     return rt