def add(self, val: str, shortval: str, gen: 'MorphGender', add_other_gender_var: bool = False) -> None: if (val is None): return if (self.head is None): if (len(val) > 3): self.head = val[0:0 + 3] else: self.head = val if (gen == MorphGender.MASCULINE or gen == MorphGender.FEMINIE): for it in self.items: if (it.value == val and it.gender == gen): return self.items.append( PersonMorphCollection.PersonMorphVariant._new2434( val, gen, shortval)) if (add_other_gender_var): g0 = (MorphGender.MASCULINE if gen == MorphGender.FEMINIE else MorphGender.FEMINIE) v = Morphology.getWordform( val, MorphBaseInfo._new210(MorphClass._new2415(True), g0)) if (v is not None): self.items.append( PersonMorphCollection.PersonMorphVariant._new2434( v, g0, shortval)) else: self.add(val, shortval, MorphGender.MASCULINE, False) self.add(val, shortval, MorphGender.FEMINIE, False)
def initialize(lang: 'MorphLang' = None) -> None: """ Инициализация сервиса. Внимание! После этого нужно инициализровать анализаторы (см. документацию) <param name="lang">необходимые языки (по умолчанию, русский и английский)</param> """ from pullenti.ner.core.internal.NumberExHelper import NumberExHelper from pullenti.ner.core.internal.NounPhraseItem import NounPhraseItem if (ProcessorService.__m_inited): return ProcessorService.__m_inited = True Morphology.initialize(lang) Explanatory.initialize(lang) Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = True NounPhraseItem._initialize() NumberHelper._initialize() NumberExHelper._initialize() BlockLine.initialize() Termin.ASSIGN_ALL_TEXTS_AS_NORMAL = False
def __mergeLetters(self) -> None: before_word = False tmp = io.StringIO() t = self.first_token first_pass2800 = True while True: if first_pass2800: first_pass2800 = False else: t = t.next0_ if (not (t is not None)): break tt = Utils.asObjectOrNull(t, TextToken) if (not tt.chars.is_letter or tt.length_char != 1): before_word = False continue i = t.whitespaces_before_count if (i > 2 or ((i == 2 and before_word))): pass else: before_word = False continue i = 0 Utils.setLengthStringIO(tmp, 0) print(tt.getSourceText(), end="", file=tmp) t1 = t while t1.next0_ is not None: tt = (Utils.asObjectOrNull(t1.next0_, TextToken)) if (tt.length_char != 1 or tt.whitespaces_before_count != 1): break i += 1 print(tt.getSourceText(), end="", file=tmp) t1 = t1.next0_ if (i > 3 or ((i > 1 and before_word))): pass else: before_word = False continue before_word = False mt = Morphology.process(Utils.toStringStringIO(tmp), None, None) if (mt is None or len(mt) != 1): t = t1 continue for wf in mt[0].word_forms: if (wf.is_in_dictionary): before_word = True break if (not before_word): t = t1 continue tt = TextToken(mt[0], self) if (t == self.first_token): self.first_token = (tt) else: tt.previous = t.previous tt.next0_ = t1.next0_ tt.begin_char = t.begin_char tt.end_char = t1.end_char t = (tt)
def getMorphVariant(self, cas: 'MorphCase', plural: bool) -> str: """ Сгенерировать текст именной группы в нужном падеже и числе Args: cas(MorphCase): plural(bool): """ mi = MorphBaseInfo._new551(cas, MorphLang.RU) if (plural): mi.number = MorphNumber.PLURAL else: mi.number = MorphNumber.SINGULAR res = None for a in self.adjectives: tt = MiscHelper.getTextValueOfMetaToken(a, GetTextAttr.NO) if (a.begin_token != a.end_token or not ((isinstance(a.begin_token, TextToken)))): pass else: tt2 = Morphology.getWordform(tt, mi) if (tt2 is not None): tt = tt2 if (res is None): res = tt else: res = "{0} {1}".format(res, tt) if (self.noun is not None): tt = MiscHelper.getTextValueOfMetaToken(self.noun, GetTextAttr.NO) if (self.noun.begin_token != self.noun.end_token or not ((isinstance(self.noun.begin_token, TextToken)))): pass else: tt2 = Morphology.getWordform(tt, mi) if (tt2 is not None): tt = tt2 if (res is None): res = tt else: res = "{0} {1}".format(res, tt) return res
def __correctWordsByMerging(self, lang : 'MorphLang') -> None: t = self.first_token first_pass2798 = True while True: if first_pass2798: first_pass2798 = False else: t = t.next0_ if (not (t is not None and t.next0_ is not None)): break if (not t.chars.is_letter or (t.length_char < 2)): continue mc0 = t.getMorphClassInDictionary() if (t.morph.containsAttr("прдктв.", None)): continue t1 = t.next0_ if (t1.is_hiphen and t1.next0_ is not None and not t1.is_newline_after): t1 = t1.next0_ if (t1.length_char == 1): continue if (not t1.chars.is_letter or not t.chars.is_letter or t1.chars.is_latin_letter != t.chars.is_latin_letter): continue if (t1.chars.is_all_upper and not t.chars.is_all_upper): continue elif (not t1.chars.is_all_lower): continue elif (t.chars.is_all_upper): continue if (t1.morph.containsAttr("прдктв.", None)): continue mc1 = t1.getMorphClassInDictionary() if (not mc1.is_undefined and not mc0.is_undefined): continue if ((len((t).term) + len((t1).term)) < 6): continue corw = (t).term + (t1).term ccc = Morphology.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue if (corw == "ПОСТ" or corw == "ВРЕД"): continue tt = TextToken(ccc[0], self) if (tt.getMorphClassInDictionary().is_undefined): continue tt.begin_char = t.begin_char tt.end_char = t1.end_char tt.chars = t.chars if (t == self.first_token): self.first_token = (tt) else: t.previous.next0_ = tt if (t1.next0_ is not None): tt.next0_ = t1.next0_ t = (tt)
def __correctWordsByMorph(self, lang : 'MorphLang') -> None: tt = self.first_token first_pass2799 = True while True: if first_pass2799: first_pass2799 = False else: tt = tt.next0_ if (not (tt is not None)): break if (not ((isinstance(tt, TextToken)))): continue if (tt.morph.containsAttr("прдктв.", None)): continue dd = tt.getMorphClassInDictionary() if (not dd.is_undefined or (tt.length_char < 4)): continue if (tt.morph.class0_.is_proper_surname and not tt.chars.is_all_lower): continue if (tt.chars.is_all_upper): continue corw = Morphology.correctWord((tt).term, (lang if tt.morph.language.is_undefined else tt.morph.language)) if (corw is None): continue ccc = Morphology.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue tt1 = TextToken._new541(ccc[0], self, tt.chars, tt.begin_char, tt.end_char, (tt).term) mc = tt1.getMorphClassInDictionary() if (mc.is_proper_surname): continue if (tt == self.first_token): self.first_token = (tt1) else: tt.previous.next0_ = tt1 tt1.next0_ = tt.next0_ tt = (tt1) if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.getSourceText()
def initByNormalText(self, text: str, lang_: 'MorphLang' = None) -> None: """ Быстрая инициализация без морф.вариантов, производится только токенизация текста. Используется для ускорения работы со словарём в случае, когда изначально известно, что на входе уже нормализованные строки Args: text(str): исходно нормализованный текст lang_(MorphLang): возможный язык """ if (Utils.isNullOrEmpty(text)): return text = text.upper() if (text.find('\'') >= 0): text = text.replace("'", "") tok = False sp = False for ch in text: if (not str.isalpha(ch)): if (ch == ' '): sp = True else: tok = True break if (not tok and not sp): tt = TextToken(None, None) tt.term = text self.terms.append(Termin.Term(tt, False)) elif (not tok and sp): wrds = Utils.splitString(text, ' ', False) i = 0 first_pass2811 = True while True: if first_pass2811: first_pass2811 = False else: i += 1 if (not (i < len(wrds))): break if (Utils.isNullOrEmpty(wrds[i])): continue tt = TextToken(None, None) tt.term = wrds[i] self.terms.append(Termin.Term(tt, False)) else: toks = Morphology.tokenize(text) if (toks is not None): i = 0 while i < len(toks): tt = TextToken(toks[i], None) self.terms.append(Termin.Term(tt, False)) i += 1 self.lang = MorphLang(lang_)
def is_verb_adjective(self) -> bool: """ Причастие """ if (self.__m_is_verb_adjective >= 0): return self.__m_is_verb_adjective > 0 for f in self.morph.items: if (f.class0_.is_adjective and (isinstance(f, MorphWordForm)) and not "к.ф." in (f).misc.attrs): return True self.__m_is_verb_adjective = 0 tt = Utils.asObjectOrNull(self.end_token, TextToken) if (tt is not None and tt.term.endswith("СЯ")): mb = Morphology.getWordBaseInfo(tt.term[0:0 + len(tt.term) - 2], None, False, False) if (mb is not None): if (mb.class0_.is_adjective): self.__m_is_verb_adjective = 1 return self.__m_is_verb_adjective > 0
def __init__(self, source: str = None, lang_: 'MorphLang' = None, source_is_normal: bool = False) -> None: """ Создать термин из строки с добавлением всех морфологических вариантов написания Args: source(str): строка lang_(MorphLang): возможный язык source_is_normal(bool): при true морфварианты не добавляются (эквивалентно вызову InitByNormalText) """ self.terms = list() self.additional_vars = None self.__m_canonic_text = None self.ignore_terms_order = False self.acronym = None self.acronym_smart = None self.acronym_can_be_lower = False self.abridges = None self.lang = MorphLang() self.tag = None self.tag2 = None if (source is None): return if (source_is_normal or Termin.ASSIGN_ALL_TEXTS_AS_NORMAL): self.initByNormalText(source, lang_) return toks = Morphology.process(source, lang_, None) if (toks is not None): i = 0 while i < len(toks): tt = TextToken(toks[i], None) self.terms.append(Termin.Term(tt, not source_is_normal)) i += 1 self.lang = MorphLang(lang_)
def __getNameWithoutBrackets(begin: 'Token', end: 'Token', normalize_first_noun_group: bool = False, normal_first_group_single: bool = False, ignore_geo_referent: bool = False) -> str: """ Получить строковое значение между токенами, при этом исключая кавычки и скобки Args: begin(Token): начальный токен end(Token): конечный токен normalize_first_noun_group(bool): нормализовывать ли первую именную группу (именит. падеж) normal_first_group_single(bool): приводить ли к единственному числу первую именную группу ignore_geo_referent(bool): игнорировать внутри географические сущности """ res = None if (BracketHelper.canBeStartOfSequence(begin, False, False) and BracketHelper.canBeEndOfSequence(end, False, begin, False)): begin = begin.next0_ end = end.previous if (normalize_first_noun_group and not begin.morph.class0_.is_preposition): npt = NounPhraseHelper.tryParse( begin, NounPhraseParseAttr.REFERENTCANBENOUN, 0) if (npt is not None): if (npt.noun.getMorphClassInDictionary().is_undefined and len(npt.adjectives) == 0): npt = (None) if (npt is not None and npt.end_token.end_char > end.end_char): npt = (None) if (npt is not None): res = npt.getNormalCaseText(None, normal_first_group_single, MorphGender.UNDEFINED, False) te = npt.end_token.next0_ if (((te is not None and te.next0_ is not None and te.is_comma) and (isinstance(te.next0_, TextToken)) and te.next0_.end_char <= end.end_char) and te.next0_.morph.class0_.is_verb and te.next0_.morph.class0_.is_adjective): for it in te.next0_.morph.items: if (it.gender == npt.morph.gender or (((it.gender) & (npt.morph.gender))) != (MorphGender.UNDEFINED)): if (not ( (it.case_) & npt.morph.case_).is_undefined): if (it.number == npt.morph.number or (((it.number) & (npt.morph.number))) != (MorphNumber.UNDEFINED)): var = (te.next0_).term if (isinstance(it, MorphWordForm)): var = (it).normal_case bi = MorphBaseInfo._new549( MorphClass.ADJECTIVE, npt.morph.gender, npt.morph.number, npt.morph.language) var = Morphology.getWordform(var, bi) if (var is not None): res = "{0}, {1}".format(res, var) te = te.next0_.next0_ break if (te is not None and te.end_char <= end.end_char): s = ProperNameHelper.getNameEx(te, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, ignore_geo_referent) if (not Utils.isNullOrEmpty(s)): if (not str.isalnum(s[0])): res = "{0}{1}".format(res, s) else: res = "{0} {1}".format(res, s) elif ((isinstance(begin, TextToken)) and begin.chars.is_cyrillic_letter): mm = begin.getMorphClassInDictionary() if (not mm.is_undefined): res = begin.getNormalCaseText(mm, False, MorphGender.UNDEFINED, False) if (begin.end_char < end.end_char): res = "{0} {1}".format( res, ProperNameHelper.getNameEx(begin.next0_, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, False)) if (res is None): res = ProperNameHelper.getNameEx(begin, end, MorphClass.UNDEFINED, MorphCase.UNDEFINED, MorphGender.UNDEFINED, True, ignore_geo_referent) if (not Utils.isNullOrEmpty(res)): k = 0 i = len(res) - 1 while i >= 0: if (res[i] == '*' or Utils.isWhitespace(res[i])): pass else: break i -= 1 k += 1 if (k > 0): if (k == len(res)): return None res = res[0:0 + len(res) - k] return res
def load_langs(langs): raw = langs_to_raw(langs) Morphology.load_languages(raw) Explanatory.load_languages(raw)
def loaded_langs(): raw = Morphology.get_loaded_languages() return raw_to_langs(raw)
def __init__(self, sofa_ : 'SourceOfAnalysis'=None, only_tokenizing : bool=False, lang : 'MorphLang'=None, progress : EventHandler=None) -> None: self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0) self.corrected_tokens = None self.first_token = None; self.__m_entities = list() self.ontology = None; self.base_language = MorphLang() self.__m_sofa = None; self.statistics = None; self.__m_datas = dict() self.misc_data = dict() self.processor = None; self.recurse_level = 0 self._m_analyzer_stack = list() if (sofa_ is None): return self.__m_sofa = sofa_ self._start_date = datetime.datetime.now() tokens = Morphology.process(sofa_.text, lang, None) t0 = None if (tokens is not None): ii = 0 while ii < len(tokens): mt = tokens[ii] if (mt.begin_char == 733860): pass tt = TextToken(mt, self) if (sofa_.correction_dict is not None): wrapcorw539 = RefOutArgWrapper(None) inoutres540 = Utils.tryGetValue(sofa_.correction_dict, mt.term, wrapcorw539) corw = wrapcorw539.value if (inoutres540): ccc = Morphology.process(corw, lang, None) if (ccc is not None and len(ccc) == 1): tt1 = TextToken._new538(ccc[0], self, tt.term) tt1.begin_char = tt.begin_char tt1.end_char = tt.end_char tt1.chars = tt.chars tt = tt1 if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.getSourceText() if (t0 is None): self.first_token = (tt) else: t0.next0_ = tt t0 = (tt) ii += 1 if (sofa_.clear_dust): self.__clearDust() if (sofa_.do_words_merging_by_morph): self.__correctWordsByMerging(lang) if (sofa_.do_word_correction_by_morph): self.__correctWordsByMorph(lang) self.__mergeLetters() self.__defineBaseLanguage() t = self.first_token first_pass2794 = True while True: if first_pass2794: first_pass2794 = False else: t = t.next0_ if (not (t is not None)): break nt = NumberHelper._tryParseNumber(t) if (nt is None): continue self.embedToken(nt) t = (nt) if (only_tokenizing): return t = self.first_token first_pass2795 = True while True: if first_pass2795: first_pass2795 = False else: t = t.next0_ if (not (t is not None)): break if (t.morph.class0_.is_preposition): continue mc = t.getMorphClassInDictionary() if (mc.is_undefined and t.chars.is_cyrillic_letter and t.length_char > 4): tail = sofa_.text[t.end_char - 1:t.end_char - 1+2] tte = None tt = t.previous if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.previous if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2] if (tail2 == tail): tte = tt if (tte is None): tt = t.next0_ if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.next0_ if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2] if (tail2 == tail): tte = tt if (tte is not None): t.morph.removeItemsEx(tte.morph, tte.getMorphClassInDictionary()) continue self.__createStatistics()
def getNormalCaseText(self, mc: 'MorphClass' = None, single_number: bool = False, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: if ((isinstance(self.begin_token, ReferentToken)) and self.begin_token == self.end_token): return self.begin_token.getNormalCaseText(mc, single_number, gender, keep_chars) res = None max_coef = 0 def_coef = -1 for it in self.morph.items: v = Utils.asObjectOrNull(it, NounPhraseItemTextVar) if (v.undef_coef > 0 and (((v.undef_coef < max_coef) or def_coef >= 0))): continue if (single_number and v.single_number_value is not None): if (mc is not None and ((gender == MorphGender.NEUTER or gender == MorphGender.FEMINIE)) and mc.is_adjective): bi = MorphBaseInfo._new467(MorphClass(mc), gender, MorphNumber.SINGULAR, MorphCase.NOMINATIVE, self.morph.language) str0_ = Morphology.getWordform(v.single_number_value, bi) if (str0_ is not None): res = str0_ else: res = v.single_number_value if (v.undef_coef == 0): break max_coef = v.undef_coef continue if (Utils.isNullOrEmpty(v.normal_value)): continue if (str.isdigit(v.normal_value[0]) and mc is not None and mc.is_adjective): wrapval468 = RefOutArgWrapper(0) inoutres469 = Utils.tryParseInt(v.normal_value, wrapval468) val = wrapval468.value if (inoutres469): str0_ = NumberHelper.getNumberAdjective( val, gender, (MorphNumber.SINGULAR if single_number or val == 1 else MorphNumber.PLURAL)) if (str0_ is not None): res = str0_ if (v.undef_coef == 0): break max_coef = v.undef_coef continue res1 = (it).normal_value if (single_number): if (res1 == "ДЕТИ"): res1 = "РЕБЕНОК" elif (res1 == "ЛЮДИ"): res1 = "ЧЕЛОВЕК" max_coef = v.undef_coef if (v.undef_coef > 0): res = res1 continue def_co = 0 if (mc is not None and mc.is_adjective and v.undef_coef == 0): pass elif (((isinstance(self.begin_token, TextToken)) and res1 == (self.begin_token).term and it.case_.is_nominative) and it.number == MorphNumber.SINGULAR): def_co = 1 if (res is None or def_co > def_coef): res = res1 def_coef = def_co if (def_co > 0): break if (res is not None): return self.__corrChars(res, keep_chars) if (res is None and self.begin_token == self.end_token): res = self.begin_token.getNormalCaseText(mc, single_number, gender, keep_chars) return Utils.ifNotNull(res, "?")
def _tryParseStreet(sli : typing.List['StreetItemToken'], ext_onto_regim : bool=False, for_metro : bool=False) -> 'AddressItemToken': if (sli is None or len(sli) == 0): return None i = 0 while i < len(sli): if (i == 0 and sli[i].typ == StreetItemType.FIX and ((len(sli) == 1 or sli[1].typ != StreetItemType.NOUN))): return StreetDefineHelper.__tryParseFix(sli) elif (sli[i].typ == StreetItemType.NOUN): if ((i == 0 and sli[i].termin.canonic_text == "УЛИЦА" and ((i + 2) < len(sli))) and sli[i + 1].typ == StreetItemType.NOUN and sli[i + 1].termin.canonic_text == "МИКРОРАЙОН"): sli[i + 1].begin_token = sli[i].begin_token del sli[i] if (sli[i].termin.canonic_text == "МЕТРО"): if ((i + 1) < len(sli)): sli1 = list() ii = i + 1 while ii < len(sli): sli1.append(sli[ii]) ii += 1 str1 = StreetDefineHelper._tryParseStreet(sli1, ext_onto_regim, True) if (str1 is not None): str1.begin_token = sli[i].begin_token str1.is_doubt = sli[i].is_abridge if (sli[i + 1].is_in_brackets): str1.is_doubt = False return str1 elif (i == 1 and sli[0].typ == StreetItemType.NAME): for_metro = True break if (i == 0 and len(sli) > 0): for_metro = True break return None if (i == 0 and (i + 1) >= len(sli) and ((sli[i].termin.canonic_text == "ВОЕННЫЙ ГОРОДОК" or sli[i].termin.canonic_text == "ПРОМЗОНА"))): stri0 = StreetReferent() stri0.addSlot(StreetReferent.ATTR_TYP, "микрорайон", False, 0) stri0.addSlot(StreetReferent.ATTR_NAME, sli[i].termin.canonic_text, False, 0) return AddressItemToken._new85(AddressItemToken.ItemType.STREET, sli[0].begin_token, sli[0].end_token, stri0, True) if (i == 0 and (i + 1) >= len(sli) and sli[i].termin.canonic_text == "МИКРОРАЙОН"): stri0 = StreetReferent() stri0.addSlot(StreetReferent.ATTR_TYP, sli[i].termin.canonic_text.lower(), False, 0) return AddressItemToken._new85(AddressItemToken.ItemType.STREET, sli[0].begin_token, sli[0].end_token, stri0, True) if (sli[i].termin.canonic_text == "ПЛОЩАДЬ" or sli[i].termin.canonic_text == "ПЛОЩА"): tt = sli[i].end_token.next0_ if (tt is not None and ((tt.is_hiphen or tt.isChar(':')))): tt = tt.next0_ nex = NumberHelper.tryParseNumberWithPostfix(tt) if (nex is not None): return None break i += 1 if (i >= len(sli)): return StreetDefineHelper.__tryDetectNonNoun(sli, ext_onto_regim, for_metro) name = None number = None age = None adj = None noun = sli[i] alt_noun = None is_micro_raion = (noun.termin.canonic_text == "МИКРОРАЙОН" or noun.termin.canonic_text == "МІКРОРАЙОН" or noun.termin.canonic_text == "КВАРТАЛ") or LanguageHelper.endsWith(noun.termin.canonic_text, "ГОРОДОК") before = 0 after = 0 j = 0 while j < i: if ((sli[j].typ == StreetItemType.NAME or sli[j].typ == StreetItemType.STDNAME or sli[j].typ == StreetItemType.FIX) or sli[j].typ == StreetItemType.STDADJECTIVE or sli[j].typ == StreetItemType.STDPARTOFNAME): before += 1 elif (sli[j].typ == StreetItemType.NUMBER): if (sli[j].is_newline_after): return None if (sli[j].number.morph.class0_.is_adjective): before += 1 elif (is_micro_raion): before += 1 elif (sli[i].number_has_prefix): before += 1 else: before += 1 j += 1 j = (i + 1) while j < len(sli): if ((sli[j].typ == StreetItemType.NAME or sli[j].typ == StreetItemType.STDNAME or sli[j].typ == StreetItemType.FIX) or sli[j].typ == StreetItemType.STDADJECTIVE or sli[j].typ == StreetItemType.STDPARTOFNAME): after += 1 elif (sli[j].typ == StreetItemType.NUMBER): if (sli[j].number is not None and sli[j].number.morph.class0_.is_adjective): after += 1 elif (is_micro_raion): after += 1 elif (sli[j].number_has_prefix): after += 1 elif (ext_onto_regim): after += 1 elif (sli[j].typ == StreetItemType.NOUN): break else: after += 1 j += 1 rli = list() if (before > after): if (noun.termin.canonic_text == "МЕТРО"): return None tt = sli[0].begin_token if (tt == sli[0].end_token and noun.begin_token == sli[0].end_token.next0_): if (not tt.morph.class0_.is_adjective and not ((isinstance(tt, NumberToken)))): if ((sli[0].is_newline_before or not MiscLocationHelper.checkGeoObjectBefore(sli[0].begin_token) or noun.morph.case_.is_genitive) or noun.morph.case_.is_instrumental): ok = False if (AddressItemToken.checkHouseAfter(noun.end_token.next0_, False, True)): ok = True elif (noun.end_token.next0_ is None): ok = True elif (noun.is_newline_after and MiscLocationHelper.checkGeoObjectBefore(sli[0].begin_token)): ok = True if (not ok): if ((noun.chars.is_latin_letter and noun.chars.is_capital_upper and sli[0].chars.is_latin_letter) and sli[0].chars.is_capital_upper): ok = True if (not ok): return None n0 = 0 n1 = (i - 1) elif (i == 1 and sli[0].typ == StreetItemType.NUMBER): if (not sli[0].is_whitespace_after): return None number = (sli[0].value if sli[0].number is None else str(sli[0].number.int_value)) if (sli[0].is_number_km): number += "км" n0 = (i + 1) n1 = (len(sli) - 1) rli.append(sli[0]) rli.append(sli[i]) elif (after > before): n0 = (i + 1) n1 = (len(sli) - 1) rli.append(sli[i]) elif (after == 0): return None elif ((len(sli) > 2 and ((sli[0].typ == StreetItemType.NAME or sli[0].typ == StreetItemType.STDADJECTIVE or sli[0].typ == StreetItemType.STDNAME)) and sli[1].typ == StreetItemType.NOUN) and sli[2].typ == StreetItemType.NUMBER): n0 = 0 n1 = 0 num = False tt2 = sli[2].end_token.next0_ if (sli[2].is_number_km): num = True elif (sli[0].begin_token.previous is not None and sli[0].begin_token.previous.isValue("КИЛОМЕТР", None)): sli[2].is_number_km = True num = True elif (sli[2].begin_token.previous.is_comma): pass elif (sli[2].begin_token != sli[2].end_token): num = True elif (AddressItemToken.checkHouseAfter(sli[2].end_token.next0_, False, True)): num = True elif (sli[2].morph.class0_.is_adjective and (sli[2].whitespaces_before_count < 2)): if (sli[2].end_token.next0_ is None or sli[2].end_token.is_comma or sli[2].is_newline_after): num = True if (num): number = (sli[2].value if sli[2].number is None else str(sli[2].number.int_value)) if (sli[2].is_number_km): number += "км" rli.append(sli[2]) else: del sli[2:2+len(sli) - 2] else: return None sec_number = None j = n0 first_pass2732 = True while True: if first_pass2732: first_pass2732 = False else: j += 1 if (not (j <= n1)): break if (sli[j].typ == StreetItemType.NUMBER): if (age is not None or ((sli[j].is_newline_before and j > 0))): break if (number is not None): if (name is not None and name.typ == StreetItemType.STDNAME): sec_number = (sli[j].value if sli[j].number is None else str(sli[j].number.int_value)) if (sli[j].is_number_km): sec_number += "км" rli.append(sli[j]) continue if (((j + 1) < len(sli)) and sli[j + 1].typ == StreetItemType.STDNAME): sec_number = (sli[j].value if sli[j].number is None else str(sli[j].number.int_value)) if (sli[j].is_number_km): sec_number += "км" rli.append(sli[j]) continue break if (sli[j].number is not None and sli[j].number.typ == NumberSpellingType.DIGIT and not sli[j].number.morph.class0_.is_adjective): if (sli[j].whitespaces_before_count > 2 and j > 0): break if (sli[j].number is not None and sli[j].number.int_value > 20): if (j > n0): if (((j + 1) < len(sli)) and sli[j + 1].typ == StreetItemType.NOUN): pass else: break if (j == n0 and n0 > 0): pass elif (j == n0 and n0 == 0 and sli[j].whitespaces_after_count == 1): pass elif (sli[j].number_has_prefix): pass elif (j == n1 and ((n1 + 1) < len(sli)) and sli[n1 + 1].typ == StreetItemType.NOUN): pass else: break number = (sli[j].value if sli[j].number is None else str(sli[j].number.int_value)) if (sli[j].is_number_km): number += "км" rli.append(sli[j]) elif (sli[j].typ == StreetItemType.AGE): if (number is not None or age is not None): break age = str(sli[j].number.int_value) rli.append(sli[j]) elif (sli[j].typ == StreetItemType.STDADJECTIVE): if (adj is not None): return None adj = sli[j] rli.append(sli[j]) elif (sli[j].typ == StreetItemType.NAME or sli[j].typ == StreetItemType.STDNAME or sli[j].typ == StreetItemType.FIX): if (name is not None): if (j > 1 and sli[j - 2].typ == StreetItemType.NOUN): break elif (i < j): break else: return None name = sli[j] rli.append(sli[j]) elif (sli[j].typ == StreetItemType.STDPARTOFNAME and j == n1): if (name is not None): break name = sli[j] rli.append(sli[j]) elif (sli[j].typ == StreetItemType.NOUN): if ((sli[0] == noun and ((noun.termin.canonic_text == "УЛИЦА" or noun.termin.canonic_text == "ВУЛИЦЯ")) and j > 0) and name is None): alt_noun = noun noun = sli[j] rli.append(sli[j]) else: break if (((n1 < i) and number is None and ((i + 1) < len(sli))) and sli[i + 1].typ == StreetItemType.NUMBER and sli[i + 1].number_has_prefix): number = (sli[i + 1].value if sli[i + 1].number is None else str(sli[i + 1].number.int_value)) rli.append(sli[i + 1]) elif ((((i < n0) and ((name is not None or adj is not None)) and (j < len(sli))) and sli[j].typ == StreetItemType.NOUN and ((noun.termin.canonic_text == "УЛИЦА" or noun.termin.canonic_text == "ВУЛИЦЯ"))) and (((sli[j].termin.canonic_text == "ПЛОЩАДЬ" or sli[j].termin.canonic_text == "БУЛЬВАР" or sli[j].termin.canonic_text == "ПЛОЩА") or sli[j].termin.canonic_text == "МАЙДАН" or (j + 1) == len(sli)))): alt_noun = noun noun = sli[j] rli.append(sli[j]) if (name is None): if (number is None and adj is None): return None if (noun.is_abridge): if (is_micro_raion): pass elif (noun.termin is not None and ((noun.termin.canonic_text == "ПРОЕЗД" or noun.termin.canonic_text == "ПРОЇЗД"))): pass elif (adj is None or adj.is_abridge): return None if (adj is not None and adj.is_abridge): return None if (not sli[i] in rli): rli.append(sli[i]) street = StreetReferent() if (not for_metro): street.addSlot(StreetReferent.ATTR_TYP, noun.termin.canonic_text.lower(), False, 0) if (noun.alt_termin is not None): if (noun.alt_termin.canonic_text == "ПРОСПЕКТ" and number is not None): pass else: street.addSlot(StreetReferent.ATTR_TYP, noun.alt_termin.canonic_text.lower(), False, 0) else: street.addSlot(StreetReferent.ATTR_TYP, "метро", False, 0) res = AddressItemToken._new82(AddressItemToken.ItemType.STREET, rli[0].begin_token, rli[0].end_token, street) for r in rli: if (res.begin_char > r.begin_char): res.begin_token = r.begin_token if (res.end_char < r.end_char): res.end_token = r.end_token if (for_metro and noun in rli and noun.termin.canonic_text == "МЕТРО"): rli.remove(noun) if (noun.is_abridge and (noun.length_char < 4)): res.is_doubt = True elif (noun.noun_is_doubt_coef > 0): res.is_doubt = True if ((name is not None and name.end_char > noun.end_char and noun.chars.is_all_lower) and not name.chars.is_all_lower and not ((isinstance(name.begin_token, ReferentToken)))): npt2 = NounPhraseHelper.tryParse(name.begin_token, NounPhraseParseAttr.NO, 0) if (npt2 is not None and npt2.end_char > name.end_char): pass elif (AddressItemToken.checkHouseAfter(res.end_token.next0_, False, False)): res.is_doubt = False elif (name.chars.is_capital_upper and noun.noun_is_doubt_coef == 1): res.is_doubt = False name_base = io.StringIO() name_alt = io.StringIO() name_alt2 = None gen = noun.termin.gender adj_gen = MorphGender.UNDEFINED if (number is not None): street.number = number if (sec_number is not None): street.sec_number = sec_number if (age is not None): if (street.number is None): street.number = age else: street.sec_number = age if (name is not None and name.value is not None): if (street.kind == StreetKind.ROAD): for r in rli: if (r.typ == StreetItemType.NAME and r != name): print(r.value, end="", file=name_alt) break if (name.alt_value is not None and name_alt.tell() == 0): print("{0} {1}".format(Utils.toStringStringIO(name_base), name.alt_value), end="", file=name_alt, flush=True) print(" {0}".format(name.value), end="", file=name_base, flush=True) elif (name is not None): is_adj = False if (isinstance(name.end_token, TextToken)): for wf in name.end_token.morph.items: if ((isinstance(wf, MorphWordForm)) and (wf).is_in_dictionary): is_adj = (wf.class0_.is_adjective | wf.class0_.is_proper_geo) adj_gen = wf.gender break elif (wf.class0_.is_adjective | wf.class0_.is_proper_geo): is_adj = True if (is_adj): tmp = io.StringIO() vars0_ = list() t = name.begin_token while t is not None: tt = Utils.asObjectOrNull(t, TextToken) if (tt is None): break if (tmp.tell() > 0): print(' ', end="", file=tmp) if (t == name.end_token): is_padez = False if (not noun.is_abridge): if (not noun.morph.case_.is_undefined and not noun.morph.case_.is_nominative): is_padez = True elif (noun.termin.canonic_text == "ШОССЕ" or noun.termin.canonic_text == "ШОСЕ"): is_padez = True if (res.begin_token.previous is not None and res.begin_token.previous.morph.class0_.is_preposition): is_padez = True if (not is_padez): print(tt.term, end="", file=tmp) break for wf in tt.morph.items: if (((wf.class0_.is_adjective or wf.class0_.is_proper_geo)) and (((wf.gender) & (gen))) != (MorphGender.UNDEFINED)): if (noun.morph.case_.is_undefined or not ((wf.case_) & noun.morph.case_).is_undefined): wff = Utils.asObjectOrNull(wf, MorphWordForm) if (wff is None): continue if (gen == MorphGender.MASCULINE and "ОЙ" in wff.normal_case): continue if (not wff.normal_case in vars0_): vars0_.append(wff.normal_case) if (not tt.term in vars0_ and Utils.indexOfList(sli, name, 0) > Utils.indexOfList(sli, noun, 0)): vars0_.append(tt.term) if (len(vars0_) == 0): vars0_.append(tt.term) break if (not tt.is_hiphen): print(tt.term, end="", file=tmp) t = t.next0_ if (len(vars0_) == 0): print(" {0}".format(Utils.toStringStringIO(tmp)), end="", file=name_base, flush=True) else: head = Utils.toStringStringIO(name_base) print(" {0}{1}".format(Utils.toStringStringIO(tmp), vars0_[0]), end="", file=name_base, flush=True) if (len(vars0_) > 1): Utils.setLengthStringIO(name_alt, 0) print("{0} {1}{2}".format(head, Utils.toStringStringIO(tmp), vars0_[1]), end="", file=name_alt, flush=True) if (len(vars0_) > 2): name_alt2 = "{0} {1}{2}".format(head, Utils.toStringStringIO(tmp), vars0_[2]) else: str_nam = None nits = list() has_adj = False has_proper_name = False t = name.begin_token while t is not None: if (t.morph.class0_.is_adjective or t.morph.class0_.is_conjunction): has_adj = True if ((isinstance(t, TextToken)) and not t.is_hiphen): if (name.termin is not None): nits.append(name.termin.canonic_text) break elif (not t.chars.is_letter and len(nits) > 0): nits[len(nits) - 1] += (t).term else: nits.append((t).term) if (t == name.begin_token and t.getMorphClassInDictionary().is_proper_name): has_proper_name = True elif ((isinstance(t, ReferentToken)) and name.termin is None): nits.append(t.getSourceText().upper()) if (t == name.end_token): break t = t.next0_ if (not has_adj and not has_proper_name): nits.sort() str_nam = Utils.joinStrings(" ", list(nits)) if (has_proper_name and len(nits) == 2): Utils.setLengthStringIO(name_alt, 0) print("{0} {1}".format(Utils.toStringStringIO(name_base), nits[1]), end="", file=name_alt, flush=True) print(" {0}".format(str_nam), end="", file=name_base, flush=True) adj_str = None adj_can_be_initial = False if (adj is not None): if (adj_gen == MorphGender.UNDEFINED and name is not None and (((name.morph.number) & (MorphNumber.PLURAL))) == (MorphNumber.UNDEFINED)): if (name.morph.gender == MorphGender.FEMINIE or name.morph.gender == MorphGender.MASCULINE or name.morph.gender == MorphGender.NEUTER): adj_gen = name.morph.gender if (name is not None and (((name.morph.number) & (MorphNumber.PLURAL))) != (MorphNumber.UNDEFINED)): s = Morphology.getWordform(adj.termin.canonic_text, MorphBaseInfo._new209(MorphClass.ADJECTIVE, MorphNumber.PLURAL)) elif (adj_gen != MorphGender.UNDEFINED): s = Morphology.getWordform(adj.termin.canonic_text, MorphBaseInfo._new210(MorphClass.ADJECTIVE, adj_gen)) elif ((((adj.morph.gender) & (gen))) == (MorphGender.UNDEFINED)): s = Morphology.getWordform(adj.termin.canonic_text, MorphBaseInfo._new210(MorphClass.ADJECTIVE, adj.morph.gender)) else: s = Morphology.getWordform(adj.termin.canonic_text, MorphBaseInfo._new210(MorphClass.ADJECTIVE, gen)) adj_str = s if (name is not None and (Utils.indexOfList(sli, adj, 0) < Utils.indexOfList(sli, name, 0))): if (adj.end_token.isChar('.') and adj.length_char <= 3 and not adj.begin_token.chars.is_all_lower): adj_can_be_initial = True s1 = Utils.toStringStringIO(name_base).strip() s2 = Utils.toStringStringIO(name_alt).strip() if (len(s1) < 3): if (street.number is not None): if (adj_str is not None): if (adj.is_abridge): return None street.addSlot(StreetReferent.ATTR_NAME, adj_str, False, 0) elif (adj_str is None): if (len(s1) < 1): return None if (is_micro_raion): street.addSlot(StreetReferent.ATTR_NAME, s1, False, 0) if (not Utils.isNullOrEmpty(s2)): street.addSlot(StreetReferent.ATTR_NAME, s2, False, 0) else: return None else: if (adj.is_abridge): return None street.addSlot(StreetReferent.ATTR_NAME, adj_str, False, 0) elif (adj_can_be_initial): street.addSlot(StreetReferent.ATTR_NAME, s1, False, 0) street.addSlot(StreetReferent.ATTR_NAME, MiscHelper.getTextValue(adj.begin_token, name.end_token, GetTextAttr.NO), False, 0) street.addSlot(StreetReferent.ATTR_NAME, "{0} {1}".format(adj_str, s1), False, 0) elif (adj_str is None): street.addSlot(StreetReferent.ATTR_NAME, s1, False, 0) else: street.addSlot(StreetReferent.ATTR_NAME, "{0} {1}".format(adj_str, s1), False, 0) if (name_alt.tell() > 0): s1 = Utils.toStringStringIO(name_alt).strip() if (adj_str is None): street.addSlot(StreetReferent.ATTR_NAME, s1, False, 0) else: street.addSlot(StreetReferent.ATTR_NAME, "{0} {1}".format(adj_str, s1), False, 0) if (name_alt2 is not None): if (adj_str is None): if (for_metro and noun is not None): street.addSlot(StreetReferent.ATTR_NAME, "{0} {1}".format(alt_noun.termin.canonic_text, name_alt2.strip()), False, 0) else: street.addSlot(StreetReferent.ATTR_NAME, name_alt2.strip(), False, 0) else: street.addSlot(StreetReferent.ATTR_NAME, "{0} {1}".format(adj_str, name_alt2.strip()), False, 0) if (name is not None and name.alt_value2 is not None): street.addSlot(StreetReferent.ATTR_NAME, name.alt_value2, False, 0) if ((name is not None and adj is None and name.exist_street is not None) and not for_metro): for n in name.exist_street.names: street.addSlot(StreetReferent.ATTR_NAME, n, False, 0) if (alt_noun is not None and not for_metro): street.addSlot(StreetReferent.ATTR_TYP, alt_noun.termin.canonic_text.lower(), False, 0) if (noun.termin.canonic_text == "ПЛОЩАДЬ" or noun.termin.canonic_text == "КВАРТАЛ" or noun.termin.canonic_text == "ПЛОЩА"): res.is_doubt = True if (name is not None and name.is_in_dictionary): res.is_doubt = False elif (alt_noun is not None or for_metro): res.is_doubt = False elif (res.begin_token.previous is None or MiscLocationHelper.checkGeoObjectBefore(res.begin_token.previous)): if (res.end_token.next0_ is None or AddressItemToken.checkHouseAfter(res.end_token.next0_, False, True)): res.is_doubt = False if (LanguageHelper.endsWith(noun.termin.canonic_text, "ГОРОДОК")): for s in street.slots: if (s.type_name == StreetReferent.ATTR_TYP): street.uploadSlot(s, "микрорайон") elif (s.type_name == StreetReferent.ATTR_NAME): street.uploadSlot(s, "{0} {1}".format(noun.termin.canonic_text, s.value)) if (street.findSlot(StreetReferent.ATTR_NAME, None, True) is None): street.addSlot(StreetReferent.ATTR_NAME, noun.termin.canonic_text, False, 0) t1 = res.end_token.next0_ if (t1 is not None and t1.is_comma): t1 = t1.next0_ non = StreetItemToken.tryParse(t1, None, False, None, False) if (non is not None and non.typ == StreetItemType.NOUN and len(street.typs) > 0): if (AddressItemToken.checkHouseAfter(non.end_token.next0_, False, True)): street._correct() nams = street.names for t in street.typs: for n in nams: street.addSlot(StreetReferent.ATTR_NAME, "{0} {1}".format(t.upper(), n), False, 0) street.addSlot(StreetReferent.ATTR_TYP, non.termin.canonic_text.lower(), False, 0) res.end_token = non.end_token if (res.is_doubt): if (noun.is_road): if (street.number is not None and Utils.endsWithString(street.number, "КМ", True)): res.is_doubt = False elif (AddressItemToken.checkKmAfter(res.end_token.next0_)): res.is_doubt = False elif (AddressItemToken.checkKmBefore(res.begin_token.previous)): res.is_doubt = False elif (noun.termin.canonic_text == "ПРОЕЗД" and street.findSlot(StreetReferent.ATTR_NAME, "ПРОЕКТИРУЕМЫЙ", True) is not None): res.is_doubt = False tt0 = res.begin_token.previous first_pass2733 = True while True: if first_pass2733: first_pass2733 = False else: tt0 = tt0.previous if (not (tt0 is not None)): break if (tt0.isCharOf(",,") or tt0.is_comma_and): continue str0 = Utils.asObjectOrNull(tt0.getReferent(), StreetReferent) if (str0 is not None): res.is_doubt = False break if (noun.termin.canonic_text == "КВАРТАЛ" and (res.whitespaces_after_count < 2) and number is None): ait = AddressItemToken.tryParse(res.end_token.next0_, None, False, True, None) if (ait is not None and ait.typ == AddressItemToken.ItemType.NUMBER and ait.value is not None): street.addSlot(StreetReferent.ATTR_NUMBER, ait.value, False, 0) res.end_token = ait.end_token return res
def getNormalCaseText(self, mc: 'MorphClass' = None, single_number: bool = False, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: from pullenti.ner.core.MiscHelper import MiscHelper empty = True if (mc is not None and mc.is_preposition): return LanguageHelper.normalizePreposition(self.term) for it in self.morph.items: if (mc is not None and not mc.is_undefined): cc = (it.class0_.value) & (mc.value) if (cc == 0): continue if (MorphClass.isMiscInt(cc) and not MorphClass.isProperInt(cc) and mc.value != it.class0_.value): continue wf = Utils.asObjectOrNull(it, MorphWordForm) normal_full = False if (gender != MorphGender.UNDEFINED): if ((((it.gender) & (gender))) == (MorphGender.UNDEFINED)): if ((gender == MorphGender.MASCULINE and ((it.gender != MorphGender.UNDEFINED or it.number == MorphNumber.PLURAL)) and wf is not None) and wf.normal_full is not None): normal_full = True elif (gender == MorphGender.MASCULINE and it.class0_.is_personal_pronoun): pass else: continue if (not it.case_.is_undefined): empty = False if (wf is not None): if (single_number and it.number == MorphNumber.PLURAL and wf.normal_full is not None): le = len(wf.normal_case) if ((le == (len(wf.normal_full) + 2) and le > 4 and wf.normal_case[le - 2] == 'С') and wf.normal_case[le - 1] == 'Я'): res = wf.normal_case else: res = (wf.normal_full if normal_full else wf.normal_full) else: res = (wf.normal_full if normal_full else (Utils.ifNotNull(wf.normal_case, self.term))) if (single_number and mc is not None and mc == MorphClass.NOUN): if (res == "ДЕТИ"): res = "РЕБЕНОК" if (keep_chars): if (self.chars.is_all_lower): res = res.lower() elif (self.chars.is_capital_upper): res = MiscHelper.convertFirstCharUpperAndOtherLower( res) return res if (not empty): return None te = None if (single_number and mc is not None): bi = MorphBaseInfo._new549(MorphClass(mc), gender, MorphNumber.SINGULAR, self.morph.language) vars0_ = Morphology.getWordform(self.term, bi) if (vars0_ is not None): te = vars0_ if (self.chars.is_cyrillic_letter and te is None and len(self.term) > 3): ch0 = self.term[len(self.term) - 1] ch1 = self.term[len(self.term) - 2] if (ch0 == 'М' and ((ch1 == 'О' or ch1 == 'А'))): te = self.term[0:0 + len(self.term) - 2] elif (not LanguageHelper.isCyrillicVowel(ch1) and LanguageHelper.isCyrillicVowel(ch0)): te = self.term[0:0 + len(self.term) - 1] if (te is None): te = self.term if (keep_chars): if (self.chars.is_all_lower): return te.lower() elif (self.chars.is_capital_upper): return MiscHelper.convertFirstCharUpperAndOtherLower(te) return te