def morph(self) -> 'MorphCollection': """ Морфологическая информация """ if (self.__m_morph is None): self.__m_morph = MorphCollection() return self.__m_morph
def _serialize(self, stream: io.IOBase) -> None: from pullenti.ner.core.internal.SerializerHelper import SerializerHelper SerializerHelper.serializeInt(stream, self.begin_char) SerializerHelper.serializeInt(stream, self.end_char) SerializerHelper.serializeInt(stream, self.__m_attrs) SerializerHelper.serializeInt(stream, self.chars.value) if (self.__m_morph is None): self.__m_morph = MorphCollection() self.__m_morph._serialize(stream)
def __init__(self, source: 'MorphToken', kit_: 'AnalysisKit', bchar: int = -1, echar: int = -1) -> None: super().__init__(kit_, (bchar if bchar >= 0 else (0 if source is None else source.begin_char)), (echar if echar >= 0 else (0 if source is None else source.end_char))) self.term = None self.lemma = None self.term0 = None self.invariant_prefix_length_of_morph_vars = 0 self.max_length_of_morph_vars = 0 if (source is None): return self.chars = source.char_info self.term = source.term self.lemma = (Utils.ifNotNull(source.get_lemma(), self.term)) self.max_length_of_morph_vars = (len(self.term)) self.morph = MorphCollection() if (source.word_forms is not None): for wf in source.word_forms: self.morph.add_item(wf) if (wf.normal_case is not None and (self.max_length_of_morph_vars < len(wf.normal_case))): self.max_length_of_morph_vars = (len(wf.normal_case)) if (wf.normal_full is not None and (self.max_length_of_morph_vars < len(wf.normal_full))): self.max_length_of_morph_vars = (len(wf.normal_full)) i = 0 while i < len(self.term): ch = self.term[i] j = 0 j = 0 while j < self.morph.items_count: wf = Utils.asObjectOrNull(self.morph.get_indexer_item(j), MorphWordForm) if (wf.normal_case is not None): if (i >= len(wf.normal_case)): break if (wf.normal_case[i] != ch): break if (wf.normal_full is not None): if (i >= len(wf.normal_full)): break if (wf.normal_full[i] != ch): break j += 1 if (j < self.morph.items_count): break self.invariant_prefix_length_of_morph_vars = ((i + 1)) i += 1 if (self.morph.language.is_undefined and not source.language.is_undefined): self.morph.language = source.language
def _deserialize(self, stream: io.IOBase, kit_: 'AnalysisKit', vers: int) -> None: from pullenti.ner.core.internal.SerializerHelper import SerializerHelper self.kit = kit_ self.begin_char = SerializerHelper.deserializeInt(stream) self.end_char = SerializerHelper.deserializeInt(stream) self.__m_attrs = (SerializerHelper.deserializeInt(stream)) self.chars = CharsInfo._new2656( SerializerHelper.deserializeInt(stream)) self.__m_morph = MorphCollection() self.__m_morph._deserialize(stream)
def __tryParseRu(t: 'Token') -> 'VerbPhraseToken': res = None t0 = t not0_ = None has_verb = False first_pass2814 = True while True: if first_pass2814: first_pass2814 = False else: t = t.next0_ if (not (t is not None)): break if (not ((isinstance(t, TextToken)))): break tt = Utils.asObjectOrNull(t, TextToken) if (tt.term == "НЕ"): not0_ = t continue ty = 0 mc = tt.getMorphClassInDictionary() if (tt.term == "НЕТ"): ty = 1 elif (mc.is_adverb): ty = 2 elif (tt.is_pure_verb or tt.is_verb_be): ty = 1 elif (mc.is_verb): if (mc.is_preposition or mc.is_misc): pass elif (mc.is_noun): if (tt.term == "СТАЛИ"): ty = 1 elif (not tt.chars.is_all_lower and not MiscHelper.canBeStartOfSentence(tt)): ty = 1 elif (mc.is_proper): if (tt.chars.is_all_lower): ty = 1 else: ty = 1 if (ty == 0): break if (res is None): res = VerbPhraseToken(t0, t) res.end_token = t it = VerbPhraseItemToken._new638(t, t, MorphCollection(t.morph)) if (not0_ is not None): it.begin_token = not0_ it.not0_ = True not0_ = (None) it.is_adverb = ty == 2 it.normal = t.getNormalCaseText( (MorphClass.ADVERB if ty == 2 else MorphClass.VERB), False, MorphGender.UNDEFINED, False) res.items.append(it) if (not has_verb and ty == 1): res.morph = it.morph has_verb = True if (not has_verb): return None return res
def __init__(self, entity : 'Referent', begin : 'Token', end : 'Token', kit_ : 'AnalysisKit'=None) -> None: super().__init__(begin, end, kit_) self.referent = None; self.data = None; self.misc_attrs = 0 self.referent = entity if (self.morph is None): self.morph = MorphCollection()
def __try_parse_ru(first: 'Token', typ: 'NounPhraseParseAttr', max_char_pos: int, def_noun: 'NounPhraseItem' = None) -> 'NounPhraseToken': if (first is None): return None items = None adverbs = None prep = None kak = False t0 = first if ((((typ) & (NounPhraseParseAttr.PARSEPREPOSITION))) != (NounPhraseParseAttr.NO) and t0.is_value("КАК", None)): t0 = t0.next0_ prep = PrepositionHelper.try_parse(t0) if (prep is not None): t0 = prep.end_token.next0_ kak = True internal_noun_prase = None conj_before = False t = t0 first_pass3041 = True while True: if first_pass3041: first_pass3041 = False else: t = t.next0_ if (not (t is not None)): break if (max_char_pos > 0 and t.begin_char > max_char_pos): break if ((t.morph.class0_.is_conjunction and not t.morph.class0_.is_adjective and not t.morph.class0_.is_pronoun) and not t.morph.class0_.is_noun): if (conj_before): break if ((((typ) & (NounPhraseParseAttr.CANNOTHASCOMMAAND))) != (NounPhraseParseAttr.NO)): break if (items is not None and ((t.is_and or t.is_or))): conj_before = True if ((t.next0_ is not None and t.next0_.is_char_of("\\/") and t.next0_.next0_ is not None) and t.next0_.next0_.is_or): t = t.next0_.next0_ if (((t.next0_ is not None and t.next0_.is_char('(') and t.next0_.next0_ is not None) and t.next0_.next0_.is_or and t.next0_.next0_.next0_ is not None) and t.next0_.next0_.next0_.is_char(')')): t = t.next0_.next0_.next0_ continue break elif (t.is_comma): if (conj_before or items is None): break if ((((typ) & (NounPhraseParseAttr.CANNOTHASCOMMAAND))) != (NounPhraseParseAttr.NO)): break mc = t.previous.get_morph_class_in_dictionary() if (mc.is_proper_surname or mc.is_proper_secname): break conj_before = True if (kak and t.next0_ is not None and t.next0_.is_value("ТАК", None)): t = t.next0_ if (t.next0_ is not None and t.next0_.is_and): t = t.next0_ pr = PrepositionHelper.try_parse(t.next0_) if (pr is not None): t = pr.end_token if (items[len(items) - 1].can_be_noun and items[len(items) - 1].end_token.morph.class0_.is_pronoun): break continue elif (t.is_char('(')): if (items is None): return None brr = BracketHelper.try_parse(t, BracketParseAttr.NO, 100) if (brr is None): break if (brr.length_char > 100): break t = brr.end_token continue if (isinstance(t, ReferentToken)): if ((((typ) & (NounPhraseParseAttr.REFERENTCANBENOUN))) == ( NounPhraseParseAttr.NO)): break elif (t.chars.is_latin_letter): break it = NounPhraseItem.try_parse(t, items, typ) if (it is None or ((not it.can_be_adj and not it.can_be_noun))): if (((it is not None and items is not None and t.chars.is_capital_upper) and (t.whitespaces_before_count < 3) and t.length_char > 3) and not t.get_morph_class_in_dictionary().is_noun and not t.get_morph_class_in_dictionary().is_adjective): it.can_be_noun = True items.append(it) break if ((((typ) & (NounPhraseParseAttr.PARSEADVERBS))) != (NounPhraseParseAttr.NO) and (isinstance(t, TextToken)) and t.morph.class0_.is_adverb): if (adverbs is None): adverbs = list() adverbs.append(Utils.asObjectOrNull(t, TextToken)) continue break it.conj_before = conj_before conj_before = False if (not it.can_be_adj and not it.can_be_noun): break if (t.is_newline_before and t != first): if ((((typ) & (NounPhraseParseAttr.MULTILINES))) != (NounPhraseParseAttr.NO)): pass elif (items is not None and t.chars != items[len(items) - 1].chars): if (t.chars.is_all_lower and items[len(items) - 1].chars.is_capital_upper): pass else: break if (items is None): items = list() else: it0 = items[len(items) - 1] if (it0.can_be_noun and it0.is_personal_pronoun): if (it.is_pronoun): break if ((it0.begin_token.previous is not None and it0.begin_token.previous. get_morph_class_in_dictionary().is_verb and not it0.begin_token.previous. get_morph_class_in_dictionary().is_adjective) and not it0.begin_token.previous. get_morph_class_in_dictionary().is_preposition): if (t.morph.case_.is_nominative or t.morph.case_.is_accusative): pass else: break if (it.can_be_noun and it.is_verb): if (it0.previous is None): pass elif ((isinstance(it0.previous, TextToken)) and not it0.previous.chars.is_letter): pass else: break items.append(it) t = it.end_token if (t.is_newline_after and not t.chars.is_all_lower): mc = t.get_morph_class_in_dictionary() if (mc.is_proper_surname): break if (t.morph.class0_.is_proper_surname and mc.is_undefined): break if (items is None): return None tt1 = None if (len(items) == 1 and items[0].can_be_adj): and0_ = False tt1 = items[0].end_token.next0_ first_pass3042 = True while True: if first_pass3042: first_pass3042 = False else: tt1 = tt1.next0_ if (not (tt1 is not None)): break if (tt1.is_and or tt1.is_or): and0_ = True break if (tt1.is_comma or tt1.is_value("НО", None) or tt1.is_value("ТАК", None)): continue break if (and0_): if (items[0].can_be_noun and items[0].is_personal_pronoun): and0_ = False if (and0_): tt2 = tt1.next0_ if (tt2 is not None and tt2.morph.class0_.is_preposition): tt2 = tt2.next0_ npt1 = _NounPraseHelperInt.__try_parse_ru( tt2, typ, max_char_pos, None) if (npt1 is not None and len(npt1.adjectives) > 0): ok1 = False for av in items[0].adj_morph: for v in npt1.noun.noun_morph: if (v.check_accord(av, False, False)): items[0].morph.add_item(av) ok1 = True if (ok1): npt1.begin_token = items[0].begin_token npt1.end_token = tt1.previous npt1.adjectives.clear() npt1.adjectives.append(items[0]) return npt1 if (def_noun is not None): items.append(def_noun) last1 = items[len(items) - 1] check = True for it in items: if (not it.can_be_adj): check = False break elif (it.can_be_noun and it.is_personal_pronoun): check = False break tt1 = last1.end_token.next0_ if ((tt1 is not None and check and ((tt1.morph.class0_.is_preposition or tt1.morph.case_.is_instrumental))) and (tt1.whitespaces_before_count < 2)): inp = NounPhraseHelper.try_parse( tt1, Utils.valToEnum((typ) | (NounPhraseParseAttr.PARSEPREPOSITION), NounPhraseParseAttr), max_char_pos, None) if (inp is not None): tt1 = inp.end_token.next0_ npt1 = _NounPraseHelperInt.__try_parse_ru( tt1, typ, max_char_pos, None) if (npt1 is not None): ok = True ii = 0 first_pass3043 = True while True: if first_pass3043: first_pass3043 = False else: ii += 1 if (not (ii < len(items))): break it = items[ii] if (NounPhraseItem.try_accord_adj_and_noun( it, Utils.asObjectOrNull(npt1.noun, NounPhraseItem))): continue if (ii > 0): inp2 = NounPhraseHelper.try_parse( it.begin_token, typ, max_char_pos, None) if (inp2 is not None and inp2.end_token == inp.end_token): del items[ii:ii + len(items) - ii] inp = inp2 break ok = False break if (ok): if (npt1.morph.case_.is_genitive and not inp.morph.case_.is_instrumental): ok = False if (ok): i = 0 while i < len(items): npt1.adjectives.insert(i, items[i]) i += 1 npt1.internal_noun = inp mmm = MorphCollection(npt1.morph) for it in items: mmm.remove_items(it.adj_morph[0], False) if (mmm.gender != MorphGender.UNDEFINED or mmm.number != MorphNumber.UNDEFINED or not mmm.case_.is_undefined): npt1.morph = mmm if (adverbs is not None): if (npt1.adverbs is None): npt1.adverbs = adverbs else: npt1.adverbs[0:0] = adverbs npt1.begin_token = first return npt1 if (tt1 is not None and tt1.morph.class0_.is_noun and not tt1.morph.case_.is_genitive): it = NounPhraseItem.try_parse(tt1, items, typ) if (it is not None and it.can_be_noun): internal_noun_prase = inp inp.begin_token = items[0].end_token.next0_ items.append(it) i = 0 first_pass3044 = True while True: if first_pass3044: first_pass3044 = False else: i += 1 if (not (i < len(items))): break if (items[i].can_be_adj and items[i].begin_token.morph.class0_.is_verb): it = items[i].begin_token if (not it.get_morph_class_in_dictionary().is_verb): continue if (it.is_value("УПОЛНОМОЧЕННЫЙ", None)): continue if ((((typ) & (NounPhraseParseAttr.PARSEVERBS))) == ( NounPhraseParseAttr.NO)): continue inp = _NounPraseHelperInt.__try_parse_ru( items[i].end_token.next0_, NounPhraseParseAttr.NO, max_char_pos, None) if (inp is None): continue if (inp.anafor is not None and i == (len(items) - 1) and NounPhraseItem.try_accord_adj_and_noun( items[i], Utils.asObjectOrNull(inp.noun, NounPhraseItem))): inp.begin_token = first ii = 0 while ii < len(items): inp.adjectives.insert(ii, items[ii]) ii += 1 return inp if (inp.end_token.whitespaces_after_count > 3): continue npt1 = _NounPraseHelperInt.__try_parse_ru( inp.end_token.next0_, NounPhraseParseAttr.NO, max_char_pos, None) if (npt1 is None): continue ok = True j = 0 while j <= i: if (not NounPhraseItem.try_accord_adj_and_noun( items[j], Utils.asObjectOrNull(npt1.noun, NounPhraseItem))): ok = False break j += 1 if (not ok): continue verb = VerbPhraseHelper.try_parse(it, True, False, False) if (verb is None): continue vlinks = SemanticHelper.try_create_links(verb, inp, None) nlinks = SemanticHelper.try_create_links(inp, npt1, None) if (len(vlinks) == 0 and len(nlinks) > 0): continue j = 0 while j <= i: npt1.adjectives.insert(j, items[j]) j += 1 items[i].end_token = inp.end_token mmm = MorphCollection(npt1.morph) bil = list() j = 0 while j <= i: bil.clear() for m in items[j].adj_morph: bil.append(m) mmm.remove_items_list_cla(bil, None) j += 1 if (mmm.gender != MorphGender.UNDEFINED or mmm.number != MorphNumber.UNDEFINED or not mmm.case_.is_undefined): npt1.morph = mmm if (adverbs is not None): if (npt1.adverbs is None): npt1.adverbs = adverbs else: npt1.adverbs[0:0] = adverbs npt1.begin_token = first return npt1 ok2 = False if ((len(items) == 1 and (((typ) & (NounPhraseParseAttr.ADJECTIVECANBELAST))) != (NounPhraseParseAttr.NO) and (items[0].whitespaces_after_count < 3)) and not items[0].is_adverb): if (not items[0].can_be_adj): ok2 = True elif (items[0].is_personal_pronoun and items[0].can_be_noun): ok2 = True if (ok2): it = NounPhraseItem.try_parse(items[0].end_token.next0_, None, typ) if (it is not None and it.can_be_adj and it.begin_token.chars.is_all_lower): ok2 = True if (it.is_adverb or it.is_verb): ok2 = False if (it.is_pronoun and items[0].is_pronoun): ok2 = False if (it.can_be_adj_for_personal_pronoun and items[0].is_personal_pronoun): ok2 = True if (ok2 and NounPhraseItem.try_accord_adj_and_noun( it, items[0])): npt1 = _NounPraseHelperInt.__try_parse_ru( it.begin_token, typ, max_char_pos, None) if (npt1 is not None and ((npt1.end_char > it.end_char or len(npt1.adjectives) > 0))): pass else: items.insert(0, it) noun = None adj_after = None for i in range(len(items) - 1, -1, -1): if (items[i].can_be_noun): if (items[i].conj_before): continue if (i > 0 and not items[i - 1].can_be_adj): continue if (i > 0 and items[i - 1].can_be_noun): if (items[i - 1].is_doubt_adjective): continue if (items[i - 1].is_pronoun and items[i].is_pronoun): if (items[i].is_pronoun and items[i - 1].can_be_adj_for_personal_pronoun): pass else: continue noun = items[i] del items[i:i + len(items) - i] if (adj_after is not None): items.append(adj_after) elif (len(items) > 0 and items[0].can_be_noun and not items[0].can_be_adj): noun = items[0] items.clear() break if (noun is None): return None res = NounPhraseToken._new466(first, noun.end_token, prep) if (adverbs is not None): for a in adverbs: if (a.begin_char < noun.begin_char): if (len(items) == 0 and prep is None): return None if (res.adverbs is None): res.adverbs = list() res.adverbs.append(a) res.noun = (noun) res.multi_nouns = noun.multi_nouns if (kak): res.multi_nouns = True res.internal_noun = internal_noun_prase for v in noun.noun_morph: noun.morph.add_item(v) res.morph = noun.morph if (res.morph.case_.is_nominative and first.previous is not None and first.previous.morph.class0_.is_preposition): res.morph.case_ = (res.morph.case_) ^ MorphCase.NOMINATIVE if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (NounPhraseParseAttr.NO) and ((res.morph.class0_.is_pronoun or res.morph.class0_.is_personal_pronoun))): return None stat = None if (len(items) > 1): stat = dict() need_update_morph = False if (len(items) > 0): ok_list = list() is_num_not = False for vv in noun.noun_morph: i = 0 v = vv i = 0 while i < len(items): ok = False for av in items[i].adj_morph: if (v.check_accord(av, False, False)): ok = True if (not ((av.case_) & v.case_).is_undefined and av.case_ != v.case_): v.case_ = av.case_ = (av.case_) & v.case_ break if (not ok): if (items[i].can_be_numeric_adj and items[i].try_accord_var(v, False)): ok = True v1 = NounPhraseItemTextVar() v1.copy_from_item(v) v1.number = MorphNumber.PLURAL is_num_not = True v1.case_ = MorphCase() for a in items[i].adj_morph: v1.case_ = (v1.case_) | a.case_ v = v1 else: break i += 1 if (i >= len(items)): ok_list.append(v) if (len(ok_list) > 0 and (((len(ok_list) < res.morph.items_count) or is_num_not))): res.morph = MorphCollection() for v in ok_list: res.morph.add_item(v) if (not is_num_not): noun.morph = res.morph i = 0 first_pass3045 = True while True: if first_pass3045: first_pass3045 = False else: i += 1 if (not (i < len(items))): break for av in items[i].adj_morph: for v in noun.noun_morph: if (v.check_accord(av, False, False)): if (not ((av.case_) & v.case_).is_undefined and av.case_ != v.case_): v.case_ = av.case_ = (av.case_) & v.case_ need_update_morph = True items[i].morph.add_item(av) if (stat is not None and av.normal_value is not None and len(av.normal_value) > 1): last = av.normal_value[len(av.normal_value) - 1] if (not last in stat): stat[last] = 1 else: stat[last] += 1 if (items[i].is_pronoun or items[i].is_personal_pronoun): res.anafor = items[i].begin_token if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == ( NounPhraseParseAttr.NO)): continue tt = Utils.asObjectOrNull(items[i].begin_token, TextToken) if (tt is not None and not tt.term.startswith("ВЫСШ")): err = False for wf in tt.morph.items: if (wf.class0_.is_adjective): if (wf.contains_attr("прев.", None)): if ((((typ) & (NounPhraseParseAttr.IGNOREADJBEST))) != (NounPhraseParseAttr.NO)): err = True if (wf.contains_attr("к.ф.", None) and tt.morph.class0_.is_personal_pronoun): return None if (err): continue if (res.morph.case_.is_nominative): v = MiscHelper.get_text_value_of_meta_token( items[i], GetTextAttr.KEEPQUOTES) if (not Utils.isNullOrEmpty(v)): if (items[i].get_normal_case_text( None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False) != v): wf = NounPhraseItemTextVar(items[i].morph, None) wf.normal_value = v wf.class0_ = MorphClass.ADJECTIVE wf.case_ = res.morph.case_ if (res.morph.case_.is_prepositional or res.morph.gender == MorphGender.NEUTER or res.morph.gender == MorphGender.FEMINIE): items[i].morph.add_item(wf) else: items[i].morph.insert_item(0, wf) res.adjectives.append(items[i]) if (items[i].end_char > res.end_char): res.end_token = items[i].end_token i = 0 first_pass3046 = True while True: if first_pass3046: first_pass3046 = False else: i += 1 if (not (i < (len(res.adjectives) - 1))): break if (res.adjectives[i].whitespaces_after_count > 5): if (res.adjectives[i].chars != res.adjectives[i + 1].chars): if (not res.adjectives[i + 1].chars.is_all_lower): return None if (res.adjectives[i].chars.is_all_upper and res.adjectives[i + 1].chars.is_capital_upper): return None if (res.adjectives[i].chars.is_capital_upper and res.adjectives[i + 1].chars.is_all_upper): return None if (res.adjectives[i].whitespaces_after_count > 10): if (res.adjectives[i].newlines_after_count == 1): if (res.adjectives[i].chars.is_capital_upper and i == 0 and res.adjectives[i + 1].chars.is_all_lower): continue if (res.adjectives[i].chars == res.adjectives[ i + 1].chars): continue return None if (need_update_morph): noun.morph = MorphCollection() for v in noun.noun_morph: noun.morph.add_item(v) res.morph = noun.morph if (len(res.adjectives) > 0): if (noun.begin_token.previous is not None): if (noun.begin_token.previous.is_comma_and): if (res.adjectives[0].begin_char > noun.begin_char): pass else: return None zap = 0 and0_ = 0 cou = 0 last_and = False i = 0 while i < (len(res.adjectives) - 1): te = res.adjectives[i].end_token.next0_ if (te is None): return None if (te.is_char('(')): pass elif (te.is_comma): zap += 1 last_and = False elif (te.is_and or te.is_or): and0_ += 1 last_and = True if (not res.adjectives[i].begin_token.morph.class0_.is_pronoun ): cou += 1 i += 1 if ((zap + and0_) > 0): if (and0_ > 1): return None elif (and0_ == 1 and not last_and): return None if ((zap + and0_) != cou): if (and0_ == 1): pass else: return None last = Utils.asObjectOrNull( res.adjectives[len(res.adjectives) - 1], NounPhraseItem) if (last.is_pronoun and not last_and): return None if (stat is not None): for adj in items: if (adj.morph.items_count > 1): w1 = Utils.asObjectOrNull(adj.morph.get_indexer_item(0), NounPhraseItemTextVar) w2 = Utils.asObjectOrNull(adj.morph.get_indexer_item(1), NounPhraseItemTextVar) if ((len(w1.normal_value) < 2) or (len(w2.normal_value) < 2)): break l1 = w1.normal_value[len(w1.normal_value) - 1] l2 = w2.normal_value[len(w2.normal_value) - 1] i1 = 0 i2 = 0 wrapi1468 = RefOutArgWrapper(0) Utils.tryGetValue(stat, l1, wrapi1468) i1 = wrapi1468.value wrapi2467 = RefOutArgWrapper(0) Utils.tryGetValue(stat, l2, wrapi2467) i2 = wrapi2467.value if (i1 < i2): adj.morph.remove_item(1) adj.morph.insert_item(0, w2) if (res.begin_token.get_morph_class_in_dictionary().is_verb and len(items) > 0): if (not res.begin_token.chars.is_all_lower or res.begin_token.previous is None): pass elif (res.begin_token.previous.morph.class0_.is_preposition): pass else: comma = False tt = res.begin_token.previous first_pass3047 = True while True: if first_pass3047: first_pass3047 = False else: tt = tt.previous if (not (tt is not None and tt.end_char <= res.end_char)): break if (tt.morph.class0_.is_adverb): continue if (tt.is_char_of(".;")): break if (tt.is_comma): comma = True continue if (tt.is_value("НЕ", None)): continue if (((tt.morph.class0_.is_noun or tt.morph.class0_.is_proper)) and comma): for it in res.begin_token.morph.items: if (it.class0_.is_verb and (isinstance(it, MorphWordForm))): if (tt.morph.check_accord(it, False, False)): if (res.morph.case_.is_instrumental): return None break if (res.begin_token == res.end_token): mc = res.begin_token.get_morph_class_in_dictionary() if (mc.is_adverb): if (res.begin_token.previous is not None and res.begin_token.previous.morph.class0_.is_preposition): pass elif (mc.is_noun and not mc.is_preposition and not mc.is_conjunction): pass elif (res.begin_token.is_value("ВЕСЬ", None)): pass else: return None if (def_noun is not None and def_noun.end_token == res.end_token and len(res.adjectives) > 0): res.end_token = res.adjectives[len(res.adjectives) - 1].end_token return res
def __try_parse_en(first: 'Token', typ: 'NounPhraseParseAttr', max_char_pos: int) -> 'NounPhraseToken': if (first is None): return None items = None has_article = False has_prop = False has_misc = False if (first.previous is not None and first.previous.morph.class0_.is_preposition and (first.whitespaces_before_count < 3)): has_prop = True t = first first_pass3048 = True while True: if first_pass3048: first_pass3048 = False else: t = t.next0_ if (not (t is not None)): break if (max_char_pos > 0 and t.begin_char > max_char_pos): break if (not t.chars.is_latin_letter): break if (t != first and t.whitespaces_before_count > 2): if ((((typ) & (NounPhraseParseAttr.MULTILINES))) != (NounPhraseParseAttr.NO)): pass elif (MiscHelper.is_eng_article(t.previous)): pass else: break tt = Utils.asObjectOrNull(t, TextToken) if (t == first and tt is not None): if (MiscHelper.is_eng_article(tt)): has_article = True continue if (isinstance(t, ReferentToken)): if ((((typ) & (NounPhraseParseAttr.REFERENTCANBENOUN))) == ( NounPhraseParseAttr.NO)): break elif (tt is None): break if ((t.is_value("SO", None) and t.next0_ is not None and t.next0_.is_hiphen) and t.next0_.next0_ is not None): if (t.next0_.next0_.is_value("CALL", None)): t = t.next0_.next0_ continue mc = t.get_morph_class_in_dictionary() if (mc.is_conjunction or mc.is_preposition): break if (mc.is_pronoun or mc.is_personal_pronoun): if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == ( NounPhraseParseAttr.NO)): break elif (mc.is_misc): if (t.is_value("THIS", None) or t.is_value("THAT", None)): has_misc = True if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == ( NounPhraseParseAttr.NO)): break is_adj = False if (((has_article or has_prop or has_misc)) and items is None): pass elif (isinstance(t, ReferentToken)): pass else: if (not mc.is_noun and not mc.is_adjective): if (mc.is_undefined and has_article): pass elif (items is None and mc.is_undefined and t.chars.is_capital_upper): pass elif (mc.is_pronoun): pass elif (tt.term.endswith("EAN")): is_adj = True elif (MiscHelper.is_eng_adj_suffix(tt.next0_)): pass else: break if (mc.is_verb): if (t.next0_ is not None and t.next0_.morph.class0_.is_verb and (t.whitespaces_after_count < 2)): pass elif (t.chars.is_capital_upper and not MiscHelper.can_be_start_of_sentence(t)): pass elif ((t.chars.is_capital_upper and mc.is_noun and (isinstance(t.next0_, TextToken))) and t.next0_.chars.is_capital_upper): pass elif (isinstance(t, ReferentToken)): pass else: break if (items is None): items = list() it = NounPhraseItem(t, t) if (mc.is_noun): it.can_be_noun = True if (mc.is_adjective or mc.is_pronoun or is_adj): it.can_be_adj = True items.append(it) t = it.end_token if (len(items) == 1): if (MiscHelper.is_eng_adj_suffix(t.next0_)): mc.is_noun = False mc.is_adjective = True t = t.next0_.next0_ if (items is None): return None noun = items[len(items) - 1] res = NounPhraseToken(first, noun.end_token) res.noun = (noun) res.morph = MorphCollection() for v in noun.end_token.morph.items: if (v.class0_.is_verb): continue if (v.class0_.is_proper and noun.begin_token.chars.is_all_lower): continue if (isinstance(v, MorphWordForm)): wf = MorphWordForm() wf.copy_from_word_form(Utils.asObjectOrNull(v, MorphWordForm)) if (has_article and v.number != MorphNumber.SINGULAR): wf.number = MorphNumber.SINGULAR res.morph.add_item(wf) else: bi = MorphBaseInfo() bi.copy_from(v) if (has_article and v.number != MorphNumber.SINGULAR): bi.number = MorphNumber.SINGULAR res.morph.add_item(bi) if (res.morph.items_count == 0 and has_article): res.morph.add_item( MorphBaseInfo._new192(MorphClass.NOUN, MorphNumber.SINGULAR)) i = 0 while i < (len(items) - 1): res.adjectives.append(items[i]) i += 1 return res
def try_parse(t: 'Token', items: typing.List['NounPhraseItem'], attrs: 'NounPhraseParseAttr') -> 'NounPhraseItem': if (t is None): return None t0 = t _can_be_surname = False _is_doubt_adj = False rt = Utils.asObjectOrNull(t, ReferentToken) if (rt is not None and rt.begin_token == rt.end_token and (isinstance(rt.begin_token, TextToken))): res = NounPhraseItem.try_parse(rt.begin_token, items, attrs) if (res is not None): res.begin_token = res.end_token = t res.can_be_noun = True return res if (rt is not None): res = NounPhraseItem(t, t) for m in t.morph.items: v = NounPhraseItemTextVar(m, None) v.normal_value = str(t.get_referent()) res.noun_morph.append(v) res.can_be_noun = True return res if (isinstance(t, NumberToken)): pass has_legal_verb = False if (isinstance(t, TextToken)): if (not t.chars.is_letter): return None str0_ = t.term if (str0_[len(str0_) - 1] == 'А' or str0_[len(str0_) - 1] == 'О'): for wf in t.morph.items: if ((isinstance(wf, MorphWordForm)) and wf.is_in_dictionary): if (wf.class0_.is_verb): mc = t.get_morph_class_in_dictionary() if (not mc.is_noun and (((attrs) & (NounPhraseParseAttr.IGNOREPARTICIPLES))) == (NounPhraseParseAttr.NO)): if (not LanguageHelper.ends_with_ex( str0_, "ОГО", "ЕГО", None, None)): return None has_legal_verb = True if (wf.class0_.is_adverb): if (t.next0_ is None or not t.next0_.is_hiphen): if ((str0_ == "ВСЕГО" or str0_ == "ДОМА" or str0_ == "НЕСКОЛЬКО") or str0_ == "МНОГО" or str0_ == "ПОРЯДКА"): pass else: return None if (wf.class0_.is_adjective): if (wf.contains_attr("к.ф.", None)): if (t.get_morph_class_in_dictionary() == MorphClass.ADJECTIVE): pass else: _is_doubt_adj = True mc0 = t.morph.class0_ if (mc0.is_proper_surname and not t.chars.is_all_lower): for wf in t.morph.items: if (wf.class0_.is_proper_surname and wf.number != MorphNumber.PLURAL): wff = Utils.asObjectOrNull(wf, MorphWordForm) if (wff is None): continue s = Utils.ifNotNull((Utils.ifNotNull( wff.normal_full, wff.normal_case)), "") if (LanguageHelper.ends_with_ex( s, "ИН", "ЕН", "ЫН", None)): if (not wff.is_in_dictionary): _can_be_surname = True else: return None if (wff.is_in_dictionary and LanguageHelper.ends_with(s, "ОВ")): _can_be_surname = True if (mc0.is_proper_name and not t.chars.is_all_lower): for wff in t.morph.items: wf = Utils.asObjectOrNull(wff, MorphWordForm) if (wf is None): continue if (wf.normal_case == "ГОР"): continue if (wf.class0_.is_proper_name and wf.is_in_dictionary): if (wf.normal_case is None or not wf.normal_case.startswith("ЛЮБ")): if (mc0.is_adjective and t.morph.contains_attr("неизм.", None)): pass elif ( (((attrs) & (NounPhraseParseAttr.REFERENTCANBENOUN)) ) == (NounPhraseParseAttr.REFERENTCANBENOUN)): pass else: if (items is None or (len(items) < 1)): return None if (not items[0].is_std_adjective): return None if (mc0.is_adjective and t.morph.items_count == 1): if (t.morph.get_indexer_item(0).contains_attr( "в.ср.ст.", None)): return None mc1 = t.get_morph_class_in_dictionary() if (mc1 == MorphClass.VERB and t.morph.case_.is_undefined): return None if (((((attrs) & (NounPhraseParseAttr.IGNOREPARTICIPLES))) == (NounPhraseParseAttr.IGNOREPARTICIPLES) and t.morph.class0_.is_verb and not t.morph.class0_.is_noun) and not t.morph.class0_.is_proper): for wf in t.morph.items: if (wf.class0_.is_verb): if (wf.contains_attr("дейст.з.", None)): if (LanguageHelper.ends_with(t.term, "СЯ")): pass else: return None t1 = None for k in range(2): t = (Utils.ifNotNull(t1, t0)) if (k == 0): if (((isinstance(t0, TextToken)) and t0.next0_ is not None and t0.next0_.is_hiphen) and t0.next0_.next0_ is not None): if (not t0.is_whitespace_after and not t0.morph.class0_.is_pronoun and not (isinstance(t0.next0_.next0_, NumberToken))): if (not t0.next0_.is_whitespace_after): t = t0.next0_.next0_ elif (t0.next0_.next0_.chars.is_all_lower and LanguageHelper.ends_with(t0.term, "О")): t = t0.next0_.next0_ it = NounPhraseItem._new404(t0, t, _can_be_surname) if (t0 == t and (isinstance(t0, ReferentToken))): it.can_be_noun = True it.morph = MorphCollection(t0.morph) can_be_prepos = False for v in t.morph.items: wf = Utils.asObjectOrNull(v, MorphWordForm) if (v.class0_.is_verb and not v.case_.is_undefined): it.can_be_adj = True it.adj_morph.append(NounPhraseItemTextVar(v, t)) continue if (v.class0_.is_preposition): can_be_prepos = True if (v.class0_.is_adjective or ((v.class0_.is_pronoun and not v.class0_.is_personal_pronoun and not v.contains_attr("неизм.", None))) or ((v.class0_.is_noun and (isinstance(t, NumberToken))))): if (NounPhraseItem.try_accord_variant( items, (0 if items is None else len(items)), v, False)): is_doub = False if (v.contains_attr("к.ф.", None)): continue if (v.contains_attr("собир.", None) and not (isinstance(t, NumberToken))): if (wf is not None and wf.is_in_dictionary): return None continue if (v.contains_attr("сравн.", None)): continue ok = True if (isinstance(t, TextToken)): s = t.term if (s == "ПРАВО" or s == "ПРАВА"): ok = False elif (LanguageHelper.ends_with(s, "ОВ") and t.get_morph_class_in_dictionary().is_noun): ok = False elif (isinstance(t, NumberToken)): if (v.class0_.is_noun and t.morph.class0_.is_adjective): ok = False elif (t.morph.class0_.is_noun and (( (attrs) & (NounPhraseParseAttr.PARSENUMERICASADJECTIVE))) == (NounPhraseParseAttr.NO)): ok = False if (ok): it.adj_morph.append(NounPhraseItemTextVar(v, t)) it.can_be_adj = True if (_is_doubt_adj and t0 == t): it.is_doubt_adjective = True if (has_legal_verb and wf is not None and wf.is_in_dictionary): it.can_be_noun = True if (wf is not None and wf.class0_.is_pronoun): it.can_be_noun = True it.noun_morph.append( NounPhraseItemTextVar(v, t)) can_be_noun_ = False if (isinstance(t, NumberToken)): pass elif (v.class0_.is_noun or ((wf is not None and wf.normal_case == "САМ"))): can_be_noun_ = True elif (v.class0_.is_personal_pronoun): if (items is None or len(items) == 0): can_be_noun_ = True else: for it1 in items: if (it1.is_verb): if (len(items) == 1 and not v.case_.is_nominative): can_be_noun_ = True else: return None if (len(items) == 1): if (items[0].can_be_adj_for_personal_pronoun): can_be_noun_ = True elif ( (v.class0_.is_pronoun and ((items is None or len(items) == 0 or ((len(items) == 1 and items[0].can_be_adj_for_personal_pronoun)))) and wf is not None) and (((((wf.normal_case == "ТОТ" or wf.normal_full == "ТО" or wf.normal_case == "ТО") or wf.normal_case == "ЭТО" or wf.normal_case == "ВСЕ") or wf.normal_case == "ЧТО" or wf.normal_case == "КТО") or wf.normal_full == "КОТОРЫЙ" or wf.normal_case == "КОТОРЫЙ"))): if (wf.normal_case == "ВСЕ"): if (t.next0_ is not None and t.next0_.is_value("РАВНО", None)): return None can_be_noun_ = True elif (wf is not None and ((Utils.ifNotNull( wf.normal_full, wf.normal_case))) == "КОТОРЫЙ" and (((attrs) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (NounPhraseParseAttr.NO)): return None elif (v.class0_.is_proper and (isinstance(t, TextToken))): if (t.length_char > 4 or v.class0_.is_proper_name): can_be_noun_ = True if (can_be_noun_): added = False if (items is not None and len(items) > 1 and (((attrs) & (NounPhraseParseAttr.MULTINOUNS))) != (NounPhraseParseAttr.NO)): ok1 = True ii = 1 while ii < len(items): if (not items[ii].conj_before): ok1 = False break ii += 1 if (ok1): if (NounPhraseItem.try_accord_variant( items, (0 if items is None else len(items)), v, True)): it.noun_morph.append( NounPhraseItemTextVar(v, t)) it.can_be_noun = True it.multi_nouns = True added = True if (not added): if (NounPhraseItem.try_accord_variant( items, (0 if items is None else len(items)), v, False)): it.noun_morph.append(NounPhraseItemTextVar(v, t)) it.can_be_noun = True if (v.class0_.is_personal_pronoun and t.morph.contains_attr("неизм.", None) and not it.can_be_adj): itt = NounPhraseItemTextVar(v, t) itt.case_ = MorphCase.ALL_CASES itt.number = MorphNumber.UNDEFINED if (itt.normal_value is None): pass it.adj_morph.append(itt) it.can_be_adj = True elif ((len(items) > 0 and len(items[0].adj_morph) > 0 and items[0].adj_morph[0].number == MorphNumber.PLURAL) and not ((items[0].adj_morph[0].case_) & v.case_).is_undefined and not items[0].adj_morph[0].class0_.is_verb): if (t.next0_ is not None and t.next0_.is_comma_and and (isinstance(t.next0_.next0_, TextToken))): npt2 = NounPhraseHelper.try_parse( t.next0_.next0_, attrs, 0, None) if (npt2 is not None and npt2.preposition is None and not ((npt2.morph.case_) & v.case_ & items[0].adj_morph[0].case_ ).is_undefined): it.noun_morph.append( NounPhraseItemTextVar(v, t)) it.can_be_noun = True if (t0 != t): for v in it.adj_morph: v.correct_prefix(Utils.asObjectOrNull(t0, TextToken), False) for v in it.noun_morph: v.correct_prefix(Utils.asObjectOrNull(t0, TextToken), True) if (k == 1 and it.can_be_noun and not it.can_be_adj): if (t1 is not None): it.end_token = t1 else: it.end_token = t0.next0_.next0_ for v in it.noun_morph: if (v.normal_value is not None and (v.normal_value.find('-') < 0)): v.normal_value = "{0}-{1}".format( v.normal_value, it.end_token.get_normal_case_text( None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False)) if (it.can_be_adj): if (NounPhraseItem.__m_std_adjectives.try_parse( it.begin_token, TerminParseAttr.NO) is not None): it.is_std_adjective = True if (can_be_prepos and it.can_be_noun): if (items is not None and len(items) > 0): npt1 = NounPhraseHelper.try_parse( t, Utils.valToEnum((NounPhraseParseAttr.PARSEPREPOSITION) | (NounPhraseParseAttr.PARSEPRONOUNS) | (NounPhraseParseAttr.PARSEVERBS), NounPhraseParseAttr), 0, None) if (npt1 is not None and npt1.end_char > t.end_char): return None else: npt1 = NounPhraseHelper.try_parse( t.next0_, Utils.valToEnum((NounPhraseParseAttr.PARSEPRONOUNS) | (NounPhraseParseAttr.PARSEVERBS), NounPhraseParseAttr), 0, None) if (npt1 is not None): mc = LanguageHelper.get_case_after_preposition(t.lemma) if (not ((mc) & npt1.morph.case_).is_undefined): return None if (it.can_be_noun or it.can_be_adj or k == 1): if (it.begin_token.morph.class0_.is_pronoun): tt2 = it.end_token.next0_ if ((tt2 is not None and tt2.is_hiphen and not tt2.is_whitespace_after) and not tt2.is_whitespace_before): tt2 = tt2.next0_ if (isinstance(tt2, TextToken)): ss = tt2.term if ((ss == "ЖЕ" or ss == "БЫ" or ss == "ЛИ") or ss == "Ж"): it.end_token = tt2 elif (ss == "НИБУДЬ" or ss == "ЛИБО" or (((ss == "ТО" and tt2.previous.is_hiphen)) and it.can_be_adj)): it.end_token = tt2 for m in it.adj_morph: m.normal_value = "{0}-{1}".format( m.normal_value, ss) if (m.single_number_value is not None): m.single_number_value = "{0}-{1}".format( m.single_number_value, ss) return it if (t0 == t): if (t0.is_value("БИЗНЕС", None) and t0.next0_ is not None and t0.next0_.chars == t0.chars): t1 = t0.next0_ continue return it return None
class TextToken(Token): """ Входной токен (после морфанализа) """ def __init__(self, source: 'MorphToken', kit_: 'AnalysisKit') -> None: super().__init__(kit_, (0 if source is None else source.begin_char), (0 if source is None else source.end_char)) self.term = None self.lemma = None self.term0 = None self.invariant_prefix_length = 0 self.max_length = 0 if (source is None): return self.chars = source.char_info self.term = source.term self.lemma = (Utils.ifNotNull(source.lemma, self.term)) self.max_length = (len(self.term)) self.morph = MorphCollection() if (source.word_forms is not None): for wf in source.word_forms: self.morph.addItem(wf) if (wf.normal_case is not None and (self.max_length < len(wf.normal_case))): self.max_length = (len(wf.normal_case)) if (wf.normal_full is not None and (self.max_length < len(wf.normal_full))): self.max_length = (len(wf.normal_full)) i = 0 while i < len(self.term): ch = self.term[i] j = 0 while j < self.morph.items_count: wf = Utils.asObjectOrNull(self.morph.getIndexerItem(j), MorphWordForm) if (wf.normal_case is not None): if (i >= len(wf.normal_case)): break if (wf.normal_case[i] != ch): break if (wf.normal_full is not None): if (i >= len(wf.normal_full)): break if (wf.normal_full[i] != ch): break j += 1 if (j < self.morph.items_count): break self.invariant_prefix_length = ((i + 1)) i += 1 if (self.morph.language.is_undefined and not source.language.is_undefined): self.morph.language = source.language def getLemma(self) -> str: """ Получить лемму (устарело, используйте Lemma) """ return self.lemma def __str__(self) -> str: res = Utils.newStringIO(self.term) for l_ in self.morph.items: print(", {0}".format(str(l_)), end="", file=res, flush=True) return Utils.toStringStringIO(res) def checkValue(self, dict0_: typing.List[tuple]) -> object: """ Попробовать привязать словарь Args: dict0_(typing.List[tuple]): """ if (dict0_ is None): return None wrapres2699 = RefOutArgWrapper(None) inoutres2700 = Utils.tryGetValue(dict0_, self.term, wrapres2699) res = wrapres2699.value if (inoutres2700): return res if (self.morph is not None): for it in self.morph.items: mf = Utils.asObjectOrNull(it, MorphWordForm) if (mf is not None): if (mf.normal_case is not None): wrapres2695 = RefOutArgWrapper(None) inoutres2696 = Utils.tryGetValue( dict0_, mf.normal_case, wrapres2695) res = wrapres2695.value if (inoutres2696): return res if (mf.normal_full is not None and mf.normal_case != mf.normal_full): wrapres2697 = RefOutArgWrapper(None) inoutres2698 = Utils.tryGetValue( dict0_, mf.normal_full, wrapres2697) res = wrapres2697.value if (inoutres2698): return res return None def getSourceText(self) -> str: return super().getSourceText() def isValue(self, term_: str, termua: str = None) -> bool: if (termua is not None and self.morph.language.is_ua): if (self.isValue(termua, None)): return True if (term_ is None): return False if (self.invariant_prefix_length > len(term_)): return False if (self.max_length >= len(self.term) and (self.max_length < len(term_))): return False if (term_ == self.term): return True for wf in self.morph.items: if ((wf).normal_case == term_ or (wf).normal_full == term_): return True return False @property def is_and(self) -> bool: """ Это соединительный союз И (на всех языках) """ if (not self.morph.class0_.is_conjunction): if (self.length_char == 1 and self.isChar('&')): return True return False val = self.term if (val == "И" or val == "AND" or val == "UND"): return True if (self.morph.language.is_ua): if (val == "І" or val == "ТА"): return True return False @property def is_or(self) -> bool: """ Это соединительный союз ИЛИ (на всех языках) """ if (not self.morph.class0_.is_conjunction): return False val = self.term if (val == "ИЛИ" or val == "OR"): return True if (self.morph.language.is_ua): if (val == "АБО"): return True return False @property def is_letters(self) -> bool: return str.isalpha(self.term[0]) def getMorphClassInDictionary(self) -> 'MorphClass': res = MorphClass() for wf in self.morph.items: if ((isinstance(wf, MorphWordForm)) and (wf).is_in_dictionary): res |= wf.class0_ return res def getNormalCaseText(self, mc: 'MorphClass' = None, single_number: bool = False, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: from pullenti.ner.core.MiscHelper import MiscHelper empty = True if (mc is not None and mc.is_preposition): return LanguageHelper.normalizePreposition(self.term) for it in self.morph.items: if (mc is not None and not mc.is_undefined): cc = (it.class0_.value) & (mc.value) if (cc == 0): continue if (MorphClass.isMiscInt(cc) and not MorphClass.isProperInt(cc) and mc.value != it.class0_.value): continue wf = Utils.asObjectOrNull(it, MorphWordForm) normal_full = False if (gender != MorphGender.UNDEFINED): if ((((it.gender) & (gender))) == (MorphGender.UNDEFINED)): if ((gender == MorphGender.MASCULINE and ((it.gender != MorphGender.UNDEFINED or it.number == MorphNumber.PLURAL)) and wf is not None) and wf.normal_full is not None): normal_full = True elif (gender == MorphGender.MASCULINE and it.class0_.is_personal_pronoun): pass else: continue if (not it.case_.is_undefined): empty = False if (wf is not None): if (single_number and it.number == MorphNumber.PLURAL and wf.normal_full is not None): le = len(wf.normal_case) if ((le == (len(wf.normal_full) + 2) and le > 4 and wf.normal_case[le - 2] == 'С') and wf.normal_case[le - 1] == 'Я'): res = wf.normal_case else: res = (wf.normal_full if normal_full else wf.normal_full) else: res = (wf.normal_full if normal_full else (Utils.ifNotNull(wf.normal_case, self.term))) if (single_number and mc is not None and mc == MorphClass.NOUN): if (res == "ДЕТИ"): res = "РЕБЕНОК" if (keep_chars): if (self.chars.is_all_lower): res = res.lower() elif (self.chars.is_capital_upper): res = MiscHelper.convertFirstCharUpperAndOtherLower( res) return res if (not empty): return None te = None if (single_number and mc is not None): bi = MorphBaseInfo._new549(MorphClass(mc), gender, MorphNumber.SINGULAR, self.morph.language) vars0_ = Morphology.getWordform(self.term, bi) if (vars0_ is not None): te = vars0_ if (self.chars.is_cyrillic_letter and te is None and len(self.term) > 3): ch0 = self.term[len(self.term) - 1] ch1 = self.term[len(self.term) - 2] if (ch0 == 'М' and ((ch1 == 'О' or ch1 == 'А'))): te = self.term[0:0 + len(self.term) - 2] elif (not LanguageHelper.isCyrillicVowel(ch1) and LanguageHelper.isCyrillicVowel(ch0)): te = self.term[0:0 + len(self.term) - 1] if (te is None): te = self.term if (keep_chars): if (self.chars.is_all_lower): return te.lower() elif (self.chars.is_capital_upper): return MiscHelper.convertFirstCharUpperAndOtherLower(te) return te @staticmethod def getSourceTextTokens(begin: 'Token', end: 'Token') -> typing.List['TextToken']: from pullenti.ner.MetaToken import MetaToken res = list() t = begin while t is not None and t != end.next0_ and t.end_char <= end.end_char: if (isinstance(t, TextToken)): res.append(Utils.asObjectOrNull(t, TextToken)) elif (isinstance(t, MetaToken)): res.extend( TextToken.getSourceTextTokens((t).begin_token, (t).end_token)) t = t.next0_ return res @property def is_pure_verb(self) -> bool: """ Признак того, что это чистый глагол """ ret = False if ((self.isValue("МОЖНО", None) or self.isValue("МОЖЕТ", None) or self.isValue("ДОЛЖНЫЙ", None)) or self.isValue("НУЖНО", None)): return True for it in self.morph.items: if ((isinstance(it, MorphWordForm)) and (it).is_in_dictionary): if (it.class0_.is_verb and it.case_.is_undefined): ret = True elif (not it.class0_.is_verb): if (it.class0_.is_adjective and it.containsAttr("к.ф.", None)): pass else: return False return ret @property def is_verb_be(self) -> bool: """ Проверка, что это глагол типа БЫТЬ, ЯВЛЯТЬСЯ и т.п. """ if ((self.isValue("БЫТЬ", None) or self.isValue("ЕСТЬ", None) or self.isValue("ЯВЛЯТЬ", None)) or self.isValue("BE", None)): return True if (self.term == "IS" or self.term == "WAS" or self.term == "BECAME"): return True if (self.term == "Є"): return True return False def _serialize(self, stream: io.IOBase) -> None: from pullenti.ner.core.internal.SerializerHelper import SerializerHelper super()._serialize(stream) SerializerHelper.serializeString(stream, self.term) SerializerHelper.serializeString(stream, self.lemma) SerializerHelper.serializeShort(stream, self.invariant_prefix_length) SerializerHelper.serializeShort(stream, self.max_length) def _deserialize(self, stream: io.IOBase, kit_: 'AnalysisKit', vers: int) -> None: from pullenti.ner.core.internal.SerializerHelper import SerializerHelper super()._deserialize(stream, kit_, vers) self.term = SerializerHelper.deserializeString(stream) self.lemma = SerializerHelper.deserializeString(stream) self.invariant_prefix_length = SerializerHelper.deserializeShort( stream) self.max_length = SerializerHelper.deserializeShort(stream) @staticmethod def _new538(_arg1: 'MorphToken', _arg2: 'AnalysisKit', _arg3: str) -> 'TextToken': res = TextToken(_arg1, _arg2) res.term0 = _arg3 return res @staticmethod def _new541(_arg1: 'MorphToken', _arg2: 'AnalysisKit', _arg3: 'CharsInfo', _arg4: int, _arg5: int, _arg6: str) -> 'TextToken': res = TextToken(_arg1, _arg2) res.chars = _arg3 res.begin_char = _arg4 res.end_char = _arg5 res.term0 = _arg6 return res
def tryParse( self, t0: 'Token', pars: 'TerminParseAttr' = TerminParseAttr.NO) -> 'TerminToken': """ Попробовать привязать термин Args: t0(Token): fullWordsOnly: """ from pullenti.ner.core.MiscHelper import MiscHelper from pullenti.ner.core.BracketHelper import BracketHelper if (t0 is None): return None term = None if (isinstance(t0, TextToken)): term = (t0).term if (self.acronym_smart is not None and (((pars) & (TerminParseAttr.FULLWORDSONLY))) == (TerminParseAttr.NO) and term is not None): if (self.acronym_smart == term): if (t0.next0_ is not None and t0.next0_.isChar('.') and not t0.is_whitespace_after): return TerminToken._new606(t0, t0.next0_, self) else: return TerminToken._new606(t0, t0, self) t1 = Utils.asObjectOrNull(t0, TextToken) tt = Utils.asObjectOrNull(t0, TextToken) i = 0 while i < len(self.acronym): if (tt is None): break term1 = tt.term if (len(term1) != 1 or tt.is_whitespace_after): break if (i > 0 and tt.is_whitespace_before): break if (term1[0] != self.acronym[i]): break if (tt.next0_ is None or not tt.next0_.isChar('.')): break t1 = (Utils.asObjectOrNull(tt.next0_, TextToken)) tt = (Utils.asObjectOrNull(tt.next0_.next0_, TextToken)) i += 1 if (i >= len(self.acronym)): return TerminToken._new606(t0, t1, self) if (self.acronym is not None and term is not None and self.acronym == term): if (t0.chars.is_all_upper or self.acronym_can_be_lower or ((not t0.chars.is_all_lower and len(term) >= 3))): return TerminToken._new606(t0, t0, self) if (self.acronym is not None and t0.chars.is_last_lower and t0.length_char > 3): if (t0.isValue(self.acronym, None)): return TerminToken._new606(t0, t0, self) cou = 0 i = 0 while i < len(self.terms): if (self.terms[i].is_hiphen): cou -= 1 else: cou += 1 i += 1 if (len(self.terms) > 0 and ((not self.ignore_terms_order or cou == 1))): t1 = t0 tt = t0 e0_ = None eup = None ok = True mc = None dont_change_mc = False i = 0 first_pass2812 = True while True: if first_pass2812: first_pass2812 = False else: i += 1 if (not (i < len(self.terms))): break if (self.terms[i].is_hiphen): continue if (tt is not None and tt.is_hiphen and i > 0): tt = tt.next0_ if (i > 0 and tt is not None): if ((((pars) & (TerminParseAttr.IGNOREBRACKETS))) != (TerminParseAttr.NO) and not tt.chars.is_letter and BracketHelper.isBracket(tt, False)): tt = tt.next0_ if (((((pars) & (TerminParseAttr.CANBEGEOOBJECT))) != (TerminParseAttr.NO) and i > 0 and (isinstance(tt, ReferentToken))) and tt.getReferent().type_name == "GEO"): tt = tt.next0_ if ((isinstance(tt, ReferentToken)) and e0_ is None): eup = tt e0_ = (tt).end_token tt = (tt).begin_token if (tt is None): ok = False break if (not self.terms[i].checkByToken(tt)): if (tt.next0_ is not None and tt.isChar('.') and self.terms[i].checkByToken(tt.next0_)): tt = tt.next0_ elif (((i > 0 and tt.next0_ is not None and (isinstance(tt, TextToken))) and ((tt.morph.class0_.is_preposition or MiscHelper.isEngArticle(tt))) and self.terms[i].checkByToken(tt.next0_)) and not self.terms[i - 1].is_pattern_any): tt = tt.next0_ else: ok = False if (((i + 2) < len(self.terms)) and self.terms[i + 1].is_hiphen and self.terms[i + 2].checkByPrefToken( self.terms[i], Utils.asObjectOrNull(tt, TextToken))): i += 2 ok = True elif (((not tt.is_whitespace_after and tt.next0_ is not None and (isinstance(tt, TextToken))) and (tt).length_char == 1 and tt.next0_.isCharOf("\"'`’“”")) and not tt.next0_.is_whitespace_after and (isinstance(tt.next0_.next0_, TextToken))): if (self.terms[i].checkByStrPrefToken( (tt).term, Utils.asObjectOrNull( tt.next0_.next0_, TextToken))): ok = True tt = tt.next0_.next0_ if (not ok): if (i > 0 and (((pars) & (TerminParseAttr.IGNORESTOPWORDS))) != (TerminParseAttr.NO)): if (isinstance(tt, TextToken)): if (not tt.chars.is_letter): tt = tt.next0_ i -= 1 continue mc1 = tt.getMorphClassInDictionary() if (mc1.is_conjunction or mc1.is_preposition): tt = tt.next0_ i -= 1 continue if (isinstance(tt, NumberToken)): tt = tt.next0_ i -= 1 continue break if (tt.morph.items_count > 0 and not dont_change_mc): mc = MorphCollection(tt.morph) if (((mc.class0_.is_noun or mc.class0_.is_verb)) and not mc.class0_.is_adjective): if (((i + 1) < len(self.terms)) and self.terms[i + 1].is_hiphen): pass else: dont_change_mc = True if (tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction): dont_change_mc = True if (tt == e0_): tt = eup eup = (None) e0_ = (None) if (e0_ is None): t1 = tt tt = tt.next0_ if (ok and i >= len(self.terms)): if (t1.next0_ is not None and t1.next0_.isChar('.') and self.abridges is not None): for a in self.abridges: if (a.tryAttach(t0) is not None): t1 = t1.next0_ break if (t0 != t1 and t0.morph.class0_.is_adjective): npt = NounPhraseHelper.tryParse(t0, NounPhraseParseAttr.NO, 0) if (npt is not None and npt.end_char <= t1.end_char): mc = npt.morph return TerminToken._new611(t0, t1, mc) if (len(self.terms) > 1 and self.ignore_terms_order): terms_ = list(self.terms) t1 = t0 tt = t0 while len(terms_) > 0: if (tt != t0 and tt is not None and tt.is_hiphen): tt = tt.next0_ if (tt is None): break j = 0 while j < len(terms_): if (terms_[j].checkByToken(tt)): break j += 1 if (j >= len(terms_)): if (tt != t0 and (((pars) & (TerminParseAttr.IGNORESTOPWORDS))) != (TerminParseAttr.NO)): if (isinstance(tt, TextToken)): if (not tt.chars.is_letter): tt = tt.next0_ continue mc1 = tt.getMorphClassInDictionary() if (mc1.is_conjunction or mc1.is_preposition): tt = tt.next0_ continue if (isinstance(tt, NumberToken)): tt = tt.next0_ continue break del terms_[j] t1 = tt tt = tt.next0_ for i in range(len(terms_) - 1, -1, -1): if (terms_[i].is_hiphen): del terms_[i] if (len(terms_) == 0): return TerminToken(t0, t1) if (self.abridges is not None and (((pars) & (TerminParseAttr.FULLWORDSONLY))) == (TerminParseAttr.NO)): res = None for a in self.abridges: r = a.tryAttach(t0) if (r is None): continue if (r.abridge_without_point and len(self.terms) > 0): if (not ((isinstance(t0, TextToken)))): continue if (a.parts[0].value != (t0).term): continue if (res is None or (res.length_char < r.length_char)): res = r if (res is not None): return res return None
class Token: """ Базовый класс для всех токенов. Наследные классы - TextToken (конечная словоформа) и MetaToken (связный фрагмент других токенов). Токен """ def __init__(self, kit_: 'AnalysisKit', begin: int, end: int) -> None: self.kit = None self.__m_begin_char = 0 self.__m_end_char = 0 self.tag = None self._m_previous = None self._m_next = None self.__m_morph = None self.chars = None self.__m_attrs = 0 self.kit = kit_ self.__m_begin_char = begin self.__m_end_char = end @property def begin_char(self) -> int: """ Позиция в тексте начального символа """ return self.__m_begin_char @property def end_char(self) -> int: """ Позиция в тексте конечного символа """ return self.__m_end_char @property def length_char(self) -> int: """ Длина в текстовых символах """ return (self.end_char - self.begin_char) + 1 @property def previous(self) -> 'Token': """ Предыдущий токен в цепочке токенов """ return self._m_previous @previous.setter def previous(self, value) -> 'Token': self._m_previous = value if (value is not None): value._m_next = self self.__m_attrs = (0) return value @property def next0_(self) -> 'Token': """ Следующий токен в цепочке токенов """ return self._m_next @next0_.setter def next0_(self, value) -> 'Token': self._m_next = value if (value is not None): value._m_previous = self self.__m_attrs = (0) return value @property def morph(self) -> 'MorphCollection': """ Морфологическая информация """ if (self.__m_morph is None): self.__m_morph = MorphCollection() return self.__m_morph @morph.setter def morph(self, value) -> 'MorphCollection': self.__m_morph = value return value def __str__(self) -> str: return self.kit.sofa.text[self.begin_char:self.begin_char + (self.end_char + 1) - self.begin_char] def __get_attr(self, i: int) -> bool: ch = '\x00' if ((((self.__m_attrs) & 1)) == 0): self.__m_attrs = (1) if (self._m_previous is None): self._set_attr(1, True) self._set_attr(3, True) else: j = self._m_previous.end_char + 1 while j < self.begin_char: ch = self.kit.sofa.text[j] if (Utils.isWhitespace((ch))): self._set_attr(1, True) if ((ord(ch)) == 0xD or (ord(ch)) == 0xA or ch == '\f'): self._set_attr(3, True) j += 1 if (self._m_next is None): self._set_attr(2, True) self._set_attr(4, True) else: j = self.end_char + 1 while j < self._m_next.begin_char: ch = self.kit.sofa.text[j] if (Utils.isWhitespace(ch)): self._set_attr(2, True) if ((ord(ch)) == 0xD or (ord(ch)) == 0xA or ch == '\f'): self._set_attr(4, True) j += 1 return (((((self.__m_attrs) >> i)) & 1)) != 0 def _set_attr(self, i: int, val: bool) -> None: if (val): self.__m_attrs |= ((1 << i)) else: self.__m_attrs &= (~((1 << i))) @property def is_whitespace_before(self) -> bool: """ Наличие пробельных символов перед """ return self.__get_attr(1) @is_whitespace_before.setter def is_whitespace_before(self, value) -> bool: self._set_attr(1, value) return value @property def is_whitespace_after(self) -> bool: """ Наличие пробельных символов после """ return self.__get_attr(2) @is_whitespace_after.setter def is_whitespace_after(self, value) -> bool: self._set_attr(2, value) return value @property def is_newline_before(self) -> bool: """ Элемент начинается с новой строки. Для 1-го элемента всегда true. """ return self.__get_attr(3) @is_newline_before.setter def is_newline_before(self, value) -> bool: self._set_attr(3, value) return value @property def is_newline_after(self) -> bool: """ Элемент заканчивает строку. Для последнего элемента всегда true. """ return self.__get_attr(4) @is_newline_after.setter def is_newline_after(self, value) -> bool: self._set_attr(4, value) return value @property def inner_bool(self) -> bool: # Это используется внутренним образом return self.__get_attr(5) @inner_bool.setter def inner_bool(self, value) -> bool: self._set_attr(5, value) return value @property def not_noun_phrase(self) -> bool: # Это используется внутренним образом # (признак того, что здесь не начинается именная группа, чтобы повторно не пытаться выделять) return self.__get_attr(6) @not_noun_phrase.setter def not_noun_phrase(self, value) -> bool: self._set_attr(6, value) return value @property def whitespaces_before_count(self) -> int: """ Количество пробелов перед, переход на новую строку = 10, табуляция = 5 """ if (self.previous is None): return 100 if ((self.previous.end_char + 1) == self.begin_char): return 0 return self.__calc_whitespaces(self.previous.end_char + 1, self.begin_char - 1) @property def newlines_before_count(self) -> int: """ Количество переходов на новую строку перед """ ch0 = chr(0) res = 0 txt = self.kit.sofa.text for p in range(self.begin_char - 1, -1, -1): ch = txt[p] if ((ord(ch)) == 0xA): res += 1 elif ((ord(ch)) == 0xD and (ord(ch0)) != 0xA): res += 1 elif (ch == '\f'): res += 10 elif (not Utils.isWhitespace(ch)): break ch0 = ch return res @property def newlines_after_count(self) -> int: """ Количество переходов на новую строку перед """ ch0 = chr(0) res = 0 txt = self.kit.sofa.text p = self.end_char + 1 while p < len(txt): ch = txt[p] if ((ord(ch)) == 0xD): res += 1 elif ((ord(ch)) == 0xA and (ord(ch0)) != 0xD): res += 1 elif (ch == '\f'): res += 10 elif (not Utils.isWhitespace(ch)): break ch0 = ch p += 1 return res @property def whitespaces_after_count(self) -> int: """ Количество пробелов перед, переход на новую строку = 10, табуляция = 5 """ if (self.next0_ is None): return 100 if ((self.end_char + 1) == self.next0_.begin_char): return 0 return self.__calc_whitespaces(self.end_char + 1, self.next0_.begin_char - 1) def __calc_whitespaces(self, p0: int, p1: int) -> int: if ((p0 < 0) or p0 > p1 or p1 >= len(self.kit.sofa.text)): return -1 res = 0 i = p0 while i <= p1: ch = self.kit.get_text_character(i) if (ch == '\r' or ch == '\n'): res += 10 ch1 = self.kit.get_text_character(i + 1) if (ch != ch1 and ((ch1 == '\r' or ch1 == '\n'))): i += 1 elif (ch == '\t'): res += 5 elif (ch == '\u0007'): res += 100 elif (ch == '\f'): res += 100 else: res += 1 i += 1 return res @property def is_hiphen(self) -> bool: """ Это символ переноса """ ch = self.kit.sofa.text[self.begin_char] return LanguageHelper.is_hiphen(ch) @property def is_table_control_char(self) -> bool: """ Это спец-символы для табличных элементов (7h, 1Eh, 1Fh) """ ch = self.kit.sofa.text[self.begin_char] return (ord(ch)) == 7 or (ord(ch)) == 0x1F or (ord(ch)) == 0x1E @property def is_and(self) -> bool: """ Это соединительный союз И (на всех языках) """ return False @property def is_or(self) -> bool: """ Это соединительный союз ИЛИ (на всех языках) """ return False @property def is_comma(self) -> bool: """ Это запятая """ return self.is_char(',') @property def is_comma_and(self) -> bool: """ Это запятая или союз И """ return self.is_comma or self.is_and def is_char(self, ch: 'char') -> bool: """ Токен состоит из конкретного символа Args: ch('char'): проверяемый символ """ if (self.begin_char != self.end_char): return False return self.kit.sofa.text[self.begin_char] == ch def is_char_of(self, chars_: str) -> bool: """ Токен состоит из одного символа, который есть в указанной строке Args: chars_(str): строка возможных символов """ if (self.begin_char != self.end_char): return False return chars_.find(self.kit.sofa.text[self.begin_char]) >= 0 def is_value(self, term: str, termua: str = None) -> bool: """ Проверка конкретного значения слова Args: term(str): слово (проверяется значение TextToken.Term) termua(str): слово для проверки на украинском языке Returns: bool: да-нет """ return False @property def is_letters(self) -> bool: """ Признак того, что это буквенный текстовой токен (TextToken) """ return False def get_referent(self) -> 'Referent': """ Получить ссылку на сущность (не null только для ReferentToken) """ return None def get_referents(self) -> typing.List['Referent']: """ Получить список ссылок на все сущности, скрывающиеся под элементом. Дело в том, что одни сущности могут накрывать другие (например, адрес накроет город). """ return None def get_normal_case_text(self, mc: 'MorphClass' = None, num: 'MorphNumber' = MorphNumber.UNDEFINED, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: """ Получить связанный с токеном текст в именительном падеже Args: mc(MorphClass): желательная часть речи num(MorphNumber): желательное число gender(MorphGender): желательный пол keep_chars(bool): сохранять регистр символов (по умолчанию, всё в верхний) Returns: str: строка текста """ return str(self) def get_source_text(self) -> str: """ Получить фрагмент исходного текста, связанный с токеном Returns: str: фрагмент исходного текста """ len0_ = (self.end_char + 1) - self.begin_char if ((len0_ < 1) or (self.begin_char < 0)): return None if ((self.begin_char + len0_) > len(self.kit.sofa.text)): return None return self.kit.sofa.text[self.begin_char:self.begin_char + len0_] def get_morph_class_in_dictionary(self) -> 'MorphClass': """ Проверка, что слово есть в словаре соответствующего языка Returns: MorphClass: части речи, если не из словаря, то IsUndefined """ return self.morph.class0_ def _serialize(self, stream: Stream) -> None: from pullenti.ner.core.internal.SerializerHelper import SerializerHelper SerializerHelper.serialize_int(stream, self.begin_char) SerializerHelper.serialize_int(stream, self.end_char) SerializerHelper.serialize_int(stream, self.__m_attrs) SerializerHelper.serialize_int(stream, self.chars.value) if (self.__m_morph is None): self.__m_morph = MorphCollection() self.__m_morph._serialize(stream) def _deserialize(self, stream: Stream, kit_: 'AnalysisKit', vers: int) -> None: from pullenti.ner.core.internal.SerializerHelper import SerializerHelper self.kit = kit_ self.__m_begin_char = SerializerHelper.deserialize_int(stream) self.__m_end_char = SerializerHelper.deserialize_int(stream) self.__m_attrs = (SerializerHelper.deserialize_int(stream)) self.chars = CharsInfo._new2561( SerializerHelper.deserialize_int(stream)) self.__m_morph = MorphCollection() self.__m_morph._deserialize(stream)
class TextToken(Token): """ Входной токен (после морфанализа) Текстовой токен """ def __init__(self, source: 'MorphToken', kit_: 'AnalysisKit', bchar: int = -1, echar: int = -1) -> None: super().__init__(kit_, (bchar if bchar >= 0 else (0 if source is None else source.begin_char)), (echar if echar >= 0 else (0 if source is None else source.end_char))) self.term = None self.lemma = None self.term0 = None self.invariant_prefix_length_of_morph_vars = 0 self.max_length_of_morph_vars = 0 if (source is None): return self.chars = source.char_info self.term = source.term self.lemma = (Utils.ifNotNull(source.get_lemma(), self.term)) self.max_length_of_morph_vars = (len(self.term)) self.morph = MorphCollection() if (source.word_forms is not None): for wf in source.word_forms: self.morph.add_item(wf) if (wf.normal_case is not None and (self.max_length_of_morph_vars < len(wf.normal_case))): self.max_length_of_morph_vars = (len(wf.normal_case)) if (wf.normal_full is not None and (self.max_length_of_morph_vars < len(wf.normal_full))): self.max_length_of_morph_vars = (len(wf.normal_full)) i = 0 while i < len(self.term): ch = self.term[i] j = 0 j = 0 while j < self.morph.items_count: wf = Utils.asObjectOrNull(self.morph.get_indexer_item(j), MorphWordForm) if (wf.normal_case is not None): if (i >= len(wf.normal_case)): break if (wf.normal_case[i] != ch): break if (wf.normal_full is not None): if (i >= len(wf.normal_full)): break if (wf.normal_full[i] != ch): break j += 1 if (j < self.morph.items_count): break self.invariant_prefix_length_of_morph_vars = ((i + 1)) i += 1 if (self.morph.language.is_undefined and not source.language.is_undefined): self.morph.language = source.language def __str__(self) -> str: res = Utils.newStringIO(self.term) for l_ in self.morph.items: print(", {0}".format(str(l_)), end="", file=res, flush=True) return Utils.toStringStringIO(res) def check_value(self, dict0_: typing.List[tuple]) -> object: """ Попробовать привязать словарь Args: dict0_(typing.List[tuple]): """ if (dict0_ is None): return None res = None wrapres2868 = RefOutArgWrapper(None) inoutres2869 = Utils.tryGetValue(dict0_, self.term, wrapres2868) res = wrapres2868.value if (inoutres2869): return res if (self.morph is not None): for it in self.morph.items: mf = Utils.asObjectOrNull(it, MorphWordForm) if (mf is not None): if (mf.normal_case is not None): wrapres2864 = RefOutArgWrapper(None) inoutres2865 = Utils.tryGetValue( dict0_, mf.normal_case, wrapres2864) res = wrapres2864.value if (inoutres2865): return res if (mf.normal_full is not None and mf.normal_case != mf.normal_full): wrapres2866 = RefOutArgWrapper(None) inoutres2867 = Utils.tryGetValue( dict0_, mf.normal_full, wrapres2866) res = wrapres2866.value if (inoutres2867): return res return None def get_source_text(self) -> str: return super().get_source_text() def is_value(self, term_: str, termua: str = None) -> bool: if (termua is not None and self.morph.language.is_ua): if (self.is_value(termua, None)): return True if (term_ is None): return False if (self.invariant_prefix_length_of_morph_vars > len(term_)): return False if (self.max_length_of_morph_vars >= len(self.term) and (self.max_length_of_morph_vars < len(term_))): return False if (term_ == self.term): return True for wf in self.morph.items: if ((isinstance(wf, MorphWordForm)) and ((wf.normal_case == term_ or wf.normal_full == term_))): return True return False @property def is_and(self) -> bool: """ Это соединительный союз И (на всех языках) """ if (not self.morph.class0_.is_conjunction): if (self.length_char == 1 and self.is_char('&')): return True return False val = self.term if (val == "И" or val == "AND" or val == "UND"): return True if (self.morph.language.is_ua): if (val == "І" or val == "ТА"): return True return False @property def is_or(self) -> bool: """ Это соединительный союз ИЛИ (на всех языках) """ if (not self.morph.class0_.is_conjunction): return False val = self.term if (val == "ИЛИ" or val == "ЛИБО" or val == "OR"): return True if (self.morph.language.is_ua): if (val == "АБО"): return True return False @property def is_letters(self) -> bool: return str.isalpha(self.term[0]) def get_morph_class_in_dictionary(self) -> 'MorphClass': res = MorphClass() for wf in self.morph.items: if ((isinstance(wf, MorphWordForm)) and wf.is_in_dictionary): res |= wf.class0_ return res def get_normal_case_text(self, mc: 'MorphClass' = None, num: 'MorphNumber' = MorphNumber.UNDEFINED, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: from pullenti.ner.core.MiscHelper import MiscHelper empty = True if (mc is not None and mc.is_preposition): return LanguageHelper.normalize_preposition(self.term) for it in self.morph.items: if (mc is not None and not mc.is_undefined): cc = (it.class0_) & mc if (cc.is_undefined): continue if (cc.is_misc and not cc.is_proper and mc != it.class0_): continue wf = Utils.asObjectOrNull(it, MorphWordForm) normal_full = False if (gender != MorphGender.UNDEFINED): if (((it.gender) & (gender)) == (MorphGender.UNDEFINED)): if ((gender == MorphGender.MASCULINE and ((it.gender != MorphGender.UNDEFINED or it.number == MorphNumber.PLURAL)) and wf is not None) and wf.normal_full is not None): normal_full = True elif (gender == MorphGender.MASCULINE and it.class0_.is_personal_pronoun): pass else: continue if (not it.case_.is_undefined): empty = False if (wf is not None): res = None if (num == MorphNumber.SINGULAR and it.number == MorphNumber.PLURAL and wf.normal_full is not None): le = len(wf.normal_case) if ((le == (len(wf.normal_full) + 2) and le > 4 and wf.normal_case[le - 2] == 'С') and wf.normal_case[le - 1] == 'Я'): res = wf.normal_case else: res = (wf.normal_full if normal_full else wf.normal_full) else: res = (wf.normal_full if normal_full else (Utils.ifNotNull(wf.normal_case, self.term))) if (num == MorphNumber.SINGULAR and mc is not None and mc == MorphClass.NOUN): if (res == "ДЕТИ"): res = "РЕБЕНОК" if (keep_chars): if (self.chars.is_all_lower): res = res.lower() elif (self.chars.is_capital_upper): res = MiscHelper.convert_first_char_upper_and_other_lower( res) return res if (not empty): return None te = None if (num == MorphNumber.SINGULAR and mc is not None): bi = MorphBaseInfo._new492(MorphClass._new53(mc.value), gender, MorphNumber.SINGULAR, self.morph.language) vars0_ = MorphologyService.get_wordform(self.term, bi) if (vars0_ is not None): te = vars0_ if (te is None): te = self.term if (keep_chars): if (self.chars.is_all_lower): return te.lower() elif (self.chars.is_capital_upper): return MiscHelper.convert_first_char_upper_and_other_lower(te) return te @staticmethod def get_source_text_tokens(begin: 'Token', end: 'Token') -> typing.List['TextToken']: from pullenti.ner.MetaToken import MetaToken res = list() t = begin while t is not None and t != end.next0_ and t.end_char <= end.end_char: if (isinstance(t, TextToken)): res.append(Utils.asObjectOrNull(t, TextToken)) elif (isinstance(t, MetaToken)): res.extend( TextToken.get_source_text_tokens(t.begin_token, t.end_token)) t = t.next0_ return res @property def is_pure_verb(self) -> bool: """ Признак того, что это чистый глагол """ ret = False if ((self.is_value("МОЖНО", None) or self.is_value("МОЖЕТ", None) or self.is_value("ДОЛЖНЫЙ", None)) or self.is_value("НУЖНО", None)): return True for it in self.morph.items: if ((isinstance(it, MorphWordForm)) and it.is_in_dictionary): if (it.class0_.is_verb and it.case_.is_undefined): ret = True elif (not it.class0_.is_verb): if (it.class0_.is_adjective and it.contains_attr("к.ф.", None)): pass else: return False return ret @property def is_verb_be(self) -> bool: """ Проверка, что это глагол типа БЫТЬ, ЯВЛЯТЬСЯ и т.п. """ if ((self.is_value("БЫТЬ", None) or self.is_value("ЕСТЬ", None) or self.is_value("ЯВЛЯТЬ", None)) or self.is_value("BE", None)): return True if (self.term == "IS" or self.term == "WAS" or self.term == "BECAME"): return True if (self.term == "Є"): return True return False def _serialize(self, stream: Stream) -> None: from pullenti.ner.core.internal.SerializerHelper import SerializerHelper super()._serialize(stream) SerializerHelper.serialize_string(stream, self.term) SerializerHelper.serialize_string(stream, self.lemma) SerializerHelper.serialize_short( stream, self.invariant_prefix_length_of_morph_vars) SerializerHelper.serialize_short(stream, self.max_length_of_morph_vars) def _deserialize(self, stream: Stream, kit_: 'AnalysisKit', vers: int) -> None: from pullenti.ner.core.internal.SerializerHelper import SerializerHelper super()._deserialize(stream, kit_, vers) self.term = SerializerHelper.deserialize_string(stream) self.lemma = SerializerHelper.deserialize_string(stream) self.invariant_prefix_length_of_morph_vars = SerializerHelper.deserialize_short( stream) self.max_length_of_morph_vars = SerializerHelper.deserialize_short( stream) @staticmethod def _new470(_arg1: 'MorphToken', _arg2: 'AnalysisKit', _arg3: int, _arg4: int, _arg5: str) -> 'TextToken': res = TextToken(_arg1, _arg2, _arg3, _arg4) res.term0 = _arg5 return res @staticmethod def _new473(_arg1: 'MorphToken', _arg2: 'AnalysisKit', _arg3: int, _arg4: int, _arg5: 'CharsInfo', _arg6: str) -> 'TextToken': res = TextToken(_arg1, _arg2, _arg3, _arg4) res.chars = _arg5 res.term0 = _arg6 return res
def __tryParseRu(first: 'Token', typ: 'NounPhraseParseAttr', max_char_pos: int) -> 'NounPhraseToken': if (first is None): return None items = None adverbs = None internal_noun_prase = None conj_before = False t = first first_pass2788 = True while True: if first_pass2788: first_pass2788 = False else: t = t.next0_ if (not (t is not None)): break if (max_char_pos > 0 and t.begin_char > max_char_pos): break if ((t.morph.class0_.is_conjunction and not t.morph.class0_.is_adjective and not t.morph.class0_.is_pronoun) and not t.morph.class0_.is_noun): if (conj_before): break if ((((typ) & (NounPhraseParseAttr.CANNOTHASCOMMAAND))) != (NounPhraseParseAttr.NO)): break if (items is not None and t.is_and): conj_before = True if ((t.next0_ is not None and t.next0_.isCharOf("\\/") and t.next0_.next0_ is not None) and t.next0_.next0_.is_or): t = t.next0_.next0_ continue break elif (t.is_comma): if (conj_before or items is None): break if ((((typ) & (NounPhraseParseAttr.CANNOTHASCOMMAAND))) != (NounPhraseParseAttr.NO)): break mc = t.previous.getMorphClassInDictionary() if (mc.is_proper_surname or mc.is_proper_secname): break conj_before = True continue elif (t.isChar('(')): if (items is None): return None if ((((typ) & (NounPhraseParseAttr.IGNOREBRACKETS))) != (NounPhraseParseAttr.IGNOREBRACKETS)): break brr = BracketHelper.tryParse(t, BracketParseAttr.NO, 100) if (brr is None): break if (brr.length_char > 100): break t = brr.end_token continue if (isinstance(t, ReferentToken)): if ((((typ) & (NounPhraseParseAttr.REFERENTCANBENOUN))) == ( NounPhraseParseAttr.NO)): break elif (t.chars.is_latin_letter): break it = NounPhraseItem.tryParse(t, items, typ) if (it is None or ((not it.can_be_adj and not it.can_be_noun))): if ((((typ) & (NounPhraseParseAttr.PARSEADVERBS))) != (NounPhraseParseAttr.NO) and (isinstance(t, TextToken)) and t.morph.class0_.is_adverb): if (items is None): if (t.previous is not None and t.previous.morph.class0_.is_preposition): pass else: return None if (adverbs is None): adverbs = list() adverbs.append(Utils.asObjectOrNull(t, TextToken)) continue break it.conj_before = conj_before conj_before = False if (not it.can_be_adj and not it.can_be_noun): break if (t.is_newline_before and t != first): if ((((typ) & (NounPhraseParseAttr.MULTILINES))) != (NounPhraseParseAttr.NO)): pass elif (items is not None and t.chars != items[len(items) - 1].chars): if (t.chars.is_all_lower and items[len(items) - 1].chars.is_capital_upper): pass else: break if (items is None): items = list() else: it0 = items[len(items) - 1] if (it0.can_be_noun and it0.is_personal_pronoun): if (it.is_pronoun): break if ((it0.begin_token.previous is not None and it0.begin_token.previous.getMorphClassInDictionary( ).is_verb and not it0.begin_token.previous. getMorphClassInDictionary().is_adjective) and not it0.begin_token.previous. getMorphClassInDictionary().is_preposition): if (t.morph.case_.is_nominative or t.morph.case_.is_accusative): pass else: break if (it.can_be_noun and it.is_verb): break items.append(it) t = it.end_token if (t.is_newline_after and not t.chars.is_all_lower): mc = t.getMorphClassInDictionary() if (mc.is_proper_surname): break if (t.morph.class0_.is_proper_surname and mc.is_undefined): break if (items is None): return None if (len(items) == 1 and items[0].can_be_adj): and0_ = False tt1 = items[0].end_token.next0_ first_pass2789 = True while True: if first_pass2789: first_pass2789 = False else: tt1 = tt1.next0_ if (not (tt1 is not None)): break if (tt1.is_and or tt1.is_or): and0_ = True break if (tt1.is_comma or tt1.isValue("НО", None) or tt1.isValue("ТАК", None)): continue break if (and0_): if (items[0].can_be_noun and items[0].is_personal_pronoun): and0_ = False if (and0_): tt2 = tt1.next0_ if (tt2 is not None and tt2.morph.class0_.is_preposition): tt2 = tt2.next0_ npt1 = _NounPraseHelperInt.__tryParseRu(tt2, typ, max_char_pos) if (npt1 is not None and len(npt1.adjectives) > 0): ok1 = False for av in items[0].adj_morph: for v in (npt1.noun).noun_morph: if (v.checkAccord(av, False)): items[0].morph.addItem(av) ok1 = True if (ok1): npt1.begin_token = items[0].begin_token npt1.end_token = tt1.previous npt1.adjectives.clear() npt1.adjectives.append(items[0]) return npt1 last1 = items[len(items) - 1] check = True for it in items: if (not it.can_be_adj): check = False break elif (it.can_be_noun and it.is_personal_pronoun): check = False break tt1 = last1.end_token.next0_ if ((tt1 is not None and check and ((tt1.morph.class0_.is_preposition or tt1.morph.case_.is_instrumental))) and (tt1.whitespaces_before_count < 2)): inp = NounPhraseHelper.tryParse( tt1, Utils.valToEnum((typ) | (NounPhraseParseAttr.PARSEPREPOSITION), NounPhraseParseAttr), max_char_pos) if (inp is not None): tt1 = inp.end_token.next0_ npt1 = _NounPraseHelperInt.__tryParseRu(tt1, typ, max_char_pos) if (npt1 is not None): ok = True for it in items: if (not NounPhraseItem.tryAccordAdjAndNoun( it, Utils.asObjectOrNull(npt1.noun, NounPhraseItem))): ok = False break if (ok): i = 0 while i < len(items): npt1.adjectives.insert(i, items[i]) i += 1 npt1.internal_noun = inp mmm = MorphCollection(npt1.morph) for it in items: mmm.removeItems(it.adj_morph[0], False) if (mmm.gender != MorphGender.UNDEFINED or mmm.number != MorphNumber.UNDEFINED or not mmm.case_.is_undefined): npt1.morph = mmm if (adverbs is not None): if (npt1.adverbs is None): npt1.adverbs = adverbs else: npt1.adverbs[0:0] = adverbs return npt1 if (tt1 is not None and tt1.morph.class0_.is_noun): it = NounPhraseItem.tryParse(tt1, items, typ) if (it is not None and it.can_be_noun): internal_noun_prase = inp inp.begin_token = items[0].end_token.next0_ items.append(it) ok2 = False if ((len(items) == 1 and (((typ) & (NounPhraseParseAttr.ADJECTIVECANBELAST))) != (NounPhraseParseAttr.NO) and (items[0].whitespaces_after_count < 3)) and not items[0].is_adverb): if (not items[0].can_be_adj): ok2 = True elif (items[0].is_personal_pronoun and items[0].can_be_noun): ok2 = True if (ok2): it = NounPhraseItem.tryParse(items[0].end_token.next0_, None, typ) if (it is not None and it.can_be_adj and it.begin_token.chars.is_all_lower): ok2 = True if (it.is_adverb or it.is_verb): ok2 = False if (it.is_pronoun and items[0].is_pronoun): ok2 = False if (it.can_be_adj_for_personal_pronoun and items[0].is_personal_pronoun): ok2 = True if (ok2 and NounPhraseItem.tryAccordAdjAndNoun(it, items[0])): npt1 = _NounPraseHelperInt.__tryParseRu( it.begin_token, typ, max_char_pos) if (npt1 is not None and ((npt1.end_char > it.end_char or len(npt1.adjectives) > 0))): pass else: items.insert(0, it) noun = None adj_after = None for i in range(len(items) - 1, -1, -1): if (items[i].can_be_noun): if (items[i].conj_before): continue if (i > 0 and not items[i - 1].can_be_adj): continue if (i > 0 and items[i - 1].can_be_noun): if (items[i - 1].is_doubt_adjective): continue if (items[i - 1].is_pronoun and items[i].is_pronoun): if (items[i].is_pronoun and items[i - 1].can_be_adj_for_personal_pronoun): pass else: continue noun = items[i] del items[i:i + len(items) - i] if (adj_after is not None): items.append(adj_after) break if (noun is None): return None res = NounPhraseToken(first, noun.end_token) if (adverbs is not None): for a in adverbs: if (a.begin_char < noun.begin_char): if (res.adverbs is None): res.adverbs = list() res.adverbs.append(a) res.noun = (noun) res.internal_noun = internal_noun_prase for v in noun.noun_morph: noun.morph.addItem(v) res.morph = noun.morph if (res.morph.case_.is_nominative and first.previous is not None and first.previous.morph.class0_.is_preposition): res.morph.case_ = (res.morph.case_) ^ MorphCase.NOMINATIVE if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == (NounPhraseParseAttr.NO) and ((res.morph.class0_.is_pronoun or res.morph.class0_.is_personal_pronoun))): return None stat = None if (len(items) > 1): stat = dict() need_update_morph = False if (len(items) > 0): ok_list = list() is_num_not = False for vv in noun.noun_morph: v = vv i = 0 while i < len(items): ok = False for av in items[i].adj_morph: if (v.checkAccord(av, False)): ok = True if (not ((av.case_) & v.case_).is_undefined and av.case_ != v.case_): v.case_ = av.case_ = (av.case_) & v.case_ break if (not ok): if (items[i].can_be_numeric_adj and items[i].tryAccordVar(v)): ok = True v = (Utils.asObjectOrNull(v.clone(), NounPhraseItemTextVar)) v.number = MorphNumber.PLURAL is_num_not = True v.case_ = MorphCase() for a in items[i].adj_morph: v.case_ = (v.case_) | a.case_ else: break i += 1 if (i >= len(items)): ok_list.append(v) if (len(ok_list) > 0 and (((len(ok_list) < res.morph.items_count) or is_num_not))): res.morph = MorphCollection() for v in ok_list: res.morph.addItem(v) if (not is_num_not): noun.morph = res.morph i = 0 first_pass2790 = True while True: if first_pass2790: first_pass2790 = False else: i += 1 if (not (i < len(items))): break for av in items[i].adj_morph: for v in noun.noun_morph: if (v.checkAccord(av, False)): if (not ((av.case_) & v.case_).is_undefined and av.case_ != v.case_): v.case_ = av.case_ = (av.case_) & v.case_ need_update_morph = True items[i].morph.addItem(av) if (stat is not None and len(av.normal_value) > 1): last = av.normal_value[len(av.normal_value) - 1] if (not last in stat): stat[last] = 1 else: stat[last] += 1 if (items[i].is_pronoun or items[i].is_personal_pronoun): res.anafor = items[i].begin_token if ((((typ) & (NounPhraseParseAttr.PARSEPRONOUNS))) == ( NounPhraseParseAttr.NO)): continue tt = Utils.asObjectOrNull(items[i].begin_token, TextToken) if (tt is not None and not tt.term.startswith("ВЫСШ")): err = False for wf in tt.morph.items: if (wf.class0_.is_adjective): if (wf.containsAttr("прев.", None)): if ((((typ) & (NounPhraseParseAttr.IGNOREADJBEST))) != (NounPhraseParseAttr.NO)): err = True if (wf.containsAttr("к.ф.", None) and tt.morph.class0_.is_personal_pronoun): return None if (err): continue if (res.morph.case_.is_nominative): v = MiscHelper.getTextValueOfMetaToken(items[i], GetTextAttr.KEEPQUOTES) if (not Utils.isNullOrEmpty(v)): if (items[i].getNormalCaseText( None, False, MorphGender.UNDEFINED, False) != v): wf = NounPhraseItemTextVar(items[i].morph, None) wf.normal_value = v wf.class0_ = MorphClass.ADJECTIVE wf.case_ = res.morph.case_ if (res.morph.case_.is_prepositional or res.morph.gender == MorphGender.NEUTER or res.morph.gender == MorphGender.FEMINIE): items[i].morph.addItem(wf) else: items[i].morph.insertItem(0, wf) res.adjectives.append(items[i]) if (items[i].end_char > res.end_char): res.end_token = items[i].end_token i = 0 first_pass2791 = True while True: if first_pass2791: first_pass2791 = False else: i += 1 if (not (i < (len(res.adjectives) - 1))): break if (res.adjectives[i].whitespaces_after_count > 5): if (res.adjectives[i].chars != res.adjectives[i + 1].chars): if (not res.adjectives[i + 1].chars.is_all_lower): return None if (res.adjectives[i].chars.is_all_upper and res.adjectives[i + 1].chars.is_capital_upper): return None if (res.adjectives[i].chars.is_capital_upper and res.adjectives[i + 1].chars.is_all_upper): return None if (res.adjectives[i].whitespaces_after_count > 10): if (res.adjectives[i].newlines_after_count == 1): if (res.adjectives[i].chars.is_capital_upper and i == 0 and res.adjectives[i + 1].chars.is_all_lower): continue if (res.adjectives[i].chars == res.adjectives[ i + 1].chars): continue return None if (need_update_morph): noun.morph = MorphCollection() for v in noun.noun_morph: noun.morph.addItem(v) res.morph = noun.morph if (len(res.adjectives) > 0): if (noun.begin_token.previous is not None): if (noun.begin_token.previous.is_comma_and): if (res.adjectives[0].begin_char > noun.begin_char): pass else: return None zap = 0 and0_ = 0 cou = 0 last_and = False i = 0 while i < (len(res.adjectives) - 1): te = res.adjectives[i].end_token.next0_ if (te is None): return None if (te.isChar('(')): pass elif (te.is_comma): zap += 1 elif (te.is_and): and0_ += 1 if (i == (len(res.adjectives) - 2)): last_and = True if (not res.adjectives[i].begin_token.morph.class0_.is_pronoun ): cou += 1 i += 1 if ((zap + and0_) > 0): if (and0_ > 1): return None elif (and0_ == 1 and not last_and): return None if ((zap + and0_) != cou): if (and0_ == 1): pass else: return None last = Utils.asObjectOrNull( res.adjectives[len(res.adjectives) - 1], NounPhraseItem) if (last.is_pronoun and not last_and): return None if (stat is not None): for adj in items: if (adj.morph.items_count > 1): w1 = Utils.asObjectOrNull(adj.morph.getIndexerItem(0), NounPhraseItemTextVar) w2 = Utils.asObjectOrNull(adj.morph.getIndexerItem(1), NounPhraseItemTextVar) if ((len(w1.normal_value) < 2) or (len(w2.normal_value) < 2)): break l1 = w1.normal_value[len(w1.normal_value) - 1] l2 = w2.normal_value[len(w2.normal_value) - 1] i1 = 0 i2 = 0 wrapi1534 = RefOutArgWrapper(0) Utils.tryGetValue(stat, l1, wrapi1534) i1 = wrapi1534.value wrapi2533 = RefOutArgWrapper(0) Utils.tryGetValue(stat, l2, wrapi2533) i2 = wrapi2533.value if (i1 < i2): adj.morph.removeItem(1) adj.morph.insertItem(0, w2) if (res.begin_token.getMorphClassInDictionary().is_verb and len(items) > 0): if (not res.begin_token.chars.is_all_lower or res.begin_token.previous is None): pass elif (res.begin_token.previous.morph.class0_.is_preposition): pass else: comma = False tt = res.begin_token.previous first_pass2792 = True while True: if first_pass2792: first_pass2792 = False else: tt = tt.previous if (not (tt is not None)): break if (tt.morph.class0_.is_adverb): continue if (tt.isCharOf(".;")): break if (tt.is_comma): comma = True continue if (tt.isValue("НЕ", None)): continue if (((tt.morph.class0_.is_noun or tt.morph.class0_.is_proper)) and comma): for it in res.begin_token.morph.items: if (it.class0_.is_verb and (isinstance(it, MorphWordForm))): if (tt.morph.checkAccord(it, False)): if (res.morph.case_.is_instrumental): return None ews = Explanatory.findDerivates( (it).normal_case, True, tt.morph.language) if (ews is not None): for ew in ews: if (ew.transitive > 0): if (res.morph.case_.is_genitive ): return None if (ew.nexts is not None): wrapcm535 = RefOutArgWrapper( None) inoutres536 = Utils.tryGetValue( ew.nexts, "", wrapcm535) cm = wrapcm535.value if (inoutres536): if (not ( (cm) & res.morph.case_ ).is_undefined): return None break if (res.begin_token == res.end_token): mc = res.begin_token.getMorphClassInDictionary() if (mc.is_adverb): if (res.begin_token.previous is not None and res.begin_token.previous.morph.class0_.is_preposition): pass elif (mc.is_noun and not mc.is_preposition and not mc.is_conjunction): pass elif (res.begin_token.isValue("ВЕСЬ", None)): pass else: return None return res
def parse_near_items(t : 'Token', t1 : 'Token', lev : int, prev : typing.List['SentItem']) -> typing.List['SentItem']: if (lev > 100): return None if (t is None or t.begin_char > t1.end_char): return None res = list() if (isinstance(t, ReferentToken)): res.append(SentItem(Utils.asObjectOrNull(t, MetaToken))) return res delim = DelimToken.try_parse(t) if (delim is not None): res.append(SentItem(delim)) return res conj = ConjunctionHelper.try_parse(t) if (conj is not None): res.append(SentItem(conj)) return res prep_ = PrepositionHelper.try_parse(t) t111 = (t if prep_ is None else prep_.end_token.next0_) if ((isinstance(t111, NumberToken)) and ((t111.morph.class0_.is_adjective and not t111.morph.class0_.is_noun))): t111 = (None) num = (None if t111 is None else NumbersWithUnitToken.try_parse(t111, None, False, False, False, False)) if (num is not None): if (len(num.units) == 0): npt1 = NounPhraseHelper.try_parse(num.end_token.next0_, SentItem.__m_npt_attrs, 0, None) if (npt1 is None and num.end_token.next0_ is not None and num.end_token.next0_.is_value("РАЗ", None)): npt1 = NounPhraseToken(num.end_token.next0_, num.end_token.next0_) npt1.noun = MetaToken(num.end_token.next0_, num.end_token.next0_) if (npt1 is not None and prep_ is not None): if (npt1.noun.end_token.is_value("РАЗ", None)): npt1.morph.remove_items(prep_.next_case, False) elif (((npt1.morph.case_) & prep_.next_case).is_undefined): npt1 = (None) else: npt1.morph.remove_items(prep_.next_case, False) if ((npt1 is not None and npt1.end_token.is_value("ОНИ", None) and npt1.preposition is not None) and npt1.preposition.normal == "ИЗ"): npt1.morph = MorphCollection(num.end_token.morph) npt1.preposition = (None) nn = str(num) si1 = SentItem(npt1) if (nn == "1" and (isinstance(num.end_token, NumberToken)) and num.end_token.end_token.is_value("ОДИН", None)): a = SemAttribute._new2946(SemAttributeType.ONEOF, num.end_token.end_token.get_normal_case_text(None, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False)) aex = SemAttributeEx._new2945(num, a) si1.attrs = list() si1.attrs.append(aex) else: si1.quant = SemQuantity(nn, num.begin_token, num.end_token) if (prep_ is not None): si1.prep = prep_.normal res.append(si1) return res if (npt1 is not None): si1 = SentItem._new2948(npt1, SemQuantity(str(num), num.begin_token, num.end_token)) if (prep_ is not None): si1.prep = prep_.normal if (npt1.end_token.is_value("РАЗ", None)): si1.typ = SentItemType.FORMULA if (((npt1.morph.number) & (MorphNumber.PLURAL)) == (MorphNumber.UNDEFINED) and si1.quant.spelling != "1"): ok = False if (si1.quant.spelling.endswith("1")): ok = True elif (si1.typ == SentItemType.FORMULA): ok = True elif (si1.quant.spelling.endswith("2") and npt1.morph.case_.is_genitive): ok = True elif (si1.quant.spelling.endswith("3") and npt1.morph.case_.is_genitive): ok = True elif (si1.quant.spelling.endswith("4") and npt1.morph.case_.is_genitive): ok = True if (ok): npt1.morph = MorphCollection() npt1.morph.number = MorphNumber.PLURAL res.append(si1) return res num.begin_token = t num.morph = MorphCollection(num.end_token.morph) si = SentItem(num) if (prep_ is not None): si.prep = prep_.normal res.append(si) if (si.prep == "НА"): aa = AdverbToken.try_parse(si.end_token.next0_) if (aa is not None and ((aa.typ == SemAttributeType.LESS or aa.typ == SemAttributeType.GREAT))): si.add_attr(aa) si.end_token = aa.end_token return res mc = t.get_morph_class_in_dictionary() adv = AdverbToken.try_parse(t) npt = NounPhraseHelper.try_parse(t, SentItem.__m_npt_attrs, 0, None) if (npt is not None and (isinstance(npt.end_token, TextToken)) and npt.end_token.term == "БЫЛИ"): npt = (None) if (npt is not None and adv is not None): if (adv.end_char > npt.end_char): npt = (None) elif (adv.end_char == npt.end_char): res.append(SentItem(npt)) res.append(SentItem(adv)) return res if (npt is not None and len(npt.adjectives) == 0): if (npt.end_token.is_value("КОТОРЫЙ", None) and t.previous is not None and t.previous.is_comma_and): res1 = SentItem.__parse_subsent(npt, t1, lev + 1, prev) if (res1 is not None): return res1 if (npt.end_token.is_value("СКОЛЬКО", None)): tt1 = npt.end_token.next0_ if (tt1 is not None and tt1.is_value("ВСЕГО", None)): tt1 = tt1.next0_ npt1 = NounPhraseHelper.try_parse(tt1, NounPhraseParseAttr.NO, 0, None) if (npt1 is not None and not npt1.morph.case_.is_undefined and prep_ is not None): if (((prep_.next_case) & npt1.morph.case_).is_undefined): npt1 = (None) else: npt1.morph.remove_items(prep_.next_case, False) if (npt1 is not None): npt1.begin_token = npt.begin_token npt1.preposition = npt.preposition npt1.adjectives.append(MetaToken(npt.end_token, npt.end_token)) npt = npt1 if (npt.end_token.morph.class0_.is_adjective): if (VerbPhraseHelper.try_parse(t, True, False, False) is not None): npt = (None) vrb = None if (npt is not None and len(npt.adjectives) > 0): vrb = VerbPhraseHelper.try_parse(t, True, False, False) if (vrb is not None and vrb.first_verb.is_participle): npt = (None) elif (adv is None or npt is not None): vrb = VerbPhraseHelper.try_parse(t, True, False, False) if (npt is not None): res.append(SentItem(npt)) if (vrb is not None and not vrb.first_verb.is_participle and not vrb.first_verb.is_dee_participle): vars0_ = list() for wf in vrb.first_verb.morph.items: if (wf.class0_.is_verb and (isinstance(wf, MorphWordForm)) and wf.is_in_dictionary): vars0_.append(Utils.asObjectOrNull(wf, MorphWordForm)) if (len(vars0_) < 2): res.append(SentItem(vrb)) else: vrb.first_verb.verb_morph = vars0_[0] res.append(SentItem(vrb)) i = 1 while i < len(vars0_): vrb = VerbPhraseHelper.try_parse(t, False, False, False) if (vrb is None): break vrb.first_verb.verb_morph = vars0_[i] res.append(SentItem(vrb)) i += 1 if (vars0_[0].misc.mood == MorphMood.IMPERATIVE and vars0_[1].misc.mood != MorphMood.IMPERATIVE): rr = res[0] res[0] = res[1] res[1] = rr return res if (vrb is not None): res1 = SentItem.__parse_participles(vrb, t1, lev + 1) if (res1 is not None): res.extend(res1) if (len(res) > 0): return res if (adv is not None): if (adv.typ == SemAttributeType.OTHER): npt1 = NounPhraseHelper.try_parse(adv.end_token.next0_, SentItem.__m_npt_attrs, 0, None) if (npt1 is not None and npt1.end_token.is_value("ОНИ", None) and npt1.preposition is not None): si1 = SentItem(npt1) a = SemAttribute._new2946(SemAttributeType.OTHER, adv.end_token.get_normal_case_text(None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False)) aex = SemAttributeEx._new2945(num, a) si1.attrs = list() si1.attrs.append(aex) if (prep_ is not None): si1.prep = prep_.normal res.append(si1) return res for i in range(len(prev) - 1, -1, -1): if (prev[i].attrs is not None): for a in prev[i].attrs: if (a.attr.typ == SemAttributeType.ONEOF): si1 = SentItem(prev[i].source) aa = SemAttribute._new2946(SemAttributeType.OTHER, adv.end_token.get_normal_case_text(None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False)) aex = SemAttributeEx._new2945(adv, aa) si1.attrs = list() si1.attrs.append(aex) if (prep_ is not None): si1.prep = prep_.normal si1.begin_token = adv.begin_token si1.end_token = adv.end_token res.append(si1) return res res.append(SentItem(adv)) return res if (mc.is_adjective): npt = NounPhraseToken._new2953(t, t, MorphCollection(t.morph)) npt.noun = MetaToken(t, t) res.append(SentItem(npt)) return res return None
def try_attach_territory( li: typing.List['TerrItemToken'], ad: 'AnalyzerData', attach_always: bool = False, cits: typing.List['CityItemToken'] = None, exists: typing.List['GeoReferent'] = None) -> 'ReferentToken': if (li is None or len(li) == 0): return None ex_obj = None new_name = None adj_list = list() noun = None add_noun = None rt = TerrAttachHelper.__try_attach_moscowao(li, ad) if (rt is not None): return rt if (li[0].termin_item is not None and li[0].termin_item.canonic_text == "ТЕРРИТОРИЯ"): res2 = TerrAttachHelper.__try_attach_pure_terr(li, ad) return res2 if (len(li) == 2): if (li[0].rzd is not None and li[1].rzd_dir is not None): rzd = GeoReferent() rzd._add_name(li[1].rzd_dir) rzd._add_typ_ter(li[0].kit.base_language) rzd.add_slot(GeoReferent.ATTR_REF, li[0].rzd.referent, False, 0) rzd.add_ext_referent(li[0].rzd) return ReferentToken(rzd, li[0].begin_token, li[1].end_token) if (li[1].rzd is not None and li[0].rzd_dir is not None): rzd = GeoReferent() rzd._add_name(li[0].rzd_dir) rzd._add_typ_ter(li[0].kit.base_language) rzd.add_slot(GeoReferent.ATTR_REF, li[1].rzd.referent, False, 0) rzd.add_ext_referent(li[1].rzd) return ReferentToken(rzd, li[0].begin_token, li[1].end_token) can_be_city_before = False adj_terr_before = False if (cits is not None): if (cits[0].typ == CityItemToken.ItemType.CITY): can_be_city_before = True elif (cits[0].typ == CityItemToken.ItemType.NOUN and len(cits) > 1): can_be_city_before = True k = 0 k = 0 while k < len(li): if (li[k].onto_item is not None): if (ex_obj is not None or new_name is not None): break if (noun is not None): if (k == 1): if (noun.termin_item.canonic_text == "РАЙОН" or noun.termin_item.canonic_text == "ОБЛАСТЬ" or noun.termin_item.canonic_text == "СОЮЗ"): if (isinstance(li[k].onto_item.referent, GeoReferent)): if (li[k].onto_item.referent.is_state): break ok = False tt = li[k].end_token.next0_ if (tt is None): ok = True elif (tt.is_char_of(",.")): ok = True if (not ok): ok = MiscLocationHelper.check_geo_object_before( li[0].begin_token) if (not ok): adr = AddressItemToken.try_parse( tt, None, False, False, None) if (adr is not None): if (adr.typ == AddressItemToken.ItemType.STREET): ok = True if (not ok): break if (li[k].onto_item is not None): if (noun.begin_token.is_value("МО", None) or noun.begin_token.is_value("ЛО", None)): return None ex_obj = li[k] elif (li[k].termin_item is not None): if (noun is not None): break if (li[k].termin_item.is_always_prefix and k > 0): break if (k > 0 and li[k].is_doubt): if (li[k].begin_token == li[k].end_token and li[k].begin_token.is_value("ЗАО", None)): break if (li[k].termin_item.is_adjective or li[k].is_geo_in_dictionary): adj_list.append(li[k]) else: if (ex_obj is not None): geo_ = Utils.asObjectOrNull(ex_obj.onto_item.referent, GeoReferent) if (geo_ is None): break if (ex_obj.is_adjective and ((li[k].termin_item.canonic_text == "СОЮЗ" or li[k].termin_item.canonic_text == "ФЕДЕРАЦИЯ"))): str0_ = str(ex_obj.onto_item) if (not li[k].termin_item.canonic_text in str0_): return None if (li[k].termin_item.canonic_text == "РАЙОН" or li[k].termin_item.canonic_text == "ОКРУГ" or li[k].termin_item.canonic_text == "КРАЙ"): tmp = io.StringIO() for s in geo_.slots: if (s.type_name == GeoReferent.ATTR_TYPE): print("{0};".format(s.value), end="", file=tmp, flush=True) if (not li[k].termin_item.canonic_text in Utils.toStringStringIO(tmp).upper()): if (k != 1 or new_name is not None): break new_name = li[0] new_name.is_adjective = True new_name.onto_item = (None) ex_obj = (None) noun = li[k] if (k == 0): tt = TerrItemToken.try_parse( li[k].begin_token.previous, None, True, False, None) if (tt is not None and tt.morph.class0_.is_adjective): adj_terr_before = True else: if (ex_obj is not None): break if (new_name is not None): break new_name = li[k] k += 1 name = None alt_name = None full_name = None morph_ = None if (ex_obj is not None): if (ex_obj.is_adjective and not ex_obj.morph.language.is_en and noun is None): if (attach_always and ex_obj.end_token.next0_ is not None): npt = NounPhraseHelper.try_parse(ex_obj.begin_token, NounPhraseParseAttr.NO, 0, None) if (ex_obj.end_token.next0_.is_comma_and): pass elif (npt is None): pass else: str0_ = StreetItemToken.try_parse( ex_obj.end_token.next0_, None, False, None, False) if (str0_ is not None): if (str0_.typ == StreetItemType.NOUN and str0_.end_token == npt.end_token): return None else: cit = CityItemToken.try_parse(ex_obj.end_token.next0_, None, False, None) if (cit is not None and ((cit.typ == CityItemToken.ItemType.NOUN or cit.typ == CityItemToken.ItemType.CITY))): npt = NounPhraseHelper.try_parse( ex_obj.begin_token, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.end_token == cit.end_token): pass else: return None elif (ex_obj.begin_token.is_value("ПОДНЕБЕСНЫЙ", None)): pass else: return None if (noun is None and ex_obj.can_be_city): cit0 = CityItemToken.try_parse_back( ex_obj.begin_token.previous) if (cit0 is not None and cit0.typ != CityItemToken.ItemType.PROPERNAME): return None if (ex_obj.is_doubt and noun is None): ok2 = False if (TerrAttachHelper.__can_be_geo_after( ex_obj.end_token.next0_)): ok2 = True elif (not ex_obj.can_be_surname and not ex_obj.can_be_city): if ((ex_obj.end_token.next0_ is not None and ex_obj.end_token.next0_.is_char(')') and ex_obj.begin_token.previous is not None) and ex_obj.begin_token.previous.is_char('(')): ok2 = True elif (ex_obj.chars.is_latin_letter and ex_obj.begin_token.previous is not None): if (ex_obj.begin_token.previous.is_value("IN", None)): ok2 = True elif (ex_obj.begin_token.previous.is_value( "THE", None) and ex_obj.begin_token.previous.previous is not None and ex_obj.begin_token.previous.previous.is_value( "IN", None)): ok2 = True if (not ok2): cit0 = CityItemToken.try_parse_back( ex_obj.begin_token.previous) if (cit0 is not None and cit0.typ != CityItemToken.ItemType.PROPERNAME): pass elif (MiscLocationHelper.check_geo_object_before( ex_obj.begin_token.previous)): pass else: return None name = ex_obj.onto_item.canonic_text morph_ = ex_obj.morph elif (new_name is not None): if (noun is None): return None j = 1 while j < k: if (li[j].is_newline_before and not li[0].is_newline_before): if (BracketHelper.can_be_start_of_sequence( li[j].begin_token, False, False)): pass else: return None j += 1 morph_ = noun.morph if (new_name.is_adjective): if (noun.termin_item.acronym == "АО"): if (noun.begin_token != noun.end_token): return None if (new_name.morph.gender != MorphGender.FEMINIE): return None geo_before = None tt0 = li[0].begin_token.previous if (tt0 is not None and tt0.is_comma_and): tt0 = tt0.previous if (not li[0].is_newline_before and tt0 is not None): geo_before = (Utils.asObjectOrNull(tt0.get_referent(), GeoReferent)) if (Utils.indexOfList(li, noun, 0) < Utils.indexOfList( li, new_name, 0)): if (noun.termin_item.is_state): return None if (new_name.can_be_surname and geo_before is None): if (((noun.morph.case_) & new_name.morph.case_).is_undefined): return None if (MiscHelper.is_exists_in_dictionary( new_name.begin_token, new_name.end_token, (MorphClass.ADJECTIVE) | MorphClass.PRONOUN | MorphClass.VERB)): if (noun.begin_token != new_name.begin_token): if (geo_before is None): if (len(li) == 2 and TerrAttachHelper.__can_be_geo_after( li[1].end_token.next0_)): pass elif (len(li) == 3 and li[2].termin_item is not None and TerrAttachHelper.__can_be_geo_after( li[2].end_token.next0_)): pass elif (new_name.is_geo_in_dictionary): pass elif (new_name.end_token.is_newline_after): pass else: return None npt = NounPhraseHelper.try_parse( new_name.end_token, NounPhraseParseAttr.PARSEPRONOUNS, 0, None) if (npt is not None and npt.end_token != new_name.end_token): if (len(li) >= 3 and li[2].termin_item is not None and npt.end_token == li[2].end_token): add_noun = li[2] else: return None rtp = new_name.kit.process_referent( "PERSON", new_name.begin_token) if (rtp is not None): return None name = ProperNameHelper.get_name_ex( new_name.begin_token, new_name.end_token, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False) else: ok = False if (((k + 1) < len(li)) and li[k].termin_item is None and li[k + 1].termin_item is not None): ok = True elif ((k < len(li)) and li[k].onto_item is not None): ok = True elif (k == len(li) and not new_name.is_adj_in_dictionary): ok = True elif (MiscLocationHelper.check_geo_object_before( li[0].begin_token) or can_be_city_before): ok = True elif (MiscLocationHelper.check_geo_object_after( li[k - 1].end_token, False)): ok = True elif (len(li) == 3 and k == 2): cit = CityItemToken.try_parse(li[2].begin_token, None, False, None) if (cit is not None): if (cit.typ == CityItemToken.ItemType.CITY or cit.typ == CityItemToken.ItemType.NOUN): ok = True elif (len(li) == 2): ok = TerrAttachHelper.__can_be_geo_after( li[len(li) - 1].end_token.next0_) if (not ok and not li[0].is_newline_before and not li[0].chars.is_all_lower): rt00 = li[0].kit.process_referent( "PERSONPROPERTY", li[0].begin_token.previous) if (rt00 is not None): ok = True if (noun.termin_item is not None and noun.termin_item.is_strong and new_name.is_adjective): ok = True if (noun.is_doubt and len(adj_list) == 0 and geo_before is None): return None name = ProperNameHelper.get_name_ex( new_name.begin_token, new_name.end_token, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False) if (not ok and not attach_always): if (MiscHelper.is_exists_in_dictionary( new_name.begin_token, new_name.end_token, (MorphClass.ADJECTIVE) | MorphClass.PRONOUN | MorphClass.VERB)): if (exists is not None): for e0_ in exists: if (e0_.find_slot(GeoReferent.ATTR_NAME, name, True) is not None): ok = True break if (not ok): return None full_name = "{0} {1}".format( ProperNameHelper.get_name_ex(li[0].begin_token, noun.begin_token.previous, MorphClass.ADJECTIVE, MorphCase.UNDEFINED, noun.termin_item.gender, False, False), noun.termin_item.canonic_text) else: if (not attach_always or ((noun.termin_item is not None and noun.termin_item.canonic_text == "ФЕДЕРАЦИЯ"))): is_latin = noun.chars.is_latin_letter and new_name.chars.is_latin_letter if (Utils.indexOfList(li, noun, 0) > Utils.indexOfList( li, new_name, 0)): if (not is_latin): return None if (not new_name.is_district_name and not BracketHelper.can_be_start_of_sequence( new_name.begin_token, False, False)): if (len(adj_list) == 0 and MiscHelper.is_exists_in_dictionary( new_name.begin_token, new_name.end_token, (MorphClass.NOUN) | MorphClass.PRONOUN)): if (len(li) == 2 and noun.is_city_region and (noun.whitespaces_after_count < 2)): pass else: return None if (not is_latin): if ((noun.termin_item.is_region and not attach_always and ((not adj_terr_before or new_name.is_doubt))) and not noun.is_city_region and not noun.termin_item.is_specific_prefix): if (not MiscLocationHelper. check_geo_object_before( noun.begin_token)): if (not noun.is_doubt and noun.begin_token != noun.end_token): pass elif ((noun.termin_item.is_always_prefix and len(li) == 2 and li[0] == noun) and li[1] == new_name): pass else: return None if (noun.is_doubt and len(adj_list) == 0): if (noun.termin_item.acronym == "МО" or noun.termin_item.acronym == "ЛО"): if (k == (len(li) - 1) and li[k].termin_item is not None): add_noun = li[k] k += 1 elif (len(li) == 2 and noun == li[0] and str(new_name).endswith("совет")): pass else: return None else: return None pers = new_name.kit.process_referent( "PERSON", new_name.begin_token) if (pers is not None): return None name = MiscHelper.get_text_value(new_name.begin_token, new_name.end_token, GetTextAttr.NO) if (new_name.begin_token != new_name.end_token): ttt = new_name.begin_token.next0_ while ttt is not None and ttt.end_char <= new_name.end_char: if (ttt.chars.is_letter): ty = TerrItemToken.try_parse( ttt, None, False, False, None) if ((ty is not None and ty.termin_item is not None and noun is not None) and ((noun.termin_item.canonic_text in ty.termin_item.canonic_text or ty.termin_item.canonic_text in noun.termin_item.canonic_text))): name = MiscHelper.get_text_value( new_name.begin_token, ttt.previous, GetTextAttr.NO) break ttt = ttt.next0_ if (len(adj_list) > 0): npt = NounPhraseHelper.try_parse(adj_list[0].begin_token, NounPhraseParseAttr.NO, 0, None) if (npt is not None and npt.end_token == noun.end_token): alt_name = "{0} {1}".format( npt.get_normal_case_text(None, MorphNumber.UNDEFINED, MorphGender.UNDEFINED, False), name) else: if ((len(li) == 1 and noun is not None and noun.end_token.next0_ is not None) and (isinstance( noun.end_token.next0_.get_referent(), GeoReferent))): g = Utils.asObjectOrNull(noun.end_token.next0_.get_referent(), GeoReferent) if (noun.termin_item is not None): tyy = noun.termin_item.canonic_text.lower() ooo = False if (g.find_slot(GeoReferent.ATTR_TYPE, tyy, True) is not None): ooo = True elif (tyy.endswith("район") and g.find_slot( GeoReferent.ATTR_TYPE, "район", True) is not None): ooo = True if (ooo): return ReferentToken._new734(g, noun.begin_token, noun.end_token.next0_, noun.begin_token.morph) if ((len(li) == 1 and noun == li[0] and li[0].termin_item is not None) and TerrItemToken.try_parse(li[0].end_token.next0_, None, True, False, None) is None and TerrItemToken.try_parse(li[0].begin_token.previous, None, True, False, None) is None): if (li[0].morph.number == MorphNumber.PLURAL): return None cou = 0 str0_ = li[0].termin_item.canonic_text.lower() tt = li[0].begin_token.previous first_pass3158 = True while True: if first_pass3158: first_pass3158 = False else: tt = tt.previous if (not (tt is not None)): break if (tt.is_newline_after): cou += 10 else: cou += 1 if (cou > 500): break g = Utils.asObjectOrNull(tt.get_referent(), GeoReferent) if (g is None): continue ok = True cou = 0 tt = li[0].end_token.next0_ first_pass3159 = True while True: if first_pass3159: first_pass3159 = False else: tt = tt.next0_ if (not (tt is not None)): break if (tt.is_newline_before): cou += 10 else: cou += 1 if (cou > 500): break tee = TerrItemToken.try_parse(tt, None, True, False, None) if (tee is None): continue ok = False break if (ok): ii = 0 while g is not None and (ii < 3): if (g.find_slot(GeoReferent.ATTR_TYPE, str0_, True) is not None): return ReferentToken._new734( g, li[0].begin_token, li[0].end_token, noun.begin_token.morph) g = g.higher ii += 1 break return None ter = None if (ex_obj is not None and (isinstance(ex_obj.tag, GeoReferent))): ter = (Utils.asObjectOrNull(ex_obj.tag, GeoReferent)) else: ter = GeoReferent() if (ex_obj is not None): geo_ = Utils.asObjectOrNull(ex_obj.onto_item.referent, GeoReferent) if (geo_ is not None and not geo_.is_city): ter._merge_slots2(geo_, li[0].kit.base_language) else: ter._add_name(name) if (noun is None and ex_obj.can_be_city): ter._add_typ_city(li[0].kit.base_language) else: pass elif (new_name is not None): ter._add_name(name) if (alt_name is not None): ter._add_name(alt_name) if (noun is not None): if (noun.termin_item.canonic_text == "АО"): ter._add_typ( ("АВТОНОМНИЙ ОКРУГ" if li[0].kit.base_language.is_ua else "АВТОНОМНЫЙ ОКРУГ")) elif (noun.termin_item.canonic_text == "МУНИЦИПАЛЬНОЕ СОБРАНИЕ" or noun.termin_item.canonic_text == "МУНІЦИПАЛЬНЕ ЗБОРИ"): ter._add_typ(("МУНІЦИПАЛЬНЕ УТВОРЕННЯ" if li[0].kit.base_language.is_ua else "МУНИЦИПАЛЬНОЕ ОБРАЗОВАНИЕ")) elif (noun.termin_item.acronym == "МО" and add_noun is not None): ter._add_typ(add_noun.termin_item.canonic_text) else: if (noun.termin_item.canonic_text == "СОЮЗ" and ex_obj is not None and ex_obj.end_char > noun.end_char): return ReferentToken._new734(ter, ex_obj.begin_token, ex_obj.end_token, ex_obj.morph) ter._add_typ(noun.termin_item.canonic_text) if (noun.termin_item.is_region and ter.is_state): ter._add_typ_reg(li[0].kit.base_language) if (ter.is_state and ter.is_region): for a in adj_list: if (a.termin_item.is_region): ter._add_typ_reg(li[0].kit.base_language) break if (ter.is_state): if (full_name is not None): ter._add_name(full_name) res = ReferentToken(ter, li[0].begin_token, li[k - 1].end_token) if (noun is not None and noun.morph.class0_.is_noun): res.morph = noun.morph else: res.morph = MorphCollection() ii = 0 while ii < k: for v in li[ii].morph.items: bi = MorphBaseInfo() bi.copy_from(v) if (noun is not None): if (bi.class0_.is_adjective): bi.class0_ = MorphClass.NOUN res.morph.add_item(bi) ii += 1 if (li[0].termin_item is not None and li[0].termin_item.is_specific_prefix): res.begin_token = li[0].end_token.next0_ if (add_noun is not None and add_noun.end_char > res.end_char): res.end_token = add_noun.end_token if ((isinstance(res.begin_token.previous, TextToken)) and (res.whitespaces_before_count < 2)): tt = Utils.asObjectOrNull(res.begin_token.previous, TextToken) if (tt.term == "АР"): for ty in ter.typs: if ("республика" in ty or "республіка" in ty): res.begin_token = tt break return res
def tryAttach(self, t0: 'Token') -> 'TerminToken': from pullenti.ner.Token import Token from pullenti.ner.TextToken import TextToken from pullenti.ner.MetaToken import MetaToken from pullenti.ner.MorphCollection import MorphCollection from pullenti.ner.core.TerminToken import TerminToken t1 = Utils.asObjectOrNull(t0, TextToken) if (t1 is None): return None if (t1.term != self.parts[0].value): if (len(self.parts) != 1 or not t1.isValue(self.parts[0].value, None)): return None if (self.tail is None): te = t1 point = False if (te.next0_ is not None): if (te.next0_.isChar('.')): te = te.next0_ point = True elif (len(self.parts) > 1): while te.next0_ is not None: if (te.next0_.isCharOf("\\/.") or te.next0_.is_hiphen): te = te.next0_ point = True else: break if (te is None): return None tt = te.next0_ i = 1 while i < len(self.parts): if (tt is not None and tt.whitespaces_before_count > 2): return None if (tt is not None and ((tt.is_hiphen or tt.isCharOf("\\/.")))): tt = tt.next0_ elif (not point and self.parts[i - 1].has_delim): return None if (tt is None): return None if (isinstance(tt, TextToken)): tet = Utils.asObjectOrNull(tt, TextToken) if (tet.term != self.parts[i].value): if (not tet.isValue(self.parts[i].value, None)): return None elif (isinstance(tt, MetaToken)): mt = Utils.asObjectOrNull(tt, MetaToken) if (mt.begin_token != mt.end_token): return None if (not mt.begin_token.isValue(self.parts[i].value, None)): return None te = tt if (tt.next0_ is not None and ((tt.next0_.isCharOf(".\\/") or tt.next0_.is_hiphen))): tt = tt.next0_ point = True if (tt is not None): te = tt else: point = False tt = tt.next0_ i += 1 res = TerminToken._new603(t0, te, t0 == te) if (point): res.morph = MorphCollection() return res t1 = (Utils.asObjectOrNull(t1.next0_, TextToken)) if (t1 is None or not t1.isCharOf("-\\/")): return None t1 = (Utils.asObjectOrNull(t1.next0_, TextToken)) if (t1 is None): return None if (t1.term[0] != self.tail[0]): return None return TerminToken(t0, t1)
class Token: """ Базовый класс для всех токенов """ def __init__(self, kit_: 'AnalysisKit', begin: int, end: int) -> None: self.kit = None self.begin_char = 0 self.end_char = 0 self.tag = None self._m_previous = None self._m_next = None self.__m_morph = None self.chars = None self.__m_attrs = 0 self.kit = kit_ self.begin_char = begin self.end_char = end @property def length_char(self) -> int: """ Длина в исходных символах """ return (self.end_char - self.begin_char) + 1 @property def previous(self) -> 'Token': """ Предыдущий токен """ return self._m_previous @previous.setter def previous(self, value) -> 'Token': self._m_previous = value if (value is not None): value._m_next = self self.__m_attrs = (0) return value @property def next0_(self) -> 'Token': """ Следующий токен """ return self._m_next @next0_.setter def next0_(self, value) -> 'Token': self._m_next = value if (value is not None): value._m_previous = self self.__m_attrs = (0) return value @property def morph(self) -> 'MorphCollection': """ Морфологическая информация """ if (self.__m_morph is None): self.__m_morph = MorphCollection() return self.__m_morph @morph.setter def morph(self, value) -> 'MorphCollection': self.__m_morph = value return value def __str__(self) -> str: return self.kit.sofa.text[self.begin_char:self.begin_char + (self.end_char + 1) - self.begin_char] def __getAttr(self, i: int) -> bool: if ((((self.__m_attrs) & 1)) == 0): self.__m_attrs = (1) if (self._m_previous is None): self._setAttr(1, True) self._setAttr(3, True) else: j = self._m_previous.end_char + 1 while j < self.begin_char: ch = self.kit.sofa.text[j] if (Utils.isWhitespace((ch))): self._setAttr(1, True) if ((ord(ch)) == 0xD or (ord(ch)) == 0xA or ch == '\f'): self._setAttr(3, True) j += 1 if (self._m_next is None): self._setAttr(2, True) self._setAttr(4, True) else: j = self.end_char + 1 while j < self._m_next.begin_char: ch = self.kit.sofa.text[j] if (Utils.isWhitespace(ch)): self._setAttr(2, True) if ((ord(ch)) == 0xD or (ord(ch)) == 0xA or ch == '\f'): self._setAttr(4, True) j += 1 return (((((self.__m_attrs) >> i)) & 1)) != 0 def _setAttr(self, i: int, val: bool) -> None: if (val): self.__m_attrs |= ((1 << i)) else: self.__m_attrs &= (~((1 << i))) @property def is_whitespace_before(self) -> bool: """ Наличие пробельных символов перед """ return self.__getAttr(1) @is_whitespace_before.setter def is_whitespace_before(self, value) -> bool: self._setAttr(1, value) return value @property def is_whitespace_after(self) -> bool: """ Наличие пробельных символов после """ return self.__getAttr(2) @is_whitespace_after.setter def is_whitespace_after(self, value) -> bool: self._setAttr(2, value) return value @property def is_newline_before(self) -> bool: """ Элемент начинается с новой строки. Для 1-го элемента всегда true. """ return self.__getAttr(3) @is_newline_before.setter def is_newline_before(self, value) -> bool: self._setAttr(3, value) return value @property def is_newline_after(self) -> bool: """ Элемент заканчивает строку. Для последнего элемента всегда true. """ return self.__getAttr(4) @is_newline_after.setter def is_newline_after(self, value) -> bool: self._setAttr(4, value) return value @property def inner_bool(self) -> bool: """ Это используется внутренним образом """ return self.__getAttr(5) @inner_bool.setter def inner_bool(self, value) -> bool: self._setAttr(5, value) return value @property def not_noun_phrase(self) -> bool: """ Это используется внутренним образом (признак того, что здесь не начинается именная группа, чтобы повторно не пытаться выделять) """ return self.__getAttr(6) @not_noun_phrase.setter def not_noun_phrase(self, value) -> bool: self._setAttr(6, value) return value @property def whitespaces_before_count(self) -> int: """ Количество пробелов перед, переход на новую строку = 10, табуляция = 5 """ if (self.previous is None): return 100 if ((self.previous.end_char + 1) == self.begin_char): return 0 return self.__calcWhitespaces(self.previous.end_char + 1, self.begin_char - 1) @property def newlines_before_count(self) -> int: """ Количество переходов на новую строку перед """ ch0 = chr(0) res = 0 txt = self.kit.sofa.text for p in range(self.begin_char - 1, -1, -1): ch = txt[p] if ((ord(ch)) == 0xA): res += 1 elif ((ord(ch)) == 0xD and (ord(ch0)) != 0xA): res += 1 elif (ch == '\f'): res += 10 elif (not Utils.isWhitespace(ch)): break ch0 = ch return res @property def newlines_after_count(self) -> int: """ Количество переходов на новую строку перед """ ch0 = chr(0) res = 0 txt = self.kit.sofa.text p = self.end_char + 1 while p < len(txt): ch = txt[p] if ((ord(ch)) == 0xD): res += 1 elif ((ord(ch)) == 0xA and (ord(ch0)) != 0xD): res += 1 elif (ch == '\f'): res += 10 elif (not Utils.isWhitespace(ch)): break ch0 = ch p += 1 return res @property def whitespaces_after_count(self) -> int: """ Количество пробелов перед, переход на новую строку = 10, табуляция = 5 """ if (self.next0_ is None): return 100 if ((self.end_char + 1) == self.next0_.begin_char): return 0 return self.__calcWhitespaces(self.end_char + 1, self.next0_.begin_char - 1) def __calcWhitespaces(self, p0: int, p1: int) -> int: if ((p0 < 0) or p0 > p1 or p1 >= len(self.kit.sofa.text)): return -1 res = 0 i = p0 while i <= p1: ch = self.kit.getTextCharacter(i) if (ch == '\r' or ch == '\n'): res += 10 ch1 = self.kit.getTextCharacter(i + 1) if (ch != ch1 and ((ch1 == '\r' or ch1 == '\n'))): i += 1 elif (ch == '\t'): res += 5 elif (ch == '\u0007'): res += 100 elif (ch == '\f'): res += 100 else: res += 1 i += 1 return res @property def is_hiphen(self) -> bool: """ Это символ переноса """ ch = self.kit.sofa.text[self.begin_char] return LanguageHelper.isHiphen(ch) @property def is_table_control_char(self) -> bool: """ Это спец-символы для табличных элементов (7h, 1Eh, 1Fh) """ ch = self.kit.sofa.text[self.begin_char] return (ord(ch)) == 7 or (ord(ch)) == 0x1F or (ord(ch)) == 0x1E @property def is_and(self) -> bool: """ Это соединительный союз И (на всех языках) """ return False @property def is_or(self) -> bool: """ Это соединительный союз ИЛИ (на всех языках) """ return False @property def is_comma(self) -> bool: """ Это запятая """ return self.isChar(',') @property def is_comma_and(self) -> bool: """ Это запятая или союз И """ return self.is_comma or self.is_and def isChar(self, ch: 'char') -> bool: """ Токен состоит из символа Args: ch('char'): проверяемый символ """ if (self.begin_char != self.end_char): return False return self.kit.sofa.text[self.begin_char] == ch def isCharOf(self, chars_: str) -> bool: """ Токен состоит из одного символа, который есть в указанной строке Args: chars_(str): строка возможных символов """ if (self.begin_char != self.end_char): return False return chars_.find(self.kit.sofa.text[self.begin_char]) >= 0 def isValue(self, term: str, termua: str = None) -> bool: return False @property def is_letters(self) -> bool: """ Признак того, что это буквенный текстовой токен (TextToken) """ return False @property def is_number(self) -> bool: """ Это число (в различных вариантах задания) """ return False @property def is_referent(self) -> bool: """ Это сущность (Referent) """ return False def getReferent(self) -> 'Referent': """ Ссылка на сущность (для ReferentToken) """ return None def getReferents(self) -> typing.List['Referent']: """ Получить список ссылок на все сущности, скрывающиеся под элементом (дело в том, что одни сущности могут поглощать дркгие, например, адрес поглотит город) """ return None def getNormalCaseText(self, mc: 'MorphClass' = None, single_number: bool = False, gender: 'MorphGender' = MorphGender.UNDEFINED, keep_chars: bool = False) -> str: """ Получить связанный с токеном текст в именительном падеже Args: mc(MorphClass): single_number(bool): переводить ли в единственное число """ return str(self) def getSourceText(self) -> str: """ Получить чистый фрагмент исходного текста """ len0_ = (self.end_char + 1) - self.begin_char if ((len0_ < 1) or (self.begin_char < 0)): return None if ((self.begin_char + len0_) > len(self.kit.sofa.text)): return None return self.kit.sofa.text[self.begin_char:self.begin_char + len0_] def getMorphClassInDictionary(self) -> 'MorphClass': """ Проверка, что это текстовый токен и есть в словаре соотв. тип Args: cla: """ return self.morph.class0_ def _serialize(self, stream: io.IOBase) -> None: from pullenti.ner.core.internal.SerializerHelper import SerializerHelper SerializerHelper.serializeInt(stream, self.begin_char) SerializerHelper.serializeInt(stream, self.end_char) SerializerHelper.serializeInt(stream, self.__m_attrs) SerializerHelper.serializeInt(stream, self.chars.value) if (self.__m_morph is None): self.__m_morph = MorphCollection() self.__m_morph._serialize(stream) def _deserialize(self, stream: io.IOBase, kit_: 'AnalysisKit', vers: int) -> None: from pullenti.ner.core.internal.SerializerHelper import SerializerHelper self.kit = kit_ self.begin_char = SerializerHelper.deserializeInt(stream) self.end_char = SerializerHelper.deserializeInt(stream) self.__m_attrs = (SerializerHelper.deserializeInt(stream)) self.chars = CharsInfo._new2656( SerializerHelper.deserializeInt(stream)) self.__m_morph = MorphCollection() self.__m_morph._deserialize(stream)
def _createReferentToken(p : 'PersonReferent', begin : 'Token', end : 'Token', morph_ : 'MorphCollection', attrs : typing.List['PersonAttrToken'], ad : 'PersonAnalyzerData', for_attribute : bool, after_be_predicate : bool) -> 'ReferentToken': from pullenti.ner.person.internal.PersonIdentityToken import PersonIdentityToken if (p is None): return None has_prefix = False if (attrs is not None): for a in attrs: if (a.typ == PersonAttrTerminType.BESTREGARDS): has_prefix = True else: if (a.begin_char < begin.begin_char): begin = a.begin_token if (a.typ != PersonAttrTerminType.PREFIX): if (a.age is not None): p.addSlot(PersonReferent.ATTR_AGE, a.age, False, 0) if (a.prop_ref is None): p.addSlot(PersonReferent.ATTR_ATTR, a.value, False, 0) else: p.addSlot(PersonReferent.ATTR_ATTR, a, False, 0) elif (a.gender == MorphGender.FEMINIE and not p.is_female): p.is_female = True elif (a.gender == MorphGender.MASCULINE and not p.is_male): p.is_male = True elif ((isinstance(begin.previous, TextToken)) and (begin.whitespaces_before_count < 3)): if ((begin.previous).term == "ИП"): a = PersonAttrToken(begin.previous, begin.previous) a.prop_ref = PersonPropertyReferent() a.prop_ref.name = "индивидуальный предприниматель" p.addSlot(PersonReferent.ATTR_ATTR, a, False, 0) begin = begin.previous m0 = MorphCollection() for it in morph_.items: bi = MorphBaseInfo(it) bi.number = MorphNumber.SINGULAR if (bi.gender == MorphGender.UNDEFINED): if (p.is_male and not p.is_female): bi.gender = MorphGender.MASCULINE if (not p.is_male and p.is_female): bi.gender = MorphGender.FEMINIE m0.addItem(bi) morph_ = m0 if ((attrs is not None and len(attrs) > 0 and not attrs[0].morph.case_.is_undefined) and morph_.case_.is_undefined): morph_.case_ = attrs[0].morph.case_ if (attrs[0].morph.number == MorphNumber.SINGULAR): morph_.number = MorphNumber.SINGULAR if (p.is_male and not p.is_female): morph_.gender = MorphGender.MASCULINE elif (p.is_female): morph_.gender = MorphGender.FEMINIE if (begin.previous is not None): ttt = begin.previous if (ttt.isValue("ИМЕНИ", "ІМЕНІ")): for_attribute = True else: if (ttt.isChar('.') and ttt.previous is not None): ttt = ttt.previous if (ttt.whitespaces_after_count < 3): if (ttt.isValue("ИМ", "ІМ")): for_attribute = True if (for_attribute): return ReferentToken._new2329(p, begin, end, morph_, p._m_person_identity_typ) if ((begin.previous is not None and begin.previous.is_comma_and and (isinstance(begin.previous.previous, ReferentToken))) and (isinstance(begin.previous.previous.getReferent(), PersonReferent))): rt00 = Utils.asObjectOrNull(begin.previous.previous, ReferentToken) ttt = rt00 while ttt is not None: if (ttt.previous is None or not ((isinstance(ttt.previous.previous, ReferentToken)))): break if (not ttt.previous.is_comma_and or not ((isinstance(ttt.previous.previous.getReferent(), PersonReferent)))): break rt00 = (Utils.asObjectOrNull(ttt.previous.previous, ReferentToken)) ttt = (rt00) if (isinstance(rt00.begin_token.getReferent(), PersonPropertyReferent)): ok = False if ((rt00.begin_token).end_token.next0_ is not None and (rt00.begin_token).end_token.next0_.isChar(':')): ok = True elif (rt00.begin_token.morph.number == MorphNumber.PLURAL): ok = True if (ok): p.addSlot(PersonReferent.ATTR_ATTR, rt00.begin_token.getReferent(), False, 0) if (ad is not None): if (ad.overflow_level > 10): return ReferentToken._new2329(p, begin, end, morph_, p._m_person_identity_typ) ad.overflow_level += 1 attrs1 = None has_position = False open_br = False t = end.next0_ first_pass3095 = True while True: if first_pass3095: first_pass3095 = False else: t = t.next0_ if (not (t is not None)): break if (t.is_table_control_char): break if (t.is_newline_before): if (t.newlines_before_count > 2): break if (attrs1 is not None and len(attrs1) > 0): break ml = MailLine.parse(t, 0) if (ml is not None and ml.typ == MailLine.Types.FROM): break if (t.chars.is_capital_upper): attr1 = PersonAttrToken.tryAttach(t, (None if ad is None else ad.local_ontology), PersonAttrToken.PersonAttrAttachAttrs.NO) ok1 = False if (attr1 is not None): if (has_prefix or attr1.is_newline_after or ((attr1.end_token.next0_ is not None and attr1.end_token.next0_.is_table_control_char))): ok1 = True else: tt2 = t.next0_ while tt2 is not None and tt2.end_char <= attr1.end_char: if (tt2.is_whitespace_before): ok1 = True tt2 = tt2.next0_ else: ttt = PersonHelper.__correctTailAttributes(p, t) if (ttt is not None and ttt != t): t = ttt end = t continue if (not ok1): break if (t.is_hiphen or t.isCharOf("_>|")): continue if (t.isValue("МОДЕЛЬ", None)): break tt = PersonHelper.__correctTailAttributes(p, t) if (tt != t and tt is not None): t = tt end = t continue is_be = False if (t.isChar('(') and t == end.next0_): open_br = True t = t.next0_ if (t is None): break pit1 = PersonItemToken.tryAttach(t, None, PersonItemToken.ParseAttr.NO, None) if ((pit1 is not None and t.chars.is_capital_upper and pit1.end_token.next0_ is not None) and (isinstance(t, TextToken)) and pit1.end_token.next0_.isChar(')')): if (pit1.lastname is not None): inf = MorphBaseInfo._new2321(MorphCase.NOMINATIVE) if (p.is_male): inf.gender = Utils.valToEnum((inf.gender) | (MorphGender.MASCULINE), MorphGender) if (p.is_female): inf.gender = Utils.valToEnum((inf.gender) | (MorphGender.FEMINIE), MorphGender) sur = PersonIdentityToken.createLastname(pit1, inf) if (sur is not None): p._addFioIdentity(sur, None, None) t = pit1.end_token.next0_ end = t continue elif (t.is_comma): t = t.next0_ if ((isinstance(t, TextToken)) and (t).isValue("WHO", None)): continue elif ((isinstance(t, TextToken)) and (t).is_verb_be): t = t.next0_ elif (t.is_and and t.is_whitespace_after and not t.is_newline_after): if (t == end.next0_): break t = t.next0_ elif (t.is_hiphen and t == end.next0_): t = t.next0_ elif (t.isChar('.') and t == end.next0_ and has_prefix): t = t.next0_ ttt2 = PersonHelper.createNickname(p, t) if (ttt2 is not None): end = ttt2 t = end continue if (t is None): break attr = None attr = PersonAttrToken.tryAttach(t, (None if ad is None else ad.local_ontology), PersonAttrToken.PersonAttrAttachAttrs.NO) if (attr is None): if ((t is not None and t.getReferent() is not None and t.getReferent().type_name == "GEO") and attrs1 is not None and open_br): continue if ((t.chars.is_capital_upper and open_br and t.next0_ is not None) and t.next0_.isChar(')')): if (p.findSlot(PersonReferent.ATTR_LASTNAME, None, True) is None): p.addSlot(PersonReferent.ATTR_LASTNAME, t.getSourceText().upper(), False, 0) t = t.next0_ end = t if (t is not None and t.isValue("КОТОРЫЙ", None) and t.morph.number == MorphNumber.SINGULAR): if (not p.is_female and t.morph.gender == MorphGender.FEMINIE): p.is_female = True p._correctData() elif (not p.is_male and t.morph.gender == MorphGender.MASCULINE): p.is_male = True p._correctData() break if (attr.morph.number == MorphNumber.PLURAL): break if (attr.typ == PersonAttrTerminType.BESTREGARDS): break if (attr.is_doubt): if (has_prefix): pass elif (t.is_newline_before and attr.is_newline_after): pass elif (t.previous is not None and ((t.previous.is_hiphen or t.previous.isChar(':')))): pass else: break if (not morph_.case_.is_undefined and not attr.morph.case_.is_undefined): if (((morph_.case_) & attr.morph.case_).is_undefined and not is_be): break if (open_br): if (PersonAnalyzer._tryAttachPerson(t, ad, False, 0, True) is not None): break if (attrs1 is None): if (t.previous.is_comma and t.previous == end.next0_): ttt = attr.end_token.next0_ if (ttt is not None): if (ttt.morph.class0_.is_verb): if (MiscHelper.canBeStartOfSentence(begin)): pass else: break attrs1 = list() attrs1.append(attr) if (attr.typ == PersonAttrTerminType.POSITION or attr.typ == PersonAttrTerminType.KING): if (not is_be): has_position = True elif (attr.typ != PersonAttrTerminType.PREFIX): if (attr.typ == PersonAttrTerminType.OTHER and attr.age is not None): pass else: attrs1 = (None) break t = attr.end_token if (attrs1 is not None and has_position and attrs is not None): te1 = attrs[len(attrs) - 1].end_token.next0_ te2 = attrs1[0].begin_token if (te1.whitespaces_after_count > te2.whitespaces_before_count and (te2.whitespaces_before_count < 2)): pass elif (attrs1[0].age is not None): pass elif (((te1.is_hiphen or te1.isChar(':'))) and not attrs1[0].is_newline_before and ((te2.previous.is_comma or te2.previous == end))): pass else: for a in attrs: if (a.typ == PersonAttrTerminType.POSITION): te = attrs1[len(attrs1) - 1].end_token if (te.next0_ is not None): if (not te.next0_.isChar('.')): attrs1 = (None) break if (attrs1 is not None and not has_prefix): attr = attrs1[len(attrs1) - 1] ok = False if (attr.end_token.next0_ is not None and attr.end_token.next0_.chars.is_capital_upper): ok = True else: rt = PersonAnalyzer._tryAttachPerson(attr.begin_token, ad, False, -1, False) if (rt is not None and (isinstance(rt.referent, PersonReferent))): ok = True if (ok): if (attr.begin_token.whitespaces_before_count > attr.end_token.whitespaces_after_count): attrs1 = (None) elif (attr.begin_token.whitespaces_before_count == attr.end_token.whitespaces_after_count): rt1 = PersonAnalyzer._tryAttachPerson(attr.begin_token, ad, False, -1, False) if (rt1 is not None): attrs1 = (None) if (attrs1 is not None): for a in attrs1: if (a.typ != PersonAttrTerminType.PREFIX): if (a.age is not None): p.addSlot(PersonReferent.ATTR_AGE, a.age, True, 0) elif (a.prop_ref is None): p.addSlot(PersonReferent.ATTR_ATTR, a.value, False, 0) else: p.addSlot(PersonReferent.ATTR_ATTR, a, False, 0) end = a.end_token if (a.gender != MorphGender.UNDEFINED and not p.is_female and not p.is_male): if (a.gender == MorphGender.MASCULINE and not p.is_male): p.is_male = True p._correctData() elif (a.gender == MorphGender.FEMINIE and not p.is_female): p.is_female = True p._correctData() if (open_br): if (end.next0_ is not None and end.next0_.isChar(')')): end = end.next0_ crlf_cou = 0 t = end.next0_ first_pass3096 = True while True: if first_pass3096: first_pass3096 = False else: t = t.next0_ if (not (t is not None)): break if (t.is_table_control_char): break if (t.is_newline_before): ml = MailLine.parse(t, 0) if (ml is not None and ml.typ == MailLine.Types.FROM): break crlf_cou += 1 if (t.isCharOf(":,(") or t.is_hiphen): continue if (t.isChar('.') and t == end.next0_): continue r = t.getReferent() if (r is not None): if (r.type_name == "PHONE" or r.type_name == "URI" or r.type_name == "ADDRESS"): ty = r.getStringValue("SCHEME") if (r.type_name == "URI"): if ((ty != "mailto" and ty != "skype" and ty != "ICQ") and ty != "http"): break p._addContact(r) end = t crlf_cou = 0 continue if (isinstance(r, PersonIdentityReferent)): p.addSlot(PersonReferent.ATTR_IDDOC, r, False, 0) end = t crlf_cou = 0 continue if (r is not None and r.type_name == "ORGANIZATION"): if (t.next0_ is not None and t.next0_.morph.class0_.is_verb): break if (begin.previous is not None and begin.previous.morph.class0_.is_verb): break if (t.whitespaces_after_count == 1): break exist = False for s in p.slots: if (s.type_name == PersonReferent.ATTR_ATTR and (isinstance(s.value, PersonPropertyReferent))): pr = Utils.asObjectOrNull(s.value, PersonPropertyReferent) if (pr.findSlot(PersonPropertyReferent.ATTR_REF, r, True) is not None): exist = True break elif (s.type_name == PersonReferent.ATTR_ATTR and (isinstance(s.value, PersonAttrToken))): pr = Utils.asObjectOrNull(s.value, PersonAttrToken) if (pr.referent.findSlot(PersonPropertyReferent.ATTR_REF, r, True) is not None): exist = True break if (not exist): pat = PersonAttrToken(t, t) pat.prop_ref = PersonPropertyReferent._new2291("сотрудник") pat.prop_ref.addSlot(PersonPropertyReferent.ATTR_REF, r, False, 0) p.addSlot(PersonReferent.ATTR_ATTR, pat, False, 0) continue if (r is not None): break if (not has_prefix or crlf_cou >= 2): break rt = t.kit.processReferent("PERSON", t) if (rt is not None): break if (ad is not None): ad.overflow_level -= 1 return ReferentToken._new2329(p, begin, end, morph_, p._m_person_identity_typ)
def __try_parse_ru(t: 'Token', can_be_partition: bool, can_be_adj_partition: bool, force_parse: bool) -> 'VerbPhraseToken': res = None t0 = t not0_ = None has_verb = False verb_be_before = False prep = None first_pass3070 = True while True: if first_pass3070: first_pass3070 = False else: t = t.next0_ if (not (t is not None)): break if (not (isinstance(t, TextToken))): break tt = Utils.asObjectOrNull(t, TextToken) is_participle = False if (tt.term == "НЕ"): not0_ = t continue ty = 0 norm = None mc = tt.get_morph_class_in_dictionary() if (tt.term == "НЕТ"): if (has_verb): break ty = 1 elif (tt.term == "ДОПУСТИМО"): ty = 3 elif (mc.is_adverb and not mc.is_verb): ty = 2 elif (tt.is_pure_verb or tt.is_verb_be): ty = 1 if (has_verb): if (not tt.morph.contains_attr("инф.", None)): if (verb_be_before): pass else: break elif (mc.is_verb): if (mc.is_preposition or mc.is_misc or mc.is_pronoun): pass elif (mc.is_noun): if (tt.term == "СТАЛИ" or tt.term == "СТЕКЛО" or tt.term == "БЫЛИ"): ty = 1 elif (not tt.chars.is_all_lower and not MiscHelper.can_be_start_of_sentence(tt)): ty = 1 elif (mc.is_adjective and can_be_partition): ty = 1 elif (force_parse): ty = 1 elif (mc.is_proper): if (tt.chars.is_all_lower): ty = 1 else: ty = 1 if (mc.is_adjective): is_participle = True if (not tt.morph.case_.is_undefined): is_participle = True if (not can_be_partition and is_participle): break if (has_verb): if (tt.morph.contains_attr("инф.", None)): pass elif (not is_participle): pass else: break elif ((mc.is_adjective and tt.morph.contains_attr("к.ф.", None) and tt.term.endswith("О")) and NounPhraseHelper.try_parse( tt, NounPhraseParseAttr.NO, 0, None) is None): ty = 2 elif (mc.is_adjective and ((can_be_partition or can_be_adj_partition))): if (tt.morph.contains_attr("к.ф.", None) and not can_be_adj_partition): break norm = tt.get_normal_case_text(MorphClass.ADJECTIVE, MorphNumber.SINGULAR, MorphGender.MASCULINE, False) if (norm.endswith("ЙШИЙ")): pass else: grs = DerivateService.find_derivates(norm, True, None) if (grs is not None and len(grs) > 0): hverb = False hpart = False for gr in grs: for w in gr.words: if (w.class0_.is_adjective and w.class0_.is_verb): if (w.spelling == norm): hpart = True elif (w.class0_.is_verb): hverb = True if (hpart and hverb): ty = 3 elif (can_be_adj_partition): ty = 3 if (ty != 3 and not Utils.isNullOrEmpty(grs[0].prefix) and norm.startswith(grs[0].prefix)): hverb = False hpart = False norm1 = norm[len(grs[0].prefix):] grs = DerivateService.find_derivates( norm1, True, None) if (grs is not None and len(grs) > 0): for gr in grs: for w in gr.words: if (w.class0_.is_adjective and w.class0_.is_verb): if (w.spelling == norm1): hpart = True elif (w.class0_.is_verb): hverb = True if (hpart and hverb): ty = 3 if (ty == 0 and t == t0 and can_be_partition): prep = PrepositionHelper.try_parse(t) if (prep is not None): t = prep.end_token continue if (ty == 0): break if (res is None): res = VerbPhraseToken(t0, t) res.end_token = t it = VerbPhraseItemToken._new603(t, t, MorphCollection(t.morph)) if (not0_ is not None): it.begin_token = not0_ it.not0_ = True not0_ = (None) it.is_adverb = ty == 2 if (prep is not None and not t.morph.case_.is_undefined and len(res.items) == 0): if (((prep.next_case) & t.morph.case_).is_undefined): return None it.morph.remove_items(prep.next_case, False) res.preposition = prep if (norm is None): norm = t.get_normal_case_text( (MorphClass.ADJECTIVE if ty == 3 else (MorphClass.ADVERB if ty == 2 else MorphClass.VERB)), MorphNumber.SINGULAR, MorphGender.MASCULINE, False) if (ty == 1 and not tt.morph.case_.is_undefined): mi = MorphWordForm._new604(MorphCase.NOMINATIVE, MorphNumber.SINGULAR, MorphGender.MASCULINE) for mit in tt.morph.items: if (isinstance(mit, MorphWordForm)): mi.misc = mit.misc break nnn = MorphologyService.get_wordform("КК" + t.term, mi) if (nnn is not None): norm = nnn[2:] it.normal = norm res.items.append(it) if (not has_verb and ((ty == 1 or ty == 3))): res.morph = it.morph has_verb = True if (ty == 1 or ty == 3): if (ty == 1 and tt.is_verb_be): verb_be_before = True else: verb_be_before = False if (not has_verb): return None for i in range(len(res.items) - 1, 0, -1): if (res.items[i].is_adverb): del res.items[i] res.end_token = res.items[i - 1].end_token else: break return res