def deserialize(self, stream : io.IOBase) -> bool: vers = 0 b = Utils.readByteIO(stream) if (b == (0xAA)): b = (Utils.readByteIO(stream)) vers = (b) else: stream.seek(stream.tell() - (1), io.SEEK_SET) self.__m_sofa = SourceOfAnalysis(None) self.__m_sofa.deserialize(stream) self.base_language = MorphLang._new5(SerializerHelper.deserializeInt(stream)) self.__m_entities = list() cou = SerializerHelper.deserializeInt(stream) i = 0 while i < cou: typ = SerializerHelper.deserializeString(stream) r = ProcessorService.createReferent(typ) if (r is None): r = Referent("UNDEFINED") self.__m_entities.append(r) i += 1 i = 0 while i < cou: self.__m_entities[i].deserialize(stream, self.__m_entities, self.__m_sofa) i += 1 self.first_token = SerializerHelper.deserializeTokens(stream, self, vers) self.__createStatistics() return True
def deserialize(self, stream: Stream) -> bool: vers = 0 b = stream.readbyte() if (b == (0xAA)): b = (stream.readbyte()) vers = (b) else: stream.position = stream.position - (1) self.__m_sofa = SourceOfAnalysis(None) self.__m_sofa.deserialize(stream) self.base_language = MorphLang._new56( SerializerHelper.deserialize_int(stream)) self.__m_entities = list() cou = SerializerHelper.deserialize_int(stream) i = 0 while i < cou: typ = SerializerHelper.deserialize_string(stream) r = ProcessorService.create_referent(typ) if (r is None): r = Referent("UNDEFINED") self.__m_entities.append(r) i += 1 i = 0 while i < cou: self.__m_entities[i].deserialize(stream, self.__m_entities, self.__m_sofa) i += 1 self.first_token = SerializerHelper.deserialize_tokens( stream, self, vers) self.__create_statistics() return True
def initialize() -> None: if (ShortNameHelper.M_INITED): return ShortNameHelper.M_INITED = True obj = PullentiNerPersonInternalResourceHelper.get_string("ShortNames.txt") if (obj is not None): kit = AnalysisKit(SourceOfAnalysis(obj)) t = kit.first_token while t is not None: if (t.is_newline_before): g = (MorphGender.FEMINIE if t.is_value("F", None) else MorphGender.MASCULINE) t = t.next0_ nam = t.term shos = list() t = t.next0_ while t is not None: if (t.is_newline_before): break else: shos.append(t.term) t = t.next0_ for s in shos: li = None wrapli2599 = RefOutArgWrapper(None) inoutres2600 = Utils.tryGetValue(ShortNameHelper.M_SHORTS_NAMES, s, wrapli2599) li = wrapli2599.value if (not inoutres2600): li = list() ShortNameHelper.M_SHORTS_NAMES[s] = li li.append(ShortNameHelper.ShortnameVar._new2598(nam, g)) if (t is None): break t = t.previous t = t.next0_
def initialize() -> None: if (OrgGlobal.GLOBAL_ORGS is not None): return OrgGlobal.GLOBAL_ORGS = IntOntologyCollection() org0_ = None oi = None with ProcessorService.create_empty_processor() as geo_proc: geo_proc.add_analyzer(GeoAnalyzer()) geos = dict() for k in range(3): lang = (MorphLang.RU if k == 0 else (MorphLang.EN if k == 1 else MorphLang.UA)) name = ("Orgs_ru.dat" if k == 0 else ("Orgs_en.dat" if k == 1 else "Orgs_ua.dat")) dat = PullentiNerOrgInternalResourceHelper.get_bytes(name) if (dat is None): raise Utils.newException("Can't file resource file {0} in Organization analyzer".format(name), None) with MemoryStream(OrgItemTypeToken._deflate(dat)) as tmp: tmp.position = 0 xml0_ = None # new XmlDocument xml0_ = Utils.parseXmlFromStream(tmp) for x in xml0_.getroot(): org0_ = OrganizationReferent() abbr = None for xx in x: if (Utils.getXmlLocalName(xx) == "typ"): org0_.add_slot(OrganizationReferent.ATTR_TYPE, Utils.getXmlInnerText(xx), False, 0) elif (Utils.getXmlLocalName(xx) == "nam"): org0_.add_slot(OrganizationReferent.ATTR_NAME, Utils.getXmlInnerText(xx), False, 0) elif (Utils.getXmlLocalName(xx) == "epo"): org0_.add_slot(OrganizationReferent.ATTR_EPONYM, Utils.getXmlInnerText(xx), False, 0) elif (Utils.getXmlLocalName(xx) == "prof"): org0_.add_slot(OrganizationReferent.ATTR_PROFILE, Utils.getXmlInnerText(xx), False, 0) elif (Utils.getXmlLocalName(xx) == "abbr"): abbr = Utils.getXmlInnerText(xx) elif (Utils.getXmlLocalName(xx) == "geo"): geo_ = None wrapgeo1767 = RefOutArgWrapper(None) inoutres1768 = Utils.tryGetValue(geos, Utils.getXmlInnerText(xx), wrapgeo1767) geo_ = wrapgeo1767.value if (not inoutres1768): ar = geo_proc.process(SourceOfAnalysis(Utils.getXmlInnerText(xx)), None, lang) if (ar is not None and len(ar.entities) == 1 and (isinstance(ar.entities[0], GeoReferent))): geo_ = (Utils.asObjectOrNull(ar.entities[0], GeoReferent)) geos[Utils.getXmlInnerText(xx)] = geo_ else: pass if (geo_ is not None): org0_.add_slot(OrganizationReferent.ATTR_GEO, geo_, False, 0) oi = org0_.create_ontology_item_ex(2, True, True) if (oi is None): continue if (abbr is not None): oi.termins.append(Termin(abbr, None, True)) if (k == 2): OrgGlobal.GLOBAL_ORGS_UA.add_item(oi) else: OrgGlobal.GLOBAL_ORGS.add_item(oi) return
def __create_referent(self, type_name: str, definition_: str) -> typing.List['Referent']: analyzer = None wrapanalyzer2809 = RefOutArgWrapper(None) inoutres2810 = Utils.tryGetValue(self.__m_anal_by_type, type_name, wrapanalyzer2809) analyzer = wrapanalyzer2809.value if (not inoutres2810): return None sf = SourceOfAnalysis(definition_) ar = self.__m_processor._process(sf, True, True, None, None) if (ar is None or ar.first_token is None): return None r0 = ar.first_token.get_referent() t = None if (r0 is not None): if (r0.type_name != type_name): r0 = (None) if (r0 is not None): t = ar.first_token else: rt = analyzer.process_ontology_item(ar.first_token) if (rt is None): return None r0 = rt.referent t = rt.end_token t = t.next0_ first_pass3432 = True while True: if first_pass3432: first_pass3432 = False else: t = t.next0_ if (not (t is not None)): break if (t.is_char(';') and t.next0_ is not None): r1 = t.next0_.get_referent() if (r1 is None): rt = analyzer.process_ontology_item(t.next0_) if (rt is None): continue t = rt.end_token r1 = rt.referent if (r1.type_name == type_name): r0.merge_slots(r1, True) r1.tag = r0 if (r0 is None): return None r0.tag = r0 r0 = analyzer._persist_analizer_data.register_referent(r0) self.__m_processor._create_res(ar.first_token.kit, ar, None, True) res = list() res.append(r0) for e0_ in ar.entities: if (e0_.tag is None): res.append(e0_) return res
def __createReferent(self, type_name : str, definition_ : str) -> 'Referent': analyzer = None wrapanalyzer2643 = RefOutArgWrapper(None) inoutres2644 = Utils.tryGetValue(self.__m_anal_by_type, type_name, wrapanalyzer2643) analyzer = wrapanalyzer2643.value if (not inoutres2644): return None sf = SourceOfAnalysis(definition_) ar = self.__m_processor._process(sf, True, True, None, None) if (ar is None or ar.first_token is None): return None r0 = ar.first_token.getReferent() t = None if (r0 is not None): if (r0.type_name != type_name): r0 = (None) if (r0 is not None): t = ar.first_token else: rt = analyzer.processOntologyItem(ar.first_token) if (rt is None): return None r0 = rt.referent t = rt.end_token t = t.next0_ first_pass3162 = True while True: if first_pass3162: first_pass3162 = False else: t = t.next0_ if (not (t is not None)): break if (t.isChar(';') and t.next0_ is not None): r1 = t.next0_.getReferent() if (r1 is None): rt = analyzer.processOntologyItem(t.next0_) if (rt is None): continue t = rt.end_token r1 = rt.referent if (r1.type_name == type_name): r0.mergeSlots(r1, True) if (r0 is not None): r0 = analyzer._persist_analizer_data.registerReferent(r0) return r0
def main(args: typing.List[str]) -> None: sw = Stopwatch() # инициализация - необходимо проводить один раз до обработки текстов print("Initializing ... ", end="", flush=True) # инициализируются движок и все имеющиеся анализаторы Sdk.initialize((MorphLang.RU) | MorphLang.EN) sw.stop() print("OK (by {0} ms), version {1}".format( sw.elapsedMilliseconds, ProcessorService.getVersion()), flush=True) # анализируемый текст txt = "Единственным конкурентом «Трансмаша» на этом сомнительном тендере было ООО «Плассер Алека Рейл Сервис», основным владельцем которого является австрийская компания «СТЦ-Холдинг ГМБХ». До конца 2011 г. эта же фирма была совладельцем «Трансмаша» вместе с «Тако» Краснова. Зато совладельцем «Плассера», также до конца 2011 г., был тот самый Карл Контрус, который имеет четверть акций «Трансмаша». " print("Text: {0}".format(txt), flush=True) # запускаем обработку на пустом процессоре (без анализаторов NER) are = ProcessorService.getEmptyProcessor().process( SourceOfAnalysis(txt), None, None) print("Noun groups: ", end="", flush=True) t = are.first_token # перебираем токены first_pass2703 = True while True: if first_pass2703: first_pass2703 = False else: t = t.next0_ if (not (t is not None)): break # выделяем именную группу с текущего токена npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0) # не получилось if (npt is None): continue # получилось, выводим в нормализованном виде print("[{0}=>{1}] ".format( npt.getSourceText(), npt.getNormalCaseText(None, True, MorphGender.UNDEFINED, False)), end="", flush=True) # указатель на последний токен именной группы t = npt.end_token with ProcessorService.createProcessor() as proc: # анализируем текст ar = proc.process(SourceOfAnalysis(txt), None, None) # результирующие сущности print( "\r\n==========================================\r\nEntities: ", flush=True) for e0_ in ar.entities: print("{0}: {1}".format(e0_.type_name, str(e0_)), flush=True) for s in e0_.slots: print(" {0}: {1}".format(s.type_name, s.value), flush=True) # пример выделения именных групп print( "\r\n==========================================\r\nNoun groups: ", flush=True) t = ar.first_token first_pass2704 = True while True: if first_pass2704: first_pass2704 = False else: t = t.next0_ if (not (t is not None)): break # токены с сущностями игнорируем if (t.getReferent() is not None): continue # пробуем создать именную группу npt = NounPhraseHelper.tryParse( t, NounPhraseParseAttr.ADJECTIVECANBELAST, 0) # не получилось if (npt is None): continue print(npt, flush=True) # указатель перемещаем на последний токен группы t = npt.end_token with ProcessorService.createSpecificProcessor( KeywordAnalyzer.ANALYZER_NAME) as proc: ar = proc.process(SourceOfAnalysis(txt), None, None) print( "\r\n==========================================\r\nKeywords1: ", flush=True) for e0_ in ar.entities: if (isinstance(e0_, KeywordReferent)): print(e0_, flush=True) print( "\r\n==========================================\r\nKeywords2: ", flush=True) t = ar.first_token first_pass2705 = True while True: if first_pass2705: first_pass2705 = False else: t = t.next0_ if (not (t is not None)): break if (isinstance(t, ReferentToken)): kw = Utils.asObjectOrNull(t.getReferent(), KeywordReferent) if (kw is None): continue kwstr = MiscHelper.getTextValueOfMetaToken( Utils.asObjectOrNull(t, ReferentToken), Utils.valToEnum( (GetTextAttr.FIRSTNOUNGROUPTONOMINATIVESINGLE) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) print("{0} = {1}".format(kwstr, kw), flush=True) print("Over!", flush=True)
def __call__(self, text): text = preprocess(text) sofa = SourceOfAnalysis(text) raw = self.raw.process(sofa) return convert_result(text, raw)
def sofa(self) -> 'SourceOfAnalysis': """ Ссылка на исходный текст """ if (self.__m_sofa is None): self.__m_sofa = SourceOfAnalysis("") return self.__m_sofa
class AnalysisKit: """ Внутренний аналитический контейнер данных """ def __init__(self, sofa_ : 'SourceOfAnalysis'=None, only_tokenizing : bool=False, lang : 'MorphLang'=None, progress : EventHandler=None) -> None: self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0) self.corrected_tokens = None self.first_token = None; self.__m_entities = list() self.ontology = None; self.base_language = MorphLang() self.__m_sofa = None; self.statistics = None; self.__m_datas = dict() self.misc_data = dict() self.processor = None; self.recurse_level = 0 self._m_analyzer_stack = list() if (sofa_ is None): return self.__m_sofa = sofa_ self._start_date = datetime.datetime.now() tokens = Morphology.process(sofa_.text, lang, None) t0 = None if (tokens is not None): ii = 0 while ii < len(tokens): mt = tokens[ii] if (mt.begin_char == 733860): pass tt = TextToken(mt, self) if (sofa_.correction_dict is not None): wrapcorw539 = RefOutArgWrapper(None) inoutres540 = Utils.tryGetValue(sofa_.correction_dict, mt.term, wrapcorw539) corw = wrapcorw539.value if (inoutres540): ccc = Morphology.process(corw, lang, None) if (ccc is not None and len(ccc) == 1): tt1 = TextToken._new538(ccc[0], self, tt.term) tt1.begin_char = tt.begin_char tt1.end_char = tt.end_char tt1.chars = tt.chars tt = tt1 if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.getSourceText() if (t0 is None): self.first_token = (tt) else: t0.next0_ = tt t0 = (tt) ii += 1 if (sofa_.clear_dust): self.__clearDust() if (sofa_.do_words_merging_by_morph): self.__correctWordsByMerging(lang) if (sofa_.do_word_correction_by_morph): self.__correctWordsByMorph(lang) self.__mergeLetters() self.__defineBaseLanguage() t = self.first_token first_pass2794 = True while True: if first_pass2794: first_pass2794 = False else: t = t.next0_ if (not (t is not None)): break nt = NumberHelper._tryParseNumber(t) if (nt is None): continue self.embedToken(nt) t = (nt) if (only_tokenizing): return t = self.first_token first_pass2795 = True while True: if first_pass2795: first_pass2795 = False else: t = t.next0_ if (not (t is not None)): break if (t.morph.class0_.is_preposition): continue mc = t.getMorphClassInDictionary() if (mc.is_undefined and t.chars.is_cyrillic_letter and t.length_char > 4): tail = sofa_.text[t.end_char - 1:t.end_char - 1+2] tte = None tt = t.previous if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.previous if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2] if (tail2 == tail): tte = tt if (tte is None): tt = t.next0_ if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.next0_ if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2] if (tail2 == tail): tte = tt if (tte is not None): t.morph.removeItemsEx(tte.morph, tte.getMorphClassInDictionary()) continue self.__createStatistics() def _initFrom(self, ar : 'AnalysisResult') -> None: self.__m_sofa = ar.sofas[0] self.first_token = ar.first_token self.base_language = ar.base_language self.__createStatistics() def __clearDust(self) -> None: t = self.first_token first_pass2796 = True while True: if first_pass2796: first_pass2796 = False else: t = t.next0_ if (not (t is not None)): break cou = AnalysisKit.__calcAbnormalCoef(t) norm = 0 if (cou < 1): continue t1 = t tt = t first_pass2797 = True while True: if first_pass2797: first_pass2797 = False else: tt = tt.next0_ if (not (tt is not None)): break co = AnalysisKit.__calcAbnormalCoef(tt) if (co == 0): continue if (co < 0): norm += 1 if (norm > 1): break else: norm = 0 cou += co t1 = tt len0_ = t1.end_char - t.begin_char if (cou > 20 and len0_ > 500): p = t.begin_char while p < t1.end_char: if (self.sofa.text[p] == self.sofa.text[p + 1]): len0_ -= 1 p += 1 if (len0_ > 500): if (t.previous is not None): t.previous.next0_ = t1.next0_ else: self.first_token = t1.next0_ t = t1 else: t = t1 else: t = t1 @staticmethod def __calcAbnormalCoef(t : 'Token') -> int: if (isinstance(t, NumberToken)): return 0 tt = Utils.asObjectOrNull(t, TextToken) if (tt is None): return 0 if (not tt.chars.is_letter): return 0 if (not tt.chars.is_latin_letter and not tt.chars.is_cyrillic_letter): return 2 if (tt.length_char < 4): return 0 for wf in tt.morph.items: if ((wf).is_in_dictionary): return -1 if (tt.length_char > 15): return 2 return 1 def __correctWordsByMerging(self, lang : 'MorphLang') -> None: t = self.first_token first_pass2798 = True while True: if first_pass2798: first_pass2798 = False else: t = t.next0_ if (not (t is not None and t.next0_ is not None)): break if (not t.chars.is_letter or (t.length_char < 2)): continue mc0 = t.getMorphClassInDictionary() if (t.morph.containsAttr("прдктв.", None)): continue t1 = t.next0_ if (t1.is_hiphen and t1.next0_ is not None and not t1.is_newline_after): t1 = t1.next0_ if (t1.length_char == 1): continue if (not t1.chars.is_letter or not t.chars.is_letter or t1.chars.is_latin_letter != t.chars.is_latin_letter): continue if (t1.chars.is_all_upper and not t.chars.is_all_upper): continue elif (not t1.chars.is_all_lower): continue elif (t.chars.is_all_upper): continue if (t1.morph.containsAttr("прдктв.", None)): continue mc1 = t1.getMorphClassInDictionary() if (not mc1.is_undefined and not mc0.is_undefined): continue if ((len((t).term) + len((t1).term)) < 6): continue corw = (t).term + (t1).term ccc = Morphology.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue if (corw == "ПОСТ" or corw == "ВРЕД"): continue tt = TextToken(ccc[0], self) if (tt.getMorphClassInDictionary().is_undefined): continue tt.begin_char = t.begin_char tt.end_char = t1.end_char tt.chars = t.chars if (t == self.first_token): self.first_token = (tt) else: t.previous.next0_ = tt if (t1.next0_ is not None): tt.next0_ = t1.next0_ t = (tt) def __correctWordsByMorph(self, lang : 'MorphLang') -> None: tt = self.first_token first_pass2799 = True while True: if first_pass2799: first_pass2799 = False else: tt = tt.next0_ if (not (tt is not None)): break if (not ((isinstance(tt, TextToken)))): continue if (tt.morph.containsAttr("прдктв.", None)): continue dd = tt.getMorphClassInDictionary() if (not dd.is_undefined or (tt.length_char < 4)): continue if (tt.morph.class0_.is_proper_surname and not tt.chars.is_all_lower): continue if (tt.chars.is_all_upper): continue corw = Morphology.correctWord((tt).term, (lang if tt.morph.language.is_undefined else tt.morph.language)) if (corw is None): continue ccc = Morphology.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue tt1 = TextToken._new541(ccc[0], self, tt.chars, tt.begin_char, tt.end_char, (tt).term) mc = tt1.getMorphClassInDictionary() if (mc.is_proper_surname): continue if (tt == self.first_token): self.first_token = (tt1) else: tt.previous.next0_ = tt1 tt1.next0_ = tt.next0_ tt = (tt1) if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.getSourceText() def __mergeLetters(self) -> None: before_word = False tmp = io.StringIO() t = self.first_token first_pass2800 = True while True: if first_pass2800: first_pass2800 = False else: t = t.next0_ if (not (t is not None)): break tt = Utils.asObjectOrNull(t, TextToken) if (not tt.chars.is_letter or tt.length_char != 1): before_word = False continue i = t.whitespaces_before_count if (i > 2 or ((i == 2 and before_word))): pass else: before_word = False continue i = 0 Utils.setLengthStringIO(tmp, 0) print(tt.getSourceText(), end="", file=tmp) t1 = t while t1.next0_ is not None: tt = (Utils.asObjectOrNull(t1.next0_, TextToken)) if (tt.length_char != 1 or tt.whitespaces_before_count != 1): break i += 1 print(tt.getSourceText(), end="", file=tmp) t1 = t1.next0_ if (i > 3 or ((i > 1 and before_word))): pass else: before_word = False continue before_word = False mt = Morphology.process(Utils.toStringStringIO(tmp), None, None) if (mt is None or len(mt) != 1): t = t1 continue for wf in mt[0].word_forms: if (wf.is_in_dictionary): before_word = True break if (not before_word): t = t1 continue tt = TextToken(mt[0], self) if (t == self.first_token): self.first_token = (tt) else: tt.previous = t.previous tt.next0_ = t1.next0_ tt.begin_char = t.begin_char tt.end_char = t1.end_char t = (tt) def embedToken(self, mt : 'MetaToken') -> None: """ Встроить токен в основную цепочку токенов Args: mt(MetaToken): """ if (mt is None): return if (mt.begin_char > mt.end_char): bg = mt.begin_token mt.begin_token = mt.end_token mt.end_token = bg if (mt.begin_char > mt.end_char): return if (mt.begin_token == self.first_token): self.first_token = (mt) else: tp = mt.begin_token.previous mt.previous = tp tn = mt.end_token.next0_ mt.next0_ = tn if (isinstance(mt, ReferentToken)): if ((mt).referent is not None): (mt).referent.addOccurence(TextAnnotation._new542(self.sofa, mt.begin_char, mt.end_char)) def debedToken(self, t : 'Token') -> 'Token': """ Убрать метатокен из цепочки, восстановив исходное Args: t(Token): Returns: Token: первый токен удалённого метатокена """ r = t.getReferent() if (r is not None): for o in r.occurrence: if (o.begin_char == t.begin_char and o.end_char == t.end_char): r.occurrence.remove(o) break mt = Utils.asObjectOrNull(t, MetaToken) if (mt is None): return t if (t.next0_ is not None): t.next0_.previous = mt.end_token if (t.previous is not None): t.previous.next0_ = mt.begin_token if (mt == self.first_token): self.first_token = mt.begin_token if (r is not None and len(r.occurrence) == 0): for d in self.__m_datas.values(): if (r in d.referents): d.removeReferent(r) break return mt.begin_token @property def entities(self) -> typing.List['Referent']: """ Список сущностей, выделенных в ходе анализа """ return self.__m_entities @property def sofa(self) -> 'SourceOfAnalysis': """ Ссылка на исходный текст """ if (self.__m_sofa is None): self.__m_sofa = SourceOfAnalysis("") return self.__m_sofa def getTextCharacter(self, position : int) -> 'char': """ Получить символ из исходного текста Args: position(int): позиция Returns: 'char': символ (0, если выход за границу) """ if ((position < 0) or position >= len(self.__m_sofa.text)): return chr(0) return self.__m_sofa.text[position] def getAnalyzerDataByAnalyzerName(self, analyzer_name : str) -> 'AnalyzerData': a = self.processor.findAnalyzer(analyzer_name) if (a is None): return None return self.getAnalyzerData(a) def getAnalyzerData(self, analyzer : 'Analyzer') -> 'AnalyzerData': """ Работа с локальными данными анализаторов Args: analyzer(Analyzer): """ if (analyzer is None or analyzer.name is None): return None wrapd543 = RefOutArgWrapper(None) inoutres544 = Utils.tryGetValue(self.__m_datas, analyzer.name, wrapd543) d = wrapd543.value if (inoutres544): d.kit = self return d default_data = analyzer.createAnalyzerData() if (default_data is None): return None if (analyzer._persist_referents_regim): if (analyzer._persist_analizer_data is None): analyzer._persist_analizer_data = default_data else: default_data = analyzer._persist_analizer_data self.__m_datas[analyzer.name] = default_data default_data.kit = self return default_data def __createStatistics(self) -> None: from pullenti.ner.core.StatisticCollection import StatisticCollection self.statistics = StatisticCollection() self.statistics.prepare(self.first_token) def __defineBaseLanguage(self) -> None: stat = dict() total = 0 t = self.first_token first_pass2801 = True while True: if first_pass2801: first_pass2801 = False else: t = t.next0_ if (not (t is not None)): break tt = Utils.asObjectOrNull(t, TextToken) if (tt is None): continue if (tt.morph.language.is_undefined): continue if (not tt.morph.language.value in stat): stat[tt.morph.language.value] = 1 else: stat[tt.morph.language.value] += 1 total += 1 val = 0 for kp in stat.items(): if (kp[1] > (math.floor(total / 2))): val |= kp[0] self.base_language.value = val def replaceReferent(self, old_referent : 'Referent', new_referent : 'Referent') -> None: """ Заменить везде где только возможно старую сущность на новую (используется при объединении сущностей) Args: old_referent(Referent): new_referent(Referent): """ t = self.first_token while t is not None: if (isinstance(t, ReferentToken)): (t)._replaceReferent(old_referent, new_referent) t = t.next0_ for d in self.__m_datas.values(): for r in d.referents: for s in r.slots: if (s.value == old_referent): r.uploadSlot(s, new_referent) if (old_referent in d.referents): d.referents.remove(old_referent) def processReferent(self, analyzer_name : str, t : 'Token') -> 'ReferentToken': if (self.processor is None): return None if (analyzer_name in self._m_analyzer_stack): return None if (self.is_recurce_overflow): return None a = self.processor.findAnalyzer(analyzer_name) if (a is None): return None self.recurse_level += 1 self._m_analyzer_stack.append(analyzer_name) res = a._processReferent(t, None) self._m_analyzer_stack.remove(analyzer_name) self.recurse_level -= 1 return res def createReferent(self, type_name : str) -> 'Referent': if (self.processor is None): return None else: for a in self.processor.analyzers: res = a.createReferent(type_name) if (res is not None): return res return None def refreshGenerals(self) -> None: GeneralRelationHelper.refreshGenerals(self.processor, self) @property def is_recurce_overflow(self) -> bool: return self.recurse_level > 5 def serialize(self, stream : io.IOBase) -> None: Utils.writeByteIO(stream, 0xAA) Utils.writeByteIO(stream, 1) self.__m_sofa.serialize(stream) SerializerHelper.serializeInt(stream, self.base_language.value) if (len(self.__m_entities) == 0): for d in self.__m_datas.items(): self.__m_entities.extend(d[1].referents) SerializerHelper.serializeInt(stream, len(self.__m_entities)) i = 0 while i < len(self.__m_entities): self.__m_entities[i].tag = i + 1 SerializerHelper.serializeString(stream, self.__m_entities[i].type_name) i += 1 for e0_ in self.__m_entities: e0_.serialize(stream) SerializerHelper.serializeTokens(stream, self.first_token, 0) def deserialize(self, stream : io.IOBase) -> bool: vers = 0 b = Utils.readByteIO(stream) if (b == (0xAA)): b = (Utils.readByteIO(stream)) vers = (b) else: stream.seek(stream.tell() - (1), io.SEEK_SET) self.__m_sofa = SourceOfAnalysis(None) self.__m_sofa.deserialize(stream) self.base_language = MorphLang._new5(SerializerHelper.deserializeInt(stream)) self.__m_entities = list() cou = SerializerHelper.deserializeInt(stream) i = 0 while i < cou: typ = SerializerHelper.deserializeString(stream) r = ProcessorService.createReferent(typ) if (r is None): r = Referent("UNDEFINED") self.__m_entities.append(r) i += 1 i = 0 while i < cou: self.__m_entities[i].deserialize(stream, self.__m_entities, self.__m_sofa) i += 1 self.first_token = SerializerHelper.deserializeTokens(stream, self, vers) self.__createStatistics() return True @staticmethod def _new2668(_arg1 : 'Processor', _arg2 : 'ExtOntology') -> 'AnalysisKit': res = AnalysisKit() res.processor = _arg1 res.ontology = _arg2 return res @staticmethod def _new2669(_arg1 : 'SourceOfAnalysis', _arg2 : bool, _arg3 : 'MorphLang', _arg4 : EventHandler, _arg5 : 'ExtOntology', _arg6 : 'Processor') -> 'AnalysisKit': res = AnalysisKit(_arg1, _arg2, _arg3, _arg4) res.ontology = _arg5 res.processor = _arg6 return res
class AnalysisKit: """ Внутренний аналитический контейнер данных. Создаётся автоматически внутри при вызове Processor.Process(...). Все токены Token ссылаются через поле Kit на экземпляр контейнера, связанного с обрабатываемым текстом. Контейнер данных """ def __init__(self, sofa_: 'SourceOfAnalysis' = None, only_tokenizing: bool = False, lang: 'MorphLang' = None, progress: EventHandler = None) -> None: self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0) self.corrected_tokens = None self.first_token = None self.__m_entities = list() self.ontology = None self.base_language = MorphLang() self.__m_sofa = None self.statistics = None self.__m_datas = dict() self.misc_data = dict() self.processor = None self.recurse_level = 0 self._m_analyzer_stack = list() self.onto_regime = False if (sofa_ is None): return self.__m_sofa = sofa_ self._start_date = datetime.datetime.now() tokens = MorphologyService.process(sofa_.text, lang, None) t0 = None if (tokens is not None): ii = 0 while ii < len(tokens): mt = tokens[ii] if (mt.begin_char == 733860): pass tt = TextToken(mt, self) if (sofa_.correction_dict is not None): corw = None wrapcorw471 = RefOutArgWrapper(None) inoutres472 = Utils.tryGetValue(sofa_.correction_dict, mt.term, wrapcorw471) corw = wrapcorw471.value if (inoutres472): ccc = MorphologyService.process(corw, lang, None) if (ccc is not None and len(ccc) == 1): tt1 = TextToken._new470(ccc[0], self, tt.begin_char, tt.end_char, tt.term) tt1.chars = tt.chars tt = tt1 if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.get_source_text() if (t0 is None): self.first_token = (tt) else: t0.next0_ = tt t0 = (tt) ii += 1 if (sofa_.clear_dust): self.__clear_dust() if (sofa_.do_words_merging_by_morph): self.__correct_words_by_merging(lang) if (sofa_.do_word_correction_by_morph): self.__correct_words_by_morph(lang) self.__merge_letters() self.__define_base_language() if (sofa_.create_number_tokens): t = self.first_token first_pass3049 = True while True: if first_pass3049: first_pass3049 = False else: t = t.next0_ if (not (t is not None)): break nt = NumberHelper._try_parse_number(t) if (nt is None): continue self.embed_token(nt) t = (nt) if (only_tokenizing): return t = self.first_token first_pass3050 = True while True: if first_pass3050: first_pass3050 = False else: t = t.next0_ if (not (t is not None)): break if (t.morph.class0_.is_preposition): continue mc = t.get_morph_class_in_dictionary() if (mc.is_undefined and t.chars.is_cyrillic_letter and t.length_char > 4): tail = sofa_.text[t.end_char - 1:t.end_char - 1 + 2] tte = None tt = t.previous if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.previous if ((tt is not None and not tt.get_morph_class_in_dictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1 + 2] if (tail2 == tail): tte = tt if (tte is None): tt = t.next0_ if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.next0_ if ((tt is not None and not tt.get_morph_class_in_dictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1 + 2] if (tail2 == tail): tte = tt if (tte is not None): t.morph.remove_items_ex( tte.morph, tte.get_morph_class_in_dictionary()) continue self.__create_statistics() def _init_from(self, ar: 'AnalysisResult') -> None: self.__m_sofa = ar.sofa self.first_token = ar.first_token self.base_language = ar.base_language self.__create_statistics() def __clear_dust(self) -> None: t = self.first_token first_pass3051 = True while True: if first_pass3051: first_pass3051 = False else: t = t.next0_ if (not (t is not None)): break cou = AnalysisKit.__calc_abnormal_coef(t) norm = 0 if (cou < 1): continue t1 = t tt = t first_pass3052 = True while True: if first_pass3052: first_pass3052 = False else: tt = tt.next0_ if (not (tt is not None)): break co = AnalysisKit.__calc_abnormal_coef(tt) if (co == 0): continue if (co < 0): norm += 1 if (norm > 1): break else: norm = 0 cou += co t1 = tt len0_ = t1.end_char - t.begin_char if (cou > 20 and len0_ > 500): p = t.begin_char while p < t1.end_char: if (self.sofa.text[p] == self.sofa.text[p + 1]): len0_ -= 1 p += 1 if (len0_ > 500): if (t.previous is not None): t.previous.next0_ = t1.next0_ else: self.first_token = t1.next0_ t = t1 else: t = t1 else: t = t1 @staticmethod def __calc_abnormal_coef(t: 'Token') -> int: if (isinstance(t, NumberToken)): return 0 tt = Utils.asObjectOrNull(t, TextToken) if (tt is None): return 0 if (not tt.chars.is_letter): return 0 if (not tt.chars.is_latin_letter and not tt.chars.is_cyrillic_letter): return 2 if (tt.length_char < 4): return 0 for wf in tt.morph.items: if (wf.is_in_dictionary): return -1 if (tt.length_char > 15): return 2 return 1 def __correct_words_by_merging(self, lang: 'MorphLang') -> None: t = self.first_token first_pass3053 = True while True: if first_pass3053: first_pass3053 = False else: t = t.next0_ if (not (t is not None and t.next0_ is not None)): break if (not t.chars.is_letter or (t.length_char < 2)): continue mc0 = t.get_morph_class_in_dictionary() if (t.morph.contains_attr("прдктв.", None)): continue t1 = t.next0_ if (t1.is_hiphen and t1.next0_ is not None and not t1.is_newline_after): t1 = t1.next0_ if (t1.length_char == 1): continue if (not t1.chars.is_letter or not t.chars.is_letter or t1.chars.is_latin_letter != t.chars.is_latin_letter): continue if (t1.chars.is_all_upper and not t.chars.is_all_upper): continue elif (not t1.chars.is_all_lower): continue elif (t.chars.is_all_upper): continue if (t1.morph.contains_attr("прдктв.", None)): continue mc1 = t1.get_morph_class_in_dictionary() if (not mc1.is_undefined and not mc0.is_undefined): continue if ((len(t.term) + len(t1.term)) < 6): continue corw = t.term + t1.term ccc = MorphologyService.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue if (corw == "ПОСТ" or corw == "ВРЕД"): continue tt = TextToken(ccc[0], self, t.begin_char, t1.end_char) if (tt.get_morph_class_in_dictionary().is_undefined): continue tt.chars = t.chars if (t == self.first_token): self.first_token = (tt) else: t.previous.next0_ = tt if (t1.next0_ is not None): tt.next0_ = t1.next0_ t = (tt) def __correct_words_by_morph(self, lang: 'MorphLang') -> None: tt = self.first_token first_pass3054 = True while True: if first_pass3054: first_pass3054 = False else: tt = tt.next0_ if (not (tt is not None)): break if (not (isinstance(tt, TextToken))): continue if (tt.morph.contains_attr("прдктв.", None)): continue dd = tt.get_morph_class_in_dictionary() if (not dd.is_undefined or (tt.length_char < 4)): continue if (tt.morph.class0_.is_proper_surname and not tt.chars.is_all_lower): continue if (tt.chars.is_all_upper): continue corw = MorphologyService.correct_word( tt.term, (lang if tt.morph.language.is_undefined else tt.morph.language)) if (corw is None): continue ccc = MorphologyService.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue tt1 = TextToken._new473(ccc[0], self, tt.begin_char, tt.end_char, tt.chars, tt.term) mc = tt1.get_morph_class_in_dictionary() if (mc.is_proper_surname): continue if (tt == self.first_token): self.first_token = (tt1) else: tt.previous.next0_ = tt1 tt1.next0_ = tt.next0_ tt = (tt1) if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.get_source_text() def __merge_letters(self) -> None: before_word = False tmp = io.StringIO() t = self.first_token first_pass3055 = True while True: if first_pass3055: first_pass3055 = False else: t = t.next0_ if (not (t is not None)): break tt = Utils.asObjectOrNull(t, TextToken) if (not tt.chars.is_letter or tt.length_char != 1): before_word = False continue i = t.whitespaces_before_count if (i > 2 or ((i == 2 and before_word))): pass else: before_word = False continue i = 0 t1 = None Utils.setLengthStringIO(tmp, 0) print(tt.get_source_text(), end="", file=tmp) t1 = t while t1.next0_ is not None: tt = (Utils.asObjectOrNull(t1.next0_, TextToken)) if (tt.length_char != 1 or tt.whitespaces_before_count != 1): break i += 1 print(tt.get_source_text(), end="", file=tmp) t1 = t1.next0_ if (i > 3 or ((i > 1 and before_word))): pass else: before_word = False continue before_word = False mt = MorphologyService.process(Utils.toStringStringIO(tmp), None, None) if (mt is None or len(mt) != 1): t = t1 continue for wf in mt[0].word_forms: if (wf.is_in_dictionary): before_word = True break if (not before_word): t = t1 continue tt = TextToken(mt[0], self, t.begin_char, t1.end_char) if (t == self.first_token): self.first_token = (tt) else: tt.previous = t.previous tt.next0_ = t1.next0_ t = (tt) def embed_token(self, mt: 'MetaToken') -> None: """ Встроить токен в основную цепочку токенов Args: mt(MetaToken): встраиваемый метатокен """ if (mt is None): return if (mt.begin_char > mt.end_char): bg = mt.begin_token mt.begin_token = mt.end_token mt.end_token = bg if (mt.begin_char > mt.end_char): return if (mt.begin_token == self.first_token): self.first_token = (mt) else: tp = mt.begin_token.previous mt.previous = tp tn = mt.end_token.next0_ mt.next0_ = tn if (isinstance(mt, ReferentToken)): if (mt.referent is not None): mt.referent.add_occurence( TextAnnotation._new474(self.sofa, mt.begin_char, mt.end_char)) def debed_token(self, t: 'Token') -> 'Token': """ Убрать метатокен из цепочки, восстановив исходное Args: t(Token): удаляемый из цепочки метатокен Returns: Token: первый токен удалённого метатокена """ r = t.get_referent() if (r is not None): for o in r.occurrence: if (o.begin_char == t.begin_char and o.end_char == t.end_char): r.occurrence.remove(o) break mt = Utils.asObjectOrNull(t, MetaToken) if (mt is None): return t if (t.next0_ is not None): t.next0_.previous = mt.end_token if (t.previous is not None): t.previous.next0_ = mt.begin_token if (mt == self.first_token): self.first_token = mt.begin_token if (r is not None and len(r.occurrence) == 0): for d in self.__m_datas.values(): if (r in d.referents): d.remove_referent(r) break return mt.begin_token @property def entities(self) -> typing.List['Referent']: """ Список сущностей Referent, выделенных в ходе анализа """ return self.__m_entities @property def sofa(self) -> 'SourceOfAnalysis': """ Ссылка на исходный текст """ if (self.__m_sofa is None): self.__m_sofa = SourceOfAnalysis("") return self.__m_sofa def get_text_character(self, position: int) -> 'char': """ Получить символ из исходного текста Args: position(int): позиция Returns: 'char': символ (0, если выход за границу) """ if ((position < 0) or position >= len(self.__m_sofa.text)): return chr(0) return self.__m_sofa.text[position] def get_analyzer_data_by_analyzer_name( self, analyzer_name: str) -> 'AnalyzerData': """ Получить данные, полученные в настоящий момент конкретным анализатором Args: analyzer_name(str): имя анализатора Returns: AnalyzerData: связанные с ним данные """ a = self.processor.find_analyzer(analyzer_name) if (a is None): return None return self.get_analyzer_data(a) def get_analyzer_data(self, analyzer: 'Analyzer') -> 'AnalyzerData': # Получить данные, полученные в настоящий момент конкретным анализатором if (analyzer is None or analyzer.name is None): return None d = None wrapd475 = RefOutArgWrapper(None) inoutres476 = Utils.tryGetValue(self.__m_datas, analyzer.name, wrapd475) d = wrapd475.value if (inoutres476): d.kit = self return d default_data = analyzer.create_analyzer_data() if (default_data is None): return None if (analyzer._persist_referents_regim): if (analyzer._persist_analizer_data is None): analyzer._persist_analizer_data = default_data else: default_data = analyzer._persist_analizer_data self.__m_datas[analyzer.name] = default_data default_data.kit = self return default_data def __create_statistics(self) -> None: from pullenti.ner.core.StatisticCollection import StatisticCollection self.statistics = StatisticCollection() self.statistics._prepare(self.first_token) def __define_base_language(self) -> None: stat = dict() total = 0 t = self.first_token first_pass3056 = True while True: if first_pass3056: first_pass3056 = False else: t = t.next0_ if (not (t is not None)): break tt = Utils.asObjectOrNull(t, TextToken) if (tt is None): continue if (tt.morph.language.is_undefined): continue if (not tt.morph.language.value in stat): stat[tt.morph.language.value] = 1 else: stat[tt.morph.language.value] += 1 total += 1 val = 0 for kp in stat.items(): if (kp[1] > (math.floor(total / 2))): val |= kp[0] self.base_language.value = val def replace_referent(self, old_referent: 'Referent', new_referent: 'Referent') -> None: # Заменить везде, где только возможно, старую сущность на новую (используется при объединении сущностей) t = self.first_token while t is not None: if (isinstance(t, ReferentToken)): t._replace_referent(old_referent, new_referent) t = t.next0_ for d in self.__m_datas.values(): for r in d.referents: for s in r.slots: if (s.value == old_referent): r.upload_slot(s, new_referent) if (old_referent in d.referents): d.referents.remove(old_referent) def process_referent(self, analyzer_name: str, t: 'Token') -> 'ReferentToken': """ Попытаться выделить с заданного токена сущность указанным анализатором. Используется, если нужно "забежать вперёд" и проверить гипотезу, есть ли тут сущность конкретного типа или нет. Args: analyzer_name(str): имя анализатора t(Token): токен, с которого попробовать выделение Returns: ReferentToken: метатокен с сущностью ReferentToken или null. Отметим, что сущность не сохранена и полученный метатокен никуда не встроен. """ if (self.processor is None): return None if (analyzer_name in self._m_analyzer_stack): return None if (self.is_recurce_overflow): return None a = self.processor.find_analyzer(analyzer_name) if (a is None): return None self.recurse_level += 1 self._m_analyzer_stack.append(analyzer_name) res = a.process_referent(t, None) self._m_analyzer_stack.remove(analyzer_name) self.recurse_level -= 1 return res def create_referent(self, type_name: str) -> 'Referent': """ Создать экземпляр сущности заданного типа Args: type_name(str): имя типа сущности Returns: Referent: экземпляр класса, наследного от Referent, или null """ if (self.processor is None): return None else: for a in self.processor.analyzers: res = a.create_referent(type_name) if (res is not None): return res return None def refresh_generals(self) -> None: GeneralRelationHelper.refresh_generals(self.processor, self) @property def is_recurce_overflow(self) -> bool: return self.recurse_level > 5 def serialize(self, stream: Stream) -> None: stream.writebyte(0xAA) stream.writebyte(1) self.__m_sofa.serialize(stream) SerializerHelper.serialize_int(stream, self.base_language.value) if (len(self.__m_entities) == 0): for d in self.__m_datas.items(): self.__m_entities.extend(d[1].referents) SerializerHelper.serialize_int(stream, len(self.__m_entities)) i = 0 while i < len(self.__m_entities): self.__m_entities[i].tag = i + 1 SerializerHelper.serialize_string(stream, self.__m_entities[i].type_name) i += 1 for e0_ in self.__m_entities: e0_.serialize(stream) SerializerHelper.serialize_tokens(stream, self.first_token, 0) def deserialize(self, stream: Stream) -> bool: vers = 0 b = stream.readbyte() if (b == (0xAA)): b = (stream.readbyte()) vers = (b) else: stream.position = stream.position - (1) self.__m_sofa = SourceOfAnalysis(None) self.__m_sofa.deserialize(stream) self.base_language = MorphLang._new56( SerializerHelper.deserialize_int(stream)) self.__m_entities = list() cou = SerializerHelper.deserialize_int(stream) i = 0 while i < cou: typ = SerializerHelper.deserialize_string(stream) r = ProcessorService.create_referent(typ) if (r is None): r = Referent("UNDEFINED") self.__m_entities.append(r) i += 1 i = 0 while i < cou: self.__m_entities[i].deserialize(stream, self.__m_entities, self.__m_sofa) i += 1 self.first_token = SerializerHelper.deserialize_tokens( stream, self, vers) self.__create_statistics() return True @staticmethod def _new2840(_arg1: 'Processor', _arg2: 'ExtOntology') -> 'AnalysisKit': res = AnalysisKit() res.processor = _arg1 res.ontology = _arg2 return res @staticmethod def _new2841(_arg1: 'SourceOfAnalysis', _arg2: bool, _arg3: 'MorphLang', _arg4: EventHandler, _arg5: 'ExtOntology', _arg6: 'Processor', _arg7: bool) -> 'AnalysisKit': res = AnalysisKit(_arg1, _arg2, _arg3, _arg4) res.ontology = _arg5 res.processor = _arg6 res.onto_regime = _arg7 return res
def main(args: typing.List[str]) -> None: sw = Stopwatch() # инициализация - необходимо проводить один раз до обработки текстов print("Initializing SDK Pullenti ver {0} ({1}) ... ".format( Sdk.get_version(), Sdk.get_version_date()), end="", flush=True) # инициализируются движок и все имеющиеся анализаторы Sdk.initialize_all() sw.stop() print("OK (by {0} ms), version {1}".format( sw.elapsedMilliseconds, ProcessorService.get_version()), flush=True) # посмотрим, какие анализаторы доступны for a in ProcessorService.get_analyzers(): print(" {0} {1} \"{2}\"".format( ("Specific analyzer" if a.is_specific else "Common analyzer"), a.name, a.caption), flush=True) # анализируемый текст txt = "Система разрабатывается с 2011 года российским программистом Михаилом Жуковым, проживающим в Москве на Красной площади в доме номер один на втором этаже. Конкурентов у него много: Abbyy, Yandex, ООО \"Russian Context Optimizer\" (RCO) и другие компании. Он планирует продать SDK за 1.120.000.001,99 (миллиард сто двадцать миллионов один рубль 99 копеек) рублей, без НДС." print("Text: {0}".format(txt), flush=True) # запускаем обработку на пустом процессоре (без анализаторов NER) are = ProcessorService.get_empty_processor().process( SourceOfAnalysis(txt), None, None) print("Noun groups: ", end="", flush=True) t = are.first_token # перебираем токены first_pass2974 = True while True: if first_pass2974: first_pass2974 = False else: t = t.next0_ if (not (t is not None)): break # выделяем именную группу с текущего токена npt = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0, None) # не получилось if (npt is None): continue # получилось, выводим в нормализованном виде print("[{0}=>{1}] ".format( npt.get_source_text(), npt.get_normal_case_text(None, MorphNumber.SINGULAR, MorphGender.UNDEFINED, False)), end="", flush=True) # указатель на последний токен именной группы t = npt.end_token with ProcessorService.create_processor() as proc: # анализируем текст ar = proc.process(SourceOfAnalysis(txt), None, None) # результирующие сущности print( "\r\n==========================================\r\nEntities: ", flush=True) for e0_ in ar.entities: print("{0}: {1}".format(e0_.type_name, str(e0_)), flush=True) for s in e0_.slots: print(" {0}: {1}".format(s.type_name, s.value), flush=True) # пример выделения именных групп print( "\r\n==========================================\r\nNoun groups: ", flush=True) t = ar.first_token first_pass2975 = True while True: if first_pass2975: first_pass2975 = False else: t = t.next0_ if (not (t is not None)): break # токены с сущностями игнорируем if (t.get_referent() is not None): continue # пробуем создать именную группу npt = NounPhraseHelper.try_parse( t, NounPhraseParseAttr.ADJECTIVECANBELAST, 0, None) # не получилось if (npt is None): continue print(npt, flush=True) # указатель перемещаем на последний токен группы t = npt.end_token with ProcessorService.create_specific_processor( KeywordAnalyzer.ANALYZER_NAME) as proc: ar = proc.process(SourceOfAnalysis(txt), None, None) print( "\r\n==========================================\r\nKeywords1: ", flush=True) for e0_ in ar.entities: if (isinstance(e0_, KeywordReferent)): print(e0_, flush=True) print( "\r\n==========================================\r\nKeywords2: ", flush=True) t = ar.first_token first_pass2976 = True while True: if first_pass2976: first_pass2976 = False else: t = t.next0_ if (not (t is not None)): break if (isinstance(t, ReferentToken)): kw = Utils.asObjectOrNull(t.get_referent(), KeywordReferent) if (kw is None): continue kwstr = MiscHelper.get_text_value_of_meta_token( Utils.asObjectOrNull(t, ReferentToken), Utils.valToEnum( (GetTextAttr.FIRSTNOUNGROUPTONOMINATIVESINGLE) | (GetTextAttr.KEEPREGISTER), GetTextAttr)) print("{0} = {1}".format(kwstr, kw), flush=True) print("Over!", flush=True)