コード例 #1
0
 def deserialize(self, stream : io.IOBase) -> bool:
     vers = 0
     b = Utils.readByteIO(stream)
     if (b == (0xAA)): 
         b = (Utils.readByteIO(stream))
         vers = (b)
     else: 
         stream.seek(stream.tell() - (1), io.SEEK_SET)
     self.__m_sofa = SourceOfAnalysis(None)
     self.__m_sofa.deserialize(stream)
     self.base_language = MorphLang._new5(SerializerHelper.deserializeInt(stream))
     self.__m_entities = list()
     cou = SerializerHelper.deserializeInt(stream)
     i = 0
     while i < cou: 
         typ = SerializerHelper.deserializeString(stream)
         r = ProcessorService.createReferent(typ)
         if (r is None): 
             r = Referent("UNDEFINED")
         self.__m_entities.append(r)
         i += 1
     i = 0
     while i < cou: 
         self.__m_entities[i].deserialize(stream, self.__m_entities, self.__m_sofa)
         i += 1
     self.first_token = SerializerHelper.deserializeTokens(stream, self, vers)
     self.__createStatistics()
     return True
コード例 #2
0
 def deserialize(self, stream: Stream) -> bool:
     vers = 0
     b = stream.readbyte()
     if (b == (0xAA)):
         b = (stream.readbyte())
         vers = (b)
     else:
         stream.position = stream.position - (1)
     self.__m_sofa = SourceOfAnalysis(None)
     self.__m_sofa.deserialize(stream)
     self.base_language = MorphLang._new56(
         SerializerHelper.deserialize_int(stream))
     self.__m_entities = list()
     cou = SerializerHelper.deserialize_int(stream)
     i = 0
     while i < cou:
         typ = SerializerHelper.deserialize_string(stream)
         r = ProcessorService.create_referent(typ)
         if (r is None):
             r = Referent("UNDEFINED")
         self.__m_entities.append(r)
         i += 1
     i = 0
     while i < cou:
         self.__m_entities[i].deserialize(stream, self.__m_entities,
                                          self.__m_sofa)
         i += 1
     self.first_token = SerializerHelper.deserialize_tokens(
         stream, self, vers)
     self.__create_statistics()
     return True
コード例 #3
0
 def initialize() -> None:
     if (ShortNameHelper.M_INITED): 
         return
     ShortNameHelper.M_INITED = True
     obj = PullentiNerPersonInternalResourceHelper.get_string("ShortNames.txt")
     if (obj is not None): 
         kit = AnalysisKit(SourceOfAnalysis(obj))
         t = kit.first_token
         while t is not None: 
             if (t.is_newline_before): 
                 g = (MorphGender.FEMINIE if t.is_value("F", None) else MorphGender.MASCULINE)
                 t = t.next0_
                 nam = t.term
                 shos = list()
                 t = t.next0_
                 while t is not None: 
                     if (t.is_newline_before): 
                         break
                     else: 
                         shos.append(t.term)
                     t = t.next0_
                 for s in shos: 
                     li = None
                     wrapli2599 = RefOutArgWrapper(None)
                     inoutres2600 = Utils.tryGetValue(ShortNameHelper.M_SHORTS_NAMES, s, wrapli2599)
                     li = wrapli2599.value
                     if (not inoutres2600): 
                         li = list()
                         ShortNameHelper.M_SHORTS_NAMES[s] = li
                     li.append(ShortNameHelper.ShortnameVar._new2598(nam, g))
                 if (t is None): 
                     break
                 t = t.previous
             t = t.next0_
コード例 #4
0
ファイル: OrgGlobal.py プロジェクト: pullenti/PullentiPython
 def initialize() -> None:
     if (OrgGlobal.GLOBAL_ORGS is not None): 
         return
     OrgGlobal.GLOBAL_ORGS = IntOntologyCollection()
     org0_ = None
     oi = None
     with ProcessorService.create_empty_processor() as geo_proc: 
         geo_proc.add_analyzer(GeoAnalyzer())
         geos = dict()
         for k in range(3):
             lang = (MorphLang.RU if k == 0 else (MorphLang.EN if k == 1 else MorphLang.UA))
             name = ("Orgs_ru.dat" if k == 0 else ("Orgs_en.dat" if k == 1 else "Orgs_ua.dat"))
             dat = PullentiNerOrgInternalResourceHelper.get_bytes(name)
             if (dat is None): 
                 raise Utils.newException("Can't file resource file {0} in Organization analyzer".format(name), None)
             with MemoryStream(OrgItemTypeToken._deflate(dat)) as tmp: 
                 tmp.position = 0
                 xml0_ = None # new XmlDocument
                 xml0_ = Utils.parseXmlFromStream(tmp)
                 for x in xml0_.getroot(): 
                     org0_ = OrganizationReferent()
                     abbr = None
                     for xx in x: 
                         if (Utils.getXmlLocalName(xx) == "typ"): 
                             org0_.add_slot(OrganizationReferent.ATTR_TYPE, Utils.getXmlInnerText(xx), False, 0)
                         elif (Utils.getXmlLocalName(xx) == "nam"): 
                             org0_.add_slot(OrganizationReferent.ATTR_NAME, Utils.getXmlInnerText(xx), False, 0)
                         elif (Utils.getXmlLocalName(xx) == "epo"): 
                             org0_.add_slot(OrganizationReferent.ATTR_EPONYM, Utils.getXmlInnerText(xx), False, 0)
                         elif (Utils.getXmlLocalName(xx) == "prof"): 
                             org0_.add_slot(OrganizationReferent.ATTR_PROFILE, Utils.getXmlInnerText(xx), False, 0)
                         elif (Utils.getXmlLocalName(xx) == "abbr"): 
                             abbr = Utils.getXmlInnerText(xx)
                         elif (Utils.getXmlLocalName(xx) == "geo"): 
                             geo_ = None
                             wrapgeo1767 = RefOutArgWrapper(None)
                             inoutres1768 = Utils.tryGetValue(geos, Utils.getXmlInnerText(xx), wrapgeo1767)
                             geo_ = wrapgeo1767.value
                             if (not inoutres1768): 
                                 ar = geo_proc.process(SourceOfAnalysis(Utils.getXmlInnerText(xx)), None, lang)
                                 if (ar is not None and len(ar.entities) == 1 and (isinstance(ar.entities[0], GeoReferent))): 
                                     geo_ = (Utils.asObjectOrNull(ar.entities[0], GeoReferent))
                                     geos[Utils.getXmlInnerText(xx)] = geo_
                                 else: 
                                     pass
                             if (geo_ is not None): 
                                 org0_.add_slot(OrganizationReferent.ATTR_GEO, geo_, False, 0)
                     oi = org0_.create_ontology_item_ex(2, True, True)
                     if (oi is None): 
                         continue
                     if (abbr is not None): 
                         oi.termins.append(Termin(abbr, None, True))
                     if (k == 2): 
                         OrgGlobal.GLOBAL_ORGS_UA.add_item(oi)
                     else: 
                         OrgGlobal.GLOBAL_ORGS.add_item(oi)
     return
コード例 #5
0
 def __create_referent(self, type_name: str,
                       definition_: str) -> typing.List['Referent']:
     analyzer = None
     wrapanalyzer2809 = RefOutArgWrapper(None)
     inoutres2810 = Utils.tryGetValue(self.__m_anal_by_type, type_name,
                                      wrapanalyzer2809)
     analyzer = wrapanalyzer2809.value
     if (not inoutres2810):
         return None
     sf = SourceOfAnalysis(definition_)
     ar = self.__m_processor._process(sf, True, True, None, None)
     if (ar is None or ar.first_token is None):
         return None
     r0 = ar.first_token.get_referent()
     t = None
     if (r0 is not None):
         if (r0.type_name != type_name):
             r0 = (None)
     if (r0 is not None):
         t = ar.first_token
     else:
         rt = analyzer.process_ontology_item(ar.first_token)
         if (rt is None):
             return None
         r0 = rt.referent
         t = rt.end_token
     t = t.next0_
     first_pass3432 = True
     while True:
         if first_pass3432: first_pass3432 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (t.is_char(';') and t.next0_ is not None):
             r1 = t.next0_.get_referent()
             if (r1 is None):
                 rt = analyzer.process_ontology_item(t.next0_)
                 if (rt is None):
                     continue
                 t = rt.end_token
                 r1 = rt.referent
             if (r1.type_name == type_name):
                 r0.merge_slots(r1, True)
                 r1.tag = r0
     if (r0 is None):
         return None
     r0.tag = r0
     r0 = analyzer._persist_analizer_data.register_referent(r0)
     self.__m_processor._create_res(ar.first_token.kit, ar, None, True)
     res = list()
     res.append(r0)
     for e0_ in ar.entities:
         if (e0_.tag is None):
             res.append(e0_)
     return res
コード例 #6
0
 def __createReferent(self, type_name : str, definition_ : str) -> 'Referent':
     analyzer = None
     wrapanalyzer2643 = RefOutArgWrapper(None)
     inoutres2644 = Utils.tryGetValue(self.__m_anal_by_type, type_name, wrapanalyzer2643)
     analyzer = wrapanalyzer2643.value
     if (not inoutres2644): 
         return None
     sf = SourceOfAnalysis(definition_)
     ar = self.__m_processor._process(sf, True, True, None, None)
     if (ar is None or ar.first_token is None): 
         return None
     r0 = ar.first_token.getReferent()
     t = None
     if (r0 is not None): 
         if (r0.type_name != type_name): 
             r0 = (None)
     if (r0 is not None): 
         t = ar.first_token
     else: 
         rt = analyzer.processOntologyItem(ar.first_token)
         if (rt is None): 
             return None
         r0 = rt.referent
         t = rt.end_token
     t = t.next0_
     first_pass3162 = True
     while True:
         if first_pass3162: first_pass3162 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (t.isChar(';') and t.next0_ is not None): 
             r1 = t.next0_.getReferent()
             if (r1 is None): 
                 rt = analyzer.processOntologyItem(t.next0_)
                 if (rt is None): 
                     continue
                 t = rt.end_token
                 r1 = rt.referent
             if (r1.type_name == type_name): 
                 r0.mergeSlots(r1, True)
     if (r0 is not None): 
         r0 = analyzer._persist_analizer_data.registerReferent(r0)
     return r0
コード例 #7
0
ファイル: Program.py プロジェクト: MihaJjDa/APCLtask
 def main(args: typing.List[str]) -> None:
     sw = Stopwatch()
     # инициализация - необходимо проводить один раз до обработки текстов
     print("Initializing ... ", end="", flush=True)
     # инициализируются движок и все имеющиеся анализаторы
     Sdk.initialize((MorphLang.RU) | MorphLang.EN)
     sw.stop()
     print("OK (by {0} ms), version {1}".format(
         sw.elapsedMilliseconds, ProcessorService.getVersion()),
           flush=True)
     # анализируемый текст
     txt = "Единственным конкурентом «Трансмаша» на этом сомнительном тендере было ООО «Плассер Алека Рейл Сервис», основным владельцем которого является австрийская компания «СТЦ-Холдинг ГМБХ». До конца 2011 г. эта же фирма была совладельцем «Трансмаша» вместе с «Тако» Краснова. Зато совладельцем «Плассера», также до конца 2011 г., был тот самый Карл Контрус, который имеет четверть акций «Трансмаша». "
     print("Text: {0}".format(txt), flush=True)
     # запускаем обработку на пустом процессоре (без анализаторов NER)
     are = ProcessorService.getEmptyProcessor().process(
         SourceOfAnalysis(txt), None, None)
     print("Noun groups: ", end="", flush=True)
     t = are.first_token
     # перебираем токены
     first_pass2703 = True
     while True:
         if first_pass2703: first_pass2703 = False
         else: t = t.next0_
         if (not (t is not None)): break
         # выделяем именную группу с текущего токена
         npt = NounPhraseHelper.tryParse(t, NounPhraseParseAttr.NO, 0)
         # не получилось
         if (npt is None):
             continue
         # получилось, выводим в нормализованном виде
         print("[{0}=>{1}] ".format(
             npt.getSourceText(),
             npt.getNormalCaseText(None, True, MorphGender.UNDEFINED,
                                   False)),
               end="",
               flush=True)
         # указатель на последний токен именной группы
         t = npt.end_token
     with ProcessorService.createProcessor() as proc:
         # анализируем текст
         ar = proc.process(SourceOfAnalysis(txt), None, None)
         # результирующие сущности
         print(
             "\r\n==========================================\r\nEntities: ",
             flush=True)
         for e0_ in ar.entities:
             print("{0}: {1}".format(e0_.type_name, str(e0_)), flush=True)
             for s in e0_.slots:
                 print("   {0}: {1}".format(s.type_name, s.value),
                       flush=True)
         # пример выделения именных групп
         print(
             "\r\n==========================================\r\nNoun groups: ",
             flush=True)
         t = ar.first_token
         first_pass2704 = True
         while True:
             if first_pass2704: first_pass2704 = False
             else: t = t.next0_
             if (not (t is not None)): break
             # токены с сущностями игнорируем
             if (t.getReferent() is not None):
                 continue
             # пробуем создать именную группу
             npt = NounPhraseHelper.tryParse(
                 t, NounPhraseParseAttr.ADJECTIVECANBELAST, 0)
             # не получилось
             if (npt is None):
                 continue
             print(npt, flush=True)
             # указатель перемещаем на последний токен группы
             t = npt.end_token
     with ProcessorService.createSpecificProcessor(
             KeywordAnalyzer.ANALYZER_NAME) as proc:
         ar = proc.process(SourceOfAnalysis(txt), None, None)
         print(
             "\r\n==========================================\r\nKeywords1: ",
             flush=True)
         for e0_ in ar.entities:
             if (isinstance(e0_, KeywordReferent)):
                 print(e0_, flush=True)
         print(
             "\r\n==========================================\r\nKeywords2: ",
             flush=True)
         t = ar.first_token
         first_pass2705 = True
         while True:
             if first_pass2705: first_pass2705 = False
             else: t = t.next0_
             if (not (t is not None)): break
             if (isinstance(t, ReferentToken)):
                 kw = Utils.asObjectOrNull(t.getReferent(), KeywordReferent)
                 if (kw is None):
                     continue
                 kwstr = MiscHelper.getTextValueOfMetaToken(
                     Utils.asObjectOrNull(t, ReferentToken),
                     Utils.valToEnum(
                         (GetTextAttr.FIRSTNOUNGROUPTONOMINATIVESINGLE) |
                         (GetTextAttr.KEEPREGISTER), GetTextAttr))
                 print("{0} = {1}".format(kwstr, kw), flush=True)
     print("Over!", flush=True)
コード例 #8
0
 def __call__(self, text):
     text = preprocess(text)
     sofa = SourceOfAnalysis(text)
     raw = self.raw.process(sofa)
     return convert_result(text, raw)
コード例 #9
0
 def sofa(self) -> 'SourceOfAnalysis':
     """ Ссылка на исходный текст """
     if (self.__m_sofa is None): 
         self.__m_sofa = SourceOfAnalysis("")
     return self.__m_sofa
コード例 #10
0
class AnalysisKit:
    """ Внутренний аналитический контейнер данных """
    
    def __init__(self, sofa_ : 'SourceOfAnalysis'=None, only_tokenizing : bool=False, lang : 'MorphLang'=None, progress : EventHandler=None) -> None:
        self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0)
        self.corrected_tokens = None
        self.first_token = None;
        self.__m_entities = list()
        self.ontology = None;
        self.base_language = MorphLang()
        self.__m_sofa = None;
        self.statistics = None;
        self.__m_datas = dict()
        self.misc_data = dict()
        self.processor = None;
        self.recurse_level = 0
        self._m_analyzer_stack = list()
        if (sofa_ is None): 
            return
        self.__m_sofa = sofa_
        self._start_date = datetime.datetime.now()
        tokens = Morphology.process(sofa_.text, lang, None)
        t0 = None
        if (tokens is not None): 
            ii = 0
            while ii < len(tokens): 
                mt = tokens[ii]
                if (mt.begin_char == 733860): 
                    pass
                tt = TextToken(mt, self)
                if (sofa_.correction_dict is not None): 
                    wrapcorw539 = RefOutArgWrapper(None)
                    inoutres540 = Utils.tryGetValue(sofa_.correction_dict, mt.term, wrapcorw539)
                    corw = wrapcorw539.value
                    if (inoutres540): 
                        ccc = Morphology.process(corw, lang, None)
                        if (ccc is not None and len(ccc) == 1): 
                            tt1 = TextToken._new538(ccc[0], self, tt.term)
                            tt1.begin_char = tt.begin_char
                            tt1.end_char = tt.end_char
                            tt1.chars = tt.chars
                            tt = tt1
                            if (self.corrected_tokens is None): 
                                self.corrected_tokens = dict()
                            self.corrected_tokens[tt] = tt.getSourceText()
                if (t0 is None): 
                    self.first_token = (tt)
                else: 
                    t0.next0_ = tt
                t0 = (tt)
                ii += 1
        if (sofa_.clear_dust): 
            self.__clearDust()
        if (sofa_.do_words_merging_by_morph): 
            self.__correctWordsByMerging(lang)
        if (sofa_.do_word_correction_by_morph): 
            self.__correctWordsByMorph(lang)
        self.__mergeLetters()
        self.__defineBaseLanguage()
        t = self.first_token
        first_pass2794 = True
        while True:
            if first_pass2794: first_pass2794 = False
            else: t = t.next0_
            if (not (t is not None)): break
            nt = NumberHelper._tryParseNumber(t)
            if (nt is None): 
                continue
            self.embedToken(nt)
            t = (nt)
        if (only_tokenizing): 
            return
        t = self.first_token
        first_pass2795 = True
        while True:
            if first_pass2795: first_pass2795 = False
            else: t = t.next0_
            if (not (t is not None)): break
            if (t.morph.class0_.is_preposition): 
                continue
            mc = t.getMorphClassInDictionary()
            if (mc.is_undefined and t.chars.is_cyrillic_letter and t.length_char > 4): 
                tail = sofa_.text[t.end_char - 1:t.end_char - 1+2]
                tte = None
                tt = t.previous
                if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): 
                    tt = tt.previous
                if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): 
                    tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2]
                    if (tail2 == tail): 
                        tte = tt
                if (tte is None): 
                    tt = t.next0_
                    if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): 
                        tt = tt.next0_
                    if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): 
                        tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2]
                        if (tail2 == tail): 
                            tte = tt
                if (tte is not None): 
                    t.morph.removeItemsEx(tte.morph, tte.getMorphClassInDictionary())
            continue
        self.__createStatistics()
    
    def _initFrom(self, ar : 'AnalysisResult') -> None:
        self.__m_sofa = ar.sofas[0]
        self.first_token = ar.first_token
        self.base_language = ar.base_language
        self.__createStatistics()
    
    def __clearDust(self) -> None:
        t = self.first_token
        first_pass2796 = True
        while True:
            if first_pass2796: first_pass2796 = False
            else: t = t.next0_
            if (not (t is not None)): break
            cou = AnalysisKit.__calcAbnormalCoef(t)
            norm = 0
            if (cou < 1): 
                continue
            t1 = t
            tt = t
            first_pass2797 = True
            while True:
                if first_pass2797: first_pass2797 = False
                else: tt = tt.next0_
                if (not (tt is not None)): break
                co = AnalysisKit.__calcAbnormalCoef(tt)
                if (co == 0): 
                    continue
                if (co < 0): 
                    norm += 1
                    if (norm > 1): 
                        break
                else: 
                    norm = 0
                    cou += co
                    t1 = tt
            len0_ = t1.end_char - t.begin_char
            if (cou > 20 and len0_ > 500): 
                p = t.begin_char
                while p < t1.end_char: 
                    if (self.sofa.text[p] == self.sofa.text[p + 1]): 
                        len0_ -= 1
                    p += 1
                if (len0_ > 500): 
                    if (t.previous is not None): 
                        t.previous.next0_ = t1.next0_
                    else: 
                        self.first_token = t1.next0_
                    t = t1
                else: 
                    t = t1
            else: 
                t = t1
    
    @staticmethod
    def __calcAbnormalCoef(t : 'Token') -> int:
        if (isinstance(t, NumberToken)): 
            return 0
        tt = Utils.asObjectOrNull(t, TextToken)
        if (tt is None): 
            return 0
        if (not tt.chars.is_letter): 
            return 0
        if (not tt.chars.is_latin_letter and not tt.chars.is_cyrillic_letter): 
            return 2
        if (tt.length_char < 4): 
            return 0
        for wf in tt.morph.items: 
            if ((wf).is_in_dictionary): 
                return -1
        if (tt.length_char > 15): 
            return 2
        return 1
    
    def __correctWordsByMerging(self, lang : 'MorphLang') -> None:
        t = self.first_token
        first_pass2798 = True
        while True:
            if first_pass2798: first_pass2798 = False
            else: t = t.next0_
            if (not (t is not None and t.next0_ is not None)): break
            if (not t.chars.is_letter or (t.length_char < 2)): 
                continue
            mc0 = t.getMorphClassInDictionary()
            if (t.morph.containsAttr("прдктв.", None)): 
                continue
            t1 = t.next0_
            if (t1.is_hiphen and t1.next0_ is not None and not t1.is_newline_after): 
                t1 = t1.next0_
            if (t1.length_char == 1): 
                continue
            if (not t1.chars.is_letter or not t.chars.is_letter or t1.chars.is_latin_letter != t.chars.is_latin_letter): 
                continue
            if (t1.chars.is_all_upper and not t.chars.is_all_upper): 
                continue
            elif (not t1.chars.is_all_lower): 
                continue
            elif (t.chars.is_all_upper): 
                continue
            if (t1.morph.containsAttr("прдктв.", None)): 
                continue
            mc1 = t1.getMorphClassInDictionary()
            if (not mc1.is_undefined and not mc0.is_undefined): 
                continue
            if ((len((t).term) + len((t1).term)) < 6): 
                continue
            corw = (t).term + (t1).term
            ccc = Morphology.process(corw, lang, None)
            if (ccc is None or len(ccc) != 1): 
                continue
            if (corw == "ПОСТ" or corw == "ВРЕД"): 
                continue
            tt = TextToken(ccc[0], self)
            if (tt.getMorphClassInDictionary().is_undefined): 
                continue
            tt.begin_char = t.begin_char
            tt.end_char = t1.end_char
            tt.chars = t.chars
            if (t == self.first_token): 
                self.first_token = (tt)
            else: 
                t.previous.next0_ = tt
            if (t1.next0_ is not None): 
                tt.next0_ = t1.next0_
            t = (tt)
    
    def __correctWordsByMorph(self, lang : 'MorphLang') -> None:
        tt = self.first_token
        first_pass2799 = True
        while True:
            if first_pass2799: first_pass2799 = False
            else: tt = tt.next0_
            if (not (tt is not None)): break
            if (not ((isinstance(tt, TextToken)))): 
                continue
            if (tt.morph.containsAttr("прдктв.", None)): 
                continue
            dd = tt.getMorphClassInDictionary()
            if (not dd.is_undefined or (tt.length_char < 4)): 
                continue
            if (tt.morph.class0_.is_proper_surname and not tt.chars.is_all_lower): 
                continue
            if (tt.chars.is_all_upper): 
                continue
            corw = Morphology.correctWord((tt).term, (lang if tt.morph.language.is_undefined else tt.morph.language))
            if (corw is None): 
                continue
            ccc = Morphology.process(corw, lang, None)
            if (ccc is None or len(ccc) != 1): 
                continue
            tt1 = TextToken._new541(ccc[0], self, tt.chars, tt.begin_char, tt.end_char, (tt).term)
            mc = tt1.getMorphClassInDictionary()
            if (mc.is_proper_surname): 
                continue
            if (tt == self.first_token): 
                self.first_token = (tt1)
            else: 
                tt.previous.next0_ = tt1
            tt1.next0_ = tt.next0_
            tt = (tt1)
            if (self.corrected_tokens is None): 
                self.corrected_tokens = dict()
            self.corrected_tokens[tt] = tt.getSourceText()
    
    def __mergeLetters(self) -> None:
        before_word = False
        tmp = io.StringIO()
        t = self.first_token
        first_pass2800 = True
        while True:
            if first_pass2800: first_pass2800 = False
            else: t = t.next0_
            if (not (t is not None)): break
            tt = Utils.asObjectOrNull(t, TextToken)
            if (not tt.chars.is_letter or tt.length_char != 1): 
                before_word = False
                continue
            i = t.whitespaces_before_count
            if (i > 2 or ((i == 2 and before_word))): 
                pass
            else: 
                before_word = False
                continue
            i = 0
            Utils.setLengthStringIO(tmp, 0)
            print(tt.getSourceText(), end="", file=tmp)
            t1 = t
            while t1.next0_ is not None: 
                tt = (Utils.asObjectOrNull(t1.next0_, TextToken))
                if (tt.length_char != 1 or tt.whitespaces_before_count != 1): 
                    break
                i += 1
                print(tt.getSourceText(), end="", file=tmp)
                t1 = t1.next0_
            if (i > 3 or ((i > 1 and before_word))): 
                pass
            else: 
                before_word = False
                continue
            before_word = False
            mt = Morphology.process(Utils.toStringStringIO(tmp), None, None)
            if (mt is None or len(mt) != 1): 
                t = t1
                continue
            for wf in mt[0].word_forms: 
                if (wf.is_in_dictionary): 
                    before_word = True
                    break
            if (not before_word): 
                t = t1
                continue
            tt = TextToken(mt[0], self)
            if (t == self.first_token): 
                self.first_token = (tt)
            else: 
                tt.previous = t.previous
            tt.next0_ = t1.next0_
            tt.begin_char = t.begin_char
            tt.end_char = t1.end_char
            t = (tt)
    
    def embedToken(self, mt : 'MetaToken') -> None:
        """ Встроить токен в основную цепочку токенов
        
        Args:
            mt(MetaToken): 
        """
        if (mt is None): 
            return
        if (mt.begin_char > mt.end_char): 
            bg = mt.begin_token
            mt.begin_token = mt.end_token
            mt.end_token = bg
        if (mt.begin_char > mt.end_char): 
            return
        if (mt.begin_token == self.first_token): 
            self.first_token = (mt)
        else: 
            tp = mt.begin_token.previous
            mt.previous = tp
        tn = mt.end_token.next0_
        mt.next0_ = tn
        if (isinstance(mt, ReferentToken)): 
            if ((mt).referent is not None): 
                (mt).referent.addOccurence(TextAnnotation._new542(self.sofa, mt.begin_char, mt.end_char))
    
    def debedToken(self, t : 'Token') -> 'Token':
        """ Убрать метатокен из цепочки, восстановив исходное
        
        Args:
            t(Token): 
        
        Returns:
            Token: первый токен удалённого метатокена
        """
        r = t.getReferent()
        if (r is not None): 
            for o in r.occurrence: 
                if (o.begin_char == t.begin_char and o.end_char == t.end_char): 
                    r.occurrence.remove(o)
                    break
        mt = Utils.asObjectOrNull(t, MetaToken)
        if (mt is None): 
            return t
        if (t.next0_ is not None): 
            t.next0_.previous = mt.end_token
        if (t.previous is not None): 
            t.previous.next0_ = mt.begin_token
        if (mt == self.first_token): 
            self.first_token = mt.begin_token
        if (r is not None and len(r.occurrence) == 0): 
            for d in self.__m_datas.values(): 
                if (r in d.referents): 
                    d.removeReferent(r)
                    break
        return mt.begin_token
    
    @property
    def entities(self) -> typing.List['Referent']:
        """ Список сущностей, выделенных в ходе анализа """
        return self.__m_entities
    
    @property
    def sofa(self) -> 'SourceOfAnalysis':
        """ Ссылка на исходный текст """
        if (self.__m_sofa is None): 
            self.__m_sofa = SourceOfAnalysis("")
        return self.__m_sofa
    
    def getTextCharacter(self, position : int) -> 'char':
        """ Получить символ из исходного текста
        
        Args:
            position(int): позиция
        
        Returns:
            'char': символ (0, если выход за границу)
        """
        if ((position < 0) or position >= len(self.__m_sofa.text)): 
            return chr(0)
        return self.__m_sofa.text[position]
    
    def getAnalyzerDataByAnalyzerName(self, analyzer_name : str) -> 'AnalyzerData':
        a = self.processor.findAnalyzer(analyzer_name)
        if (a is None): 
            return None
        return self.getAnalyzerData(a)
    
    def getAnalyzerData(self, analyzer : 'Analyzer') -> 'AnalyzerData':
        """ Работа с локальными данными анализаторов
        
        Args:
            analyzer(Analyzer): 
        
        """
        if (analyzer is None or analyzer.name is None): 
            return None
        wrapd543 = RefOutArgWrapper(None)
        inoutres544 = Utils.tryGetValue(self.__m_datas, analyzer.name, wrapd543)
        d = wrapd543.value
        if (inoutres544): 
            d.kit = self
            return d
        default_data = analyzer.createAnalyzerData()
        if (default_data is None): 
            return None
        if (analyzer._persist_referents_regim): 
            if (analyzer._persist_analizer_data is None): 
                analyzer._persist_analizer_data = default_data
            else: 
                default_data = analyzer._persist_analizer_data
        self.__m_datas[analyzer.name] = default_data
        default_data.kit = self
        return default_data
    
    def __createStatistics(self) -> None:
        from pullenti.ner.core.StatisticCollection import StatisticCollection
        self.statistics = StatisticCollection()
        self.statistics.prepare(self.first_token)
    
    def __defineBaseLanguage(self) -> None:
        stat = dict()
        total = 0
        t = self.first_token
        first_pass2801 = True
        while True:
            if first_pass2801: first_pass2801 = False
            else: t = t.next0_
            if (not (t is not None)): break
            tt = Utils.asObjectOrNull(t, TextToken)
            if (tt is None): 
                continue
            if (tt.morph.language.is_undefined): 
                continue
            if (not tt.morph.language.value in stat): 
                stat[tt.morph.language.value] = 1
            else: 
                stat[tt.morph.language.value] += 1
            total += 1
        val = 0
        for kp in stat.items(): 
            if (kp[1] > (math.floor(total / 2))): 
                val |= kp[0]
        self.base_language.value = val
    
    def replaceReferent(self, old_referent : 'Referent', new_referent : 'Referent') -> None:
        """ Заменить везде где только возможно старую сущность на новую (используется при объединении сущностей)
        
        Args:
            old_referent(Referent): 
            new_referent(Referent): 
        """
        t = self.first_token
        while t is not None: 
            if (isinstance(t, ReferentToken)): 
                (t)._replaceReferent(old_referent, new_referent)
            t = t.next0_
        for d in self.__m_datas.values(): 
            for r in d.referents: 
                for s in r.slots: 
                    if (s.value == old_referent): 
                        r.uploadSlot(s, new_referent)
            if (old_referent in d.referents): 
                d.referents.remove(old_referent)
    
    def processReferent(self, analyzer_name : str, t : 'Token') -> 'ReferentToken':
        if (self.processor is None): 
            return None
        if (analyzer_name in self._m_analyzer_stack): 
            return None
        if (self.is_recurce_overflow): 
            return None
        a = self.processor.findAnalyzer(analyzer_name)
        if (a is None): 
            return None
        self.recurse_level += 1
        self._m_analyzer_stack.append(analyzer_name)
        res = a._processReferent(t, None)
        self._m_analyzer_stack.remove(analyzer_name)
        self.recurse_level -= 1
        return res
    
    def createReferent(self, type_name : str) -> 'Referent':
        if (self.processor is None): 
            return None
        else: 
            for a in self.processor.analyzers: 
                res = a.createReferent(type_name)
                if (res is not None): 
                    return res
        return None
    
    def refreshGenerals(self) -> None:
        GeneralRelationHelper.refreshGenerals(self.processor, self)
    
    @property
    def is_recurce_overflow(self) -> bool:
        return self.recurse_level > 5
    
    def serialize(self, stream : io.IOBase) -> None:
        Utils.writeByteIO(stream, 0xAA)
        Utils.writeByteIO(stream, 1)
        self.__m_sofa.serialize(stream)
        SerializerHelper.serializeInt(stream, self.base_language.value)
        if (len(self.__m_entities) == 0): 
            for d in self.__m_datas.items(): 
                self.__m_entities.extend(d[1].referents)
        SerializerHelper.serializeInt(stream, len(self.__m_entities))
        i = 0
        while i < len(self.__m_entities): 
            self.__m_entities[i].tag = i + 1
            SerializerHelper.serializeString(stream, self.__m_entities[i].type_name)
            i += 1
        for e0_ in self.__m_entities: 
            e0_.serialize(stream)
        SerializerHelper.serializeTokens(stream, self.first_token, 0)
    
    def deserialize(self, stream : io.IOBase) -> bool:
        vers = 0
        b = Utils.readByteIO(stream)
        if (b == (0xAA)): 
            b = (Utils.readByteIO(stream))
            vers = (b)
        else: 
            stream.seek(stream.tell() - (1), io.SEEK_SET)
        self.__m_sofa = SourceOfAnalysis(None)
        self.__m_sofa.deserialize(stream)
        self.base_language = MorphLang._new5(SerializerHelper.deserializeInt(stream))
        self.__m_entities = list()
        cou = SerializerHelper.deserializeInt(stream)
        i = 0
        while i < cou: 
            typ = SerializerHelper.deserializeString(stream)
            r = ProcessorService.createReferent(typ)
            if (r is None): 
                r = Referent("UNDEFINED")
            self.__m_entities.append(r)
            i += 1
        i = 0
        while i < cou: 
            self.__m_entities[i].deserialize(stream, self.__m_entities, self.__m_sofa)
            i += 1
        self.first_token = SerializerHelper.deserializeTokens(stream, self, vers)
        self.__createStatistics()
        return True
    
    @staticmethod
    def _new2668(_arg1 : 'Processor', _arg2 : 'ExtOntology') -> 'AnalysisKit':
        res = AnalysisKit()
        res.processor = _arg1
        res.ontology = _arg2
        return res
    
    @staticmethod
    def _new2669(_arg1 : 'SourceOfAnalysis', _arg2 : bool, _arg3 : 'MorphLang', _arg4 : EventHandler, _arg5 : 'ExtOntology', _arg6 : 'Processor') -> 'AnalysisKit':
        res = AnalysisKit(_arg1, _arg2, _arg3, _arg4)
        res.ontology = _arg5
        res.processor = _arg6
        return res
コード例 #11
0
class AnalysisKit:
    """ Внутренний аналитический контейнер данных. Создаётся автоматически внутри при вызове Processor.Process(...).
    Все токены Token ссылаются через поле Kit на экземпляр контейнера, связанного с обрабатываемым текстом.
    
    Контейнер данных
    """
    def __init__(self,
                 sofa_: 'SourceOfAnalysis' = None,
                 only_tokenizing: bool = False,
                 lang: 'MorphLang' = None,
                 progress: EventHandler = None) -> None:
        self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0)
        self.corrected_tokens = None
        self.first_token = None
        self.__m_entities = list()
        self.ontology = None
        self.base_language = MorphLang()
        self.__m_sofa = None
        self.statistics = None
        self.__m_datas = dict()
        self.misc_data = dict()
        self.processor = None
        self.recurse_level = 0
        self._m_analyzer_stack = list()
        self.onto_regime = False
        if (sofa_ is None):
            return
        self.__m_sofa = sofa_
        self._start_date = datetime.datetime.now()
        tokens = MorphologyService.process(sofa_.text, lang, None)
        t0 = None
        if (tokens is not None):
            ii = 0
            while ii < len(tokens):
                mt = tokens[ii]
                if (mt.begin_char == 733860):
                    pass
                tt = TextToken(mt, self)
                if (sofa_.correction_dict is not None):
                    corw = None
                    wrapcorw471 = RefOutArgWrapper(None)
                    inoutres472 = Utils.tryGetValue(sofa_.correction_dict,
                                                    mt.term, wrapcorw471)
                    corw = wrapcorw471.value
                    if (inoutres472):
                        ccc = MorphologyService.process(corw, lang, None)
                        if (ccc is not None and len(ccc) == 1):
                            tt1 = TextToken._new470(ccc[0], self,
                                                    tt.begin_char, tt.end_char,
                                                    tt.term)
                            tt1.chars = tt.chars
                            tt = tt1
                            if (self.corrected_tokens is None):
                                self.corrected_tokens = dict()
                            self.corrected_tokens[tt] = tt.get_source_text()
                if (t0 is None):
                    self.first_token = (tt)
                else:
                    t0.next0_ = tt
                t0 = (tt)
                ii += 1
        if (sofa_.clear_dust):
            self.__clear_dust()
        if (sofa_.do_words_merging_by_morph):
            self.__correct_words_by_merging(lang)
        if (sofa_.do_word_correction_by_morph):
            self.__correct_words_by_morph(lang)
        self.__merge_letters()
        self.__define_base_language()
        if (sofa_.create_number_tokens):
            t = self.first_token
            first_pass3049 = True
            while True:
                if first_pass3049: first_pass3049 = False
                else: t = t.next0_
                if (not (t is not None)): break
                nt = NumberHelper._try_parse_number(t)
                if (nt is None):
                    continue
                self.embed_token(nt)
                t = (nt)
        if (only_tokenizing):
            return
        t = self.first_token
        first_pass3050 = True
        while True:
            if first_pass3050: first_pass3050 = False
            else: t = t.next0_
            if (not (t is not None)): break
            if (t.morph.class0_.is_preposition):
                continue
            mc = t.get_morph_class_in_dictionary()
            if (mc.is_undefined and t.chars.is_cyrillic_letter
                    and t.length_char > 4):
                tail = sofa_.text[t.end_char - 1:t.end_char - 1 + 2]
                tte = None
                tt = t.previous
                if (tt is not None and
                    ((tt.is_comma_and or tt.morph.class0_.is_preposition
                      or tt.morph.class0_.is_conjunction))):
                    tt = tt.previous
                if ((tt is not None
                     and not tt.get_morph_class_in_dictionary().is_undefined
                     and (((tt.morph.class0_.value) &
                           (t.morph.class0_.value))) != 0)
                        and tt.length_char > 4):
                    tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1 + 2]
                    if (tail2 == tail):
                        tte = tt
                if (tte is None):
                    tt = t.next0_
                    if (tt is not None and
                        ((tt.is_comma_and or tt.morph.class0_.is_preposition
                          or tt.morph.class0_.is_conjunction))):
                        tt = tt.next0_
                    if ((tt is not None and
                         not tt.get_morph_class_in_dictionary().is_undefined
                         and (((tt.morph.class0_.value) &
                               (t.morph.class0_.value))) != 0)
                            and tt.length_char > 4):
                        tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1 + 2]
                        if (tail2 == tail):
                            tte = tt
                if (tte is not None):
                    t.morph.remove_items_ex(
                        tte.morph, tte.get_morph_class_in_dictionary())
            continue
        self.__create_statistics()

    def _init_from(self, ar: 'AnalysisResult') -> None:
        self.__m_sofa = ar.sofa
        self.first_token = ar.first_token
        self.base_language = ar.base_language
        self.__create_statistics()

    def __clear_dust(self) -> None:
        t = self.first_token
        first_pass3051 = True
        while True:
            if first_pass3051: first_pass3051 = False
            else: t = t.next0_
            if (not (t is not None)): break
            cou = AnalysisKit.__calc_abnormal_coef(t)
            norm = 0
            if (cou < 1):
                continue
            t1 = t
            tt = t
            first_pass3052 = True
            while True:
                if first_pass3052: first_pass3052 = False
                else: tt = tt.next0_
                if (not (tt is not None)): break
                co = AnalysisKit.__calc_abnormal_coef(tt)
                if (co == 0):
                    continue
                if (co < 0):
                    norm += 1
                    if (norm > 1):
                        break
                else:
                    norm = 0
                    cou += co
                    t1 = tt
            len0_ = t1.end_char - t.begin_char
            if (cou > 20 and len0_ > 500):
                p = t.begin_char
                while p < t1.end_char:
                    if (self.sofa.text[p] == self.sofa.text[p + 1]):
                        len0_ -= 1
                    p += 1
                if (len0_ > 500):
                    if (t.previous is not None):
                        t.previous.next0_ = t1.next0_
                    else:
                        self.first_token = t1.next0_
                    t = t1
                else:
                    t = t1
            else:
                t = t1

    @staticmethod
    def __calc_abnormal_coef(t: 'Token') -> int:
        if (isinstance(t, NumberToken)):
            return 0
        tt = Utils.asObjectOrNull(t, TextToken)
        if (tt is None):
            return 0
        if (not tt.chars.is_letter):
            return 0
        if (not tt.chars.is_latin_letter and not tt.chars.is_cyrillic_letter):
            return 2
        if (tt.length_char < 4):
            return 0
        for wf in tt.morph.items:
            if (wf.is_in_dictionary):
                return -1
        if (tt.length_char > 15):
            return 2
        return 1

    def __correct_words_by_merging(self, lang: 'MorphLang') -> None:
        t = self.first_token
        first_pass3053 = True
        while True:
            if first_pass3053: first_pass3053 = False
            else: t = t.next0_
            if (not (t is not None and t.next0_ is not None)): break
            if (not t.chars.is_letter or (t.length_char < 2)):
                continue
            mc0 = t.get_morph_class_in_dictionary()
            if (t.morph.contains_attr("прдктв.", None)):
                continue
            t1 = t.next0_
            if (t1.is_hiphen and t1.next0_ is not None
                    and not t1.is_newline_after):
                t1 = t1.next0_
            if (t1.length_char == 1):
                continue
            if (not t1.chars.is_letter or not t.chars.is_letter
                    or t1.chars.is_latin_letter != t.chars.is_latin_letter):
                continue
            if (t1.chars.is_all_upper and not t.chars.is_all_upper):
                continue
            elif (not t1.chars.is_all_lower):
                continue
            elif (t.chars.is_all_upper):
                continue
            if (t1.morph.contains_attr("прдктв.", None)):
                continue
            mc1 = t1.get_morph_class_in_dictionary()
            if (not mc1.is_undefined and not mc0.is_undefined):
                continue
            if ((len(t.term) + len(t1.term)) < 6):
                continue
            corw = t.term + t1.term
            ccc = MorphologyService.process(corw, lang, None)
            if (ccc is None or len(ccc) != 1):
                continue
            if (corw == "ПОСТ" or corw == "ВРЕД"):
                continue
            tt = TextToken(ccc[0], self, t.begin_char, t1.end_char)
            if (tt.get_morph_class_in_dictionary().is_undefined):
                continue
            tt.chars = t.chars
            if (t == self.first_token):
                self.first_token = (tt)
            else:
                t.previous.next0_ = tt
            if (t1.next0_ is not None):
                tt.next0_ = t1.next0_
            t = (tt)

    def __correct_words_by_morph(self, lang: 'MorphLang') -> None:
        tt = self.first_token
        first_pass3054 = True
        while True:
            if first_pass3054: first_pass3054 = False
            else: tt = tt.next0_
            if (not (tt is not None)): break
            if (not (isinstance(tt, TextToken))):
                continue
            if (tt.morph.contains_attr("прдктв.", None)):
                continue
            dd = tt.get_morph_class_in_dictionary()
            if (not dd.is_undefined or (tt.length_char < 4)):
                continue
            if (tt.morph.class0_.is_proper_surname
                    and not tt.chars.is_all_lower):
                continue
            if (tt.chars.is_all_upper):
                continue
            corw = MorphologyService.correct_word(
                tt.term, (lang if tt.morph.language.is_undefined else
                          tt.morph.language))
            if (corw is None):
                continue
            ccc = MorphologyService.process(corw, lang, None)
            if (ccc is None or len(ccc) != 1):
                continue
            tt1 = TextToken._new473(ccc[0], self, tt.begin_char, tt.end_char,
                                    tt.chars, tt.term)
            mc = tt1.get_morph_class_in_dictionary()
            if (mc.is_proper_surname):
                continue
            if (tt == self.first_token):
                self.first_token = (tt1)
            else:
                tt.previous.next0_ = tt1
            tt1.next0_ = tt.next0_
            tt = (tt1)
            if (self.corrected_tokens is None):
                self.corrected_tokens = dict()
            self.corrected_tokens[tt] = tt.get_source_text()

    def __merge_letters(self) -> None:
        before_word = False
        tmp = io.StringIO()
        t = self.first_token
        first_pass3055 = True
        while True:
            if first_pass3055: first_pass3055 = False
            else: t = t.next0_
            if (not (t is not None)): break
            tt = Utils.asObjectOrNull(t, TextToken)
            if (not tt.chars.is_letter or tt.length_char != 1):
                before_word = False
                continue
            i = t.whitespaces_before_count
            if (i > 2 or ((i == 2 and before_word))):
                pass
            else:
                before_word = False
                continue
            i = 0
            t1 = None
            Utils.setLengthStringIO(tmp, 0)
            print(tt.get_source_text(), end="", file=tmp)
            t1 = t
            while t1.next0_ is not None:
                tt = (Utils.asObjectOrNull(t1.next0_, TextToken))
                if (tt.length_char != 1 or tt.whitespaces_before_count != 1):
                    break
                i += 1
                print(tt.get_source_text(), end="", file=tmp)
                t1 = t1.next0_
            if (i > 3 or ((i > 1 and before_word))):
                pass
            else:
                before_word = False
                continue
            before_word = False
            mt = MorphologyService.process(Utils.toStringStringIO(tmp), None,
                                           None)
            if (mt is None or len(mt) != 1):
                t = t1
                continue
            for wf in mt[0].word_forms:
                if (wf.is_in_dictionary):
                    before_word = True
                    break
            if (not before_word):
                t = t1
                continue
            tt = TextToken(mt[0], self, t.begin_char, t1.end_char)
            if (t == self.first_token):
                self.first_token = (tt)
            else:
                tt.previous = t.previous
            tt.next0_ = t1.next0_
            t = (tt)

    def embed_token(self, mt: 'MetaToken') -> None:
        """ Встроить токен в основную цепочку токенов
        
        Args:
            mt(MetaToken): встраиваемый метатокен
        
        """
        if (mt is None):
            return
        if (mt.begin_char > mt.end_char):
            bg = mt.begin_token
            mt.begin_token = mt.end_token
            mt.end_token = bg
        if (mt.begin_char > mt.end_char):
            return
        if (mt.begin_token == self.first_token):
            self.first_token = (mt)
        else:
            tp = mt.begin_token.previous
            mt.previous = tp
        tn = mt.end_token.next0_
        mt.next0_ = tn
        if (isinstance(mt, ReferentToken)):
            if (mt.referent is not None):
                mt.referent.add_occurence(
                    TextAnnotation._new474(self.sofa, mt.begin_char,
                                           mt.end_char))

    def debed_token(self, t: 'Token') -> 'Token':
        """ Убрать метатокен из цепочки, восстановив исходное
        
        Args:
            t(Token): удаляемый из цепочки метатокен
        
        Returns:
            Token: первый токен удалённого метатокена
        
        """
        r = t.get_referent()
        if (r is not None):
            for o in r.occurrence:
                if (o.begin_char == t.begin_char and o.end_char == t.end_char):
                    r.occurrence.remove(o)
                    break
        mt = Utils.asObjectOrNull(t, MetaToken)
        if (mt is None):
            return t
        if (t.next0_ is not None):
            t.next0_.previous = mt.end_token
        if (t.previous is not None):
            t.previous.next0_ = mt.begin_token
        if (mt == self.first_token):
            self.first_token = mt.begin_token
        if (r is not None and len(r.occurrence) == 0):
            for d in self.__m_datas.values():
                if (r in d.referents):
                    d.remove_referent(r)
                    break
        return mt.begin_token

    @property
    def entities(self) -> typing.List['Referent']:
        """ Список сущностей Referent, выделенных в ходе анализа """
        return self.__m_entities

    @property
    def sofa(self) -> 'SourceOfAnalysis':
        """ Ссылка на исходный текст """
        if (self.__m_sofa is None):
            self.__m_sofa = SourceOfAnalysis("")
        return self.__m_sofa

    def get_text_character(self, position: int) -> 'char':
        """ Получить символ из исходного текста
        
        Args:
            position(int): позиция
        
        Returns:
            'char': символ (0, если выход за границу)
        """
        if ((position < 0) or position >= len(self.__m_sofa.text)):
            return chr(0)
        return self.__m_sofa.text[position]

    def get_analyzer_data_by_analyzer_name(
            self, analyzer_name: str) -> 'AnalyzerData':
        """ Получить данные, полученные в настоящий момент конкретным анализатором
        
        Args:
            analyzer_name(str): имя анализатора
        
        Returns:
            AnalyzerData: связанные с ним данные
        """
        a = self.processor.find_analyzer(analyzer_name)
        if (a is None):
            return None
        return self.get_analyzer_data(a)

    def get_analyzer_data(self, analyzer: 'Analyzer') -> 'AnalyzerData':
        # Получить данные, полученные в настоящий момент конкретным анализатором
        if (analyzer is None or analyzer.name is None):
            return None
        d = None
        wrapd475 = RefOutArgWrapper(None)
        inoutres476 = Utils.tryGetValue(self.__m_datas, analyzer.name,
                                        wrapd475)
        d = wrapd475.value
        if (inoutres476):
            d.kit = self
            return d
        default_data = analyzer.create_analyzer_data()
        if (default_data is None):
            return None
        if (analyzer._persist_referents_regim):
            if (analyzer._persist_analizer_data is None):
                analyzer._persist_analizer_data = default_data
            else:
                default_data = analyzer._persist_analizer_data
        self.__m_datas[analyzer.name] = default_data
        default_data.kit = self
        return default_data

    def __create_statistics(self) -> None:
        from pullenti.ner.core.StatisticCollection import StatisticCollection
        self.statistics = StatisticCollection()
        self.statistics._prepare(self.first_token)

    def __define_base_language(self) -> None:
        stat = dict()
        total = 0
        t = self.first_token
        first_pass3056 = True
        while True:
            if first_pass3056: first_pass3056 = False
            else: t = t.next0_
            if (not (t is not None)): break
            tt = Utils.asObjectOrNull(t, TextToken)
            if (tt is None):
                continue
            if (tt.morph.language.is_undefined):
                continue
            if (not tt.morph.language.value in stat):
                stat[tt.morph.language.value] = 1
            else:
                stat[tt.morph.language.value] += 1
            total += 1
        val = 0
        for kp in stat.items():
            if (kp[1] > (math.floor(total / 2))):
                val |= kp[0]
        self.base_language.value = val

    def replace_referent(self, old_referent: 'Referent',
                         new_referent: 'Referent') -> None:
        # Заменить везде, где только возможно, старую сущность на новую (используется при объединении сущностей)
        t = self.first_token
        while t is not None:
            if (isinstance(t, ReferentToken)):
                t._replace_referent(old_referent, new_referent)
            t = t.next0_
        for d in self.__m_datas.values():
            for r in d.referents:
                for s in r.slots:
                    if (s.value == old_referent):
                        r.upload_slot(s, new_referent)
            if (old_referent in d.referents):
                d.referents.remove(old_referent)

    def process_referent(self, analyzer_name: str,
                         t: 'Token') -> 'ReferentToken':
        """ Попытаться выделить с заданного токена сущность указанным анализатором.
        Используется, если нужно "забежать вперёд" и проверить гипотезу, есть ли тут сущность конкретного типа или нет.
        
        Args:
            analyzer_name(str): имя анализатора
            t(Token): токен, с которого попробовать выделение
        
        Returns:
            ReferentToken: метатокен с сущностью ReferentToken или null. Отметим, что сущность не сохранена и полученный метатокен никуда не встроен.
        
        """
        if (self.processor is None):
            return None
        if (analyzer_name in self._m_analyzer_stack):
            return None
        if (self.is_recurce_overflow):
            return None
        a = self.processor.find_analyzer(analyzer_name)
        if (a is None):
            return None
        self.recurse_level += 1
        self._m_analyzer_stack.append(analyzer_name)
        res = a.process_referent(t, None)
        self._m_analyzer_stack.remove(analyzer_name)
        self.recurse_level -= 1
        return res

    def create_referent(self, type_name: str) -> 'Referent':
        """ Создать экземпляр сущности заданного типа
        
        Args:
            type_name(str): имя типа сущности
        
        Returns:
            Referent: экземпляр класса, наследного от Referent, или null
        """
        if (self.processor is None):
            return None
        else:
            for a in self.processor.analyzers:
                res = a.create_referent(type_name)
                if (res is not None):
                    return res
        return None

    def refresh_generals(self) -> None:
        GeneralRelationHelper.refresh_generals(self.processor, self)

    @property
    def is_recurce_overflow(self) -> bool:
        return self.recurse_level > 5

    def serialize(self, stream: Stream) -> None:
        stream.writebyte(0xAA)
        stream.writebyte(1)
        self.__m_sofa.serialize(stream)
        SerializerHelper.serialize_int(stream, self.base_language.value)
        if (len(self.__m_entities) == 0):
            for d in self.__m_datas.items():
                self.__m_entities.extend(d[1].referents)
        SerializerHelper.serialize_int(stream, len(self.__m_entities))
        i = 0
        while i < len(self.__m_entities):
            self.__m_entities[i].tag = i + 1
            SerializerHelper.serialize_string(stream,
                                              self.__m_entities[i].type_name)
            i += 1
        for e0_ in self.__m_entities:
            e0_.serialize(stream)
        SerializerHelper.serialize_tokens(stream, self.first_token, 0)

    def deserialize(self, stream: Stream) -> bool:
        vers = 0
        b = stream.readbyte()
        if (b == (0xAA)):
            b = (stream.readbyte())
            vers = (b)
        else:
            stream.position = stream.position - (1)
        self.__m_sofa = SourceOfAnalysis(None)
        self.__m_sofa.deserialize(stream)
        self.base_language = MorphLang._new56(
            SerializerHelper.deserialize_int(stream))
        self.__m_entities = list()
        cou = SerializerHelper.deserialize_int(stream)
        i = 0
        while i < cou:
            typ = SerializerHelper.deserialize_string(stream)
            r = ProcessorService.create_referent(typ)
            if (r is None):
                r = Referent("UNDEFINED")
            self.__m_entities.append(r)
            i += 1
        i = 0
        while i < cou:
            self.__m_entities[i].deserialize(stream, self.__m_entities,
                                             self.__m_sofa)
            i += 1
        self.first_token = SerializerHelper.deserialize_tokens(
            stream, self, vers)
        self.__create_statistics()
        return True

    @staticmethod
    def _new2840(_arg1: 'Processor', _arg2: 'ExtOntology') -> 'AnalysisKit':
        res = AnalysisKit()
        res.processor = _arg1
        res.ontology = _arg2
        return res

    @staticmethod
    def _new2841(_arg1: 'SourceOfAnalysis', _arg2: bool, _arg3: 'MorphLang',
                 _arg4: EventHandler, _arg5: 'ExtOntology', _arg6: 'Processor',
                 _arg7: bool) -> 'AnalysisKit':
        res = AnalysisKit(_arg1, _arg2, _arg3, _arg4)
        res.ontology = _arg5
        res.processor = _arg6
        res.onto_regime = _arg7
        return res
コード例 #12
0
ファイル: Program.py プロジェクト: pullenti/PullentiPython
 def main(args: typing.List[str]) -> None:
     sw = Stopwatch()
     # инициализация - необходимо проводить один раз до обработки текстов
     print("Initializing SDK Pullenti ver {0} ({1}) ... ".format(
         Sdk.get_version(), Sdk.get_version_date()),
           end="",
           flush=True)
     # инициализируются движок и все имеющиеся анализаторы
     Sdk.initialize_all()
     sw.stop()
     print("OK (by {0} ms), version {1}".format(
         sw.elapsedMilliseconds, ProcessorService.get_version()),
           flush=True)
     # посмотрим, какие анализаторы доступны
     for a in ProcessorService.get_analyzers():
         print("   {0} {1} \"{2}\"".format(
             ("Specific analyzer" if a.is_specific else "Common analyzer"),
             a.name, a.caption),
               flush=True)
     # анализируемый текст
     txt = "Система разрабатывается с 2011 года российским программистом Михаилом Жуковым, проживающим в Москве на Красной площади в доме номер один на втором этаже. Конкурентов у него много: Abbyy, Yandex, ООО \"Russian Context Optimizer\" (RCO) и другие компании. Он планирует продать SDK за 1.120.000.001,99 (миллиард сто двадцать миллионов один рубль 99 копеек) рублей, без НДС."
     print("Text: {0}".format(txt), flush=True)
     # запускаем обработку на пустом процессоре (без анализаторов NER)
     are = ProcessorService.get_empty_processor().process(
         SourceOfAnalysis(txt), None, None)
     print("Noun groups: ", end="", flush=True)
     t = are.first_token
     # перебираем токены
     first_pass2974 = True
     while True:
         if first_pass2974: first_pass2974 = False
         else: t = t.next0_
         if (not (t is not None)): break
         # выделяем именную группу с текущего токена
         npt = NounPhraseHelper.try_parse(t, NounPhraseParseAttr.NO, 0,
                                          None)
         # не получилось
         if (npt is None):
             continue
         # получилось, выводим в нормализованном виде
         print("[{0}=>{1}] ".format(
             npt.get_source_text(),
             npt.get_normal_case_text(None, MorphNumber.SINGULAR,
                                      MorphGender.UNDEFINED, False)),
               end="",
               flush=True)
         # указатель на последний токен именной группы
         t = npt.end_token
     with ProcessorService.create_processor() as proc:
         # анализируем текст
         ar = proc.process(SourceOfAnalysis(txt), None, None)
         # результирующие сущности
         print(
             "\r\n==========================================\r\nEntities: ",
             flush=True)
         for e0_ in ar.entities:
             print("{0}: {1}".format(e0_.type_name, str(e0_)), flush=True)
             for s in e0_.slots:
                 print("   {0}: {1}".format(s.type_name, s.value),
                       flush=True)
         # пример выделения именных групп
         print(
             "\r\n==========================================\r\nNoun groups: ",
             flush=True)
         t = ar.first_token
         first_pass2975 = True
         while True:
             if first_pass2975: first_pass2975 = False
             else: t = t.next0_
             if (not (t is not None)): break
             # токены с сущностями игнорируем
             if (t.get_referent() is not None):
                 continue
             # пробуем создать именную группу
             npt = NounPhraseHelper.try_parse(
                 t, NounPhraseParseAttr.ADJECTIVECANBELAST, 0, None)
             # не получилось
             if (npt is None):
                 continue
             print(npt, flush=True)
             # указатель перемещаем на последний токен группы
             t = npt.end_token
     with ProcessorService.create_specific_processor(
             KeywordAnalyzer.ANALYZER_NAME) as proc:
         ar = proc.process(SourceOfAnalysis(txt), None, None)
         print(
             "\r\n==========================================\r\nKeywords1: ",
             flush=True)
         for e0_ in ar.entities:
             if (isinstance(e0_, KeywordReferent)):
                 print(e0_, flush=True)
         print(
             "\r\n==========================================\r\nKeywords2: ",
             flush=True)
         t = ar.first_token
         first_pass2976 = True
         while True:
             if first_pass2976: first_pass2976 = False
             else: t = t.next0_
             if (not (t is not None)): break
             if (isinstance(t, ReferentToken)):
                 kw = Utils.asObjectOrNull(t.get_referent(),
                                           KeywordReferent)
                 if (kw is None):
                     continue
                 kwstr = MiscHelper.get_text_value_of_meta_token(
                     Utils.asObjectOrNull(t, ReferentToken),
                     Utils.valToEnum(
                         (GetTextAttr.FIRSTNOUNGROUPTONOMINATIVESINGLE) |
                         (GetTextAttr.KEEPREGISTER), GetTextAttr))
                 print("{0} = {1}".format(kwstr, kw), flush=True)
     print("Over!", flush=True)