コード例 #1
0
 def __merge_letters(self) -> None:
     before_word = False
     tmp = io.StringIO()
     t = self.first_token
     first_pass3055 = True
     while True:
         if first_pass3055: first_pass3055 = False
         else: t = t.next0_
         if (not (t is not None)): break
         tt = Utils.asObjectOrNull(t, TextToken)
         if (not tt.chars.is_letter or tt.length_char != 1):
             before_word = False
             continue
         i = t.whitespaces_before_count
         if (i > 2 or ((i == 2 and before_word))):
             pass
         else:
             before_word = False
             continue
         i = 0
         t1 = None
         Utils.setLengthStringIO(tmp, 0)
         print(tt.get_source_text(), end="", file=tmp)
         t1 = t
         while t1.next0_ is not None:
             tt = (Utils.asObjectOrNull(t1.next0_, TextToken))
             if (tt.length_char != 1 or tt.whitespaces_before_count != 1):
                 break
             i += 1
             print(tt.get_source_text(), end="", file=tmp)
             t1 = t1.next0_
         if (i > 3 or ((i > 1 and before_word))):
             pass
         else:
             before_word = False
             continue
         before_word = False
         mt = MorphologyService.process(Utils.toStringStringIO(tmp), None,
                                        None)
         if (mt is None or len(mt) != 1):
             t = t1
             continue
         for wf in mt[0].word_forms:
             if (wf.is_in_dictionary):
                 before_word = True
                 break
         if (not before_word):
             t = t1
             continue
         tt = TextToken(mt[0], self, t.begin_char, t1.end_char)
         if (t == self.first_token):
             self.first_token = (tt)
         else:
             tt.previous = t.previous
         tt.next0_ = t1.next0_
         t = (tt)
コード例 #2
0
 def __correct_words_by_merging(self, lang: 'MorphLang') -> None:
     t = self.first_token
     first_pass3053 = True
     while True:
         if first_pass3053: first_pass3053 = False
         else: t = t.next0_
         if (not (t is not None and t.next0_ is not None)): break
         if (not t.chars.is_letter or (t.length_char < 2)):
             continue
         mc0 = t.get_morph_class_in_dictionary()
         if (t.morph.contains_attr("прдктв.", None)):
             continue
         t1 = t.next0_
         if (t1.is_hiphen and t1.next0_ is not None
                 and not t1.is_newline_after):
             t1 = t1.next0_
         if (t1.length_char == 1):
             continue
         if (not t1.chars.is_letter or not t.chars.is_letter
                 or t1.chars.is_latin_letter != t.chars.is_latin_letter):
             continue
         if (t1.chars.is_all_upper and not t.chars.is_all_upper):
             continue
         elif (not t1.chars.is_all_lower):
             continue
         elif (t.chars.is_all_upper):
             continue
         if (t1.morph.contains_attr("прдктв.", None)):
             continue
         mc1 = t1.get_morph_class_in_dictionary()
         if (not mc1.is_undefined and not mc0.is_undefined):
             continue
         if ((len(t.term) + len(t1.term)) < 6):
             continue
         corw = t.term + t1.term
         ccc = MorphologyService.process(corw, lang, None)
         if (ccc is None or len(ccc) != 1):
             continue
         if (corw == "ПОСТ" or corw == "ВРЕД"):
             continue
         tt = TextToken(ccc[0], self, t.begin_char, t1.end_char)
         if (tt.get_morph_class_in_dictionary().is_undefined):
             continue
         tt.chars = t.chars
         if (t == self.first_token):
             self.first_token = (tt)
         else:
             t.previous.next0_ = tt
         if (t1.next0_ is not None):
             tt.next0_ = t1.next0_
         t = (tt)
コード例 #3
0
ファイル: Termin.py プロジェクト: MihaJjDa/APCLtask
 def initByNormalText(self, text: str, lang_: 'MorphLang' = None) -> None:
     """ Быстрая инициализация без морф.вариантов, производится только
      токенизация текста. Используется для ускорения работы со словарём в случае,
      когда изначально известно, что на входе уже нормализованные строки
     
     Args:
         text(str): исходно нормализованный текст
         lang_(MorphLang): возможный язык
     """
     if (Utils.isNullOrEmpty(text)):
         return
     text = text.upper()
     if (text.find('\'') >= 0):
         text = text.replace("'", "")
     tok = False
     sp = False
     for ch in text:
         if (not str.isalpha(ch)):
             if (ch == ' '):
                 sp = True
             else:
                 tok = True
                 break
     if (not tok and not sp):
         tt = TextToken(None, None)
         tt.term = text
         self.terms.append(Termin.Term(tt, False))
     elif (not tok and sp):
         wrds = Utils.splitString(text, ' ', False)
         i = 0
         first_pass2811 = True
         while True:
             if first_pass2811: first_pass2811 = False
             else: i += 1
             if (not (i < len(wrds))): break
             if (Utils.isNullOrEmpty(wrds[i])):
                 continue
             tt = TextToken(None, None)
             tt.term = wrds[i]
             self.terms.append(Termin.Term(tt, False))
     else:
         toks = Morphology.tokenize(text)
         if (toks is not None):
             i = 0
             while i < len(toks):
                 tt = TextToken(toks[i], None)
                 self.terms.append(Termin.Term(tt, False))
                 i += 1
     self.lang = MorphLang(lang_)
コード例 #4
0
 def __correct_words_by_morph(self, lang: 'MorphLang') -> None:
     tt = self.first_token
     first_pass3054 = True
     while True:
         if first_pass3054: first_pass3054 = False
         else: tt = tt.next0_
         if (not (tt is not None)): break
         if (not (isinstance(tt, TextToken))):
             continue
         if (tt.morph.contains_attr("прдктв.", None)):
             continue
         dd = tt.get_morph_class_in_dictionary()
         if (not dd.is_undefined or (tt.length_char < 4)):
             continue
         if (tt.morph.class0_.is_proper_surname
                 and not tt.chars.is_all_lower):
             continue
         if (tt.chars.is_all_upper):
             continue
         corw = MorphologyService.correct_word(
             tt.term, (lang if tt.morph.language.is_undefined else
                       tt.morph.language))
         if (corw is None):
             continue
         ccc = MorphologyService.process(corw, lang, None)
         if (ccc is None or len(ccc) != 1):
             continue
         tt1 = TextToken._new473(ccc[0], self, tt.begin_char, tt.end_char,
                                 tt.chars, tt.term)
         mc = tt1.get_morph_class_in_dictionary()
         if (mc.is_proper_surname):
             continue
         if (tt == self.first_token):
             self.first_token = (tt1)
         else:
             tt.previous.next0_ = tt1
         tt1.next0_ = tt.next0_
         tt = (tt1)
         if (self.corrected_tokens is None):
             self.corrected_tokens = dict()
         self.corrected_tokens[tt] = tt.get_source_text()
コード例 #5
0
ファイル: Termin.py プロジェクト: MihaJjDa/APCLtask
 def __init__(self,
              source: str = None,
              lang_: 'MorphLang' = None,
              source_is_normal: bool = False) -> None:
     """ Создать термин из строки с добавлением всех морфологических вариантов написания
     
     Args:
         source(str): строка
         lang_(MorphLang): возможный язык
         source_is_normal(bool): при true морфварианты не добавляются 
      (эквивалентно вызову InitByNormalText)
     """
     self.terms = list()
     self.additional_vars = None
     self.__m_canonic_text = None
     self.ignore_terms_order = False
     self.acronym = None
     self.acronym_smart = None
     self.acronym_can_be_lower = False
     self.abridges = None
     self.lang = MorphLang()
     self.tag = None
     self.tag2 = None
     if (source is None):
         return
     if (source_is_normal or Termin.ASSIGN_ALL_TEXTS_AS_NORMAL):
         self.initByNormalText(source, lang_)
         return
     toks = Morphology.process(source, lang_, None)
     if (toks is not None):
         i = 0
         while i < len(toks):
             tt = TextToken(toks[i], None)
             self.terms.append(Termin.Term(tt, not source_is_normal))
             i += 1
     self.lang = MorphLang(lang_)
コード例 #6
0
 def __deserialize_token(stream : Stream, kit : 'AnalysisKit', vers : int) -> 'Token':
     from pullenti.ner.MetaToken import MetaToken
     from pullenti.ner.ReferentToken import ReferentToken
     typ = SerializerHelper.deserialize_short(stream)
     if (typ == (0)): 
         return None
     t = None
     if (typ == (1)): 
         t = (TextToken(None, kit))
     elif (typ == (2)): 
         t = (NumberToken(None, None, None, NumberSpellingType.DIGIT, kit))
     elif (typ == (3)): 
         t = (ReferentToken(None, None, None, kit))
     else: 
         t = (MetaToken(None, None, kit))
     t._deserialize(stream, kit, vers)
     if (isinstance(t, MetaToken)): 
         tt = SerializerHelper.deserialize_tokens(stream, kit, vers)
         if (tt is not None): 
             t._m_begin_token = tt
             while tt is not None: 
                 t._m_end_token = tt
                 tt = tt.next0_
     return t
コード例 #7
0
 def __init__(self, sofa_ : 'SourceOfAnalysis'=None, only_tokenizing : bool=False, lang : 'MorphLang'=None, progress : EventHandler=None) -> None:
     self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0)
     self.corrected_tokens = None
     self.first_token = None;
     self.__m_entities = list()
     self.ontology = None;
     self.base_language = MorphLang()
     self.__m_sofa = None;
     self.statistics = None;
     self.__m_datas = dict()
     self.misc_data = dict()
     self.processor = None;
     self.recurse_level = 0
     self._m_analyzer_stack = list()
     if (sofa_ is None): 
         return
     self.__m_sofa = sofa_
     self._start_date = datetime.datetime.now()
     tokens = Morphology.process(sofa_.text, lang, None)
     t0 = None
     if (tokens is not None): 
         ii = 0
         while ii < len(tokens): 
             mt = tokens[ii]
             if (mt.begin_char == 733860): 
                 pass
             tt = TextToken(mt, self)
             if (sofa_.correction_dict is not None): 
                 wrapcorw539 = RefOutArgWrapper(None)
                 inoutres540 = Utils.tryGetValue(sofa_.correction_dict, mt.term, wrapcorw539)
                 corw = wrapcorw539.value
                 if (inoutres540): 
                     ccc = Morphology.process(corw, lang, None)
                     if (ccc is not None and len(ccc) == 1): 
                         tt1 = TextToken._new538(ccc[0], self, tt.term)
                         tt1.begin_char = tt.begin_char
                         tt1.end_char = tt.end_char
                         tt1.chars = tt.chars
                         tt = tt1
                         if (self.corrected_tokens is None): 
                             self.corrected_tokens = dict()
                         self.corrected_tokens[tt] = tt.getSourceText()
             if (t0 is None): 
                 self.first_token = (tt)
             else: 
                 t0.next0_ = tt
             t0 = (tt)
             ii += 1
     if (sofa_.clear_dust): 
         self.__clearDust()
     if (sofa_.do_words_merging_by_morph): 
         self.__correctWordsByMerging(lang)
     if (sofa_.do_word_correction_by_morph): 
         self.__correctWordsByMorph(lang)
     self.__mergeLetters()
     self.__defineBaseLanguage()
     t = self.first_token
     first_pass2794 = True
     while True:
         if first_pass2794: first_pass2794 = False
         else: t = t.next0_
         if (not (t is not None)): break
         nt = NumberHelper._tryParseNumber(t)
         if (nt is None): 
             continue
         self.embedToken(nt)
         t = (nt)
     if (only_tokenizing): 
         return
     t = self.first_token
     first_pass2795 = True
     while True:
         if first_pass2795: first_pass2795 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (t.morph.class0_.is_preposition): 
             continue
         mc = t.getMorphClassInDictionary()
         if (mc.is_undefined and t.chars.is_cyrillic_letter and t.length_char > 4): 
             tail = sofa_.text[t.end_char - 1:t.end_char - 1+2]
             tte = None
             tt = t.previous
             if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): 
                 tt = tt.previous
             if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): 
                 tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2]
                 if (tail2 == tail): 
                     tte = tt
             if (tte is None): 
                 tt = t.next0_
                 if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): 
                     tt = tt.next0_
                 if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): 
                     tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2]
                     if (tail2 == tail): 
                         tte = tt
             if (tte is not None): 
                 t.morph.removeItemsEx(tte.morph, tte.getMorphClassInDictionary())
         continue
     self.__createStatistics()