Beispiel #1
0
 def __mergeLetters(self) -> None:
     before_word = False
     tmp = io.StringIO()
     t = self.first_token
     first_pass2800 = True
     while True:
         if first_pass2800: first_pass2800 = False
         else: t = t.next0_
         if (not (t is not None)): break
         tt = Utils.asObjectOrNull(t, TextToken)
         if (not tt.chars.is_letter or tt.length_char != 1): 
             before_word = False
             continue
         i = t.whitespaces_before_count
         if (i > 2 or ((i == 2 and before_word))): 
             pass
         else: 
             before_word = False
             continue
         i = 0
         Utils.setLengthStringIO(tmp, 0)
         print(tt.getSourceText(), end="", file=tmp)
         t1 = t
         while t1.next0_ is not None: 
             tt = (Utils.asObjectOrNull(t1.next0_, TextToken))
             if (tt.length_char != 1 or tt.whitespaces_before_count != 1): 
                 break
             i += 1
             print(tt.getSourceText(), end="", file=tmp)
             t1 = t1.next0_
         if (i > 3 or ((i > 1 and before_word))): 
             pass
         else: 
             before_word = False
             continue
         before_word = False
         mt = Morphology.process(Utils.toStringStringIO(tmp), None, None)
         if (mt is None or len(mt) != 1): 
             t = t1
             continue
         for wf in mt[0].word_forms: 
             if (wf.is_in_dictionary): 
                 before_word = True
                 break
         if (not before_word): 
             t = t1
             continue
         tt = TextToken(mt[0], self)
         if (t == self.first_token): 
             self.first_token = (tt)
         else: 
             tt.previous = t.previous
         tt.next0_ = t1.next0_
         tt.begin_char = t.begin_char
         tt.end_char = t1.end_char
         t = (tt)
Beispiel #2
0
 def __correctWordsByMerging(self, lang : 'MorphLang') -> None:
     t = self.first_token
     first_pass2798 = True
     while True:
         if first_pass2798: first_pass2798 = False
         else: t = t.next0_
         if (not (t is not None and t.next0_ is not None)): break
         if (not t.chars.is_letter or (t.length_char < 2)): 
             continue
         mc0 = t.getMorphClassInDictionary()
         if (t.morph.containsAttr("прдктв.", None)): 
             continue
         t1 = t.next0_
         if (t1.is_hiphen and t1.next0_ is not None and not t1.is_newline_after): 
             t1 = t1.next0_
         if (t1.length_char == 1): 
             continue
         if (not t1.chars.is_letter or not t.chars.is_letter or t1.chars.is_latin_letter != t.chars.is_latin_letter): 
             continue
         if (t1.chars.is_all_upper and not t.chars.is_all_upper): 
             continue
         elif (not t1.chars.is_all_lower): 
             continue
         elif (t.chars.is_all_upper): 
             continue
         if (t1.morph.containsAttr("прдктв.", None)): 
             continue
         mc1 = t1.getMorphClassInDictionary()
         if (not mc1.is_undefined and not mc0.is_undefined): 
             continue
         if ((len((t).term) + len((t1).term)) < 6): 
             continue
         corw = (t).term + (t1).term
         ccc = Morphology.process(corw, lang, None)
         if (ccc is None or len(ccc) != 1): 
             continue
         if (corw == "ПОСТ" or corw == "ВРЕД"): 
             continue
         tt = TextToken(ccc[0], self)
         if (tt.getMorphClassInDictionary().is_undefined): 
             continue
         tt.begin_char = t.begin_char
         tt.end_char = t1.end_char
         tt.chars = t.chars
         if (t == self.first_token): 
             self.first_token = (tt)
         else: 
             t.previous.next0_ = tt
         if (t1.next0_ is not None): 
             tt.next0_ = t1.next0_
         t = (tt)
Beispiel #3
0
 def __correctWordsByMorph(self, lang : 'MorphLang') -> None:
     tt = self.first_token
     first_pass2799 = True
     while True:
         if first_pass2799: first_pass2799 = False
         else: tt = tt.next0_
         if (not (tt is not None)): break
         if (not ((isinstance(tt, TextToken)))): 
             continue
         if (tt.morph.containsAttr("прдктв.", None)): 
             continue
         dd = tt.getMorphClassInDictionary()
         if (not dd.is_undefined or (tt.length_char < 4)): 
             continue
         if (tt.morph.class0_.is_proper_surname and not tt.chars.is_all_lower): 
             continue
         if (tt.chars.is_all_upper): 
             continue
         corw = Morphology.correctWord((tt).term, (lang if tt.morph.language.is_undefined else tt.morph.language))
         if (corw is None): 
             continue
         ccc = Morphology.process(corw, lang, None)
         if (ccc is None or len(ccc) != 1): 
             continue
         tt1 = TextToken._new541(ccc[0], self, tt.chars, tt.begin_char, tt.end_char, (tt).term)
         mc = tt1.getMorphClassInDictionary()
         if (mc.is_proper_surname): 
             continue
         if (tt == self.first_token): 
             self.first_token = (tt1)
         else: 
             tt.previous.next0_ = tt1
         tt1.next0_ = tt.next0_
         tt = (tt1)
         if (self.corrected_tokens is None): 
             self.corrected_tokens = dict()
         self.corrected_tokens[tt] = tt.getSourceText()
Beispiel #4
0
 def __init__(self,
              source: str = None,
              lang_: 'MorphLang' = None,
              source_is_normal: bool = False) -> None:
     """ Создать термин из строки с добавлением всех морфологических вариантов написания
     
     Args:
         source(str): строка
         lang_(MorphLang): возможный язык
         source_is_normal(bool): при true морфварианты не добавляются 
      (эквивалентно вызову InitByNormalText)
     """
     self.terms = list()
     self.additional_vars = None
     self.__m_canonic_text = None
     self.ignore_terms_order = False
     self.acronym = None
     self.acronym_smart = None
     self.acronym_can_be_lower = False
     self.abridges = None
     self.lang = MorphLang()
     self.tag = None
     self.tag2 = None
     if (source is None):
         return
     if (source_is_normal or Termin.ASSIGN_ALL_TEXTS_AS_NORMAL):
         self.initByNormalText(source, lang_)
         return
     toks = Morphology.process(source, lang_, None)
     if (toks is not None):
         i = 0
         while i < len(toks):
             tt = TextToken(toks[i], None)
             self.terms.append(Termin.Term(tt, not source_is_normal))
             i += 1
     self.lang = MorphLang(lang_)
Beispiel #5
0
 def __init__(self, sofa_ : 'SourceOfAnalysis'=None, only_tokenizing : bool=False, lang : 'MorphLang'=None, progress : EventHandler=None) -> None:
     self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0)
     self.corrected_tokens = None
     self.first_token = None;
     self.__m_entities = list()
     self.ontology = None;
     self.base_language = MorphLang()
     self.__m_sofa = None;
     self.statistics = None;
     self.__m_datas = dict()
     self.misc_data = dict()
     self.processor = None;
     self.recurse_level = 0
     self._m_analyzer_stack = list()
     if (sofa_ is None): 
         return
     self.__m_sofa = sofa_
     self._start_date = datetime.datetime.now()
     tokens = Morphology.process(sofa_.text, lang, None)
     t0 = None
     if (tokens is not None): 
         ii = 0
         while ii < len(tokens): 
             mt = tokens[ii]
             if (mt.begin_char == 733860): 
                 pass
             tt = TextToken(mt, self)
             if (sofa_.correction_dict is not None): 
                 wrapcorw539 = RefOutArgWrapper(None)
                 inoutres540 = Utils.tryGetValue(sofa_.correction_dict, mt.term, wrapcorw539)
                 corw = wrapcorw539.value
                 if (inoutres540): 
                     ccc = Morphology.process(corw, lang, None)
                     if (ccc is not None and len(ccc) == 1): 
                         tt1 = TextToken._new538(ccc[0], self, tt.term)
                         tt1.begin_char = tt.begin_char
                         tt1.end_char = tt.end_char
                         tt1.chars = tt.chars
                         tt = tt1
                         if (self.corrected_tokens is None): 
                             self.corrected_tokens = dict()
                         self.corrected_tokens[tt] = tt.getSourceText()
             if (t0 is None): 
                 self.first_token = (tt)
             else: 
                 t0.next0_ = tt
             t0 = (tt)
             ii += 1
     if (sofa_.clear_dust): 
         self.__clearDust()
     if (sofa_.do_words_merging_by_morph): 
         self.__correctWordsByMerging(lang)
     if (sofa_.do_word_correction_by_morph): 
         self.__correctWordsByMorph(lang)
     self.__mergeLetters()
     self.__defineBaseLanguage()
     t = self.first_token
     first_pass2794 = True
     while True:
         if first_pass2794: first_pass2794 = False
         else: t = t.next0_
         if (not (t is not None)): break
         nt = NumberHelper._tryParseNumber(t)
         if (nt is None): 
             continue
         self.embedToken(nt)
         t = (nt)
     if (only_tokenizing): 
         return
     t = self.first_token
     first_pass2795 = True
     while True:
         if first_pass2795: first_pass2795 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (t.morph.class0_.is_preposition): 
             continue
         mc = t.getMorphClassInDictionary()
         if (mc.is_undefined and t.chars.is_cyrillic_letter and t.length_char > 4): 
             tail = sofa_.text[t.end_char - 1:t.end_char - 1+2]
             tte = None
             tt = t.previous
             if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): 
                 tt = tt.previous
             if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): 
                 tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2]
                 if (tail2 == tail): 
                     tte = tt
             if (tte is None): 
                 tt = t.next0_
                 if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): 
                     tt = tt.next0_
                 if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): 
                     tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2]
                     if (tail2 == tail): 
                         tte = tt
             if (tte is not None): 
                 t.morph.removeItemsEx(tte.morph, tte.getMorphClassInDictionary())
         continue
     self.__createStatistics()