Esempio n. 1
0
 def __merge_letters(self) -> None:
     before_word = False
     tmp = io.StringIO()
     t = self.first_token
     first_pass3055 = True
     while True:
         if first_pass3055: first_pass3055 = False
         else: t = t.next0_
         if (not (t is not None)): break
         tt = Utils.asObjectOrNull(t, TextToken)
         if (not tt.chars.is_letter or tt.length_char != 1):
             before_word = False
             continue
         i = t.whitespaces_before_count
         if (i > 2 or ((i == 2 and before_word))):
             pass
         else:
             before_word = False
             continue
         i = 0
         t1 = None
         Utils.setLengthStringIO(tmp, 0)
         print(tt.get_source_text(), end="", file=tmp)
         t1 = t
         while t1.next0_ is not None:
             tt = (Utils.asObjectOrNull(t1.next0_, TextToken))
             if (tt.length_char != 1 or tt.whitespaces_before_count != 1):
                 break
             i += 1
             print(tt.get_source_text(), end="", file=tmp)
             t1 = t1.next0_
         if (i > 3 or ((i > 1 and before_word))):
             pass
         else:
             before_word = False
             continue
         before_word = False
         mt = MorphologyService.process(Utils.toStringStringIO(tmp), None,
                                        None)
         if (mt is None or len(mt) != 1):
             t = t1
             continue
         for wf in mt[0].word_forms:
             if (wf.is_in_dictionary):
                 before_word = True
                 break
         if (not before_word):
             t = t1
             continue
         tt = TextToken(mt[0], self, t.begin_char, t1.end_char)
         if (t == self.first_token):
             self.first_token = (tt)
         else:
             tt.previous = t.previous
         tt.next0_ = t1.next0_
         t = (tt)
Esempio n. 2
0
 def __correct_words_by_merging(self, lang: 'MorphLang') -> None:
     t = self.first_token
     first_pass3053 = True
     while True:
         if first_pass3053: first_pass3053 = False
         else: t = t.next0_
         if (not (t is not None and t.next0_ is not None)): break
         if (not t.chars.is_letter or (t.length_char < 2)):
             continue
         mc0 = t.get_morph_class_in_dictionary()
         if (t.morph.contains_attr("прдктв.", None)):
             continue
         t1 = t.next0_
         if (t1.is_hiphen and t1.next0_ is not None
                 and not t1.is_newline_after):
             t1 = t1.next0_
         if (t1.length_char == 1):
             continue
         if (not t1.chars.is_letter or not t.chars.is_letter
                 or t1.chars.is_latin_letter != t.chars.is_latin_letter):
             continue
         if (t1.chars.is_all_upper and not t.chars.is_all_upper):
             continue
         elif (not t1.chars.is_all_lower):
             continue
         elif (t.chars.is_all_upper):
             continue
         if (t1.morph.contains_attr("прдктв.", None)):
             continue
         mc1 = t1.get_morph_class_in_dictionary()
         if (not mc1.is_undefined and not mc0.is_undefined):
             continue
         if ((len(t.term) + len(t1.term)) < 6):
             continue
         corw = t.term + t1.term
         ccc = MorphologyService.process(corw, lang, None)
         if (ccc is None or len(ccc) != 1):
             continue
         if (corw == "ПОСТ" or corw == "ВРЕД"):
             continue
         tt = TextToken(ccc[0], self, t.begin_char, t1.end_char)
         if (tt.get_morph_class_in_dictionary().is_undefined):
             continue
         tt.chars = t.chars
         if (t == self.first_token):
             self.first_token = (tt)
         else:
             t.previous.next0_ = tt
         if (t1.next0_ is not None):
             tt.next0_ = t1.next0_
         t = (tt)
Esempio n. 3
0
 def __correct_words_by_morph(self, lang: 'MorphLang') -> None:
     tt = self.first_token
     first_pass3054 = True
     while True:
         if first_pass3054: first_pass3054 = False
         else: tt = tt.next0_
         if (not (tt is not None)): break
         if (not (isinstance(tt, TextToken))):
             continue
         if (tt.morph.contains_attr("прдктв.", None)):
             continue
         dd = tt.get_morph_class_in_dictionary()
         if (not dd.is_undefined or (tt.length_char < 4)):
             continue
         if (tt.morph.class0_.is_proper_surname
                 and not tt.chars.is_all_lower):
             continue
         if (tt.chars.is_all_upper):
             continue
         corw = MorphologyService.correct_word(
             tt.term, (lang if tt.morph.language.is_undefined else
                       tt.morph.language))
         if (corw is None):
             continue
         ccc = MorphologyService.process(corw, lang, None)
         if (ccc is None or len(ccc) != 1):
             continue
         tt1 = TextToken._new473(ccc[0], self, tt.begin_char, tt.end_char,
                                 tt.chars, tt.term)
         mc = tt1.get_morph_class_in_dictionary()
         if (mc.is_proper_surname):
             continue
         if (tt == self.first_token):
             self.first_token = (tt1)
         else:
             tt.previous.next0_ = tt1
         tt1.next0_ = tt.next0_
         tt = (tt1)
         if (self.corrected_tokens is None):
             self.corrected_tokens = dict()
         self.corrected_tokens[tt] = tt.get_source_text()
Esempio n. 4
0
 def __init__(self,
              sofa_: 'SourceOfAnalysis' = None,
              only_tokenizing: bool = False,
              lang: 'MorphLang' = None,
              progress: EventHandler = None) -> None:
     self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0)
     self.corrected_tokens = None
     self.first_token = None
     self.__m_entities = list()
     self.ontology = None
     self.base_language = MorphLang()
     self.__m_sofa = None
     self.statistics = None
     self.__m_datas = dict()
     self.misc_data = dict()
     self.processor = None
     self.recurse_level = 0
     self._m_analyzer_stack = list()
     self.onto_regime = False
     if (sofa_ is None):
         return
     self.__m_sofa = sofa_
     self._start_date = datetime.datetime.now()
     tokens = MorphologyService.process(sofa_.text, lang, None)
     t0 = None
     if (tokens is not None):
         ii = 0
         while ii < len(tokens):
             mt = tokens[ii]
             if (mt.begin_char == 733860):
                 pass
             tt = TextToken(mt, self)
             if (sofa_.correction_dict is not None):
                 corw = None
                 wrapcorw471 = RefOutArgWrapper(None)
                 inoutres472 = Utils.tryGetValue(sofa_.correction_dict,
                                                 mt.term, wrapcorw471)
                 corw = wrapcorw471.value
                 if (inoutres472):
                     ccc = MorphologyService.process(corw, lang, None)
                     if (ccc is not None and len(ccc) == 1):
                         tt1 = TextToken._new470(ccc[0], self,
                                                 tt.begin_char, tt.end_char,
                                                 tt.term)
                         tt1.chars = tt.chars
                         tt = tt1
                         if (self.corrected_tokens is None):
                             self.corrected_tokens = dict()
                         self.corrected_tokens[tt] = tt.get_source_text()
             if (t0 is None):
                 self.first_token = (tt)
             else:
                 t0.next0_ = tt
             t0 = (tt)
             ii += 1
     if (sofa_.clear_dust):
         self.__clear_dust()
     if (sofa_.do_words_merging_by_morph):
         self.__correct_words_by_merging(lang)
     if (sofa_.do_word_correction_by_morph):
         self.__correct_words_by_morph(lang)
     self.__merge_letters()
     self.__define_base_language()
     if (sofa_.create_number_tokens):
         t = self.first_token
         first_pass3049 = True
         while True:
             if first_pass3049: first_pass3049 = False
             else: t = t.next0_
             if (not (t is not None)): break
             nt = NumberHelper._try_parse_number(t)
             if (nt is None):
                 continue
             self.embed_token(nt)
             t = (nt)
     if (only_tokenizing):
         return
     t = self.first_token
     first_pass3050 = True
     while True:
         if first_pass3050: first_pass3050 = False
         else: t = t.next0_
         if (not (t is not None)): break
         if (t.morph.class0_.is_preposition):
             continue
         mc = t.get_morph_class_in_dictionary()
         if (mc.is_undefined and t.chars.is_cyrillic_letter
                 and t.length_char > 4):
             tail = sofa_.text[t.end_char - 1:t.end_char - 1 + 2]
             tte = None
             tt = t.previous
             if (tt is not None and
                 ((tt.is_comma_and or tt.morph.class0_.is_preposition
                   or tt.morph.class0_.is_conjunction))):
                 tt = tt.previous
             if ((tt is not None
                  and not tt.get_morph_class_in_dictionary().is_undefined
                  and (((tt.morph.class0_.value) &
                        (t.morph.class0_.value))) != 0)
                     and tt.length_char > 4):
                 tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1 + 2]
                 if (tail2 == tail):
                     tte = tt
             if (tte is None):
                 tt = t.next0_
                 if (tt is not None and
                     ((tt.is_comma_and or tt.morph.class0_.is_preposition
                       or tt.morph.class0_.is_conjunction))):
                     tt = tt.next0_
                 if ((tt is not None and
                      not tt.get_morph_class_in_dictionary().is_undefined
                      and (((tt.morph.class0_.value) &
                            (t.morph.class0_.value))) != 0)
                         and tt.length_char > 4):
                     tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1 + 2]
                     if (tail2 == tail):
                         tte = tt
             if (tte is not None):
                 t.morph.remove_items_ex(
                     tte.morph, tte.get_morph_class_in_dictionary())
         continue
     self.__create_statistics()