def __correctWordsByMerging(self, lang : 'MorphLang') -> None: t = self.first_token first_pass2798 = True while True: if first_pass2798: first_pass2798 = False else: t = t.next0_ if (not (t is not None and t.next0_ is not None)): break if (not t.chars.is_letter or (t.length_char < 2)): continue mc0 = t.getMorphClassInDictionary() if (t.morph.containsAttr("прдктв.", None)): continue t1 = t.next0_ if (t1.is_hiphen and t1.next0_ is not None and not t1.is_newline_after): t1 = t1.next0_ if (t1.length_char == 1): continue if (not t1.chars.is_letter or not t.chars.is_letter or t1.chars.is_latin_letter != t.chars.is_latin_letter): continue if (t1.chars.is_all_upper and not t.chars.is_all_upper): continue elif (not t1.chars.is_all_lower): continue elif (t.chars.is_all_upper): continue if (t1.morph.containsAttr("прдктв.", None)): continue mc1 = t1.getMorphClassInDictionary() if (not mc1.is_undefined and not mc0.is_undefined): continue if ((len((t).term) + len((t1).term)) < 6): continue corw = (t).term + (t1).term ccc = Morphology.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue if (corw == "ПОСТ" or corw == "ВРЕД"): continue tt = TextToken(ccc[0], self) if (tt.getMorphClassInDictionary().is_undefined): continue tt.begin_char = t.begin_char tt.end_char = t1.end_char tt.chars = t.chars if (t == self.first_token): self.first_token = (tt) else: t.previous.next0_ = tt if (t1.next0_ is not None): tt.next0_ = t1.next0_ t = (tt)
def __init__(self, sofa_ : 'SourceOfAnalysis'=None, only_tokenizing : bool=False, lang : 'MorphLang'=None, progress : EventHandler=None) -> None: self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0) self.corrected_tokens = None self.first_token = None; self.__m_entities = list() self.ontology = None; self.base_language = MorphLang() self.__m_sofa = None; self.statistics = None; self.__m_datas = dict() self.misc_data = dict() self.processor = None; self.recurse_level = 0 self._m_analyzer_stack = list() if (sofa_ is None): return self.__m_sofa = sofa_ self._start_date = datetime.datetime.now() tokens = Morphology.process(sofa_.text, lang, None) t0 = None if (tokens is not None): ii = 0 while ii < len(tokens): mt = tokens[ii] if (mt.begin_char == 733860): pass tt = TextToken(mt, self) if (sofa_.correction_dict is not None): wrapcorw539 = RefOutArgWrapper(None) inoutres540 = Utils.tryGetValue(sofa_.correction_dict, mt.term, wrapcorw539) corw = wrapcorw539.value if (inoutres540): ccc = Morphology.process(corw, lang, None) if (ccc is not None and len(ccc) == 1): tt1 = TextToken._new538(ccc[0], self, tt.term) tt1.begin_char = tt.begin_char tt1.end_char = tt.end_char tt1.chars = tt.chars tt = tt1 if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.getSourceText() if (t0 is None): self.first_token = (tt) else: t0.next0_ = tt t0 = (tt) ii += 1 if (sofa_.clear_dust): self.__clearDust() if (sofa_.do_words_merging_by_morph): self.__correctWordsByMerging(lang) if (sofa_.do_word_correction_by_morph): self.__correctWordsByMorph(lang) self.__mergeLetters() self.__defineBaseLanguage() t = self.first_token first_pass2794 = True while True: if first_pass2794: first_pass2794 = False else: t = t.next0_ if (not (t is not None)): break nt = NumberHelper._tryParseNumber(t) if (nt is None): continue self.embedToken(nt) t = (nt) if (only_tokenizing): return t = self.first_token first_pass2795 = True while True: if first_pass2795: first_pass2795 = False else: t = t.next0_ if (not (t is not None)): break if (t.morph.class0_.is_preposition): continue mc = t.getMorphClassInDictionary() if (mc.is_undefined and t.chars.is_cyrillic_letter and t.length_char > 4): tail = sofa_.text[t.end_char - 1:t.end_char - 1+2] tte = None tt = t.previous if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.previous if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2] if (tail2 == tail): tte = tt if (tte is None): tt = t.next0_ if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.next0_ if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2] if (tail2 == tail): tte = tt if (tte is not None): t.morph.removeItemsEx(tte.morph, tte.getMorphClassInDictionary()) continue self.__createStatistics()