def __merge_letters(self) -> None: before_word = False tmp = io.StringIO() t = self.first_token first_pass3055 = True while True: if first_pass3055: first_pass3055 = False else: t = t.next0_ if (not (t is not None)): break tt = Utils.asObjectOrNull(t, TextToken) if (not tt.chars.is_letter or tt.length_char != 1): before_word = False continue i = t.whitespaces_before_count if (i > 2 or ((i == 2 and before_word))): pass else: before_word = False continue i = 0 t1 = None Utils.setLengthStringIO(tmp, 0) print(tt.get_source_text(), end="", file=tmp) t1 = t while t1.next0_ is not None: tt = (Utils.asObjectOrNull(t1.next0_, TextToken)) if (tt.length_char != 1 or tt.whitespaces_before_count != 1): break i += 1 print(tt.get_source_text(), end="", file=tmp) t1 = t1.next0_ if (i > 3 or ((i > 1 and before_word))): pass else: before_word = False continue before_word = False mt = MorphologyService.process(Utils.toStringStringIO(tmp), None, None) if (mt is None or len(mt) != 1): t = t1 continue for wf in mt[0].word_forms: if (wf.is_in_dictionary): before_word = True break if (not before_word): t = t1 continue tt = TextToken(mt[0], self, t.begin_char, t1.end_char) if (t == self.first_token): self.first_token = (tt) else: tt.previous = t.previous tt.next0_ = t1.next0_ t = (tt)
def __correct_words_by_merging(self, lang: 'MorphLang') -> None: t = self.first_token first_pass3053 = True while True: if first_pass3053: first_pass3053 = False else: t = t.next0_ if (not (t is not None and t.next0_ is not None)): break if (not t.chars.is_letter or (t.length_char < 2)): continue mc0 = t.get_morph_class_in_dictionary() if (t.morph.contains_attr("прдктв.", None)): continue t1 = t.next0_ if (t1.is_hiphen and t1.next0_ is not None and not t1.is_newline_after): t1 = t1.next0_ if (t1.length_char == 1): continue if (not t1.chars.is_letter or not t.chars.is_letter or t1.chars.is_latin_letter != t.chars.is_latin_letter): continue if (t1.chars.is_all_upper and not t.chars.is_all_upper): continue elif (not t1.chars.is_all_lower): continue elif (t.chars.is_all_upper): continue if (t1.morph.contains_attr("прдктв.", None)): continue mc1 = t1.get_morph_class_in_dictionary() if (not mc1.is_undefined and not mc0.is_undefined): continue if ((len(t.term) + len(t1.term)) < 6): continue corw = t.term + t1.term ccc = MorphologyService.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue if (corw == "ПОСТ" or corw == "ВРЕД"): continue tt = TextToken(ccc[0], self, t.begin_char, t1.end_char) if (tt.get_morph_class_in_dictionary().is_undefined): continue tt.chars = t.chars if (t == self.first_token): self.first_token = (tt) else: t.previous.next0_ = tt if (t1.next0_ is not None): tt.next0_ = t1.next0_ t = (tt)
def initByNormalText(self, text: str, lang_: 'MorphLang' = None) -> None: """ Быстрая инициализация без морф.вариантов, производится только токенизация текста. Используется для ускорения работы со словарём в случае, когда изначально известно, что на входе уже нормализованные строки Args: text(str): исходно нормализованный текст lang_(MorphLang): возможный язык """ if (Utils.isNullOrEmpty(text)): return text = text.upper() if (text.find('\'') >= 0): text = text.replace("'", "") tok = False sp = False for ch in text: if (not str.isalpha(ch)): if (ch == ' '): sp = True else: tok = True break if (not tok and not sp): tt = TextToken(None, None) tt.term = text self.terms.append(Termin.Term(tt, False)) elif (not tok and sp): wrds = Utils.splitString(text, ' ', False) i = 0 first_pass2811 = True while True: if first_pass2811: first_pass2811 = False else: i += 1 if (not (i < len(wrds))): break if (Utils.isNullOrEmpty(wrds[i])): continue tt = TextToken(None, None) tt.term = wrds[i] self.terms.append(Termin.Term(tt, False)) else: toks = Morphology.tokenize(text) if (toks is not None): i = 0 while i < len(toks): tt = TextToken(toks[i], None) self.terms.append(Termin.Term(tt, False)) i += 1 self.lang = MorphLang(lang_)
def __correct_words_by_morph(self, lang: 'MorphLang') -> None: tt = self.first_token first_pass3054 = True while True: if first_pass3054: first_pass3054 = False else: tt = tt.next0_ if (not (tt is not None)): break if (not (isinstance(tt, TextToken))): continue if (tt.morph.contains_attr("прдктв.", None)): continue dd = tt.get_morph_class_in_dictionary() if (not dd.is_undefined or (tt.length_char < 4)): continue if (tt.morph.class0_.is_proper_surname and not tt.chars.is_all_lower): continue if (tt.chars.is_all_upper): continue corw = MorphologyService.correct_word( tt.term, (lang if tt.morph.language.is_undefined else tt.morph.language)) if (corw is None): continue ccc = MorphologyService.process(corw, lang, None) if (ccc is None or len(ccc) != 1): continue tt1 = TextToken._new473(ccc[0], self, tt.begin_char, tt.end_char, tt.chars, tt.term) mc = tt1.get_morph_class_in_dictionary() if (mc.is_proper_surname): continue if (tt == self.first_token): self.first_token = (tt1) else: tt.previous.next0_ = tt1 tt1.next0_ = tt.next0_ tt = (tt1) if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.get_source_text()
def __init__(self, source: str = None, lang_: 'MorphLang' = None, source_is_normal: bool = False) -> None: """ Создать термин из строки с добавлением всех морфологических вариантов написания Args: source(str): строка lang_(MorphLang): возможный язык source_is_normal(bool): при true морфварианты не добавляются (эквивалентно вызову InitByNormalText) """ self.terms = list() self.additional_vars = None self.__m_canonic_text = None self.ignore_terms_order = False self.acronym = None self.acronym_smart = None self.acronym_can_be_lower = False self.abridges = None self.lang = MorphLang() self.tag = None self.tag2 = None if (source is None): return if (source_is_normal or Termin.ASSIGN_ALL_TEXTS_AS_NORMAL): self.initByNormalText(source, lang_) return toks = Morphology.process(source, lang_, None) if (toks is not None): i = 0 while i < len(toks): tt = TextToken(toks[i], None) self.terms.append(Termin.Term(tt, not source_is_normal)) i += 1 self.lang = MorphLang(lang_)
def __deserialize_token(stream : Stream, kit : 'AnalysisKit', vers : int) -> 'Token': from pullenti.ner.MetaToken import MetaToken from pullenti.ner.ReferentToken import ReferentToken typ = SerializerHelper.deserialize_short(stream) if (typ == (0)): return None t = None if (typ == (1)): t = (TextToken(None, kit)) elif (typ == (2)): t = (NumberToken(None, None, None, NumberSpellingType.DIGIT, kit)) elif (typ == (3)): t = (ReferentToken(None, None, None, kit)) else: t = (MetaToken(None, None, kit)) t._deserialize(stream, kit, vers) if (isinstance(t, MetaToken)): tt = SerializerHelper.deserialize_tokens(stream, kit, vers) if (tt is not None): t._m_begin_token = tt while tt is not None: t._m_end_token = tt tt = tt.next0_ return t
def __init__(self, sofa_ : 'SourceOfAnalysis'=None, only_tokenizing : bool=False, lang : 'MorphLang'=None, progress : EventHandler=None) -> None: self._start_date = datetime.datetime(1, 1, 1, 0, 0, 0) self.corrected_tokens = None self.first_token = None; self.__m_entities = list() self.ontology = None; self.base_language = MorphLang() self.__m_sofa = None; self.statistics = None; self.__m_datas = dict() self.misc_data = dict() self.processor = None; self.recurse_level = 0 self._m_analyzer_stack = list() if (sofa_ is None): return self.__m_sofa = sofa_ self._start_date = datetime.datetime.now() tokens = Morphology.process(sofa_.text, lang, None) t0 = None if (tokens is not None): ii = 0 while ii < len(tokens): mt = tokens[ii] if (mt.begin_char == 733860): pass tt = TextToken(mt, self) if (sofa_.correction_dict is not None): wrapcorw539 = RefOutArgWrapper(None) inoutres540 = Utils.tryGetValue(sofa_.correction_dict, mt.term, wrapcorw539) corw = wrapcorw539.value if (inoutres540): ccc = Morphology.process(corw, lang, None) if (ccc is not None and len(ccc) == 1): tt1 = TextToken._new538(ccc[0], self, tt.term) tt1.begin_char = tt.begin_char tt1.end_char = tt.end_char tt1.chars = tt.chars tt = tt1 if (self.corrected_tokens is None): self.corrected_tokens = dict() self.corrected_tokens[tt] = tt.getSourceText() if (t0 is None): self.first_token = (tt) else: t0.next0_ = tt t0 = (tt) ii += 1 if (sofa_.clear_dust): self.__clearDust() if (sofa_.do_words_merging_by_morph): self.__correctWordsByMerging(lang) if (sofa_.do_word_correction_by_morph): self.__correctWordsByMorph(lang) self.__mergeLetters() self.__defineBaseLanguage() t = self.first_token first_pass2794 = True while True: if first_pass2794: first_pass2794 = False else: t = t.next0_ if (not (t is not None)): break nt = NumberHelper._tryParseNumber(t) if (nt is None): continue self.embedToken(nt) t = (nt) if (only_tokenizing): return t = self.first_token first_pass2795 = True while True: if first_pass2795: first_pass2795 = False else: t = t.next0_ if (not (t is not None)): break if (t.morph.class0_.is_preposition): continue mc = t.getMorphClassInDictionary() if (mc.is_undefined and t.chars.is_cyrillic_letter and t.length_char > 4): tail = sofa_.text[t.end_char - 1:t.end_char - 1+2] tte = None tt = t.previous if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.previous if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2] if (tail2 == tail): tte = tt if (tte is None): tt = t.next0_ if (tt is not None and ((tt.is_comma_and or tt.morph.class0_.is_preposition or tt.morph.class0_.is_conjunction))): tt = tt.next0_ if ((tt is not None and not tt.getMorphClassInDictionary().is_undefined and (((tt.morph.class0_.value) & (t.morph.class0_.value))) != 0) and tt.length_char > 4): tail2 = sofa_.text[tt.end_char - 1:tt.end_char - 1+2] if (tail2 == tail): tte = tt if (tte is not None): t.morph.removeItemsEx(tte.morph, tte.getMorphClassInDictionary()) continue self.__createStatistics()