def correct_word(w: str) -> str: """ Откорректировать слово (перевод в верхний регистр и замена некоторых букв типа Ё->Е) Args: w(str): исходное слово Returns: str: откорректированное слово """ if (w is None): return None w = w.upper() for ch in w: if (LanguageHelper.__m_rus0.find(ch) >= 0): tmp = io.StringIO() print(w, end="", file=tmp) i = 0 while i < tmp.tell(): j = LanguageHelper.__m_rus0.find( Utils.getCharAtStringIO(tmp, i)) if (j >= 0): Utils.setCharAtStringIO(tmp, i, LanguageHelper.__m_rus1[j]) i += 1 w = Utils.toStringStringIO(tmp) break if (w.find(chr(0x00AD)) >= 0): w = w.replace(chr(0x00AD), '-') if (w.startswith("АГЕНС")): w = ("АГЕНТС" + w[5:]) return w
def correct_word_by_morph(self, word: str) -> str: vars0_ = list() tmp = Utils.newStringIO(len(word)) ch = 1 while ch < len(word): Utils.setLengthStringIO(tmp, 0) print(word, end="", file=tmp) Utils.setCharAtStringIO(tmp, ch, '*') var = self.__check_corr_var(Utils.toStringStringIO(tmp), self.m_root, 0) if (var is not None): if (not var in vars0_): vars0_.append(var) ch += 1 if (len(vars0_) == 0): ch = 1 while ch < len(word): Utils.setLengthStringIO(tmp, 0) print(word, end="", file=tmp) Utils.insertStringIO(tmp, ch, '*') var = self.__check_corr_var(Utils.toStringStringIO(tmp), self.m_root, 0) if (var is not None): if (not var in vars0_): vars0_.append(var) ch += 1 if (len(vars0_) == 0): ch = 1 while ch < (len(word) - 1): Utils.setLengthStringIO(tmp, 0) print(word, end="", file=tmp) Utils.removeStringIO(tmp, ch, 1) var = self.__check_corr_var(Utils.toStringStringIO(tmp), self.m_root, 0) if (var is not None): if (not var in vars0_): vars0_.append(var) ch += 1 if (len(vars0_) != 1): return None return vars0_[0]
def correct_word(w: str) -> str: if (w is None): return None res = w.upper() for ch in res: if (LanguageHelper.__m_rus0.find(ch) >= 0): tmp = io.StringIO() print(res, end="", file=tmp) i = 0 while i < tmp.tell(): j = LanguageHelper.__m_rus0.find( Utils.getCharAtStringIO(tmp, i)) if (j >= 0): Utils.setCharAtStringIO(tmp, i, LanguageHelper.__m_rus1[j]) i += 1 res = Utils.toStringStringIO(tmp) break if (res.find(chr(0x00AD)) >= 0): res = res.replace(chr(0x00AD), '-') if (res.startswith("АГЕНС")): res = ("АГЕНТС" + res[5:]) return res
def transliteral_correction(value: str, prev_value: str, always: bool = False) -> str: """ Транслитеральная корректировка Args: value(str): prev_value(str): always(bool): """ pure_cyr = 0 pure_lat = 0 ques_cyr = 0 ques_lat = 0 udar_cyr = 0 y = False udaren = False i = 0 first_pass2897 = True while True: if first_pass2897: first_pass2897 = False else: i += 1 if (not (i < len(value))): break ch = value[i] ui = UnicodeInfo.ALL_CHARS[ord(ch)] if (not ui.is_letter): if (ui.is_udaren): udaren = True continue if (ui.is_apos and len(value) > 2): return LanguageHelper.transliteral_correction( value.replace("{0}".format(ch), ""), prev_value, False) return value if (ui.is_cyrillic): if (LanguageHelper._m_cyr_chars.find(ch) >= 0): ques_cyr += 1 else: pure_cyr += 1 elif (ui.is_latin): if (LanguageHelper._m_lat_chars.find(ch) >= 0): ques_lat += 1 else: pure_lat += 1 elif (LanguageHelper.__m_udar_chars.find(ch) >= 0): udar_cyr += 1 else: return value if (ch == 'Ь' and ((i + 1) < len(value)) and value[i + 1] == 'I'): y = True to_rus = False to_lat = False if (pure_lat > 0 and pure_cyr > 0): return value if (((pure_lat > 0 or always)) and ques_cyr > 0): to_lat = True elif (((pure_cyr > 0 or always)) and ques_lat > 0): to_rus = True elif (pure_cyr == 0 and pure_lat == 0): if (ques_cyr > 0 and ques_lat > 0): if (not Utils.isNullOrEmpty(prev_value)): if (LanguageHelper.is_cyrillic_char(prev_value[0])): to_rus = True elif (LanguageHelper.is_latin_char(prev_value[0])): to_lat = True if (not to_lat and not to_rus): if (ques_cyr > ques_lat): to_rus = True elif (ques_cyr < ques_lat): to_lat = True if (not to_rus and not to_lat): if (not y and not udaren and udar_cyr == 0): return value tmp = Utils.newStringIO(value) i = 0 first_pass2898 = True while True: if first_pass2898: first_pass2898 = False else: i += 1 if (not (i < tmp.tell())): break if (Utils.getCharAtStringIO(tmp, i) == 'Ь' and ((i + 1) < tmp.tell()) and Utils.getCharAtStringIO(tmp, i + 1) == 'I'): Utils.setCharAtStringIO(tmp, i, 'Ы') Utils.removeStringIO(tmp, i + 1, 1) continue cod = ord(Utils.getCharAtStringIO(tmp, i)) if (cod >= 0x300 and (cod < 0x370)): Utils.removeStringIO(tmp, i, 1) continue if (to_rus): ii = LanguageHelper._m_lat_chars.find( Utils.getCharAtStringIO(tmp, i)) if (ii >= 0): Utils.setCharAtStringIO(tmp, i, LanguageHelper._m_cyr_chars[ii]) else: ii = LanguageHelper.__m_udar_chars.find( Utils.getCharAtStringIO(tmp, i)) if (((ii)) >= 0): Utils.setCharAtStringIO( tmp, i, LanguageHelper.__m_udar_cyr_chars[ii]) elif (to_lat): ii = LanguageHelper._m_cyr_chars.find( Utils.getCharAtStringIO(tmp, i)) if (ii >= 0): Utils.setCharAtStringIO(tmp, i, LanguageHelper._m_lat_chars[ii]) else: ii = LanguageHelper.__m_udar_chars.find( Utils.getCharAtStringIO(tmp, i)) if (ii >= 0): Utils.setCharAtStringIO( tmp, i, LanguageHelper.__m_udar_cyr_chars[ii]) return Utils.toStringStringIO(tmp)
def __doCrLfCorrection(self, txt: str) -> str: """ Это анализ случаев принудительно отформатированного текста Args: txt(str): """ cou = 0 total_len = 0 i = 0 first_pass3166 = True while True: if first_pass3166: first_pass3166 = False else: i += 1 if (not (i < len(txt))): break ch = txt[i] if ((ord(ch)) != 0xD and (ord(ch)) != 0xA): continue len0_ = 0 last_char = ch j = (i + 1) while j < len(txt): ch = txt[j] if ((ord(ch)) == 0xD or (ord(ch)) == 0xA): break elif ((ord(ch)) == 0x9): len0_ += 5 else: last_char = ch len0_ += 1 j += 1 if (j >= len(txt)): break if (len0_ < 30): continue if (last_char != '.' and last_char != ':' and last_char != ';'): next_is_dig = False k = j + 1 while k < len(txt): if (not Utils.isWhitespace(txt[k])): if (str.isdigit(txt[k])): next_is_dig = True break k += 1 if (not next_is_dig): cou += 1 total_len += len0_ i = j if (cou < 4): return txt total_len = math.floor(total_len / cou) if ((total_len < 50) or total_len > 100): return txt tmp = Utils.newStringIO(txt) i = 0 while i < tmp.tell(): ch = Utils.getCharAtStringIO(tmp, i) len0_ = 0 last_char = ch j = (i + 1) while j < tmp.tell(): ch = Utils.getCharAtStringIO(tmp, j) if ((ord(ch)) == 0xD or (ord(ch)) == 0xA): break elif ((ord(ch)) == 0x9): len0_ += 5 else: last_char = ch len0_ += 1 j += 1 if (j >= tmp.tell()): break for jj in range(j - 1, -1, -1): last_char = Utils.getCharAtStringIO(tmp, jj) if (not Utils.isWhitespace(last_char)): break else: jj = -1 not_single = False jj = (j + 1) if ((jj < tmp.tell()) and (ord(Utils.getCharAtStringIO(tmp, j))) == 0xD and (ord(Utils.getCharAtStringIO(tmp, jj))) == 0xA): jj += 1 while jj < tmp.tell(): ch = Utils.getCharAtStringIO(tmp, jj) if (not Utils.isWhitespace(ch)): break if ((ord(ch)) == 0xD or (ord(ch)) == 0xA): not_single = True break jj += 1 if (((not not_single and len0_ > (total_len - 20) and (len0_ < (total_len + 10))) and last_char != '.' and last_char != ':') and last_char != ';'): Utils.setCharAtStringIO(tmp, j, ' ') self.crlf_corrected_count += 1 if ((j + 1) < tmp.tell()): ch = Utils.getCharAtStringIO(tmp, j + 1) if ((ord(ch)) == 0xA): Utils.setCharAtStringIO(tmp, j + 1, ' ') j += 1 i = (j - 1) i += 1 return Utils.toStringStringIO(tmp)
def __doTransliteralCorrection(txt: io.StringIO, info: io.StringIO) -> int: """ Произвести транслитеральную коррекцию Args: txt(io.StringIO): корректируемый текст info(io.StringIO): информация о замене (может быть null) Returns: int: количество замен """ stat = 0 pref_rus_word = False i = 0 while i < txt.tell(): if (str.isalpha(Utils.getCharAtStringIO(txt, i))): rus = 0 pure_lat = 0 unknown = 0 j = i while j < txt.tell(): ch = Utils.getCharAtStringIO(txt, j) if (not str.isalpha(ch)): break code = ord(ch) if (code >= 0x400 and (code < 0x500)): rus += 1 elif (SourceOfAnalysis.__m_lat_chars.find(ch) >= 0): unknown += 1 else: pure_lat += 1 j += 1 if (((unknown > 0 and rus > 0)) or ((unknown > 0 and pure_lat == 0 and pref_rus_word))): if (info is not None): if (info.tell() > 0): print("\r\n", end="", file=info) k = i while k < j: print(Utils.getCharAtStringIO(txt, k), end="", file=info) k += 1 print(": ", end="", file=info) k = i while k < j: ii = SourceOfAnalysis.__m_lat_chars.find( Utils.getCharAtStringIO(txt, k)) if (ii >= 0): if (info is not None): print("{0}->{1} ".format( Utils.getCharAtStringIO(txt, k), SourceOfAnalysis.__m_rus_chars[ii]), end="", file=info, flush=True) Utils.setCharAtStringIO( txt, k, SourceOfAnalysis.__m_rus_chars[ii]) k += 1 stat += unknown pref_rus_word = True else: pref_rus_word = rus > 0 i = j i += 1 return stat