コード例 #1
0
 def correct_word(w: str) -> str:
     """ Откорректировать слово (перевод в верхний регистр и замена некоторых букв типа Ё->Е)
     
     Args:
         w(str): исходное слово
     
     Returns:
         str: откорректированное слово
     """
     if (w is None):
         return None
     w = w.upper()
     for ch in w:
         if (LanguageHelper.__m_rus0.find(ch) >= 0):
             tmp = io.StringIO()
             print(w, end="", file=tmp)
             i = 0
             while i < tmp.tell():
                 j = LanguageHelper.__m_rus0.find(
                     Utils.getCharAtStringIO(tmp, i))
                 if (j >= 0):
                     Utils.setCharAtStringIO(tmp, i,
                                             LanguageHelper.__m_rus1[j])
                 i += 1
             w = Utils.toStringStringIO(tmp)
             break
     if (w.find(chr(0x00AD)) >= 0):
         w = w.replace(chr(0x00AD), '-')
     if (w.startswith("АГЕНС")):
         w = ("АГЕНТС" + w[5:])
     return w
コード例 #2
0
ファイル: MorphEngine.py プロジェクト: AAA1911/PullentiPython
 def correct_word_by_morph(self, word: str) -> str:
     vars0_ = list()
     tmp = Utils.newStringIO(len(word))
     ch = 1
     while ch < len(word):
         Utils.setLengthStringIO(tmp, 0)
         print(word, end="", file=tmp)
         Utils.setCharAtStringIO(tmp, ch, '*')
         var = self.__check_corr_var(Utils.toStringStringIO(tmp),
                                     self.m_root, 0)
         if (var is not None):
             if (not var in vars0_):
                 vars0_.append(var)
         ch += 1
     if (len(vars0_) == 0):
         ch = 1
         while ch < len(word):
             Utils.setLengthStringIO(tmp, 0)
             print(word, end="", file=tmp)
             Utils.insertStringIO(tmp, ch, '*')
             var = self.__check_corr_var(Utils.toStringStringIO(tmp),
                                         self.m_root, 0)
             if (var is not None):
                 if (not var in vars0_):
                     vars0_.append(var)
             ch += 1
     if (len(vars0_) == 0):
         ch = 1
         while ch < (len(word) - 1):
             Utils.setLengthStringIO(tmp, 0)
             print(word, end="", file=tmp)
             Utils.removeStringIO(tmp, ch, 1)
             var = self.__check_corr_var(Utils.toStringStringIO(tmp),
                                         self.m_root, 0)
             if (var is not None):
                 if (not var in vars0_):
                     vars0_.append(var)
             ch += 1
     if (len(vars0_) != 1):
         return None
     return vars0_[0]
コード例 #3
0
 def correct_word(w: str) -> str:
     if (w is None):
         return None
     res = w.upper()
     for ch in res:
         if (LanguageHelper.__m_rus0.find(ch) >= 0):
             tmp = io.StringIO()
             print(res, end="", file=tmp)
             i = 0
             while i < tmp.tell():
                 j = LanguageHelper.__m_rus0.find(
                     Utils.getCharAtStringIO(tmp, i))
                 if (j >= 0):
                     Utils.setCharAtStringIO(tmp, i,
                                             LanguageHelper.__m_rus1[j])
                 i += 1
             res = Utils.toStringStringIO(tmp)
             break
     if (res.find(chr(0x00AD)) >= 0):
         res = res.replace(chr(0x00AD), '-')
     if (res.startswith("АГЕНС")):
         res = ("АГЕНТС" + res[5:])
     return res
コード例 #4
0
 def transliteral_correction(value: str,
                             prev_value: str,
                             always: bool = False) -> str:
     """ Транслитеральная корректировка
     
     Args:
         value(str): 
         prev_value(str): 
         always(bool): 
     
     """
     pure_cyr = 0
     pure_lat = 0
     ques_cyr = 0
     ques_lat = 0
     udar_cyr = 0
     y = False
     udaren = False
     i = 0
     first_pass2897 = True
     while True:
         if first_pass2897: first_pass2897 = False
         else: i += 1
         if (not (i < len(value))): break
         ch = value[i]
         ui = UnicodeInfo.ALL_CHARS[ord(ch)]
         if (not ui.is_letter):
             if (ui.is_udaren):
                 udaren = True
                 continue
             if (ui.is_apos and len(value) > 2):
                 return LanguageHelper.transliteral_correction(
                     value.replace("{0}".format(ch), ""), prev_value, False)
             return value
         if (ui.is_cyrillic):
             if (LanguageHelper._m_cyr_chars.find(ch) >= 0):
                 ques_cyr += 1
             else:
                 pure_cyr += 1
         elif (ui.is_latin):
             if (LanguageHelper._m_lat_chars.find(ch) >= 0):
                 ques_lat += 1
             else:
                 pure_lat += 1
         elif (LanguageHelper.__m_udar_chars.find(ch) >= 0):
             udar_cyr += 1
         else:
             return value
         if (ch == 'Ь' and ((i + 1) < len(value)) and value[i + 1] == 'I'):
             y = True
     to_rus = False
     to_lat = False
     if (pure_lat > 0 and pure_cyr > 0):
         return value
     if (((pure_lat > 0 or always)) and ques_cyr > 0):
         to_lat = True
     elif (((pure_cyr > 0 or always)) and ques_lat > 0):
         to_rus = True
     elif (pure_cyr == 0 and pure_lat == 0):
         if (ques_cyr > 0 and ques_lat > 0):
             if (not Utils.isNullOrEmpty(prev_value)):
                 if (LanguageHelper.is_cyrillic_char(prev_value[0])):
                     to_rus = True
                 elif (LanguageHelper.is_latin_char(prev_value[0])):
                     to_lat = True
             if (not to_lat and not to_rus):
                 if (ques_cyr > ques_lat):
                     to_rus = True
                 elif (ques_cyr < ques_lat):
                     to_lat = True
     if (not to_rus and not to_lat):
         if (not y and not udaren and udar_cyr == 0):
             return value
     tmp = Utils.newStringIO(value)
     i = 0
     first_pass2898 = True
     while True:
         if first_pass2898: first_pass2898 = False
         else: i += 1
         if (not (i < tmp.tell())): break
         if (Utils.getCharAtStringIO(tmp, i) == 'Ь'
                 and ((i + 1) < tmp.tell())
                 and Utils.getCharAtStringIO(tmp, i + 1) == 'I'):
             Utils.setCharAtStringIO(tmp, i, 'Ы')
             Utils.removeStringIO(tmp, i + 1, 1)
             continue
         cod = ord(Utils.getCharAtStringIO(tmp, i))
         if (cod >= 0x300 and (cod < 0x370)):
             Utils.removeStringIO(tmp, i, 1)
             continue
         if (to_rus):
             ii = LanguageHelper._m_lat_chars.find(
                 Utils.getCharAtStringIO(tmp, i))
             if (ii >= 0):
                 Utils.setCharAtStringIO(tmp, i,
                                         LanguageHelper._m_cyr_chars[ii])
             else:
                 ii = LanguageHelper.__m_udar_chars.find(
                     Utils.getCharAtStringIO(tmp, i))
                 if (((ii)) >= 0):
                     Utils.setCharAtStringIO(
                         tmp, i, LanguageHelper.__m_udar_cyr_chars[ii])
         elif (to_lat):
             ii = LanguageHelper._m_cyr_chars.find(
                 Utils.getCharAtStringIO(tmp, i))
             if (ii >= 0):
                 Utils.setCharAtStringIO(tmp, i,
                                         LanguageHelper._m_lat_chars[ii])
         else:
             ii = LanguageHelper.__m_udar_chars.find(
                 Utils.getCharAtStringIO(tmp, i))
             if (ii >= 0):
                 Utils.setCharAtStringIO(
                     tmp, i, LanguageHelper.__m_udar_cyr_chars[ii])
     return Utils.toStringStringIO(tmp)
コード例 #5
0
ファイル: SourceOfAnalysis.py プロジェクト: MihaJjDa/APCLtask
 def __doCrLfCorrection(self, txt: str) -> str:
     """ Это анализ случаев принудительно отформатированного текста
     
     Args:
         txt(str): 
     """
     cou = 0
     total_len = 0
     i = 0
     first_pass3166 = True
     while True:
         if first_pass3166: first_pass3166 = False
         else: i += 1
         if (not (i < len(txt))): break
         ch = txt[i]
         if ((ord(ch)) != 0xD and (ord(ch)) != 0xA):
             continue
         len0_ = 0
         last_char = ch
         j = (i + 1)
         while j < len(txt):
             ch = txt[j]
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 break
             elif ((ord(ch)) == 0x9):
                 len0_ += 5
             else:
                 last_char = ch
                 len0_ += 1
             j += 1
         if (j >= len(txt)):
             break
         if (len0_ < 30):
             continue
         if (last_char != '.' and last_char != ':' and last_char != ';'):
             next_is_dig = False
             k = j + 1
             while k < len(txt):
                 if (not Utils.isWhitespace(txt[k])):
                     if (str.isdigit(txt[k])):
                         next_is_dig = True
                     break
                 k += 1
             if (not next_is_dig):
                 cou += 1
                 total_len += len0_
         i = j
     if (cou < 4):
         return txt
     total_len = math.floor(total_len / cou)
     if ((total_len < 50) or total_len > 100):
         return txt
     tmp = Utils.newStringIO(txt)
     i = 0
     while i < tmp.tell():
         ch = Utils.getCharAtStringIO(tmp, i)
         len0_ = 0
         last_char = ch
         j = (i + 1)
         while j < tmp.tell():
             ch = Utils.getCharAtStringIO(tmp, j)
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 break
             elif ((ord(ch)) == 0x9):
                 len0_ += 5
             else:
                 last_char = ch
                 len0_ += 1
             j += 1
         if (j >= tmp.tell()):
             break
         for jj in range(j - 1, -1, -1):
             last_char = Utils.getCharAtStringIO(tmp, jj)
             if (not Utils.isWhitespace(last_char)):
                 break
         else:
             jj = -1
         not_single = False
         jj = (j + 1)
         if ((jj < tmp.tell())
                 and (ord(Utils.getCharAtStringIO(tmp, j))) == 0xD
                 and (ord(Utils.getCharAtStringIO(tmp, jj))) == 0xA):
             jj += 1
         while jj < tmp.tell():
             ch = Utils.getCharAtStringIO(tmp, jj)
             if (not Utils.isWhitespace(ch)):
                 break
             if ((ord(ch)) == 0xD or (ord(ch)) == 0xA):
                 not_single = True
                 break
             jj += 1
         if (((not not_single and len0_ >
               (total_len - 20) and (len0_ < (total_len + 10)))
              and last_char != '.' and last_char != ':')
                 and last_char != ';'):
             Utils.setCharAtStringIO(tmp, j, ' ')
             self.crlf_corrected_count += 1
             if ((j + 1) < tmp.tell()):
                 ch = Utils.getCharAtStringIO(tmp, j + 1)
                 if ((ord(ch)) == 0xA):
                     Utils.setCharAtStringIO(tmp, j + 1, ' ')
                     j += 1
         i = (j - 1)
         i += 1
     return Utils.toStringStringIO(tmp)
コード例 #6
0
ファイル: SourceOfAnalysis.py プロジェクト: MihaJjDa/APCLtask
 def __doTransliteralCorrection(txt: io.StringIO, info: io.StringIO) -> int:
     """ Произвести транслитеральную коррекцию
     
     Args:
         txt(io.StringIO): корректируемый текст
         info(io.StringIO): информация о замене (может быть null)
     
     Returns:
         int: количество замен
     """
     stat = 0
     pref_rus_word = False
     i = 0
     while i < txt.tell():
         if (str.isalpha(Utils.getCharAtStringIO(txt, i))):
             rus = 0
             pure_lat = 0
             unknown = 0
             j = i
             while j < txt.tell():
                 ch = Utils.getCharAtStringIO(txt, j)
                 if (not str.isalpha(ch)):
                     break
                 code = ord(ch)
                 if (code >= 0x400 and (code < 0x500)):
                     rus += 1
                 elif (SourceOfAnalysis.__m_lat_chars.find(ch) >= 0):
                     unknown += 1
                 else:
                     pure_lat += 1
                 j += 1
             if (((unknown > 0 and rus > 0)) or
                 ((unknown > 0 and pure_lat == 0 and pref_rus_word))):
                 if (info is not None):
                     if (info.tell() > 0):
                         print("\r\n", end="", file=info)
                     k = i
                     while k < j:
                         print(Utils.getCharAtStringIO(txt, k),
                               end="",
                               file=info)
                         k += 1
                     print(": ", end="", file=info)
                 k = i
                 while k < j:
                     ii = SourceOfAnalysis.__m_lat_chars.find(
                         Utils.getCharAtStringIO(txt, k))
                     if (ii >= 0):
                         if (info is not None):
                             print("{0}->{1} ".format(
                                 Utils.getCharAtStringIO(txt, k),
                                 SourceOfAnalysis.__m_rus_chars[ii]),
                                   end="",
                                   file=info,
                                   flush=True)
                         Utils.setCharAtStringIO(
                             txt, k, SourceOfAnalysis.__m_rus_chars[ii])
                     k += 1
                 stat += unknown
                 pref_rus_word = True
             else:
                 pref_rus_word = rus > 0
             i = j
         i += 1
     return stat