Beispiel #1
0
    def correct(self, sentence=''):
        """
        句子改错
        :param sentence: 句子文本
        :return: 改正后的句子, list(wrong, right, begin_idx, end_idx)
        """
        detail = []
        maybe_errors = self.detect(sentence)
        for item, begin_idx, end_idx, err_type in maybe_errors:
            # 纠错,逐个处理
            before_sent = sentence[:begin_idx]
            after_sent = sentence[end_idx:]

            if err_type == ErrorType.char:
                # 对非中文的错字不做处理
                if not is_chinese_string(item):
                    continue
                if not self.check_vocab_has_all_token(sentence):
                    continue
                # 取得所有可能正确的字
                corrected_item = self.predict_mask_token(
                    sentence, begin_idx, end_idx)
            elif err_type == ErrorType.word:
                corrected_item = item
            else:
                print('not strand error_type')
            # output
            if corrected_item != item:
                sentence = before_sent + corrected_item + after_sent
                detail_word = [
                    item, corrected_item, begin_idx, end_idx, ErrorType.char
                ]
                detail.append(detail_word)
        detail = sorted(detail, key=operator.itemgetter(2))
        return sentence, detail
Beispiel #2
0
 def generate_bertScore_sound_shape_file(self, text, right_sentence='',id_lists=[]):
     """
     生成bert_score、sound_score、shape_score文件
     :param text: 句子文本
     :return: file
     """
     text_new = ''
     details = []
     self.check_corrector_initialized()
     # 编码统一,utf-8 to unicode
     text = convert_to_unicode(text)
     # 长句切分为短句
     blocks = self.split_2_short_text(text, include_symbol=True)
     try:
         blocks = [(blocks[i][0] + blocks[i + 1][0], blocks[i][1]) for i in range(0, len(blocks), 2)]
     except Exception as e:
         pass
     punc='。'
     if self.is_char_error_detect:
         for blk, start_idx in blocks:
             blk=punc+blk
             blk_new = ''
             for idx, s in enumerate(blk):
                 # 对非中文的错误不做处理
                 if is_chinese_string(s):
                     # 对已包含错误不处理
                     maybe_err = [s, idx, idx + 1, ErrorType.char]
                     if not self._check_contain_details_error(maybe_err, details):
                         sentence_lst = list(blk_new + blk[idx:])
                         sentence_lst[idx] = self.mask
                         sentence_new = ''.join(sentence_lst)
                         predicts = self.model(sentence_new)
                         top_tokens = []
                         ssc_s = self._getSSC(s)
                         for p in predicts:
                             token_id = p.get('token', 0)
                             token_score = p.get('score', 0)
                             token_str = self.model.tokenizer.convert_ids_to_tokens(token_id)
                             ssc_token = self._getSSC(token_str)
                             soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4])
                             shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:])
                             ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token)
                             top_tokens.append({'bert_score': token_score, 'token_str': token_str, \
                                 'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi})
                     
                         if top_tokens and (s not in [token.get('token_str') for token in top_tokens]):
                             self.write2scorefile(s, top_tokens, start_idx + idx - 1, id_lists, right_sentence[start_idx + idx - 1])
                             # correct_item = self.ssc_correct_item(s, top_tokens)
                             correct_item = right_sentence[start_idx + idx - 1]
                             if correct_item != s:
                                 details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char])
                                 s = correct_item
                 blk_new += s
             text_new += blk_new
             punc = blk_new[-1]
     details = sorted(details, key=operator.itemgetter(2))
     return text_new, details
Beispiel #3
0
    def bert_correct_ssc_origin(self, text):
        """
        使用ssc音形码进行句子纠错
        :param text: 句子文本
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        text_new = ''
        details = []
        self.check_corrector_initialized()
        # 编码统一,utf-8 to unicode
        text = convert_to_unicode(text)
        if self.is_char_error_detect:
            text_new = ""
            for idx, s in enumerate(text):
                # 对非中文的错误不做处理
                if is_chinese_string(s):
                    # 对已包含错误不处理
                    maybe_err = [s, idx, idx + 1, ErrorType.char]
                    if not self._check_contain_details_error(maybe_err, details):
                        sentence_lst = list(text_new + text[idx:])
                        sentence_lst[idx] = self.mask
                        sentence_new = ''.join(sentence_lst)
                        predicts = self.model(sentence_new)
                        top_tokens = []
                        ssc_s = self._getSSC(s)
                        for p in predicts:
                            token_id = p.get('token', 0)
                            token_score = p.get('score', 0)
                            token_str = self.model.tokenizer.convert_ids_to_tokens(token_id)
                            ssc_token = self._getSSC(token_str)
                            soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4])
                            shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:])
                            ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token)
                            top_tokens.append({'bert_score': token_score, 'token_str': token_str, \
                                'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi})

                        if top_tokens and (s not in [token.get('token_str') for token in top_tokens]):
                            # correct_item = self.ssc_correct_item(s, top_tokens)
                            correct_item = self.neural_ssc_correct_item(s, top_tokens)
                            if correct_item != s:
                                details.append([s, correct_item, idx, idx + 1, ErrorType.char])
                            s = correct_item
                            # 取得所有可能正确的词
                            # candidates = self.generate_items(s)
                            # if candidates:
                            #     for token_str in top_tokens:
                            #         if token_str in candidates:
                            #             details.append([s, token_str, idx, idx + 1,ErrorType.char])
                            #             s = token_str
                            #             break
                text_new += s

        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details
Beispiel #4
0
    def bert_correct_ssc(self, text):
        """
        使用ssc音形码进行句子纠错
        :param text: 句子文本
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        text_new = ''
        details = []
        self.check_corrector_initialized()
        # 编码统一,utf-8 to unicode
        text = convert_to_unicode(text)
        # 长句切分为短句
        blocks = self.split_2_short_text(text, include_symbol=True)
        # blocks = self.split_2_short_text(text)
        if self.is_word_error_detect:
            pass
        
        if self.is_char_error_detect:
            for blk, start_idx in blocks:
                blk_new = ''
                for idx, s in enumerate(blk):
                    # 对非中文的错误不做处理
                    if is_chinese_string(s):
                        sentence_lst = list(blk_new + blk[idx:])
                        sentence_lst[idx] = self.mask
                        sentence_new = ''.join(sentence_lst)
                        predicts = self.model(sentence_new)
                        top_tokens = []
                        ssc_s = self._getSSC(s)
                        for p in predicts:
                            token_id = p.get('token', 0)
                            token_score = p.get('score', 0)
                            token_str = self.model.tokenizer.convert_ids_to_tokens(token_id)
                            ssc_token = self._getSSC(token_str)
                            soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4])
                            shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:])
                            ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token)
                            top_tokens.append({'bert_score': token_score, 'token_str': token_str, \
                                'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi})
                       
                        if top_tokens and (s not in [token.get('token_str') for token in top_tokens]):
                            # correct_item = self.ssc_correct_item(s, top_tokens)
                            correct_item = self.neural_ssc_correct_item(s, top_tokens)
                            if correct_item != s:
                                details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char])
                                s = correct_item
                    blk_new += s
                text_new += blk_new

        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details        
Beispiel #5
0
    def electra_correct(self, text):
        """
        句子纠错
        :param text: 句子文本
        :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos]
        """
        text_new = ''
        details = []
        # 编码统一,utf-8 to unicode
        text = convert_to_unicode(text)
        # 长句切分为短句
        blocks = self.split_2_short_text(text, include_symbol=True)
        for blk, start_idx in blocks:
            error_ids = self.electra_detect(blk)
            sentence_lst = list(blk)
            for idx in error_ids:
                s = sentence_lst[idx]
                if is_chinese_string(s):
                    # 处理中文错误
                    sentence_lst[idx] = self.mask
                    sentence_new = ''.join(sentence_lst)
                    # 生成器fill-mask预测[mask],默认取top5
                    predicts = self.g_model(sentence_new)
                    top_tokens = []
                    for p in predicts:
                        token_id = p.get('token', 0)
                        token_str = self.g_model.tokenizer.convert_ids_to_tokens(
                            token_id)
                        top_tokens.append(token_str)

                    if top_tokens and (s not in top_tokens):
                        # 取得所有可能正确的词
                        candidates = self.generate_items(s)
                        if candidates:
                            for token_str in top_tokens:
                                if token_str in candidates:
                                    details.append([
                                        s, token_str, start_idx + idx,
                                        start_idx + idx + 1
                                    ])
                                    sentence_lst[idx] = token_str
                                    break
                    # 还原
                    if sentence_lst[idx] == self.mask:
                        sentence_lst[idx] = s

            blk_new = ''.join(sentence_lst)
            text_new += blk_new
        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details
 def generate_items_for_word(self, word, fraction=1):
     candidates_1_order = []
     candidates_2_order = []
     candidates_3_order = []
     # same pinyin word
     candidates_1_order.extend(self._confusion_word_set(word))
     # custom confusion word
     candidates_1_order.extend(self._confusion_custom_set(word))
     if len(word) == 2:
         # same first char pinyin
         confusion = [
             i + word[1:] for i in self._confusion_char_set(word[0]) if i
         ]
         candidates_2_order.extend(confusion)
         # same last char pinyin
         confusion = [
             word[:-1] + i for i in self._confusion_char_set(word[-1]) if i
         ]
         candidates_2_order.extend(confusion)
     if len(word) > 2:
         # same first char pinyin
         confusion = [
             i + word[1:] for i in self._confusion_char_set(word[0]) if i
         ]
         candidates_3_order.extend(confusion)
         # same last char pinyin
         confusion = [
             word[:-1] + i for i in self._confusion_char_set(word[-1]) if i
         ]
         candidates_3_order.extend(confusion)
     # add all confusion word list
     confusion_word_set = set(candidates_1_order + candidates_2_order +
                              candidates_3_order)
     confusion_word_list = [
         item for item in confusion_word_set if is_chinese_string(item)
     ]
     confusion_sorted = sorted(confusion_word_list,
                               key=lambda k: self.word_frequency(k),
                               reverse=True)
     return confusion_sorted[:len(confusion_word_list) // fraction + 1]
    def correct(self, sentence, reverse=True):
        """
        句子改错
        :param sentence: 句子文本
        :return: 改正后的句子, list(wrong, right, begin_idx, end_idx)
        """
        detail = []
        sentences = []
        self.check_corrector_initialized()
        # 长句切分为短句
        # sentences = re.split(r";|,|。|\?\s|;\s|,\s", sentence)
        maybe_errors = self.detect(sentence)
        # trick: 类似翻译模型,倒序处理
        maybe_errors = sorted(maybe_errors,
                              key=operator.itemgetter(2),
                              reverse=reverse)
        for item, begin_idx, end_idx, err_type in maybe_errors:
            # 纠错,逐个处理
            before_sent = sentence[:begin_idx]
            after_sent = sentence[end_idx:]

            # 对非中文的错字不做处理
            if not is_chinese_string(item):
                continue
            # 困惑集中指定的词,直接取结果
            if err_type == ErrorType.confusion:
                # corrected_item = self.custom_confusion[item]
                corrected_item = (self.custom_confusion[item],
                                  ErrorType.confusion)
            # 对碎片且不常用单字,可能错误是多字少字
            elif err_type == ErrorType.word_char:
                maybe_right_items = self.generate_items_word_char(
                    item, before_sent, after_sent, begin_idx, end_idx)
                corrected_item = self.lm_correct_item(item, maybe_right_items,
                                                      before_sent, after_sent)
            # 多字
            elif err_type == ErrorType.redundancy:
                maybe_right_items = ['']
                corrected_item = self.lm_correct_item(item, maybe_right_items,
                                                      before_sent, after_sent)
            elif err_type == ErrorType.word:
                # 取得所有可能正确的词
                maybe_right_items = self.generate_items(item)
                # print(maybe_right_items)
                if not maybe_right_items:
                    continue
                maybe_right_items = [(item, ErrorType.word)
                                     for item in maybe_right_items]
                corrected_item = self.lm_correct_item(item, maybe_right_items,
                                                      before_sent, after_sent)
            else:
                '''err_type == ErrorType.char'''
                # 取得所有可能正确的词
                maybe_right_items = self.generate_items(item)
                if not maybe_right_items:
                    continue
                maybe_right_items = [(item, ErrorType.char)
                                     for item in maybe_right_items]
                # 取得最可能正确的字
                corrected_item = self.lm_correct_item(item, maybe_right_items,
                                                      before_sent, after_sent)
            # output
            if corrected_item[0] != item:
                sentence = before_sent + corrected_item[0] + after_sent
                # logger.debug('predict:' + item + '=>' + corrected_item)
                detail_word = [
                    item, corrected_item[0], begin_idx, end_idx,
                    corrected_item[1]
                ]
                detail.append(detail_word)
        detail = sorted(detail, key=operator.itemgetter(2))
        return sentence, detail
    def generate_items(self, word, fraction=1):
        """
        生成纠错候选集
        :param word:
        :param fraction:
        :return:
        """
        self.check_corrector_initialized()
        candidates_1_order = []
        candidates_2_order = []
        candidates_3_order = []
        # same pinyin word
        candidates_1_order.extend(self._confusion_word_set(word))
        # custom confusion word
        candidates_1_order.extend(self._confusion_custom_set(word))
        # same pinyin char
        if len(word) == 1:
            # same one char pinyin
            confusion = [i for i in self._confusion_char_set(word[0]) if i]
            candidates_1_order.extend(confusion)
        if len(word) == 2:
            # same first char pinyin
            confusion = [
                i + word[1:] for i in self._confusion_char_set(word[0]) if i
            ]
            candidates_2_order.extend(confusion)
            # same last char pinyin
            confusion = [
                word[:-1] + i for i in self._confusion_char_set(word[-1]) if i
            ]
            candidates_2_order.extend(confusion)
        if len(word) > 2:
            # same mid char pinyin
            confusion = [
                word[0] + i + word[2:]
                for i in self._confusion_char_set(word[1])
            ]
            candidates_3_order.extend(confusion)

            # same first word pinyin
            confusion_word = [
                i + word[-1] for i in self._confusion_word_set(word[:-1])
            ]
            candidates_3_order.extend(confusion_word)

            # same last word pinyin
            confusion_word = [
                word[0] + i for i in self._confusion_word_set(word[1:])
            ]
            candidates_3_order.extend(confusion_word)

        # add all confusion word list
        confusion_word_set = set(candidates_1_order + candidates_2_order +
                                 candidates_3_order)
        confusion_word_list = [
            item for item in confusion_word_set if is_chinese_string(item)
        ]
        confusion_sorted = sorted(confusion_word_list,
                                  key=lambda k: self.word_frequency(k),
                                  reverse=True)
        return confusion_sorted[:len(confusion_word_list) // fraction + 1]
    def correct(self, sentence, reverse=True):
        """
        句子改错
        :param sentence: 句子文本
        :return: 改正后的句子, list(wrong, right, begin_idx, end_idx)
        """
        detail = []
        sentences = []
        self.check_corrector_initialized()
        # 长句切分为短句
        # sentences = re.split(r";|,|。|\?\s|;\s|,\s", sentence)
        maybe_errors = self.detect(sentence)
        # trick: 类似翻译模型,倒序处理
        maybe_errors = sorted(maybe_errors,
                              key=operator.itemgetter(2),
                              reverse=reverse)
        for cur_item, begin_idx, end_idx, err_type in maybe_errors:
            # 纠错,逐个处理
            before_sent = sentence[:begin_idx]
            after_sent = sentence[end_idx:]
            # 对非中文的错字不做处理
            if not is_chinese_string(cur_item):
                continue
            # 困惑集中指定的词,直接取结果
            if err_type == ErrorType.confusion:
                # corrected_item = self.custom_confusion[item]
                corrected_item = (self.custom_confusion[cur_item],
                                  ErrorType.confusion)
            # 对碎片且不常用单字,可能错误是多字少字
            elif err_type == ErrorType.word_char:
                maybe_right_items = self.generate_items_word_char(
                    cur_item, before_sent, after_sent, begin_idx, end_idx)
                corrected_item = self.lm_correct_item(cur_item,
                                                      maybe_right_items,
                                                      before_sent, after_sent)
            # 多字
            elif err_type == ErrorType.redundancy:
                maybe_right_items = [('', ErrorType.redundancy)]
                corrected_item = self.lm_correct_item(cur_item,
                                                      maybe_right_items,
                                                      before_sent, after_sent)
            elif err_type == ErrorType.word:
                # 取得所有可能正确的词
                candidates = self.generate_items(cur_item)
                if not candidates:
                    continue
                candidates = [(item, ErrorType.word) for item in candidates]
                corrected_item = self.lm_correct_item(cur_item, candidates,
                                                      before_sent, after_sent)
                # 对ErrorType.word错误进行双层检测
                # 对多字词进行处理
                if len(corrected_item[0]
                       ) > 2 and corrected_item[0] not in self.word_freq:
                    candidates = self.generate_items_for_word(
                        corrected_item[0])
                    if not candidates:
                        continue
                    candidates = [(item, ErrorType.word)
                                  for item in candidates]
                    corrected_item = self.lm_correct_item(
                        corrected_item[0], candidates, before_sent, after_sent)
            else:
                '''err_type == ErrorType.char'''
                # 取得所有可能正确的词
                candidates = self.generate_items(cur_item)
                if not candidates:
                    continue
                # 取得最可能正确的字
                corrected_item = self.predict_mask_token(
                    cur_item, sentence, candidates, begin_idx, end_idx)
                corrected_item = (corrected_item, ErrorType.char)
            # output
            if corrected_item[0] != cur_item:
                sentence = before_sent + corrected_item[0] + after_sent
                detail_word = [
                    cur_item, corrected_item[0], begin_idx, end_idx,
                    corrected_item[1]
                ]
                detail.append(detail_word)

        detail = sorted(detail, key=operator.itemgetter(2))
        maybe_errors = sorted(maybe_errors,
                              key=operator.itemgetter(2),
                              reverse=False)
        return sentence, detail, '/'.join(self.tokens), maybe_errors
Beispiel #10
0
    def correct_short(self, text, start_idx=0):
        text_new = ''
        details = []
        self.check_corrector_initialized()
        # 编码统一,utf-8 to unicode
        text = convert_to_unicode(text)
        if self.is_word_error_detect:
            maybe_errors = self.detect(text)
            # trick: 类似翻译模型,倒序处理
            maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=True)
            for cur_item, begin_idx, end_idx, err_type in maybe_errors:
                # 纠错,逐个处理
                before_sent = text[:begin_idx]
                after_sent = text[end_idx:]

                # 对非中文的错字不做处理
                if not is_chinese_string(cur_item):
                    continue
                # 困惑集中指定的词,直接取结果
                if err_type == ErrorType.confusion:
                    corrected_item = (self.custom_confusion[cur_item], ErrorType.confusion)
                # 对碎片且不常用单字,可能错误是多字少字
                elif err_type == ErrorType.word_char:
                    maybe_right_items = self.generate_items_word_char(cur_item, before_sent, after_sent, begin_idx, end_idx)
                    corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent)
                # 多字
                elif err_type == ErrorType.redundancy:
                    maybe_right_items = [('',ErrorType.redundancy)]
                    corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent)
                
                # output
                if corrected_item[0] != cur_item:
                    text = before_sent + corrected_item[0] + after_sent
                    detail_word = [cur_item, corrected_item[0], start_idx+begin_idx, start_idx+end_idx, corrected_item[1]]
                    details.append(detail_word)    

        if self.is_char_error_detect:
            for idx, s in enumerate(text):
                # 对非中文的错误不做处理
                if is_chinese_string(s):
                    sentence_lst = list(text_new + text[idx:])
                    sentence_lst[idx] = self.mask
                    sentence_new = ''.join(sentence_lst)
                    predicts = self.model(sentence_new)
                    top_tokens = []
                    ssc_s = self._getSSC(s)
                    for p in predicts:
                        token_id = p.get('token', 0)
                        token_score = p.get('score', 0)
                        token_str = self.model.tokenizer.convert_ids_to_tokens(token_id)
                        ssc_token = self._getSSC(token_str)
                        soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4])
                        shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:])
                        ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token)
                        top_tokens.append({'bert_score': token_score, 'token_str': token_str, \
                            'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi})
                    
                    if top_tokens and (s not in [token.get('token_str') for token in top_tokens]):
                        # correct_item = self.ssc_correct_item(s, top_tokens)
                        correct_item = self.neural_ssc_correct_item(s, top_tokens)
                        if correct_item != s:
                            # details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char])
                            details.append([s, correct_item, idx + start_idx -1, idx + start_idx, ErrorType.char])
                            s = correct_item
                text_new += s
        details = sorted(details, key=operator.itemgetter(2))
        return text_new[1:], details  
Beispiel #11
0
    def bert_correct(self, text):
        """
        句子纠错
        :param text: 句子文本
        :return: list[list], [error_word, begin_pos, end_pos, error_type]
        """
        text_new = ''
        details = []
        self.check_corrector_initialized()
        # 编码统一,utf-8 to unicode
        text = convert_to_unicode(text)
        if self.is_word_error_detect:
            maybe_errors = self.detect(text)
            # trick: 类似翻译模型,倒序处理
            maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=True)
            for cur_item, begin_idx, end_idx, err_type in maybe_errors:
                # 纠错,逐个处理
                before_sent = text[:begin_idx]
                after_sent = text[end_idx:]

                # 对非中文的错字不做处理
                if not is_chinese_string(cur_item):
                    continue
                # 困惑集中指定的词,直接取结果
                if err_type == ErrorType.confusion:
                    corrected_item = (self.custom_confusion[cur_item], ErrorType.confusion)
                # 对碎片且不常用单字,可能错误是多字少字
                elif err_type == ErrorType.word_char:
                    maybe_right_items = self.generate_items_word_char(cur_item, before_sent, after_sent, begin_idx, end_idx)
                    corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent)
                # 多字
                elif err_type == ErrorType.redundancy:
                    maybe_right_items = [('',ErrorType.redundancy)]
                    corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent)
                elif err_type == ErrorType.word:
                    # 取得所有可能正确的词
                    candidates = self.generate_items(cur_item)
                    if not candidates:
                        continue
                    candidates=[(item,ErrorType.word) for item in candidates]
                    corrected_item = self.lm_correct_item(cur_item, candidates, before_sent, after_sent)
                    # 对ErrorType.word错误进行双层检测
                    # 对多字词进行处理
                    if len(corrected_item[0]) > 2 and corrected_item[0] not in self.word_freq:
                        candidates = self.generate_items_for_word(corrected_item[0])
                        if not candidates:
                            continue
                        candidates=[(item,ErrorType.word) for item in candidates]
                        corrected_item = self.lm_correct_item(corrected_item[0], candidates, before_sent, after_sent)
    
                # output
                if corrected_item[0] != cur_item:
                    text = before_sent + corrected_item[0] + after_sent
                    detail_word = [cur_item, corrected_item[0], begin_idx, end_idx, corrected_item[1]]
                    details.append(detail_word)

        if self.is_char_error_detect:
            text_new = ""
            for idx, s in enumerate(text):
                # 对非中文的错误不做处理
                if is_chinese_string(s):
                    # 对已包含错误不处理
                    maybe_err = [s, idx, idx + 1, ErrorType.char]
                    if not self._check_contain_details_error(maybe_err, details):
                        sentence_lst = list(text_new + text[idx:])
                        sentence_lst[idx] = self.mask
                        sentence_new = ''.join(sentence_lst)
                        predicts = self.model(sentence_new)
                        top_tokens = []
                        for p in predicts:
                            token_id = p.get('token', 0)
                            token_str = self.model.tokenizer.convert_ids_to_tokens(token_id)
                            top_tokens.append(token_str)

                        if top_tokens and (s not in top_tokens):
                            # 取得所有可能正确的词
                            candidates = self.generate_items(s)
                            if candidates:
                                for token_str in top_tokens:
                                    if token_str in candidates:
                                        details.append([s, token_str, idx, idx + 1,ErrorType.char])
                                        s = token_str
                                        break
                text_new += s

        details = sorted(details, key=operator.itemgetter(2))
        return text_new, details