def multi_threads_correct(text): threads_list = [] text_new = '' details = [] # 编码统一,utf-8 to unicode test = convert_to_unicode(text) blocks = bertCorrector.split_2_short_text(text, include_symbol=True) try: blocks = [(blocks[i][0] + blocks[i + 1][0], blocks[i][1]) for i in range(0, len(blocks), 2)] except Exception as e: pass punc = '。' for blk, start_idx in blocks: threads_list.append(MyThread(punc + blk, start_idx)) punc = blk[-1] for thread in threads_list: thread.start() for thread in threads_list: thread.join() pred_text, pred_details = thread.get_result() text_new += pred_text for detail in pred_details: details.append(detail) return text_new, details
def generate_bertScore_sound_shape_file(self, text, right_sentence='',id_lists=[]): """ 生成bert_score、sound_score、shape_score文件 :param text: 句子文本 :return: file """ text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 长句切分为短句 blocks = self.split_2_short_text(text, include_symbol=True) try: blocks = [(blocks[i][0] + blocks[i + 1][0], blocks[i][1]) for i in range(0, len(blocks), 2)] except Exception as e: pass punc='。' if self.is_char_error_detect: for blk, start_idx in blocks: blk=punc+blk blk_new = '' for idx, s in enumerate(blk): # 对非中文的错误不做处理 if is_chinese_string(s): # 对已包含错误不处理 maybe_err = [s, idx, idx + 1, ErrorType.char] if not self._check_contain_details_error(maybe_err, details): sentence_lst = list(blk_new + blk[idx:]) sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) predicts = self.model(sentence_new) top_tokens = [] ssc_s = self._getSSC(s) for p in predicts: token_id = p.get('token', 0) token_score = p.get('score', 0) token_str = self.model.tokenizer.convert_ids_to_tokens(token_id) ssc_token = self._getSSC(token_str) soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4]) shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:]) ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token) top_tokens.append({'bert_score': token_score, 'token_str': token_str, \ 'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi}) if top_tokens and (s not in [token.get('token_str') for token in top_tokens]): self.write2scorefile(s, top_tokens, start_idx + idx - 1, id_lists, right_sentence[start_idx + idx - 1]) # correct_item = self.ssc_correct_item(s, top_tokens) correct_item = right_sentence[start_idx + idx - 1] if correct_item != s: details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char]) s = correct_item blk_new += s text_new += blk_new punc = blk_new[-1] details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def bert_correct_ssc_origin(self, text): """ 使用ssc音形码进行句子纠错 :param text: 句子文本 :return: list[list], [error_word, begin_pos, end_pos, error_type] """ text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) if self.is_char_error_detect: text_new = "" for idx, s in enumerate(text): # 对非中文的错误不做处理 if is_chinese_string(s): # 对已包含错误不处理 maybe_err = [s, idx, idx + 1, ErrorType.char] if not self._check_contain_details_error(maybe_err, details): sentence_lst = list(text_new + text[idx:]) sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) predicts = self.model(sentence_new) top_tokens = [] ssc_s = self._getSSC(s) for p in predicts: token_id = p.get('token', 0) token_score = p.get('score', 0) token_str = self.model.tokenizer.convert_ids_to_tokens(token_id) ssc_token = self._getSSC(token_str) soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4]) shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:]) ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token) top_tokens.append({'bert_score': token_score, 'token_str': token_str, \ 'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi}) if top_tokens and (s not in [token.get('token_str') for token in top_tokens]): # correct_item = self.ssc_correct_item(s, top_tokens) correct_item = self.neural_ssc_correct_item(s, top_tokens) if correct_item != s: details.append([s, correct_item, idx, idx + 1, ErrorType.char]) s = correct_item # 取得所有可能正确的词 # candidates = self.generate_items(s) # if candidates: # for token_str in top_tokens: # if token_str in candidates: # details.append([s, token_str, idx, idx + 1,ErrorType.char]) # s = token_str # break text_new += s details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def detect(self, sentence): """ 检测句子中的疑似错误信息,包括[词、位置、错误类型] :param text: :return: list[list], [error_word, begin_pos, end_pos, error_type] """ maybe_errors = [] if not sentence.strip(): return maybe_errors # 初始化 self.check_detector_initialized() # 编码统一,utf-8 to unicode sentence = convert_to_unicode(sentence) # 文本归一化 sentence = uniform(sentence) # 切词 tokens = self.tokenizer.tokenize(sentence) # 自定义混淆集加入疑似错误词典 for confuse in self.custom_confusion: idx = sentence.find(confuse) if idx > -1: maybe_err = [confuse, idx, idx + len(confuse), ErrorType.confusion] self._add_maybe_error_item(maybe_err, maybe_errors) if self.is_word_error_detect: # 未登录词加入疑似错误词典 for word, begin_idx, end_idx in tokens: # pass filter word if self.is_filter_token(word): continue # pass in dict if word in self.word_freq: # 多字词或词频大于50000的单字,可以continue if len(word) == 1 and word in self.char_freq and self.char_freq.get(word) < 10000: maybe_err = [word, begin_idx, end_idx, ErrorType.word_char] self._add_maybe_error_item(maybe_err, maybe_errors) continue # 出现叠字,考虑是否多字 if len(word) == 1 and sentence[begin_idx - 1] == word: maybe_err = [word, begin_idx, end_idx, ErrorType.redundancy] self._add_maybe_error_item(maybe_err, maybe_errors) continue # 对碎片单字进行检测,可能多字、少字、错字 if len(word) == 1: maybe_err = [word, begin_idx, end_idx, ErrorType.word_char] self._add_maybe_error_item(maybe_err, maybe_errors) continue # maybe_err = [word, begin_idx, end_idx, ErrorType.word] # self._add_maybe_error_item(maybe_err, maybe_errors) return sorted(maybe_errors, key=lambda k: k[1], reverse=False)
def bert_correct_ssc(self, text): """ 使用ssc音形码进行句子纠错 :param text: 句子文本 :return: list[list], [error_word, begin_pos, end_pos, error_type] """ text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 长句切分为短句 blocks = self.split_2_short_text(text, include_symbol=True) # blocks = self.split_2_short_text(text) if self.is_word_error_detect: pass if self.is_char_error_detect: for blk, start_idx in blocks: blk_new = '' for idx, s in enumerate(blk): # 对非中文的错误不做处理 if is_chinese_string(s): sentence_lst = list(blk_new + blk[idx:]) sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) predicts = self.model(sentence_new) top_tokens = [] ssc_s = self._getSSC(s) for p in predicts: token_id = p.get('token', 0) token_score = p.get('score', 0) token_str = self.model.tokenizer.convert_ids_to_tokens(token_id) ssc_token = self._getSSC(token_str) soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4]) shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:]) ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token) top_tokens.append({'bert_score': token_score, 'token_str': token_str, \ 'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi}) if top_tokens and (s not in [token.get('token_str') for token in top_tokens]): # correct_item = self.ssc_correct_item(s, top_tokens) correct_item = self.neural_ssc_correct_item(s, top_tokens) if correct_item != s: details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char]) s = correct_item blk_new += s text_new += blk_new details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def electra_correct(self, text): """ 句子纠错 :param text: 句子文本 :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos] """ text_new = '' details = [] # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 长句切分为短句 blocks = self.split_2_short_text(text, include_symbol=True) for blk, start_idx in blocks: error_ids = self.electra_detect(blk) sentence_lst = list(blk) for idx in error_ids: s = sentence_lst[idx] if is_chinese_string(s): # 处理中文错误 sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) # 生成器fill-mask预测[mask],默认取top5 predicts = self.g_model(sentence_new) top_tokens = [] for p in predicts: token_id = p.get('token', 0) token_str = self.g_model.tokenizer.convert_ids_to_tokens( token_id) top_tokens.append(token_str) if top_tokens and (s not in top_tokens): # 取得所有可能正确的词 candidates = self.generate_items(s) if candidates: for token_str in top_tokens: if token_str in candidates: details.append([ s, token_str, start_idx + idx, start_idx + idx + 1 ]) sentence_lst[idx] = token_str break # 还原 if sentence_lst[idx] == self.mask: sentence_lst[idx] = s blk_new = ''.join(sentence_lst) text_new += blk_new details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def correct_short(self, text, start_idx=0): text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) if self.is_word_error_detect: maybe_errors = self.detect(text) # trick: 类似翻译模型,倒序处理 maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=True) for cur_item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = text[:begin_idx] after_sent = text[end_idx:] # 对非中文的错字不做处理 if not is_chinese_string(cur_item): continue # 困惑集中指定的词,直接取结果 if err_type == ErrorType.confusion: corrected_item = (self.custom_confusion[cur_item], ErrorType.confusion) # 对碎片且不常用单字,可能错误是多字少字 elif err_type == ErrorType.word_char: maybe_right_items = self.generate_items_word_char(cur_item, before_sent, after_sent, begin_idx, end_idx) corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent) # 多字 elif err_type == ErrorType.redundancy: maybe_right_items = [('',ErrorType.redundancy)] corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent) # output if corrected_item[0] != cur_item: text = before_sent + corrected_item[0] + after_sent detail_word = [cur_item, corrected_item[0], start_idx+begin_idx, start_idx+end_idx, corrected_item[1]] details.append(detail_word) if self.is_char_error_detect: for idx, s in enumerate(text): # 对非中文的错误不做处理 if is_chinese_string(s): sentence_lst = list(text_new + text[idx:]) sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) predicts = self.model(sentence_new) top_tokens = [] ssc_s = self._getSSC(s) for p in predicts: token_id = p.get('token', 0) token_score = p.get('score', 0) token_str = self.model.tokenizer.convert_ids_to_tokens(token_id) ssc_token = self._getSSC(token_str) soundSimi=computeSoundCodeSimilarity(ssc_s[:4], ssc_token[:4]) shapeSimi=computeShapeCodeSimilarity(ssc_s[4:], ssc_token[4:]) ssc_similarity = computeSSCSimilarity(ssc_s, ssc_token) top_tokens.append({'bert_score': token_score, 'token_str': token_str, \ 'ssc_similar': ssc_similarity, 'sound_similar': soundSimi, 'shape_similar': shapeSimi}) if top_tokens and (s not in [token.get('token_str') for token in top_tokens]): # correct_item = self.ssc_correct_item(s, top_tokens) correct_item = self.neural_ssc_correct_item(s, top_tokens) if correct_item != s: # details.append([s, correct_item, idx + start_idx, idx + start_idx + 1, ErrorType.char]) details.append([s, correct_item, idx + start_idx -1, idx + start_idx, ErrorType.char]) s = correct_item text_new += s details = sorted(details, key=operator.itemgetter(2)) return text_new[1:], details
def bert_correct(self, text): """ 句子纠错 :param text: 句子文本 :return: list[list], [error_word, begin_pos, end_pos, error_type] """ text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) if self.is_word_error_detect: maybe_errors = self.detect(text) # trick: 类似翻译模型,倒序处理 maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=True) for cur_item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = text[:begin_idx] after_sent = text[end_idx:] # 对非中文的错字不做处理 if not is_chinese_string(cur_item): continue # 困惑集中指定的词,直接取结果 if err_type == ErrorType.confusion: corrected_item = (self.custom_confusion[cur_item], ErrorType.confusion) # 对碎片且不常用单字,可能错误是多字少字 elif err_type == ErrorType.word_char: maybe_right_items = self.generate_items_word_char(cur_item, before_sent, after_sent, begin_idx, end_idx) corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent) # 多字 elif err_type == ErrorType.redundancy: maybe_right_items = [('',ErrorType.redundancy)] corrected_item = self.lm_correct_item(cur_item, maybe_right_items, before_sent, after_sent) elif err_type == ErrorType.word: # 取得所有可能正确的词 candidates = self.generate_items(cur_item) if not candidates: continue candidates=[(item,ErrorType.word) for item in candidates] corrected_item = self.lm_correct_item(cur_item, candidates, before_sent, after_sent) # 对ErrorType.word错误进行双层检测 # 对多字词进行处理 if len(corrected_item[0]) > 2 and corrected_item[0] not in self.word_freq: candidates = self.generate_items_for_word(corrected_item[0]) if not candidates: continue candidates=[(item,ErrorType.word) for item in candidates] corrected_item = self.lm_correct_item(corrected_item[0], candidates, before_sent, after_sent) # output if corrected_item[0] != cur_item: text = before_sent + corrected_item[0] + after_sent detail_word = [cur_item, corrected_item[0], begin_idx, end_idx, corrected_item[1]] details.append(detail_word) if self.is_char_error_detect: text_new = "" for idx, s in enumerate(text): # 对非中文的错误不做处理 if is_chinese_string(s): # 对已包含错误不处理 maybe_err = [s, idx, idx + 1, ErrorType.char] if not self._check_contain_details_error(maybe_err, details): sentence_lst = list(text_new + text[idx:]) sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) predicts = self.model(sentence_new) top_tokens = [] for p in predicts: token_id = p.get('token', 0) token_str = self.model.tokenizer.convert_ids_to_tokens(token_id) top_tokens.append(token_str) if top_tokens and (s not in top_tokens): # 取得所有可能正确的词 candidates = self.generate_items(s) if candidates: for token_str in top_tokens: if token_str in candidates: details.append([s, token_str, idx, idx + 1,ErrorType.char]) s = token_str break text_new += s details = sorted(details, key=operator.itemgetter(2)) return text_new, details