def _correct_item(sentence, idx, item): """ 纠正错误,逐词处理 :param sentence: :param idx: :param item: :return: corrected word 修正的词语 """ corrected_sent = sentence if not is_chinese_string(item): return corrected_sent, [] # 取得所有可能正确的词 maybe_error_items = _generate_items(item) if not maybe_error_items: return corrected_sent, [] ids = idx.split(',') begin_id = int(ids[0]) end_id = int(ids[-1]) if len(ids) > 1 else int(ids[0]) + 1 before = sentence[:begin_id] after = sentence[end_id:] corrected_item = min(maybe_error_items, key=lambda k: get_ppl_score(list(before + k + after), mode=trigram_char)) wrongs, rights, begin_idx, end_idx = [], [], [], [] if corrected_item != item: corrected_sent = before + corrected_item + after # default_logger.debug('pred:', item, '=>', corrected_item) wrongs.append(item) rights.append(corrected_item) begin_idx.append(begin_id) end_idx.append(end_id) detail = list(zip(wrongs, rights, begin_idx, end_idx)) return corrected_sent, detail
def correct(self, sentence=''): """ 句子改错 :param sentence: 句子文本 :return: 改正后的句子, list(wrong, right, begin_idx, end_idx) """ detail = [] maybe_errors = self.detect(sentence) for item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = sentence[:begin_idx] after_sent = sentence[end_idx:] if err_type == ErrorType.char: # 对非中文的错字不做处理 if not is_chinese_string(item): continue if not self.check_vocab_has_all_token(sentence): continue # 取得所有可能正确的字 corrected_item = self.predict_mask_token( sentence, begin_idx, end_idx) elif err_type == ErrorType.word: corrected_item = item else: print('not strand error_type') # output if corrected_item != item: sentence = before_sent + corrected_item + after_sent detail_word = [item, corrected_item, begin_idx, end_idx] detail.append(detail_word) detail = sorted(detail, key=operator.itemgetter(2)) return sentence, detail
def _correct_item(sentence, idx, item): """ 纠正错误,逐词处理 :param sentence: :param idx: :param item: :return: corrected word 修正的词语 """ corrected_sent = sentence if not is_chinese_string(item): return corrected_sent, [] # 取得所有可能正确的词 maybe_error_items = _generate_items(item) if not maybe_error_items: return corrected_sent, [] ids = idx.split(',') begin_id = int(ids[0]) end_id = int(ids[-1]) if len(ids) > 1 else int(ids[0]) + 1 before = sentence[:begin_id] after = sentence[end_id:] corrected_item = min(maybe_error_items, key=lambda k: get_ppl_score(list(before + k + after), mode=trigram_char)) wrongs, rights, begin_idx, end_idx = [], [], [], [] if corrected_item != item: corrected_sent = before + corrected_item + after # default_logger.debug('pred:', item, '=>', corrected_item) wrongs.append(item) rights.append(corrected_item) begin_idx.append(begin_id) end_idx.append(end_id) detail = list(zip(wrongs, rights, begin_idx, end_idx)) return corrected_sent, detail
def generate_items(self, word, fraction=1): """ 生成纠错候选集 :param word: :param fraction: :return: """ candidates_1_order = [] candidates_2_order = [] candidates_3_order = [] # same pinyin word candidates_1_order.extend(self._confusion_word_set(word)) # custom confusion word candidates_1_order.extend(self._confusion_custom_set(word)) # same pinyin char if len(word) == 1: # same one char pinyin confusion = [i for i in self._confusion_char_set(word[0]) if i] candidates_2_order.extend(confusion) if len(word) == 2: # same first char pinyin confusion = [ i + word[1:] for i in self._confusion_char_set(word[0]) if i ] candidates_2_order.extend(confusion) # same last char pinyin confusion = [ word[:-1] + i for i in self._confusion_char_set(word[-1]) if i ] candidates_2_order.extend(confusion) if len(word) > 2: # same mid char pinyin confusion = [ word[0] + i + word[2:] for i in self._confusion_char_set(word[1]) ] candidates_3_order.extend(confusion) # same first word pinyin confusion_word = [ i + word[-1] for i in self._confusion_word_set(word[:-1]) ] candidates_3_order.extend(confusion_word) # same last word pinyin confusion_word = [ word[0] + i for i in self._confusion_word_set(word[1:]) ] candidates_3_order.extend(confusion_word) # add all confusion word list confusion_word_set = set(candidates_1_order + candidates_2_order + candidates_3_order) confusion_word_list = [ item for item in confusion_word_set if is_chinese_string(item) ] confusion_sorted = sorted(confusion_word_list, key=lambda k: self.word_frequency(k), reverse=True) return confusion_sorted[:len(confusion_word_list) // fraction + 1]
def _correct_item(self, sentence, item, begin_idx, end_idx): """ 纠正字词错误 :param sentence: :param idx: :param item: :return: corrected word 修正的词语 """ corrected_sent = sentence detail = [] if not is_chinese_string(item): return corrected_sent, detail # 取得所有可能正确的词 maybe_right_items = self._generate_items(item) if not maybe_right_items: return corrected_sent, [] before_sent = sentence[:begin_idx] after_sent = sentence[end_idx:] corrected_item = min( maybe_right_items, key=lambda k: self.ppl_score(list(before_sent + k + after_sent))) if corrected_item != item: corrected_sent = before_sent + corrected_item + after_sent # default_logger.debug('predict:' + item + '=>' + corrected_item) detail = [item, corrected_item, begin_idx, end_idx] return corrected_sent, detail
def tokenize_words(text): """Word segmentation""" output = [] sentences = split_2_short_text(text, include_symbol=True) for sentence, idx in sentences: if is_chinese_string(sentence): import jieba output.extend(jieba.lcut(sentence)) else: output.extend(whitespace_tokenize(sentence)) return output
def _generate_items(word, fraction=1): candidates_1_order = [] candidates_2_order = [] candidates_3_order = [] # same pinyin word candidates_1_order.extend(get_confusion_word_set(word)) # same pinyin char if len(word) == 1: # same pinyin confusion = [i for i in get_confusion_char_set(word[0]) if i] candidates_2_order.extend(confusion) if len(word) > 1: # same first pinyin confusion = [ i + word[1:] for i in get_confusion_char_set(word[0]) if i ] candidates_2_order.extend(confusion) # same last pinyin confusion = [ word[:-1] + i for i in get_confusion_char_set(word[-1]) if i ] candidates_2_order.extend(confusion) if len(word) > 2: # same mid char pinyin confusion = [ word[0] + i + word[2:] for i in get_confusion_char_set(word[1]) ] candidates_3_order.extend(confusion) # same first word pinyin confusion_word = [ i + word[-1] for i in get_confusion_word_set(word[:-1]) ] candidates_1_order.extend(confusion_word) # same last word pinyin confusion_word = [ word[0] + i for i in get_confusion_word_set(word[1:]) ] candidates_1_order.extend(confusion_word) # add all confusion word list confusion_word_set = set(candidates_1_order + candidates_2_order + candidates_3_order) confusion_word_list = [ item for item in confusion_word_set if is_chinese_string(item) ] confusion_sorted = sorted(confusion_word_list, key=lambda k: \ get_frequency(k), reverse=True) return confusion_sorted[:len(confusion_word_list) // fraction + 1]
def electra_correct(self, text): """ 句子纠错 :param text: 句子文本 :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos] """ text_new = '' details = [] # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 长句切分为短句 blocks = self.split_2_short_text(text, include_symbol=True) for blk, start_idx in blocks: error_ids = self.electra_detect(blk) sentence_lst = list(blk) for idx in error_ids: s = sentence_lst[idx] if is_chinese_string(s): # 处理中文错误 sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) # 生成器fill-mask预测[mask],默认取top5 predicts = self.g_model(sentence_new) top_tokens = [] for p in predicts: token_id = p.get('token', 0) token_str = self.g_model.tokenizer.convert_ids_to_tokens( token_id) top_tokens.append(token_str) if top_tokens and (s not in top_tokens): # 取得所有可能正确的词 candidates = self.generate_items(s) if candidates: for token_str in top_tokens: if token_str in candidates: details.append([ s, token_str, start_idx + idx, start_idx + idx + 1 ]) sentence_lst[idx] = token_str break # 还原 if sentence_lst[idx] == self.mask: sentence_lst[idx] = s blk_new = ''.join(sentence_lst) text_new += blk_new details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def generate_items(self, word, fragment=1): """ 生成纠错候选集 :param word: :param fragment: 分段 :return: """ self.check_corrector_initialized() # 1字 candidates_1 = [] # 2字 candidates_2 = [] # 多于2字 candidates_3 = [] # same pinyin word candidates_1.extend(self._confusion_word_set(word)) # custom confusion word candidates_1.extend(self._confusion_custom_set(word)) # same pinyin char if len(word) == 1: # same one char pinyin confusion = [i for i in self._confusion_char_set(word[0]) if i] candidates_1.extend(confusion) if len(word) == 2: # same first char pinyin confusion = [i + word[1:] for i in self._confusion_char_set(word[0]) if i] candidates_2.extend(confusion) # same last char pinyin confusion = [word[:-1] + i for i in self._confusion_char_set(word[-1]) if i] candidates_2.extend(confusion) if len(word) > 2: # same mid char pinyin confusion = [word[0] + i + word[2:] for i in self._confusion_char_set(word[1])] candidates_3.extend(confusion) # same first word pinyin confusion_word = [i + word[-1] for i in self._confusion_word_set(word[:-1])] candidates_3.extend(confusion_word) # same last word pinyin confusion_word = [word[0] + i for i in self._confusion_word_set(word[1:])] candidates_3.extend(confusion_word) # add all confusion word list confusion_word_set = set(candidates_1 + candidates_2 + candidates_3) confusion_word_list = [item for item in confusion_word_set if is_chinese_string(item)] confusion_sorted = sorted(confusion_word_list, key=lambda k: self.word_frequency(k), reverse=True) return confusion_sorted[:len(confusion_word_list) // fragment + 1]
def is_filter_token(token): result = False # pass blank if not token.strip(): result = True # pass num if token.isdigit(): result = True # pass alpha if is_alphabet_string(token.lower()): result = True # pass not chinese if not is_chinese_string(token): result = True return result
def bert_correct(self, text): """ 句子纠错 :param text: 句子文本 :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos] """ text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 长句切分为短句 blocks = self.split_text_by_maxlen(text, maxlen=128) for blk, start_idx in blocks: blk_new = '' for idx, s in enumerate(blk): # 处理中文错误 if is_chinese_string(s): sentence_lst = list(blk_new + blk[idx:]) sentence_lst[idx] = self.mask sentence_new = ''.join(sentence_lst) # 预测,默认取top5 predicts = self.model(sentence_new) top_tokens = [] for p in predicts: token_id = p.get('token', 0) token_str = self.model.tokenizer.convert_ids_to_tokens( token_id) top_tokens.append(token_str) if top_tokens and (s not in top_tokens): # 取得所有可能正确的词 candidates = self.generate_items(s) if candidates: for token_str in top_tokens: if token_str in candidates: details.append([ s, token_str, start_idx + idx, start_idx + idx + 1 ]) s = token_str break blk_new += s text_new += blk_new details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def correct(self, sentence): """ 句子改错 :param sentence: 句子文本 :return: 改正后的句子, list(wrong, right, begin_idx, end_idx) """ detail = [] self.check_corrector_initialized() # 长句切分为短句 # sentences = re.split(r";|,|。|\?\s|;\s|,\s", sentence) maybe_errors = self.detect(sentence) # trick: 类似翻译模型,倒序处理;用end_idx排序,倒序 maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=True) pprint('ordered maybe errors', maybe_errors) print('---generate items:') # 居然一个一个地改[抠鼻][抠鼻] for item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = sentence[:begin_idx] after_sent = sentence[end_idx:] pprint('item,err_type', [item, err_type]) # 困惑集中指定的词,直接取结果 if err_type == error_type["confusion"]: corrected_item = self.custom_confusion[item] else: # 对非中文的错字不做处理 if not is_chinese_string(item): continue # 取得所有可能正确的词 maybe_right_items = self.generate_items(item) pprint('maybe_right_items', maybe_right_items) if not maybe_right_items: continue corrected_item = self.lm_correct_item(item, maybe_right_items, before_sent, after_sent) pprint('corrected_item', corrected_item) # output if corrected_item != item: sentence = before_sent + corrected_item + after_sent # logger.debug('predict:' + item + '=>' + corrected_item) detail_word = [item, corrected_item, begin_idx, end_idx] detail.append(detail_word) detail = sorted(detail, key=operator.itemgetter(2)) return sentence, detail
def ernie_correct(self, text, ernie_cut_type='char'): """ 句子纠错 :param text: 句子文本 :param ernie_cut_type: 切词类型(char/word) :return: corrected_text, list[list], [error_word, correct_word, begin_pos, end_pos] """ text_new = '' details = [] self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 长句切分为短句 blocks = self.split_text_by_maxlen(text, maxlen=512) for blk, start_idx in blocks: blk_new = '' blk = segment(blk, cut_type=ernie_cut_type, pos=False) for idx, s in enumerate(blk): # 处理中文错误 if is_chinese_string(s): sentence_lst = blk[:idx] + blk[idx:] sentence_lst[idx] = self.mask_token * len(s) sentence_new = ' '.join(sentence_lst) # 预测,默认取top5 predicts = self.predict_mask(sentence_new) top_tokens = [] for p in predicts: top_tokens.append(p.get('token', '')) if top_tokens and (s not in top_tokens): # 取得所有可能正确的词 candidates = self.generate_items(s) if candidates: for token_str in top_tokens: if token_str in candidates: details.append([ s, token_str, start_idx + idx, start_idx + idx + 1 ]) s = token_str break blk_new += s text_new += blk_new details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def correct(self, sentence): """ 句子改错 :param sentence: 句子文本 :return: 改正后的句子, list(wrong, right, begin_idx, end_idx) """ detail = [] self.check_corrector_initialized() maybe_errors = self.detect(sentence) # 排序 maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=False) for cur_item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = sentence[:begin_idx] after_sent = sentence[end_idx:] # 困惑集中指定的词,直接取结果 if err_type == ErrorType.confusion: corrected_item = self.custom_confusion[cur_item] else: # 对非中文的错字不做处理 if not is_chinese_string(cur_item): continue # 取得所有可能正确的词 candidates = self.generate_items(cur_item) if not candidates: continue corrected_item = self.get_lm_correct_item(cur_item, candidates, before_sent, after_sent, n=5, threshold=50) # output if corrected_item != cur_item: sentence = before_sent + corrected_item + after_sent detail_word = [cur_item, corrected_item, begin_idx, end_idx] detail.append(detail_word) detail = sorted(detail, key=operator.itemgetter(2)) return sentence, detail
def correct(self, sentence=''): """ 句子改错 :param sentence: 句子文本 :return: 改正后的句子, list(wrong, right, begin_idx, end_idx) """ detail = [] maybe_errors = self.detect(sentence) maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=False) for item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = sentence[:begin_idx] after_sent = sentence[end_idx:] # 困惑集中指定的词,直接取结果 if err_type == error_type["confusion"]: corrected_item = self.custom_confusion[item] elif err_type == error_type["char"]: # 对非中文的错字不做处理 if not is_chinese_string(item): continue if not self.check_vocab_has_all_token(sentence): continue # 取得所有可能正确的字 corrected_item = self.bert_lm_infer(sentence, error_begin_idx=begin_idx, error_end_idx=end_idx) elif err_type == error_type["word"]: corrected_item = item else: print('not strand error_type') # output if corrected_item != item: sentence = before_sent + corrected_item + after_sent detail_word = [item, corrected_item, begin_idx, end_idx] detail.append(detail_word) detail = sorted(detail, key=operator.itemgetter(2)) return sentence, detail
def _correct_item(self, sentence, item, begin_idx, end_idx, err_type): """ 纠正字词错误 :param sentence: :param item: :param begin_idx: :param end_idx: :param err_type: 错误类型 :return: corrected word 修正的词语 """ corrected_sent = sentence detail = [] before_sent = sentence[:begin_idx] after_sent = sentence[end_idx:] # 困惑集中指定的词,直接取结果 if err_type == error_type["confusion"]: corrected_item = self.custom_confusion[item] else: # 对非中文的错字不做处理 if not is_chinese_string(item): return corrected_sent, detail # 取得所有可能正确的词 maybe_right_items = self._generate_items(item) if not maybe_right_items: return corrected_sent, detail if item not in maybe_right_items: maybe_right_items.append(item) corrected_item = min(maybe_right_items, key=lambda k: self.ppl_score( list(before_sent + k + after_sent))) # output if corrected_item != item: corrected_sent = before_sent + corrected_item + after_sent # default_logger.debug('predict:' + item + '=>' + corrected_item) detail = [item, corrected_item, begin_idx, end_idx] return corrected_sent, detail
def correct(self, sentence): """ 句子改错 :param sentence: 句子文本 :return: 改正后的句子, list(wrong, right, begin_idx, end_idx) """ detail = [] self.check_corrector_initialized() maybe_errors = self.detect(sentence) # 倒序处理 maybe_errors = sorted(maybe_errors, key=operator.itemgetter(2), reverse=True) for item, begin_idx, end_idx, err_type in maybe_errors: # 纠错,逐个处理 before_sent = sentence[:begin_idx] after_sent = sentence[end_idx:] # 困惑集中指定的词,直接取结果 if err_type == ErrorType.confusion: corrected_item = self.custom_confusion[item] else: # 对非中文的错字不做处理 if not is_chinese_string(item): continue # 取得所有可能正确的词 maybe_right_items = self.generate_items(item) if not maybe_right_items: continue corrected_item = self.lm_correct_item(item, maybe_right_items, before_sent, after_sent) # output if corrected_item != item: sentence = before_sent + corrected_item + after_sent # logger.debug('predict:' + item + '=>' + corrected_item) detail_word = [item, corrected_item, begin_idx, end_idx] detail.append(detail_word) detail = sorted(detail, key=operator.itemgetter(2)) return sentence, detail
def predict(self, text, **kwargs): details = [] text_new = '' self.check_corrector_initialized() # 编码统一,utf-8 to unicode text = convert_to_unicode(text) # 长句切分为短句 blocks = split_text_by_maxlen(text, maxlen=128) for blk, start_idx in blocks: blk_new = '' for idx, s in enumerate(blk): # 处理中文错误 if is_chinese_string(s): sentence_lst = list(blk_new + blk[idx:]) sentence_lst[idx] = self.mask # 预测,默认取top10 predict_words = self.predict_mask_token(sentence_lst, idx, k=10) top_tokens = [] for w, _ in predict_words: top_tokens.append(w) if top_tokens and (s not in top_tokens): # 取得所有可能正确的词 candidates = self.generate_items(s) if candidates: for token_str in top_tokens: if token_str in candidates: details.append( (s, token_str, start_idx + idx, start_idx + idx + 1)) s = token_str break blk_new += s text_new += blk_new details = sorted(details, key=operator.itemgetter(2)) return text_new, details
def _generate_items(word, fraction=1): candidates_1_order = [] candidates_2_order = [] candidates_3_order = [] # same pinyin word candidates_1_order.extend(get_confusion_word_set(word)) # same pinyin char if len(word) == 1: # same pinyin confusion = [i for i in get_confusion_char_set(word[0]) if i] candidates_2_order.extend(confusion) if len(word) > 1: # same first pinyin confusion = [i + word[1:] for i in get_confusion_char_set(word[0]) if i] candidates_2_order.extend(confusion) # same last pinyin confusion = [word[:-1] + i for i in get_confusion_char_set(word[-1]) if i] candidates_2_order.extend(confusion) if len(word) > 2: # same mid char pinyin confusion = [word[0] + i + word[2:] for i in get_confusion_char_set(word[1])] candidates_3_order.extend(confusion) # same first word pinyin confusion_word = [i + word[-1] for i in get_confusion_word_set(word[:-1])] candidates_1_order.extend(confusion_word) # same last word pinyin confusion_word = [word[0] + i for i in get_confusion_word_set(word[1:])] candidates_1_order.extend(confusion_word) # add all confusion word list confusion_word_set = set(candidates_1_order + candidates_2_order + candidates_3_order) confusion_word_list = [item for item in confusion_word_set if is_chinese_string(item)] confusion_sorted = sorted(confusion_word_list, key=lambda k: \ get_frequency(k), reverse=True) return confusion_sorted[:len(confusion_word_list) // fraction + 1]
def generate_items(self, word, fragment=1): """ 生成纠错候选集 :param word: :param fragment: 分段 :return: """ self.check_corrector_initialized() # 1字 candidates_1 = [] # 2字 candidates_2 = [] # 多于2字 candidates_3 = [] # same pinyin word candidates_1.extend(self._confusion_word_set(word)) # 第一种情况:confusion,如果是自定义混淆集中的错误,直接修改为对应的正确的值就可以了。 # custom confusion word candidates_1.extend(self._confusion_custom_set(word)) """ 第二种情况:对于word和char两种情况,没有对应的正确的值,就需要通过语言模型来从候选集中找了。 候选集的构造逻辑如下,输入是item,也就是检错阶段得到的可能的错误词。首先,同音字和形近字表共同可以构建一个基于字的混淆集(confusion_char_set)。其次,借助于常用字表和item之间的编辑距离,可以构建一个比较粗的候选词集,通过常用词表可以做一个过滤,最后加上同音的限制条件,可以得到一个更小的基于词的混淆集(confusion_word_set)。最后,还有一个自定义的混淆集(confusion_custom _set)。 有了上述的表,就可以构建候选集了。通过候选词的长度来分情况讨论: 第一:长度为1。直接利用基于字的混淆集来替换。 第二:长度为2。分别替换每一个字。 第三:长度为3。同上。 最后,合并所有的候选集。那么通过上述的构造过程,可能导致一些无效词或者字的出现,因此要做一些过滤处理,最后按照选出一些候选集的子集来处理。代码中的规则是基于词频来处理,选择topk个词作为候选集。 """ # same pinyin char if len(word) == 1: # same one char pinyin confusion = [i for i in self._confusion_char_set(word[0]) if i] candidates_1.extend(confusion) if len(word) == 2: # same first char pinyin confusion = [ i + word[1:] for i in self._confusion_char_set(word[0]) if i ] candidates_2.extend(confusion) # same last char pinyin confusion = [ word[:-1] + i for i in self._confusion_char_set(word[-1]) if i ] candidates_2.extend(confusion) if len(word) > 2: # same mid char pinyin confusion = [ word[0] + i + word[2:] for i in self._confusion_char_set(word[1]) ] candidates_3.extend(confusion) # same first word pinyin confusion_word = [ i + word[-1] for i in self._confusion_word_set(word[:-1]) ] candidates_3.extend(confusion_word) # same last word pinyin confusion_word = [ word[0] + i for i in self._confusion_word_set(word[1:]) ] candidates_3.extend(confusion_word) # add all confusion word list confusion_word_set = set(candidates_1 + candidates_2 + candidates_3) confusion_word_list = [ item for item in confusion_word_set if is_chinese_string(item) ] confusion_sorted = sorted(confusion_word_list, key=lambda k: self.word_frequency(k), reverse=True) return confusion_sorted[:len(confusion_word_list) // fragment + 1]
def _generate_items(sentence, idx, word, fraction=1): if len(word) == 1: confusion_word_set = set([i for i in get_confusion_char_set(word[0]) if i]) if len(word) > 1: def combine_two_confusion_char(sentence, idx, word): # # assuming there is only two char to change # # definitely not the final version, need to be fixed!!!! result = set() for i in range(len(word) - 1): for j in range(i + 1,len(word)): result |= set([word[: i] + i_word + word[i + 1: j] + j_word + word[j + 1:] \ for i_word in get_confusion_char_set(word[i]) if i_word \ for j_word in get_confusion_char_set(word[j]) if j_word]) return result def confusion_set(sentence, idx, word): # maximum number of change char is set up by 'edit_distance' # the maximum edit-distance edit_distance = 2 cands_tmp = [['',0]] result = set() ids = list(range(int(idx.split(',')[0]), int(idx.split(',')[1]))) # # change individual char while cands_tmp: if len(cands_tmp[0][0]) == len(word): result.add(cands_tmp[0][0]) elif cands_tmp[0][1] == edit_distance: result.add(cands_tmp[0][0] + word[len(cands_tmp[0][0]):]) else: target_idx = ids[len(cands_tmp[0][0])] for char_cand in get_confusion_char_set(sentence[target_idx]): if target_idx == 0: if char_cand + sentence[target_idx + 1] not in two_char_dict: continue elif target_idx == len(sentence) - 1: if sentence[target_idx - 1] + char_cand not in two_char_dict: continue elif char_cand + sentence[target_idx + 1] not in two_char_dict and \ sentence[target_idx - 1] + char_cand not in two_char_dict: continue if char_cand == sentence[target_idx]: cands_tmp.append([cands_tmp[0][0] + char_cand, cands_tmp[0][1]]) else: cands_tmp.append([cands_tmp[0][0] + char_cand, cands_tmp[0][1] + 1]) cands_tmp.pop(0) # # change connected two chars for i in range(len(word) - 1): for char_i in get_confusion_char_set(word[i]): for char_j in get_confusion_char_set(word[i + 1]): if char_i + char_j in two_char_dict: result.add(word[:i] + char_i + char_j + word[i + 2:]) return result confusion_word_set = confusion_set(sentence, idx, word) confusion_word_list = [item for item in confusion_word_set if is_chinese_string(item)] confusion_sorted = sorted(confusion_word_list, key=lambda k: get_frequency(k), reverse=True) return confusion_sorted[:len(confusion_word_list) // fraction + 1]
sys.path.append("../") from pypinyin import lazy_pinyin from pycorrector.utils.text_utils import traditional2simplified, simplified2traditional from pycorrector.utils.text_utils import get_homophones_by_char, get_homophones_by_pinyin traditional_sentence = '憂郁的臺灣烏龜' simplified_sentence = traditional2simplified(traditional_sentence) print(simplified_sentence) simplified_sentence = '忧郁的台湾乌龟' traditional_sentence = simplified2traditional(simplified_sentence) print(traditional_sentence) print(lazy_pinyin('中心')) # 不带音调 pron = get_homophones_by_char('长') print('get_homophones_by_char:', pron) pron = get_homophones_by_pinyin('zha1ng') print('get_homophones_by_pinyin:', pron) from pycorrector.utils.text_utils import is_chinese, is_chinese_string s = """现在 银色的K2P是MTK还是博通啊?李雯雯……“00后”选手 啥123kMk.23?? ''"’ """ print(s, is_chinese_string(s)) for i in s: print(i, is_chinese(i))