def do_docx(file_bytes, file_name, username, process_method): s = io.BytesIO(file_bytes) doc = docx.Document(s) for paragraph in doc.paragraphs: for run in paragraph.runs: if contain_zh(run.text): if process_method == 'kenlm': correct_sent = pycorrector.correct(run.text)[1] else: correct_sent = baidu_correct(run.text) if correct_sent: run.font.color.rgb = RGBColor(255, 0, 0) # 调用api,获取返回的数据 for table in doc.tables: for row in table.rows: for cell in row.cells: for para in cell.paragraphs: for run in para.runs: if contain_zh(run.text): if process_method == 'kenlm': correct_sent = pycorrector.correct(run.text)[1] else: correct_sent = baidu_correct(run.text) # 调用api,获取返回的数据 if correct_sent: run.font.color.rgb = RGBColor(255, 0, 0) dir = os.path.join(BASE_DIR, 'utils/{}'.format(username)) if not os.path.exists(dir): os.makedirs(dir) doc.save(os.path.join(dir, file_name)) return dir
def test_brand(): """测试品牌名纠错""" pycorrector.enable_char_error(enable=False) error_sentence_1 = '买衣服就到拼哆哆' # 拼多多 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent)) error_sentence_1 = '这个特仑素牛奶喝起来还不错吧' # 特仑苏 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent))
def text1(): error_sentence_1 = '机七学习是人工智能领遇最能体现智能的一个分知' correct_sent = correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format( error_sentence_1, correct_sent)) corrected_sent, detail = correct('我们应该让坐') for item in detail: source, target, start, end = item print(type(start)) print(type(end))
def test_disease(): """测试疾病名纠错""" pycorrector.enable_char_error(enable=False) error_sentence_1 = '这个新药奥美砂坦脂片能治疗心绞痛,效果还可以' # 奥美沙坦酯片 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent)) error_sentence_1 = '有个药名叫硫酸氢录吡各雷片能治疗高血压' # 硫酸氢氯吡格雷片 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent))
def test_person_name(): """测试人名纠错""" error_sentence_1 = '发行人共同实际控制人萧华、霍荣铨、邓啟棠、张旗康分别' # 误杀,萧华-肖 import jieba.posseg print(jieba.posseg.lcut(error_sentence_1)) correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent)) error_sentence_1 = '上述承诺内容系本人真实意思表示' # 误杀:系-及 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent))
def test_suyu(): """测试俗语纠错""" pycorrector.enable_char_error(enable=False) error_sentence_1 = '这衣服买给她吧,也是肥水步流外人田' # 肥水不流外人田 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent)) error_sentence_1 = '这么多字让他写也是赶鸭子打架' # 赶鸭子上架 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format(error_sentence_1, correct_sent))
def typo(strRceive): # 输入 变量text corrected_sent, detail = pycorrector.correct(strRceive) # print(corrected_sent) correctResult = list(corrected_sent) strResult = list(strRceive) # print(correctResult) # print(detail) cnt1 = 0 cnt2 = 0 for i in detail: # print(i) correctResult.insert(i[2] + cnt1, '(') cnt1 += 1 correctResult.insert(i[3] + cnt1, ')') cnt1 += 1 strResult.insert(i[2] + cnt2, '(') cnt2 += 1 strResult.insert(i[3] + cnt2, ')') cnt2 += 1 correctResult = "".join(correctResult) strResult = "".join(strResult) # print("The wrong words in the brackets: \n" + strResult + "\n") # print("After corrected: \n" + correctResult + "\n") # print(strResult) print(correctResult)
def test(path, result_path): count, count_all = 0, 0 #_badcase = open('../../positive_badcase.txt','w', encoding='utf-8') with open(path, 'r', encoding='utf-8') as file, open(result_path, 'w', encoding='utf-8') as wfile: line = file.readline() while line != None and line != '': count_all += 1 # 用于测试sighan数据的部分代码 index, origin_string = line.strip().split( ' ')[0], line.strip().split(' ')[1] #if count_all == 4: # break # 用于测试笔录数据的部分代码 """ origin_string = line.strip().split(',')[0] if len(line.strip().split(',')) > 1: origin_string += line.strip().split(',')[1] corr_string, detail = pycorrector.correct(origin_string) if str(detail) == "[]": count += 1 else: #wfile.write('{}\t{}\n'.format(corr_string, detail)) wfile.write('{}\t{}\t{}\n'.format(index, corr_string, detail)) print('{} / {}'.format(count, count_all)) """ idx = index.strip().split('=')[1].strip(')') idx, corr_string, detail = pycorrector.correct(idx, origin_string) wfile.write('{}\t{}\t{}\n'.format(index, corr_string, detail)) line = file.readline()
def correct_sentence(_sentence): corrected_sent, detail = pycorrector.correct(_sentence) print(">>> corrected_sent:", corrected_sent) print(">>> detail:", detail) idx_errors = pycorrector.detect(_sentence) print(">>> index of errors:", idx_errors)
def test_text4(): error_sentences = [ '我喜欢打监球,你呢?足球吗', '老师工作非常幸苦,我们要遵敬老师', ' 我兴高彩列地去公园游玩', '老师的生体不好,可她艰持给我们上课', '我们要宝护它们', '讲台上放着一只漂亮的刚笔', '春暖花开之时我们躯车到了海滨渡假村', '按照上级布署安排', '冬冬今天戴来了一本好看的童话书', '少先队员因该为老人让坐', '服装店里的衣服各试各样', '一只小鱼船浮在平净的河面上', '我的家乡是有明的渔米之乡', ' _ ,', '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物', # 出租的事 '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。', # 题高 => 提高 专业人氏 => 专业人士 '三个凑皮匠胜过一个诸葛亮也有道理。', # 凑 '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。', '还有我要看他们的个性,如果跟同时合不来受到压力的话,无法专心地工作。', ] for line in error_sentences: correct_sent = correct(line) print("original sentence:{} => correct sentence:{}".format( line, correct_sent))
def eval_sighan_corpus(pkl_path, verbose=False): sighan_data = load_pkl(pkl_path) total_count = 0 right_count = 0 for error_sentence, details in sighan_data: ids = [] error_word = '' right_word = '' if not details: continue for detail in details: idx = detail[0] error_word = detail[1] right_word = detail[2] begin_idx = idx - 1 ids.append(begin_idx) correct_sentence = error_sentence.replace(error_word, right_word) # pred_detail: list(wrong, right, begin_idx, end_idx) pred_sentence, pred_detail = pycorrector.correct(error_sentence) if pred_sentence == correct_sentence: right_count += 1 else: if verbose: print('truth:', correct_sentence, details) print('predict:', pred_sentence, pred_detail) total_count += 1 right_rate = 0.0 if total_count > 0: right_rate = right_count / total_count return right_rate
def main(**kwargs): """ Cmd script of correct. Input text file, output corrected text file. :param kwargs: input, a text file object that will be read from. Should contain utf-8 sentence per line :param output: a text file object where parsed output will be written. Parsed output will be similar to CSV data :type input: text file object in read mode :type output: text file object in write mode :return: """ no_char = kwargs['no_char'] if 'no_char' in kwargs else False if no_char: pycorrector.enable_char_error(enable=False) print('disable char error detect.') detail = kwargs['detail'] if 'detail' in kwargs else False count = 0 with open(kwargs['input'], 'r', encoding='utf-8') as fr, open(kwargs['output'], 'w', encoding='utf-8') as fw: for line in fr: line = line.strip() corrected_sent, info = pycorrector.correct(line) count += 1 r = corrected_sent if detail: r = corrected_sent + '\t' + str(info) fw.write(line + '\t' + r + '\n') print('{} lines in output'.format(count))
def eval_bcmi_data(data_path, verbose=False): total_count = 0 right_count = 0 right_result = dict() wrong_result = dict() with open(data_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() error_sentence, right_sentence, right_detail = get_bcmi_corpus(line) if not error_sentence: continue pred_sentence, pred_detail = pycorrector.correct(error_sentence) if verbose: print('input sentence:', error_sentence) print('pred sentence:', pred_sentence, pred_detail) print('right sentence:', right_sentence, right_detail) total_count += 1 if right_sentence == pred_sentence: right_count += 1 right_result[error_sentence] = [right_sentence, pred_sentence] else: wrong_result[error_sentence] = [right_sentence, pred_sentence] if verbose: print('right count:', right_count, ';total_count:', total_count) right_rate = 0.0 if total_count > 0: right_rate = right_count / total_count return right_rate, right_result, wrong_result
def eval_bcmi_data(data_path, verbose=False): sentence_size = 1 right_count = 0 right_result = dict() wrong_result = dict() with open(data_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() error_sentence, right_sentence = get_bcmi_corpus(line) if not error_sentence: continue pred_sentence, pred_detail = correct(error_sentence) if verbose: print('input sentence:', error_sentence) print('pred sentence:', pred_sentence, pred_detail) print('right sentence:', right_sentence) sentence_size += 1 if right_sentence == pred_sentence: right_count += 1 right_result[error_sentence] = [right_sentence, pred_sentence] else: wrong_result[error_sentence] = [right_sentence, pred_sentence] if verbose: print('right count:', right_count, ';sentence size:', sentence_size) return right_count / sentence_size, right_result, wrong_result
def test_char_correct_wrong(): errors = [ '她知难而上,沤心沥血,一心扑在舞台上', '还有你们看看清除哈', '我国人民义愤填鹰', '权利的游戏第八季', '2周岁22斤宝宝用多大的啊', '这个到底有多辣?', '所以先救挨饿的人,然后治疗病人。', '现在,常常会到听男女平等这个词。', '我的喉咙发炎了要买点阿莫细林吃', '做的最倒霉的一件事就帮尼哥檫脚。', '战士微笑著轻轻拍了拍少年的肩膀。', '差点拌到自己的脚。', '面对着熙熙嚷嚷的城市。', '你等我和老大商却一下。', '报应接中迩来。', '我心理不由有些忌妒。', '他们不需要怕他门没有钱。', '全球的产龄妇女总生育率只生下一半,根据调查很有可能一直到2050年产龄妇女总生育率还是减少的趋势。', '但现代的妇女所担任的责任已家重,除了家务以外,仍需出外工作补贴家', '加上父母亲自己的看法,想原封不动地、完完全全地全部传给子女们', '叶子的绿色与本身枝干的颜色都会变为偏较暗的颜色。', ] for i in errors: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i))
def e2r(word): """ 汉字纠错 :param word: :return: """ corrected_sent, detail = pycorrector.correct(word) return corrected_sent, detail
def eval_sighan_2015_by_rule(sighan_path=sighan_2015_path, verbose=True, num_limit_lines=1000): total_count = 0 right_count = 0 right_rate = 0.0 recall_rate = 0.0 recall_right_count = 0 recall_total_count = 0 start_time = time.time() with open(sighan_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() if line.startswith('#'): continue parts = line.split() if len(parts) != 2: continue if 0 < num_limit_lines < total_count: continue src = parts[0] trg = parts[1] pred, pred_detail = pycorrector.correct(src) if src != trg: recall_total_count += 1 if pred == trg: right_count += 1 if src != trg: recall_right_count += 1 if verbose: print("\nright:") print( f'input : {src}\ntruth : {trg}\npredict: {pred} pred_detail: {pred_detail}' ) else: if verbose: print("\nwrong:") print( f'input : {src}\ntruth : {trg}\npredict: {pred} pred_detail: {pred_detail}' ) total_count += 1 spend_time = time.time() - start_time if total_count > 0: right_rate = right_count / total_count if recall_total_count > 0: recall_rate = recall_right_count / recall_total_count print( 'right_rate:{}, right_count:{}, total_count:{};\n' 'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s' .format(right_rate, right_count, total_count, recall_rate, recall_right_count, recall_total_count, spend_time))
def test_chengyu(): """测试成语纠错""" pycorrector.enable_char_error(enable=False) error_sentence_1 = '这块名表带带相传' # 代代相传 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format( error_sentence_1, correct_sent)) error_sentence_1 = '他贰话不说把牛奶喝完了' # 二话不说 correct_sent = pycorrector.correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format( error_sentence_1, correct_sent)) # 这家伙还蛮格((恪))尽职守的。 # 报应接中迩((而))来。 # 人群穿((川))流不息。 # 这个消息不径((胫))而走。 # 眼前的场景美仑((轮))美幻简直超出了人类的想象。 # 看着这两个人谈笑风声((生))我心理((里))不由有些忌妒。 # 有了这一番旁证((征))博引。 x = [ '这家伙还蛮格尽职守的', '报应接中迩来', # 接踵而来 '人群穿流不息', '这个消息不径而走', '这个消息不胫儿走', '眼前的场景美仑美幻简直超出了人类的想象', '看着这两个人谈笑风声我心理不由有些忌妒', '有了这一番旁证博引', '有了这一番旁针博引', ] for i in x: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i)) pycorrector.enable_char_error(enable=True) print("-" * 42) for i in x: print(i, pycorrector.detect(i)) print(i, pycorrector.correct(i))
def eval_corpus500_by_rule(input_eval_path=eval_data_path, output_eval_path='', verbose=True): res = [] corpus = load_json(input_eval_path) total_count = 0 right_count = 0 right_rate = 0.0 recall_rate = 0.0 recall_right_count = 0 recall_total_count = 0 start_time = time.time() for data_dict in corpus: text = data_dict.get('text', '') correction = data_dict.get('correction', '') errors = data_dict.get('errors', []) # pred_detail: list(wrong, right, begin_idx, end_idx) pred_sentence, pred_detail = pycorrector.correct(text) # compute recall if errors: recall_total_count += 1 if errors and pred_detail and correction == pred_sentence: recall_right_count += 1 # compute precision if correction == pred_sentence: right_count += 1 else: err_data_dict = copy.deepcopy(data_dict) err_data_dict['pred_sentence'] = pred_sentence err_data_dict['pred_errors'] = str(pred_detail) res.append(err_data_dict) if verbose: print("\nwrong:") print('input :', text) print('truth :', correction, errors) print('predict:', pred_sentence, pred_detail) total_count += 1 spend_time = time.time() - start_time if total_count > 0: right_rate = right_count / total_count if recall_total_count > 0: recall_rate = recall_right_count / recall_total_count print('right_rate:{}, right_count:{}, total_count:{};\n' 'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s'.format(right_rate, right_count, total_count, recall_rate, recall_right_count, recall_total_count, spend_time)) if output_eval_path: save_json(res, output_eval_path)
def test_confusion_dict(self): sents = [ '买iphonex,要多少钱', '共同实际控制人萧华、霍荣铨、张旗康', ] res = [] for name in sents: s, r = pycorrector.correct(name) print(r) res.append(r) self.assertEqual(res[0], []) self.assertEqual(res[1], [('张旗康', '张启康', 14, 17)]) pycorrector.set_custom_confusion_dict('../examples/my_custom_confusion.txt') res = [] for name in sents: s, r = pycorrector.correct(name) print(r) res.append(r) self.assertEqual(res[0], [('iphonex', 'iphoneX', 1, 8)]) self.assertEqual(res[1], [])
def main(): text = [ '这件事情针让人想象难以', '这周末我要去配副眼睛', '感帽了', '你儿字今年几岁了', '少先队员因该为老人让坐', ] pass corrected_sent, detail = pycorrector.correct('少先队员因该为老人让坐') print(corrected_sent, detail) pass
def test_single_pinyin(self): sents = [ '我的宝贝万一zhuan钱了呢', '我已经zuo了一遍工作', ] res = [] for name in sents: s, r = pycorrector.correct(name) print(s, r) res.append(r) # self.assertEqual(res[0], [('zhuan', '赚', 6, 12)]) # self.assertEqual(res[1], [('zuo', '做', 3, 7)]) self.assertEqual(res[0], []) self.assertEqual(res[1], [])
def test_full_pinyin(self): sents = [ '你们要很xingfu才可以', '智能手机中最好的是pingguo手机', ] res = [] for name in sents: s, r = pycorrector.correct(name) print(s, r) res.append(r) # self.assertEqual(res[0], [('xingfu', '幸福', 4, 11)]) # self.assertEqual(res[1], [('pingguo', '苹果', 9, 17)]) self.assertEqual(res[0], []) self.assertEqual(res[1], [])
def eval_sighan_corpus(pkl_path, verbose=False): sighan_data = load_pkl(pkl_path) total_count = 1 right_count = 0 for error_sentence, right_detail in sighan_data: # pred_detail: list(wrong, right, begin_idx, end_idx) pred_sentence, pred_detail = correct(error_sentence) if verbose: print('input sentence:', error_sentence, right_detail) print('pred sentence:', pred_sentence, pred_detail) if len(right_detail) != len(pred_detail): total_count += 1 else: right_count += 1 return right_count / total_count
def do_txt(file_bytes, file_name, username, process_method): dir = os.path.join(BASE_DIR, 'utils/{}'.format(username)) # 1 分段,段超过511字节,结巴分词 file_str = file_bytes.decode() lines = file_str.splitlines() with open(os.path.join(dir, file_name), 'w') as f: for l in lines: if l and contain_zh(l): if process_method != 'kenlm': query = baidu_correct(l) else: query = pycorrector.correct(l)[0] else: query = l f.write(query) return dir
def wrong_word_recognition(word_list): """ 传入单词列表,进行异常字符识别 :param word_list: :return: """ if len(English_dictionary) is 0: load_english_dict() corr_word_list = [] for word in word_list: if distinguish_english(word) is True: if word in English_dictionary.keys(): corr_word_list.append(word) else: corr_word_list.append(english_word_correct(word)) else: corr_word_list.append(pycorrector.correct(word)[0]) return corr_word_list
def test_char_correct_right(): errors = [ '少先队员因该为老人让坐', '服装店里的衣服各试各样', '那天花板上的钻石可比鸡弹还大啊', '才般进装修好没多久的新宫殿里。', '一但死去,以前花费的心血都会归零。', '这家伙还蛮格尽职守的。', '玩家取明“什么”已被占用。', '人群穿流不息。', '这个消息不径而走。', '眼前的场景美仑美幻简直超出了人类的想象。', '看着这两个人谈笑风声', '有老怪坐阵难怪他们高枕无忧了。', '有了这一番旁证博引。', ] for i in errors: print(i, pycorrector.correct(i))
def eval_corpus(eval_error_path, verbose=True): res = [] corpus = load_json(eval_data_path) total_count = 0 right_count = 0 right_rate = 0.0 recall_rate = 0.0 recall_right_count = 0 recall_total_count = 0 for data_dict in corpus: text = data_dict.get('text', '') correction = data_dict.get('correction', '') errors = data_dict.get('errors', []) # pred_detail: list(wrong, right, begin_idx, end_idx) pred_sentence, pred_detail = pycorrector.correct(text) # compute recall if errors: recall_total_count += 1 if errors and pred_detail and correction == pred_sentence: recall_right_count += 1 # compute precision if correction == pred_sentence: right_count += 1 else: err_data_dict = copy.deepcopy(data_dict) err_data_dict['pred_sentence'] = pred_sentence err_data_dict['pred_errors'] = str(pred_detail) res.append(err_data_dict) if verbose: print('truth:', text, errors) print('predict:', pred_sentence, pred_detail) total_count += 1 if total_count > 0: right_rate = right_count / total_count if recall_total_count > 0: recall_rate = recall_right_count / recall_total_count print('right_rate:{}, right_count:{}, total_count:{};\n' 'recall_rate:{},recall_right_count:{},recall_total_count:{}'.format(right_rate, right_count, total_count, recall_rate, recall_right_count, recall_total_count)) save_json(res, eval_error_path)
def test_trace(): import tracemalloc tracemalloc.start(10) time1 = tracemalloc.take_snapshot() import pycorrector c = pycorrector.correct('少先队员因该为老人让坐') print(c) time2 = tracemalloc.take_snapshot() stats = time2.compare_to(time1, 'lineno') print('*' * 32) for stat in stats[:3]: print(stat) stats = time2.compare_to(time1, 'traceback') print('*' * 32) for stat in stats[:3]: print(stat.traceback.format())
def test_base_demos(self): sents = [ '少先队员因该为老人让坐', '今天心情很好', '真麻烦你了。希望你们好好的跳无', '机七学习是人工智能领遇最能体现智能的一个分知', '一只小鱼船浮在平净的河面上', '我的家乡是有明的渔米之乡', ] res = [] for name in sents: s, r = pycorrector.correct(name) print(r) res.append(r) self.assertEqual(res[0], [('因该', '应该', 4, 6), ('坐', '座', 10, 11)]) self.assertEqual(res[1], []) self.assertEqual(res[2], [('无', '舞', 14, 15)]) self.assertEqual(res[3], [('机七', '机器', 0, 2), ('领遇', '领域', 9, 11)]) self.assertEqual(res[4], [('平净', '平静', 7, 9)]) self.assertEqual(res[5], [('有明', '有名', 5, 7)])