def eval_bcmi_data(data_path, verbose=False): sentence_size = 1 right_count = 0 right_result = dict() wrong_result = dict() with open(data_path, 'r', encoding='utf-8') as f: for line in f: line = line.strip() error_sentence, right_sentence = get_bcmi_corpus(line) if not error_sentence: continue pred_sentence, pred_detail = correct(error_sentence) if verbose: print('input sentence:', error_sentence) print('pred sentence:', pred_sentence) print('right sentence:', right_sentence) sentence_size += 1 if right_sentence == pred_sentence: right_count += 1 right_result[error_sentence] = [right_sentence, pred_sentence] else: wrong_result[error_sentence] = [right_sentence, pred_sentence] if verbose: print('right count:', right_count, ';sentence size:', sentence_size) return right_count / sentence_size, right_result, wrong_result
def test_text4(): error_sentences = [ '我喜欢打监球,你呢?足球吗', '老师工作非常幸苦,我们要遵敬老师', ' 我兴高彩列地去公园游玩', '老师的生体不好,可她艰持给我们上课', '我们要宝护它们', '讲台上放着一只漂亮的刚笔', '春暖花开之时我们躯车到了海滨渡假村', '按照上级布署安排', '冬冬今天戴来了一本好看的童话书', '少先队员因该为老人让坐', '服装店里的衣服各试各样', '一只小鱼船浮在平净的河面上', '我的家乡是有明的渔米之乡', ' _ ,', '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物', # 出租的事 '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。', # 题高 => 提高 专业人氏 => 专业人士 '三个凑皮匠胜过一个诸葛亮也有道理。', # 凑 '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。', '还有我要看他们的个性,如果跟同时合不来受到压力的话,无法专心地工作。', ] for line in error_sentences: correct_sent = correct(line) print("original sentence:{} => correct sentence:{}".format( line, correct_sent))
def eval_sighan(input_path, output_path, param_ec, param_gd, verbose=False): ''' Input: input_path: file of original sentences form: (pid)\terror_sentence output_path: path of predicted sentences form: (pid)\tcorrected_sentence verbose: print the error and corrected sentences during running or not ''' sys.stderr.write( 'sighan15_test : start correcting sentences......\n') sys.stderr.write('error_sentences_path : ' + input_path + '\n') sys.stderr.write('corrected_sentences_path : ' + output_path + '\n') sighan_data = open(input_path, 'rb', encoding='utf-8') corr_file = open(output_path, 'w+', encoding='utf-8') if verbose: for line in sighan_data.readlines(): pid, sentence = line.split('\t') pred_sent, pred_detail = correct(sentence.strip(), param_ec, param_gd) sys.stderr.write('input sentence : ' + sentence + '\n') sys.stderr.write('pred sentence : ' + pred_sent + '\n') sys.stderr.write('predict change : ' + ', '.join([i[0][0] + '-->' + i[0][1] \ for i in pred_detail if i]) + '\n') corr_file.write(pid + '\t' + pred_sent + '\n') else: for line in tqdm(sighan_data.readlines()): pid, sentence = line.split('\t') pred_sent, pred_detail = correct(sentence.strip(), param_ec, param_gd) corr_file.write(pid + '\t' + pred_sent + '\n') corr_file.close() sighan_data.close() sys.stderr.write( 'sighan15_test : finishing correcting sentences\n')
def eval_sighan_corpus(pkl_path, verbose=False): sighan_data = load_pkl(pkl_path) total_count = 1 right_count = 0 right_result = dict() wrong_result = dict() for error_sentence, right_detail in sighan_data: pred_sentence, pred_detail = correct(error_sentence) if verbose: print('input sentence:', error_sentence) print('pred sentence:', pred_sentence) for (right_loc, right_w, right_r) in right_detail: total_count += 1 # if right_r == pred_r: # right_count += 1 # right_result[error_sentence] = [right_r, pred_r] # else: # wrong_result[error_sentence] = [right_r, pred_r] if verbose: print('right: {} => {} , index: {}'.format(right_w, right_r, right_loc)) # if verbose: # print('right count:', right_count, ';total count:', total_count) return right_count / total_count, right_result, wrong_result
def eval_sighan_corpus(pkl_path, verbose=False): sighan_data = load_pkl(pkl_path) total_count = 1 right_count = 0 right_result = dict() wrong_result = dict() for error_sentence, right_detail in sighan_data: pred_sentence, pred_detail = correct(error_sentence) if verbose: print('input sentence:', error_sentence) print('pred sentence:', pred_sentence) for (right_loc, right_w, right_r) in right_detail: total_count += 1 # if right_r == pred_r: # right_count += 1 # right_result[error_sentence] = [right_r, pred_r] # else: # wrong_result[error_sentence] = [right_r, pred_r] if verbose: print('right: {} => {} , index: {}'.format( right_w, right_r, right_loc)) # if verbose: # print('right count:', right_count, ';total count:', total_count) return right_count / total_count, right_result, wrong_result
def eval_sighan_corpus(pkl_path, verbose=False): sighan_data = load_pkl(pkl_path) total_count = 1 right_count = 0 right_result = dict() wrong_result = dict() for error_sentence, right_detail in sighan_data: pred_sentence, pred_detail = correct(error_sentence) # print(pred_detail) # if pred_detail and len(pred_detail[0]) > 1: # pdb.set_trace() if verbose: print('input sentence:', error_sentence) print('pred sentence :', pred_sentence) for (right_loc, right_w, right_r) in right_detail: total_count += 1 # pdb.set_trace() if pred_detail: # print(pred_detail) for [(pred_w, pred_r, pred_beg, pred_end)] in pred_detail: if right_r in pred_r or pred_r in right_r: right_count += 1 right_result[error_sentence] = [right_r, pred_r] pred_detail.remove([(pred_w, pred_r, pred_beg, pred_end)]) # else: # wrong_result[error_sentence] = [right_r, pred_r] elif not right_detail: right_count += 1 if verbose: print('right: {} => {} , index: {}'.format( right_w, right_r, right_loc)) # if verbose: # print('right count:', right_count, ';total count:', total_count) return right_count / total_count, right_result, wrong_result
# -*- coding: utf-8 -*- #!/usr/bin/env python # import os import sys sys.path.append("../") import re from codecs import open from pycorrector.corrector import correct from pycorrector.utils.io_utils import load_pkl from tqdm import tqdm import pdb pwd_path = os.path.abspath(os.path.dirname(__file__)) data_path = os.path.join(pwd_path, '../pycorrector/data/test/source.txt') pred_path = os.path.join(pwd_path, '../pycorrector/data/test/prediction.txt') input_file = open(data_path, 'rb', encoding='utf-8').readlines() output_file = open(pred_path, 'w', encoding='utf-8') for err_sent in tqdm(input_file): pred_sent, pred_detail = correct(err_sent) output_file.write(pred_sent) output_file.close()
# '第一位京第二位,他说:第二位利好。', # '我准备一些面包给他吃,我也从冰箱拿出来了埤酒', # '所以我很高心', # '请我座在沙发上', # '美食美事皆不可辜负,这场盛会你一定期待已久', # '点击咨询痣疮是什么原因?咨询医师痣疮原因', # '附睾焱的症状?要引起注意!', # '外阴尖锐涅疣怎样治疗?-济群解析', # '洛阳大华雅思 30天突破雅思7分', # '男人不育少靖子症如何治疗?专业男科,烟台京城医院', # '疝気医院那好 疝気专科百科问答', # '成都医院治扁平苔鲜贵吗_国家2甲医院', # '少先队员因该为老人让坐', # '服装店里的衣服各试各样', # '一只小鱼船浮在平净的河面上', # '我的家乡是有明的渔米之乡', # ' _ ,', # '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物', # 出租的事 # '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。', # 题高 => 提高 专业人氏 => 专业人士 # '三个凑皮匠胜过一个诸葛亮也有道理。', # 凑 # '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。', ] for line in error_sentences: print("starting correction...") correct_sent = corrector.correct(line) print("original sentence:{} => correct sentence:{}".format(line, correct_sent))
def reader(in_file): lines = list() cout = 0 with open(in_file, 'r', encoding='utf-8') as f: for line in f: line = line.strip() text = line.split("\t")[0] lines.append(text) cout += 1 print("in file: %s, cout: %d" % (in_file, cout)) return lines def saver(out_file, lines): cout = 0 with open(out_file, 'w', encoding='utf-8') as f: for line in lines: line = line.strip() f.write(line + '\n') cout += 1 print("save file: %s, cout: %d" % (out_file, cout)) input_lines = reader(in_file) correct_lines = list() for line in input_lines: correct_sent, error_detail = corrector.correct(line) print("{}\t{}\t{}".format(line, correct_sent, error_detail)) correct_lines.append(line + '\t' + correct_sent + '\t' + str(error_detail)) saver(out_file, correct_lines)
def correct(): line = '少先队员因该为老人让坐' # line = '机七学习是人工智能领遇最能体现智能的' print('input sentence is:', line) print(correct(line))
def test_text3(): error_sentence_3 = '我们现今所"使用"的大部分舒学符号,你们用的什么婊点符号' correct_sent = correct(error_sentence_3) print("original sentence:{} => correct sentence:{}".format( error_sentence_3, correct_sent))
def test_text2(): error_sentence_2 = '杭洲是中国的八大古都之一,因风景锈丽,享有“人间天棠”的美誉!' correct_sent = correct(error_sentence_2) print("original sentence:{} => correct sentence:{}".format( error_sentence_2, correct_sent))
def test_text1(): error_sentence_1 = '机七学习是人工智能领遇最能体现智能的一个分知' correct_sent = correct(error_sentence_1) print("original sentence:{} => correct sentence:{}".format( error_sentence_1, correct_sent))
# Brief: from pycorrector import corrector error_sentences = [ '汽车新式在这条路上', '中国人工只能布局很不错', '想不想在来一次比赛', '你不觉的高兴吗', '权利的游戏第八季', '美食美事皆不可辜负,这场盛会你一定期待已久', '点击咨询痣疮是什么原因?咨询医师痣疮原因', '附睾焱的症状?要引起注意!', '外阴尖锐涅疣怎样治疗?-济群解析', '洛阳大华雅思 30天突破雅思7分', '男人不育少靖子症如何治疗?专业男科,烟台京城医院', '疝気医院那好 疝気专科百科问答', '成都医院治扁平苔鲜贵吗_国家2甲医院', '少先队员因该为老人让坐', '服装店里的衣服各试各样', '一只小鱼船浮在平净的河面上', '我的家乡是有明的渔米之乡', ' _ ,', '我对于宠物出租得事非常认同,因为其实很多人喜欢宠物', # 出租的事 '有了宠物出租地方另一方面还可以题高人类对动物的了解,因为那些专业人氏可以指导我们对于动物的习惯。', # 题高 => 提高 专业人氏 => 专业人士 '三个凑皮匠胜过一个诸葛亮也有道理。', # 凑 '还有广告业是只要桌子前面坐者工作未必产生出来好的成果。', ] for line in error_sentences: correct_sent = corrector.correct(line) print("original sentence:{} => correct sentence:{}".format(line, correct_sent))
lines = list() cout = 0 with open(in_file, 'r', encoding='utf-8') as f: for line in f: line = line.strip() text = line.split("\t")[0] lines.append(text) cout += 1 print("in file: %s, cout: %d" % (in_file, cout)) return lines def saver(out_file, lines): cout = 0 with open(out_file, 'w', encoding='utf-8') as f: for line in lines: line = line.strip() f.write(line + '\n') cout += 1 print("save file: %s, cout: %d" % (out_file, cout)) input_lines = reader(in_file) correct_lines = list() for line in input_lines: correct_sent, error_detail = corrector.correct(line) print("{}\t{}\t{}".format( line, correct_sent, error_detail)) correct_lines.append(line + '\t' + correct_sent + '\t' + str(error_detail)) saver(out_file, correct_lines)