def test_build_confusion_dict(): confusions = [] sighan_data = load_pkl(clp_path) for error_sentence, right_detail in sighan_data: if right_detail: if right_detail[0][1:] not in confusions: confusions.append(right_detail[0][1:]) sighan_data = load_pkl(sighan_path) for error_sentence, right_detail in sighan_data: if right_detail: if right_detail[0][1:] not in confusions: confusions.append(right_detail[0][1:]) with open('a.txt', 'w', encoding='utf-8') as f: for i in confusions: f.write(i[0] + '\t' + i[1] + '\n')
def eval_sighan_corpus(pkl_path, verbose=False): sighan_data = load_pkl(pkl_path) total_count = 0 right_count = 0 for error_sentence, details in sighan_data: ids = [] error_word = '' right_word = '' if not details: continue for detail in details: idx = detail[0] error_word = detail[1] right_word = detail[2] begin_idx = idx - 1 ids.append(begin_idx) correct_sentence = error_sentence.replace(error_word, right_word) # pred_detail: list(wrong, right, begin_idx, end_idx) pred_sentence, pred_detail = pycorrector.correct(error_sentence) if pred_sentence == correct_sentence: right_count += 1 else: if verbose: print('truth:', correct_sentence, details) print('predict:', pred_sentence, pred_detail) total_count += 1 right_rate = 0.0 if total_count > 0: right_rate = right_count / total_count return right_rate
def build_sighan_corpus(data_path, output_path): corpus = [] sighan_data = load_pkl(data_path) for error_sentence, error_details in sighan_data: ids = [] error_word = '' right_word = '' if not error_details: continue for detail in error_details: idx = detail[0] error_word = detail[1] right_word = detail[2] begin_idx = idx - 1 ids.append(begin_idx) correct_sentence = error_sentence.replace(error_word, right_word) details = [] for i in ids: details.append([error_sentence[i], correct_sentence[i], i, i + 1]) line_dict = { "text": error_sentence, "correction": correct_sentence, "errors": details } corpus.append(line_dict) save_json(corpus, output_path)
def eval_sighan_corpus(pkl_path, verbose=False): sighan_data = load_pkl(pkl_path) total_count = 1 right_count = 0 for error_sentence, right_detail in sighan_data: # pred_detail: list(wrong, right, begin_idx, end_idx) pred_sentence, pred_detail = correct(error_sentence) if verbose: print('input sentence:', error_sentence, right_detail) print('pred sentence:', pred_sentence, pred_detail) if len(right_detail) != len(pred_detail): total_count += 1 else: right_count += 1 return right_count / total_count
def eval_sighan_corpus(pkl_path, verbose=False): sighan_data = load_pkl(pkl_path) total_count = 1 right_count = 0 right_result = dict() wrong_result = dict() for error_sentence, right_detail in sighan_data: pred_sentence, pred_detail = correct(error_sentence) if verbose: print('input sentence:', error_sentence) print('pred sentence:', pred_sentence) for (right_loc, right_w, right_r) in right_detail: total_count += 1 # if right_r == pred_r: # right_count += 1 # right_result[error_sentence] = [right_r, pred_r] # else: # wrong_result[error_sentence] = [right_r, pred_r] if verbose: print('right: {} => {} , index: {}'.format(right_w, right_r, right_loc)) # if verbose: # print('right count:', right_count, ';total count:', total_count) return right_count / total_count, right_result, wrong_result
def eval_sighan_corpus(pkl_path, verbose=False): sighan_data = load_pkl(pkl_path) total_count = 1 right_count = 0 right_result = dict() wrong_result = dict() for error_sentence, right_detail in sighan_data: pred_sentence, pred_detail = correct(error_sentence) # print(pred_detail) # if pred_detail and len(pred_detail[0]) > 1: # pdb.set_trace() if verbose: print('input sentence:', error_sentence) print('pred sentence :', pred_sentence) for (right_loc, right_w, right_r) in right_detail: total_count += 1 # pdb.set_trace() if pred_detail: # print(pred_detail) for [(pred_w, pred_r, pred_beg, pred_end)] in pred_detail: if right_r in pred_r or pred_r in right_r: right_count += 1 right_result[error_sentence] = [right_r, pred_r] pred_detail.remove([(pred_w, pred_r, pred_beg, pred_end)]) # else: # wrong_result[error_sentence] = [right_r, pred_r] elif not right_detail: right_count += 1 if verbose: print('right: {} => {} , index: {}'.format( right_w, right_r, right_loc)) # if verbose: # print('right count:', right_count, ';total count:', total_count) return right_count / total_count, right_result, wrong_result
def eval_sighan_corpus(pkl_path, verbose=False): sighan_data = load_pkl(pkl_path) total_count = 1 right_count = 0 right_result = dict() wrong_result = dict() for error_sentence, right_detail in sighan_data: pred_sentence, pred_detail = correct(error_sentence) if verbose: print('input sentence:', error_sentence) print('pred sentence:', pred_sentence) for (right_loc, right_w, right_r) in right_detail: total_count += 1 # if right_r == pred_r: # right_count += 1 # right_result[error_sentence] = [right_r, pred_r] # else: # wrong_result[error_sentence] = [right_r, pred_r] if verbose: print('right: {} => {} , index: {}'.format( right_w, right_r, right_loc)) # if verbose: # print('right count:', right_count, ';total count:', total_count) return right_count / total_count, right_result, wrong_result
if parts and len(parts) > 1: for i, c in enumerate(parts): # result[c].add(c) # result[c] |= set(list(parts[:i] + parts[i + 1:])) result[c] |= set(parts) return result char_dict_path = os.path.join(pwd_path, config.char_dict_path) cn_char_set = load_char_dict(char_dict_path) two_char_dict = load_2char_dict(pwd_path + '/data/char_two_set.txt') # # word dictionary word_dict_text_path = os.path.join(pwd_path, config.word_dict_path) word_dict_model_path = os.path.join(pwd_path, config.word_dict_model_path) if os.path.exists(word_dict_model_path): cn_word_set = load_pkl(word_dict_model_path) else: default_logger.debug('load word dict from text file:', word_dict_model_path) cn_word_set = load_word_dict(word_dict_text_path) dump_pkl(cn_word_set, word_dict_model_path) # similar pronuciation same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path) same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path) # same_pinyin = load_same_pinyin(same_pinyin_text_path) if os.path.exists(same_pinyin_model_path): same_pinyin = load_pkl(same_pinyin_model_path) else: default_logger.debug('load same pinyin from text file:', same_pinyin_text_path) same_pinyin = load_same_pinyin(same_pinyin_text_path) dump_pkl(same_pinyin, same_pinyin_model_path)
def load_word_freq_dict(path): word_freq = {} with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: info = line.split() word = info[0] freq = int(info[1]) word_freq[word] = freq return word_freq # 字频统计 word_dict_path = os.path.join(pwd_path, config.word_dict_path) word_dict_model_path = os.path.join(pwd_path, config.word_dict_model_path) if os.path.exists(word_dict_model_path): word_freq = load_pkl(word_dict_model_path) else: default_logger.debug('load word freq from text file:', word_dict_path) word_freq = load_word_freq_dict(word_dict_path) dump_pkl(word_freq, word_dict_model_path) def get_ngram_score(chars, mode=trigram_char): """ 取n元文法得分 :param chars: list, 以词或字切分 :param mode: :return: """ return mode.score(' '.join(chars), bos=False, eos=False)
def load_word_freq_dict(path): word_freq = {} with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: info = line.split() word = info[0] freq = int(info[1]) word_freq[word] = freq return word_freq # 字频统计 word_freq_path = os.path.join(pwd_path, config.word_freq_path) word_freq_model_path = os.path.join(pwd_path, config.word_freq_model_path) if os.path.exists(word_freq_model_path): word_freq = load_pkl(word_freq_model_path) else: default_logger.debug('load word freq from text file:', word_freq_path) word_freq = load_word_freq_dict(word_freq_path) dump_pkl(word_freq, word_freq_model_path) def get_ngram_score(chars, mode=trigram_char): """ 取n元文法得分 :param chars: list, 以词或字切分 :param mode: :return: """ return mode.score(' '.join(chars), bos=False, eos=False)
with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: line = traditional2simplified(line.strip()) parts = line.strip().split(sep) if parts and len(parts) > 1: for i, c in enumerate(parts): result[c] = set(list(parts[:i] + parts[i + 1:])) return result cn_char_set = load_word_dict(char_file_path) same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path) same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path) # 同音字 if os.path.exists(same_pinyin_model_path): same_pinyin = load_pkl(same_pinyin_model_path) else: default_logger.debug('load same pinyin from text file:', same_pinyin_text_path) same_pinyin = load_same_pinyin(same_pinyin_text_path) dump_pkl(same_pinyin, same_pinyin_model_path) # 形似字 same_stroke_text_path = os.path.join(pwd_path, config.same_stroke_text_path) same_stroke_model_path = os.path.join(pwd_path, config.same_stroke_model_path) if os.path.exists(same_stroke_model_path): same_stroke = load_pkl(same_stroke_model_path) else: default_logger.debug('load same stroke from text file:', same_stroke_text_path) same_stroke = load_same_stroke(same_stroke_text_path)
with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: line = traditional2simplified(line.strip()) parts = line.strip().split(sep) if parts and len(parts) > 1: for i, c in enumerate(parts): result[c] = set(list(parts[:i] + parts[i + 1:])) return result cn_char_set = load_word_dict(char_file_path) same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path) same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path) # 同音字 if os.path.exists(same_pinyin_model_path): same_pinyin = load_pkl(same_pinyin_model_path) else: default_logger.debug('load same pinyin from text file:', same_pinyin_text_path) same_pinyin = load_same_pinyin(same_pinyin_text_path) dump_pkl(same_pinyin, same_pinyin_model_path) # 形似字 same_stroke_text_path = os.path.join(pwd_path, config.same_stroke_text_path) same_stroke_model_path = os.path.join(pwd_path, config.same_stroke_model_path) if os.path.exists(same_stroke_model_path): same_stroke = load_pkl(same_stroke_model_path) else: default_logger.debug('load same stroke from text file:', same_stroke_text_path) same_stroke = load_same_stroke(same_stroke_text_path) dump_pkl(same_stroke, same_stroke_model_path)
def load_word_freq_dict(path): word_freq = {} with codecs.open(path, 'r', encoding='utf-8') as f: for line in f: info = line.split() word = info[0] freq = int(info[1]) word_freq[word] = freq return word_freq # 字频统计 word_freq_path = os.path.join(pwd_path, config.word_freq_path) word_freq_model_path = os.path.join(pwd_path, config.word_freq_model_path) if os.path.exists(word_freq_model_path): word_freq = load_pkl(word_freq_model_path) else: default_logger.debug('load word freq from text file:', word_freq_path) word_freq = load_word_freq_dict(word_freq_path) dump_pkl(word_freq, word_freq_model_path) def get_ngram_score(chars, mode=trigram_char): """ 取n元文法得分 :param chars: list, 以词或字切分 :param mode: :return: """ return mode.score(' '.join(chars), bos=False, eos=False)