Esempio n. 1
0
def test_build_confusion_dict():
    confusions = []
    sighan_data = load_pkl(clp_path)
    for error_sentence, right_detail in sighan_data:
        if right_detail:
            if right_detail[0][1:] not in confusions:
                confusions.append(right_detail[0][1:])

    sighan_data = load_pkl(sighan_path)
    for error_sentence, right_detail in sighan_data:
        if right_detail:
            if right_detail[0][1:] not in confusions:
                confusions.append(right_detail[0][1:])
    with open('a.txt', 'w', encoding='utf-8') as f:
        for i in confusions:
            f.write(i[0] + '\t' + i[1] + '\n')
Esempio n. 2
0
def eval_sighan_corpus(pkl_path, verbose=False):
    sighan_data = load_pkl(pkl_path)
    total_count = 0
    right_count = 0
    for error_sentence, details in sighan_data:
        ids = []
        error_word = ''
        right_word = ''
        if not details:
            continue
        for detail in details:
            idx = detail[0]
            error_word = detail[1]
            right_word = detail[2]
            begin_idx = idx - 1
            ids.append(begin_idx)
        correct_sentence = error_sentence.replace(error_word, right_word)
        #  pred_detail: list(wrong, right, begin_idx, end_idx)
        pred_sentence, pred_detail = pycorrector.correct(error_sentence)
        if pred_sentence == correct_sentence:
            right_count += 1
        else:
            if verbose:
                print('truth:', correct_sentence, details)
                print('predict:', pred_sentence, pred_detail)
        total_count += 1
    right_rate = 0.0
    if total_count > 0:
        right_rate = right_count / total_count
    return right_rate
Esempio n. 3
0
def build_sighan_corpus(data_path, output_path):
    corpus = []
    sighan_data = load_pkl(data_path)
    for error_sentence, error_details in sighan_data:
        ids = []
        error_word = ''
        right_word = ''
        if not error_details:
            continue
        for detail in error_details:
            idx = detail[0]
            error_word = detail[1]
            right_word = detail[2]
            begin_idx = idx - 1
            ids.append(begin_idx)
        correct_sentence = error_sentence.replace(error_word, right_word)
        details = []
        for i in ids:
            details.append([error_sentence[i], correct_sentence[i], i, i + 1])
        line_dict = {
            "text": error_sentence,
            "correction": correct_sentence,
            "errors": details
        }
        corpus.append(line_dict)
    save_json(corpus, output_path)
Esempio n. 4
0
def eval_sighan_corpus(pkl_path, verbose=False):
    sighan_data = load_pkl(pkl_path)
    total_count = 1
    right_count = 0
    for error_sentence, right_detail in sighan_data:
        #  pred_detail: list(wrong, right, begin_idx, end_idx)
        pred_sentence, pred_detail = correct(error_sentence)
        if verbose:
            print('input sentence:', error_sentence, right_detail)
            print('pred sentence:', pred_sentence, pred_detail)
        if len(right_detail) != len(pred_detail):
            total_count += 1
        else:
            right_count += 1
    return right_count / total_count
Esempio n. 5
0
def eval_sighan_corpus(pkl_path, verbose=False):
    sighan_data = load_pkl(pkl_path)
    total_count = 1
    right_count = 0
    right_result = dict()
    wrong_result = dict()
    for error_sentence, right_detail in sighan_data:
        pred_sentence, pred_detail = correct(error_sentence)
        if verbose:
            print('input sentence:', error_sentence)
            print('pred sentence:', pred_sentence)
        for (right_loc, right_w, right_r) in right_detail:
            total_count += 1
            # if right_r == pred_r:
            #     right_count += 1
            #     right_result[error_sentence] = [right_r, pred_r]
            # else:
            #     wrong_result[error_sentence] = [right_r, pred_r]
            if verbose:
                print('right: {} => {} , index: {}'.format(right_w, right_r, right_loc))
                # if verbose:
                # print('right count:', right_count, ';total count:', total_count)
    return right_count / total_count, right_result, wrong_result
Esempio n. 6
0
def eval_sighan_corpus(pkl_path, verbose=False):
    sighan_data = load_pkl(pkl_path)
    total_count = 1
    right_count = 0
    right_result = dict()
    wrong_result = dict()
    for error_sentence, right_detail in sighan_data:
        pred_sentence, pred_detail = correct(error_sentence)
        # print(pred_detail)
        # if pred_detail and len(pred_detail[0]) > 1:
        # pdb.set_trace()
        if verbose:
            print('input sentence:', error_sentence)
            print('pred sentence :', pred_sentence)
        for (right_loc, right_w, right_r) in right_detail:
            total_count += 1
            # pdb.set_trace()
            if pred_detail:
                # print(pred_detail)
                for [(pred_w, pred_r, pred_beg, pred_end)] in pred_detail:
                    if right_r in pred_r or pred_r in right_r:
                        right_count += 1
                        right_result[error_sentence] = [right_r, pred_r]
                        pred_detail.remove([(pred_w, pred_r, pred_beg,
                                             pred_end)])
                    # else:
                    #     wrong_result[error_sentence] = [right_r, pred_r]
            elif not right_detail:
                right_count += 1

            if verbose:
                print('right: {} => {} , index: {}'.format(
                    right_w, right_r, right_loc))
                # if verbose:
                # print('right count:', right_count, ';total count:', total_count)
    return right_count / total_count, right_result, wrong_result
Esempio n. 7
0
def eval_sighan_corpus(pkl_path, verbose=False):
    sighan_data = load_pkl(pkl_path)
    total_count = 1
    right_count = 0
    right_result = dict()
    wrong_result = dict()
    for error_sentence, right_detail in sighan_data:
        pred_sentence, pred_detail = correct(error_sentence)
        if verbose:
            print('input sentence:', error_sentence)
            print('pred sentence:', pred_sentence)
        for (right_loc, right_w, right_r) in right_detail:
            total_count += 1
            # if right_r == pred_r:
            #     right_count += 1
            #     right_result[error_sentence] = [right_r, pred_r]
            # else:
            #     wrong_result[error_sentence] = [right_r, pred_r]
            if verbose:
                print('right: {} => {} , index: {}'.format(
                    right_w, right_r, right_loc))
                # if verbose:
                # print('right count:', right_count, ';total count:', total_count)
    return right_count / total_count, right_result, wrong_result
Esempio n. 8
0
            if parts and len(parts) > 1:
                for i, c in enumerate(parts):
                    # result[c].add(c)
                    # result[c] |= set(list(parts[:i] + parts[i + 1:]))
                    result[c] |= set(parts)
    return result

char_dict_path = os.path.join(pwd_path, config.char_dict_path)
cn_char_set = load_char_dict(char_dict_path)
two_char_dict = load_2char_dict(pwd_path + '/data/char_two_set.txt')

# # word dictionary
word_dict_text_path = os.path.join(pwd_path, config.word_dict_path)
word_dict_model_path = os.path.join(pwd_path, config.word_dict_model_path)
if os.path.exists(word_dict_model_path):
    cn_word_set = load_pkl(word_dict_model_path)
else:
    default_logger.debug('load word dict from text file:', word_dict_model_path)
    cn_word_set = load_word_dict(word_dict_text_path)
    dump_pkl(cn_word_set, word_dict_model_path)

# similar pronuciation
same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path)
same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path)
# same_pinyin = load_same_pinyin(same_pinyin_text_path)
if os.path.exists(same_pinyin_model_path):
    same_pinyin = load_pkl(same_pinyin_model_path)
else:
    default_logger.debug('load same pinyin from text file:', same_pinyin_text_path)
    same_pinyin = load_same_pinyin(same_pinyin_text_path)
    dump_pkl(same_pinyin, same_pinyin_model_path)
Esempio n. 9
0
def load_word_freq_dict(path):
    word_freq = {}
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            info = line.split()
            word = info[0]
            freq = int(info[1])
            word_freq[word] = freq
    return word_freq


# 字频统计
word_dict_path = os.path.join(pwd_path, config.word_dict_path)
word_dict_model_path = os.path.join(pwd_path, config.word_dict_model_path)
if os.path.exists(word_dict_model_path):
    word_freq = load_pkl(word_dict_model_path)
else:
    default_logger.debug('load word freq from text file:', word_dict_path)
    word_freq = load_word_freq_dict(word_dict_path)
    dump_pkl(word_freq, word_dict_model_path)


def get_ngram_score(chars, mode=trigram_char):
    """
    取n元文法得分
    :param chars: list, 以词或字切分
    :param mode:
    :return:
    """
    return mode.score(' '.join(chars), bos=False, eos=False)
Esempio n. 10
0
def load_word_freq_dict(path):
    word_freq = {}
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            info = line.split()
            word = info[0]
            freq = int(info[1])
            word_freq[word] = freq
    return word_freq


# 字频统计
word_freq_path = os.path.join(pwd_path, config.word_freq_path)
word_freq_model_path = os.path.join(pwd_path, config.word_freq_model_path)
if os.path.exists(word_freq_model_path):
    word_freq = load_pkl(word_freq_model_path)
else:
    default_logger.debug('load word freq from text file:', word_freq_path)
    word_freq = load_word_freq_dict(word_freq_path)
    dump_pkl(word_freq, word_freq_model_path)


def get_ngram_score(chars, mode=trigram_char):
    """
    取n元文法得分
    :param chars: list, 以词或字切分
    :param mode:
    :return:
    """
    return mode.score(' '.join(chars), bos=False, eos=False)
Esempio n. 11
0
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = traditional2simplified(line.strip())
            parts = line.strip().split(sep)
            if parts and len(parts) > 1:
                for i, c in enumerate(parts):
                    result[c] = set(list(parts[:i] + parts[i + 1:]))
    return result


cn_char_set = load_word_dict(char_file_path)
same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path)
same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path)
# 同音字
if os.path.exists(same_pinyin_model_path):
    same_pinyin = load_pkl(same_pinyin_model_path)
else:
    default_logger.debug('load same pinyin from text file:',
                         same_pinyin_text_path)
    same_pinyin = load_same_pinyin(same_pinyin_text_path)
    dump_pkl(same_pinyin, same_pinyin_model_path)

# 形似字
same_stroke_text_path = os.path.join(pwd_path, config.same_stroke_text_path)
same_stroke_model_path = os.path.join(pwd_path, config.same_stroke_model_path)
if os.path.exists(same_stroke_model_path):
    same_stroke = load_pkl(same_stroke_model_path)
else:
    default_logger.debug('load same stroke from text file:',
                         same_stroke_text_path)
    same_stroke = load_same_stroke(same_stroke_text_path)
Esempio n. 12
0
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            line = traditional2simplified(line.strip())
            parts = line.strip().split(sep)
            if parts and len(parts) > 1:
                for i, c in enumerate(parts):
                    result[c] = set(list(parts[:i] + parts[i + 1:]))
    return result


cn_char_set = load_word_dict(char_file_path)
same_pinyin_text_path = os.path.join(pwd_path, config.same_pinyin_text_path)
same_pinyin_model_path = os.path.join(pwd_path, config.same_pinyin_model_path)
# 同音字
if os.path.exists(same_pinyin_model_path):
    same_pinyin = load_pkl(same_pinyin_model_path)
else:
    default_logger.debug('load same pinyin from text file:', same_pinyin_text_path)
    same_pinyin = load_same_pinyin(same_pinyin_text_path)
    dump_pkl(same_pinyin, same_pinyin_model_path)

# 形似字
same_stroke_text_path = os.path.join(pwd_path, config.same_stroke_text_path)
same_stroke_model_path = os.path.join(pwd_path, config.same_stroke_model_path)
if os.path.exists(same_stroke_model_path):
    same_stroke = load_pkl(same_stroke_model_path)
else:
    default_logger.debug('load same stroke from text file:', same_stroke_text_path)
    same_stroke = load_same_stroke(same_stroke_text_path)
    dump_pkl(same_stroke, same_stroke_model_path)
Esempio n. 13
0
def load_word_freq_dict(path):
    word_freq = {}
    with codecs.open(path, 'r', encoding='utf-8') as f:
        for line in f:
            info = line.split()
            word = info[0]
            freq = int(info[1])
            word_freq[word] = freq
    return word_freq


# 字频统计
word_freq_path = os.path.join(pwd_path, config.word_freq_path)
word_freq_model_path = os.path.join(pwd_path, config.word_freq_model_path)
if os.path.exists(word_freq_model_path):
    word_freq = load_pkl(word_freq_model_path)
else:
    default_logger.debug('load word freq from text file:', word_freq_path)
    word_freq = load_word_freq_dict(word_freq_path)
    dump_pkl(word_freq, word_freq_model_path)


def get_ngram_score(chars, mode=trigram_char):
    """
    取n元文法得分
    :param chars: list, 以词或字切分
    :param mode:
    :return:
    """
    return mode.score(' '.join(chars), bos=False, eos=False)