Esempio n. 1
0
def build_eval_corpus():
    bcmi_path = os.path.join(pwd_path, '../data/cn/bcmi.txt')
    clp_path = os.path.join(pwd_path, '../data/cn/clp14_C1.pkl')
    sighan_path = os.path.join(pwd_path, '../data/cn/sighan15_A2.pkl')
    cged_path = os.path.join(pwd_path, '../data/cn/CGED/CGED16_HSK_TrainingSet.xml')

    char_error_path = os.path.join(pwd_path, './bcmi_corpus.json')
    build_bcmi_corpus(bcmi_path, char_error_path)
    char_errors = load_json(char_error_path)

    word_error_path = os.path.join(pwd_path, './sighan_corpus.json')
    build_sighan_corpus(sighan_path, word_error_path)
    word_errors = load_json(word_error_path)

    grammar_error_path = os.path.join(pwd_path, './clp_corpus.json')
    build_sighan_corpus(clp_path, grammar_error_path)
    grammar_errors = load_json(grammar_error_path)

    no_error_path = os.path.join(pwd_path, './noerror_corpus.json')
    build_cged_no_error_corpus(cged_path, no_error_path)
    no_errors = load_json(no_error_path)

    corpus = sample(char_errors, 100) + sample(word_errors, 100) + sample(grammar_errors, 100) + sample(no_errors, 200)
    save_json(corpus, eval_data_path)
    print("save eval corpus done", eval_data_path)
    os.remove(char_error_path)
    os.remove(word_error_path)
    os.remove(grammar_error_path)
    os.remove(no_error_path)
Esempio n. 2
0
def eval_corpus500_by_macbert(input_eval_path=eval_data_path, output_eval_path='', verbose=True):
    from pycorrector.macbert.macbert_corrector import MacBertCorrector
    model = MacBertCorrector()
    res = []
    corpus = load_json(input_eval_path)
    total_count = 0
    right_count = 0
    right_rate = 0.0
    recall_rate = 0.0
    recall_right_count = 0
    recall_total_count = 0
    start_time = time.time()
    for data_dict in corpus:
        text = data_dict.get('text', '')
        correction = data_dict.get('correction', '')
        errors = data_dict.get('errors', [])

        #  pred_detail: list(wrong, right, begin_idx, end_idx)
        pred_sentence, pred_detail = model.macbert_correct(text)
        # compute recall
        if errors:
            recall_total_count += 1
            if errors and pred_detail and correction == pred_sentence:
                recall_right_count += 1

        # compute precision
        if correction == pred_sentence:
            right_count += 1
            print("\nright:")
            print('truth  :', text, errors)
            print('predict:', pred_sentence, pred_detail)
        else:
            err_data_dict = copy.deepcopy(data_dict)
            err_data_dict['pred_sentence'] = pred_sentence
            err_data_dict['pred_errors'] = str(pred_detail)
            res.append(err_data_dict)
            if verbose:
                print("\nwrong:")
                print('input  :', text)
                print('truth  :', correction, errors)
                print('predict:', pred_sentence, pred_detail)
        total_count += 1
    spend_time = time.time() - start_time
    if total_count > 0:
        right_rate = right_count / total_count
    if recall_total_count > 0:
        recall_rate = recall_right_count / recall_total_count
    print('right_rate:{}, right_count:{}, total_count:{};\n'
          'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s'.format(right_rate,
                                                                                                 right_count,
                                                                                                 total_count,
                                                                                                 recall_rate,
                                                                                                 recall_right_count,
                                                                                                 recall_total_count,
                                                                                                 spend_time))
    if output_eval_path:
        save_json(res, output_eval_path)
Esempio n. 3
0
def build_eval_corpus(output_eval_path=eval_data_path):
    """
    生成评估样本集,抽样分布可修改
    当前已经生成评估集,可以修改代码生成自己的样本分布
    :param output_eval_path:
    :return: json file
    """
    bcmi_path = os.path.join(pwd_path, '../data/cn/bcmi.txt')
    clp_path = os.path.join(pwd_path, '../data/cn/clp14_C1.pkl')
    sighan_path = os.path.join(pwd_path, '../data/cn/sighan15_A2.pkl')
    cged_path = os.path.join(pwd_path,
                             '../data/cn/CGED/CGED16_HSK_TrainingSet.xml')

    char_error_path = os.path.join(pwd_path, './bcmi_corpus.json')
    build_bcmi_corpus(bcmi_path, char_error_path)
    char_errors = load_json(char_error_path)

    word_error_path = os.path.join(pwd_path, './sighan_corpus.json')
    build_sighan_corpus(sighan_path, word_error_path)
    word_errors = load_json(word_error_path)

    grammar_error_path = os.path.join(pwd_path, './clp_corpus.json')
    build_sighan_corpus(clp_path, grammar_error_path)
    grammar_errors = load_json(grammar_error_path)

    no_error_path = os.path.join(pwd_path, './noerror_corpus.json')
    build_cged_no_error_corpus(cged_path, no_error_path)
    no_errors = load_json(no_error_path)

    corpus = sample(char_errors, 100) + sample(word_errors, 100) + sample(
        grammar_errors, 100) + sample(no_errors, 200)
    save_json(corpus, output_eval_path)
    print("save eval corpus done", output_eval_path)
    os.remove(char_error_path)
    os.remove(word_error_path)
    os.remove(grammar_error_path)
    os.remove(no_error_path)
Esempio n. 4
0
def eval_corpus_by_bert(input_eval_path=eval_data_path,
                        output_eval_path=output_eval_error_path,
                        verbose=True):
    from pycorrector.bert.bert_corrector import BertCorrector
    model = BertCorrector()
    res = []
    corpus = load_json(input_eval_path)
    total_count = 0
    right_count = 0
    right_rate = 0.0
    recall_rate = 0.0
    recall_right_count = 0
    recall_total_count = 0
    for data_dict in corpus:
        text = data_dict.get('text', '')
        correction = data_dict.get('correction', '')
        errors = data_dict.get('errors', [])

        #  pred_detail: list(wrong, right, begin_idx, end_idx)
        pred_sentence, pred_detail = model.bert_correct(text)
        # compute recall
        if errors:
            recall_total_count += 1
            if errors and pred_detail and correction == pred_sentence:
                recall_right_count += 1

        # compute precision
        if correction == pred_sentence:
            right_count += 1
        else:
            err_data_dict = copy.deepcopy(data_dict)
            err_data_dict['pred_sentence'] = pred_sentence
            err_data_dict['pred_errors'] = str(pred_detail)
            res.append(err_data_dict)
            if verbose:
                print('truth:', text, errors)
                print('predict:', pred_sentence, pred_detail)
        total_count += 1

    if total_count > 0:
        right_rate = right_count / total_count
    if recall_total_count > 0:
        recall_rate = recall_right_count / recall_total_count
    print('right_rate:{}, right_count:{}, total_count:{};\n'
          'recall_rate:{},recall_right_count:{},recall_total_count:{}'.format(
              right_rate, right_count, total_count, recall_rate,
              recall_right_count, recall_total_count))
    save_json(res, output_eval_path)
Esempio n. 5
0
def eval_corpus500_by_model(correct_fn,
                            input_eval_path=eval_data_path,
                            verbose=True):
    """
    句级评估结果,设定需要纠错为正样本,无需纠错为负样本
    Args:
        correct_fn:
        input_eval_path:
        output_eval_path:
        verbose:

    Returns:
        Acc, Recall, F1
    """
    corpus = load_json(input_eval_path)
    TP = 0.0
    FP = 0.0
    FN = 0.0
    TN = 0.0
    total_num = 0
    start_time = time.time()
    for data_dict in corpus:
        src = data_dict.get('text', '')
        tgt = data_dict.get('correction', '')
        errors = data_dict.get('errors', [])

        #  pred_detail: list(wrong, right, begin_idx, end_idx)
        tgt_pred, pred_detail = correct_fn(src)
        if verbose:
            print()
            print('input  :', src)
            print('truth  :', tgt, errors)
            print('predict:', tgt_pred, pred_detail)

        # 负样本
        if src == tgt:
            # 预测也为负
            if tgt == tgt_pred:
                TN += 1
                print('right')
            # 预测为正
            else:
                FP += 1
                print('wrong')
        # 正样本
        else:
            # 预测也为正
            if tgt == tgt_pred:
                TP += 1
                print('right')
            # 预测为负
            else:
                FN += 1
                print('wrong')
        total_num += 1
    spend_time = time.time() - start_time
    acc = (TP + TN) / total_num
    precision = TP / (TP + FP) if TP > 0 else 0.0
    recall = TP / (TP + FN) if TP > 0 else 0.0
    f1 = 2 * precision * recall / (precision +
                                   recall) if precision + recall != 0 else 0
    print(
        f'Sentence Level: acc:{acc:.4f}, precision:{precision:.4f}, recall:{recall:.4f}, f1:{f1:.4f}, cost time:{spend_time:.2f} s'
    )
    return acc, precision, recall, f1