Beispiel #1
0
def build_eval_corpus():
    bcmi_path = os.path.join(pwd_path, '../data/cn/bcmi.txt')
    clp_path = os.path.join(pwd_path, '../data/cn/clp14_C1.pkl')
    sighan_path = os.path.join(pwd_path, '../data/cn/sighan15_A2.pkl')
    cged_path = os.path.join(pwd_path, '../data/cn/CGED/CGED16_HSK_TrainingSet.xml')

    char_error_path = os.path.join(pwd_path, './bcmi_corpus.json')
    build_bcmi_corpus(bcmi_path, char_error_path)
    char_errors = load_json(char_error_path)

    word_error_path = os.path.join(pwd_path, './sighan_corpus.json')
    build_sighan_corpus(sighan_path, word_error_path)
    word_errors = load_json(word_error_path)

    grammar_error_path = os.path.join(pwd_path, './clp_corpus.json')
    build_sighan_corpus(clp_path, grammar_error_path)
    grammar_errors = load_json(grammar_error_path)

    no_error_path = os.path.join(pwd_path, './noerror_corpus.json')
    build_cged_no_error_corpus(cged_path, no_error_path)
    no_errors = load_json(no_error_path)

    corpus = sample(char_errors, 100) + sample(word_errors, 100) + sample(grammar_errors, 100) + sample(no_errors, 200)
    save_json(corpus, eval_data_path)
    print("save eval corpus done", eval_data_path)
    os.remove(char_error_path)
    os.remove(word_error_path)
    os.remove(grammar_error_path)
    os.remove(no_error_path)
Beispiel #2
0
def build_cged_no_error_corpus(data_path, output_path, limit_size=500):
    corpus = []
    print('Parse data from %s' % data_path)
    dom_tree = minidom.parse(data_path)
    docs = dom_tree.documentElement.getElementsByTagName('DOC')
    count = 0
    for doc in docs:
        # Input the text
        text = doc.getElementsByTagName('TEXT')[0]. \
            childNodes[0].data.strip()
        # Input the correct text
        correction = doc.getElementsByTagName('CORRECTION')[0]. \
            childNodes[0].data.strip()

        if correction:
            count += 1
            line_dict = {
                "text": correction,
                "correction": correction,
                "errors": []
            }
            corpus.append(line_dict)
            if count > limit_size:
                break
    save_json(corpus, output_path)
Beispiel #3
0
def build_sighan_corpus(data_path, output_path):
    corpus = []
    sighan_data = load_pkl(data_path)
    for error_sentence, error_details in sighan_data:
        ids = []
        error_word = ''
        right_word = ''
        if not error_details:
            continue
        for detail in error_details:
            idx = detail[0]
            error_word = detail[1]
            right_word = detail[2]
            begin_idx = idx - 1
            ids.append(begin_idx)
        correct_sentence = error_sentence.replace(error_word, right_word)
        details = []
        for i in ids:
            details.append([error_sentence[i], correct_sentence[i], i, i + 1])
        line_dict = {
            "text": error_sentence,
            "correction": correct_sentence,
            "errors": details
        }
        corpus.append(line_dict)
    save_json(corpus, output_path)
Beispiel #4
0
def eval_corpus500_by_macbert(input_eval_path=eval_data_path, output_eval_path='', verbose=True):
    from pycorrector.macbert.macbert_corrector import MacBertCorrector
    model = MacBertCorrector()
    res = []
    corpus = load_json(input_eval_path)
    total_count = 0
    right_count = 0
    right_rate = 0.0
    recall_rate = 0.0
    recall_right_count = 0
    recall_total_count = 0
    start_time = time.time()
    for data_dict in corpus:
        text = data_dict.get('text', '')
        correction = data_dict.get('correction', '')
        errors = data_dict.get('errors', [])

        #  pred_detail: list(wrong, right, begin_idx, end_idx)
        pred_sentence, pred_detail = model.macbert_correct(text)
        # compute recall
        if errors:
            recall_total_count += 1
            if errors and pred_detail and correction == pred_sentence:
                recall_right_count += 1

        # compute precision
        if correction == pred_sentence:
            right_count += 1
            print("\nright:")
            print('truth  :', text, errors)
            print('predict:', pred_sentence, pred_detail)
        else:
            err_data_dict = copy.deepcopy(data_dict)
            err_data_dict['pred_sentence'] = pred_sentence
            err_data_dict['pred_errors'] = str(pred_detail)
            res.append(err_data_dict)
            if verbose:
                print("\nwrong:")
                print('input  :', text)
                print('truth  :', correction, errors)
                print('predict:', pred_sentence, pred_detail)
        total_count += 1
    spend_time = time.time() - start_time
    if total_count > 0:
        right_rate = right_count / total_count
    if recall_total_count > 0:
        recall_rate = recall_right_count / recall_total_count
    print('right_rate:{}, right_count:{}, total_count:{};\n'
          'recall_rate:{}, recall_right_count:{}, recall_total_count:{}, spend_time:{} s'.format(right_rate,
                                                                                                 right_count,
                                                                                                 total_count,
                                                                                                 recall_rate,
                                                                                                 recall_right_count,
                                                                                                 recall_total_count,
                                                                                                 spend_time))
    if output_eval_path:
        save_json(res, output_eval_path)
Beispiel #5
0
def build_bcmi_corpus(data_path, output_path):
    corpus = []
    with open(data_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            error_sentence, correct_sentence, details = get_bcmi_corpus(line)
            if not error_sentence:
                continue
            line_dict = {"text": error_sentence, "correction": correct_sentence, "errors": details}
            corpus.append(line_dict)
        save_json(corpus, output_path)
Beispiel #6
0
def eval_corpus_by_bert(input_eval_path=eval_data_path,
                        output_eval_path=output_eval_error_path,
                        verbose=True):
    from pycorrector.bert.bert_corrector import BertCorrector
    model = BertCorrector()
    res = []
    corpus = load_json(input_eval_path)
    total_count = 0
    right_count = 0
    right_rate = 0.0
    recall_rate = 0.0
    recall_right_count = 0
    recall_total_count = 0
    for data_dict in corpus:
        text = data_dict.get('text', '')
        correction = data_dict.get('correction', '')
        errors = data_dict.get('errors', [])

        #  pred_detail: list(wrong, right, begin_idx, end_idx)
        pred_sentence, pred_detail = model.bert_correct(text)
        # compute recall
        if errors:
            recall_total_count += 1
            if errors and pred_detail and correction == pred_sentence:
                recall_right_count += 1

        # compute precision
        if correction == pred_sentence:
            right_count += 1
        else:
            err_data_dict = copy.deepcopy(data_dict)
            err_data_dict['pred_sentence'] = pred_sentence
            err_data_dict['pred_errors'] = str(pred_detail)
            res.append(err_data_dict)
            if verbose:
                print('truth:', text, errors)
                print('predict:', pred_sentence, pred_detail)
        total_count += 1

    if total_count > 0:
        right_rate = right_count / total_count
    if recall_total_count > 0:
        recall_rate = recall_right_count / recall_total_count
    print('right_rate:{}, right_count:{}, total_count:{};\n'
          'recall_rate:{},recall_right_count:{},recall_total_count:{}'.format(
              right_rate, right_count, total_count, recall_rate,
              recall_right_count, recall_total_count))
    save_json(res, output_eval_path)
Beispiel #7
0
def parse_cged_file(file_dir):
    rst = []
    for fn in os.listdir(file_dir):
        if fn.endswith('.xml'):
            path = os.path.join(file_dir, fn)
            print('Parse data from %s' % path)

            dom_tree = minidom.parse(path)
            docs = dom_tree.documentElement.getElementsByTagName('DOC')
            for doc in docs:
                id = ''
                text = ''
                texts = doc.getElementsByTagName('TEXT')
                for i in texts:
                    id = i.getAttribute('id')
                    # Input the text
                    text = i.childNodes[0].data.strip()
                # Input the correct text
                correction = doc.getElementsByTagName('CORRECTION')[0]. \
                    childNodes[0].data.strip()
                wrong_ids = []
                for error in doc.getElementsByTagName('ERROR'):
                    start_off = error.getAttribute('start_off')
                    end_off = error.getAttribute('end_off')
                    if start_off and end_off:
                        for i in range(int(start_off), int(end_off) + 1):
                            wrong_ids.append(i)
                source = text.strip()
                target = correction.strip()

                pair = [source, target]
                if pair not in rst:
                    rst.append({
                        'id': id,
                        'original_text': source,
                        'wrong_ids': wrong_ids,
                        'correct_text': target
                    })
    save_json(rst, os.path.join(pwd_path, 'output/cged.json'))
    return rst
Beispiel #8
0
def build_eval_corpus(output_eval_path=eval_data_path):
    """
    生成评估样本集,抽样分布可修改
    当前已经生成评估集,可以修改代码生成自己的样本分布
    :param output_eval_path:
    :return: json file
    """
    bcmi_path = os.path.join(pwd_path, '../data/cn/bcmi.txt')
    clp_path = os.path.join(pwd_path, '../data/cn/clp14_C1.pkl')
    sighan_path = os.path.join(pwd_path, '../data/cn/sighan15_A2.pkl')
    cged_path = os.path.join(pwd_path,
                             '../data/cn/CGED/CGED16_HSK_TrainingSet.xml')

    char_error_path = os.path.join(pwd_path, './bcmi_corpus.json')
    build_bcmi_corpus(bcmi_path, char_error_path)
    char_errors = load_json(char_error_path)

    word_error_path = os.path.join(pwd_path, './sighan_corpus.json')
    build_sighan_corpus(sighan_path, word_error_path)
    word_errors = load_json(word_error_path)

    grammar_error_path = os.path.join(pwd_path, './clp_corpus.json')
    build_sighan_corpus(clp_path, grammar_error_path)
    grammar_errors = load_json(grammar_error_path)

    no_error_path = os.path.join(pwd_path, './noerror_corpus.json')
    build_cged_no_error_corpus(cged_path, no_error_path)
    no_errors = load_json(no_error_path)

    corpus = sample(char_errors, 100) + sample(word_errors, 100) + sample(
        grammar_errors, 100) + sample(no_errors, 200)
    save_json(corpus, output_eval_path)
    print("save eval corpus done", output_eval_path)
    os.remove(char_error_path)
    os.remove(word_error_path)
    os.remove(grammar_error_path)
    os.remove(no_error_path)
Beispiel #9
0
def main():
    # 注意:训练样本过少,仅作为模型测试使用
    # parse_cged_file(os.path.join(pwd_path, '../data/cn/CGED/'))
    sighan15_dir = os.path.join(pwd_path, '../data/cn/sighan_2015/')
    rst_items = []
    test_lst = proc_test_set(sighan15_dir)
    for item in read_data(sighan15_dir):
        rst_items += proc_item(item)

    # 拆分训练与测试
    print('data_size:', len(rst_items))
    train_lst, dev_lst = train_test_split(rst_items, test_size=0.1, random_state=42)
    save_json(train_lst, os.path.join(pwd_path, 'output/train.json'))
    save_json(train_lst, os.path.join(pwd_path, 'output/dev.json'))
    save_json(test_lst, os.path.join(pwd_path, 'output/test.json'))