Example #1
0
def get_bcmi_corpus(line, left_symbol='((', right_symbol='))'):
    """
    转换原始文本为encoder-decoder列表
    :param line: 王老师心((性))格温和,态度和爱((蔼)),教学有方,得到了许多人的好平((评))。
    :param left_symbol:
    :param right_symbol:
    :return: ["王老师心格温和,态度和爱,教学有方,得到了许多人的好平。" , "王老师性格温和,态度和蔼,教学有方,得到了许多人的好评。"]
    """
    error_sentence = ''
    correct_sentence = ''
    details = []
    if left_symbol not in line or right_symbol not in line:
        return error_sentence, correct_sentence, details

    left_ids = find_all_idx(line, left_symbol)
    right_ids = find_all_idx(line, right_symbol)
    if len(left_ids) != len(right_ids):
        return error_sentence, correct_sentence, details
    begin = 0
    for left, right in zip(left_ids, right_ids):
        correct_len = right - left - len(left_symbol)
        correct_word = line[(left + len(left_symbol)):right]
        error_sentence += line[begin:left]
        correct_sentence += line[begin:(left - correct_len)] + correct_word
        begin = right + len(right_symbol)
        details.append(correct_word)
    error_sentence += line[begin:]
    correct_sentence += line[begin:]
    n_details = []
    for i in details:
        idx = correct_sentence.find(i)
        end_idx = idx + len(i)
        error_item = error_sentence[idx:end_idx]
        n_details.append([error_item, i, idx, end_idx])
    return error_sentence, correct_sentence, n_details
Example #2
0
def get_bcmi_corpus(line, left_symbol='((', right_symbol='))'):
    """
    转换原始文本为encoder-decoder列表
    :param line: 王老师心((性))格温和,态度和爱((蔼)),教学有方,得到了许多人的好平((评))。
    :param left_symbol:
    :param right_symbol:
    :return: ["王老师心格温和,态度和爱,教学有方,得到了许多人的好平。" , "王老师性格温和,态度和蔼,教学有方,得到了许多人的好评。"]
    """
    error_sentence, correct_sentence = '', ''
    if left_symbol not in line or right_symbol not in line:
        return error_sentence, correct_sentence

    left_ids = find_all_idx(line, left_symbol)
    right_ids = find_all_idx(line, right_symbol)
    if len(left_ids) != len(right_ids):
        return error_sentence, correct_sentence
    begin = 0
    for left, right in zip(left_ids, right_ids):
        correct_len = right - left - len(left_symbol)
        correct_word = line[(left + len(left_symbol)):right]
        error_sentence += line[begin:left]
        correct_sentence += line[begin:(left - correct_len)] + correct_word
        begin = right + len(right_symbol)
    error_sentence += line[begin:]
    correct_sentence += line[begin:]
    return error_sentence, correct_sentence