def get_bcmi_corpus(line, left_symbol='((', right_symbol='))'): """ 转换原始文本为encoder-decoder列表 :param line: 王老师心((性))格温和,态度和爱((蔼)),教学有方,得到了许多人的好平((评))。 :param left_symbol: :param right_symbol: :return: ["王老师心格温和,态度和爱,教学有方,得到了许多人的好平。" , "王老师性格温和,态度和蔼,教学有方,得到了许多人的好评。"] """ error_sentence = '' correct_sentence = '' details = [] if left_symbol not in line or right_symbol not in line: return error_sentence, correct_sentence, details left_ids = find_all_idx(line, left_symbol) right_ids = find_all_idx(line, right_symbol) if len(left_ids) != len(right_ids): return error_sentence, correct_sentence, details begin = 0 for left, right in zip(left_ids, right_ids): correct_len = right - left - len(left_symbol) correct_word = line[(left + len(left_symbol)):right] error_sentence += line[begin:left] correct_sentence += line[begin:(left - correct_len)] + correct_word begin = right + len(right_symbol) details.append(correct_word) error_sentence += line[begin:] correct_sentence += line[begin:] n_details = [] for i in details: idx = correct_sentence.find(i) end_idx = idx + len(i) error_item = error_sentence[idx:end_idx] n_details.append([error_item, i, idx, end_idx]) return error_sentence, correct_sentence, n_details
def get_bcmi_corpus(line, left_symbol='((', right_symbol='))'): """ 转换原始文本为encoder-decoder列表 :param line: 王老师心((性))格温和,态度和爱((蔼)),教学有方,得到了许多人的好平((评))。 :param left_symbol: :param right_symbol: :return: ["王老师心格温和,态度和爱,教学有方,得到了许多人的好平。" , "王老师性格温和,态度和蔼,教学有方,得到了许多人的好评。"] """ error_sentence, correct_sentence = '', '' if left_symbol not in line or right_symbol not in line: return error_sentence, correct_sentence left_ids = find_all_idx(line, left_symbol) right_ids = find_all_idx(line, right_symbol) if len(left_ids) != len(right_ids): return error_sentence, correct_sentence begin = 0 for left, right in zip(left_ids, right_ids): correct_len = right - left - len(left_symbol) correct_word = line[(left + len(left_symbol)):right] error_sentence += line[begin:left] correct_sentence += line[begin:(left - correct_len)] + correct_word begin = right + len(right_symbol) error_sentence += line[begin:] correct_sentence += line[begin:] return error_sentence, correct_sentence