def classify_pair_corpus(bert_model): # 数据预处理 from utils.text_tools import text_preprocess, txtRead, txtWrite from conf.path_config import path_webank_sim import random webank_q_2_l = txtRead(path_webank_sim, encodeType='gbk') questions = [] labels = [] for ques_label in webank_q_2_l[1:]: q_2_l = ques_label.split(',') q_1 = q_2_l[0] q_2 = "".join(q_2_l[1:-1]) label = q_2_l[-1] questions.append([text_preprocess(q_1), text_preprocess(q_2)]) label_int = int(label) labels.append([0, 1] if label_int == 1 else [1, 0]) questions = np.array(questions) labels = np.array(labels) index = [i for i in range(len(labels))] random.shuffle(index) questions = questions[index] labels = labels[index] len_train = int(len(labels) * 0.9) train_x, train_y = questions[0:len_train], labels[0:len_train] test_x, test_y = questions[len_train:], labels[len_train:] input_ids, input_masks, input_type_ids = bert_model.process_pair(train_x) input_ids2, input_masks2, input_type_ids2 = bert_model.process_pair(test_x) return train_x, train_y, test_x, test_y, input_ids, input_masks, input_type_ids, input_ids2, input_masks2, input_type_ids2
def classify_pair_corpus_webank(bert_model, path_webank): # 数据预处理 from utils.text_tools import text_preprocess, txtRead, txtWrite import random webank_q_2_l = txtRead(path_webank, encodeType='utf-8') questions = [] labels = [] for ques_label in webank_q_2_l[1:]: q_2_l = ques_label.split(',') q_1 = q_2_l[0] q_2 = "".join(q_2_l[1:-1]) label = q_2_l[-1] questions.append([text_preprocess(q_1), text_preprocess(q_2)]) label_int = int(label) labels.append([0, 1] if label_int == 1 else [1, 0]) questions = np.array(questions) labels = np.array(labels) input_ids, input_masks, input_type_ids = bert_model.process_pair(questions) return questions, labels, input_ids, input_masks, input_type_ids