def sentence2idx(self, text): text = extract_chinese(str(text).upper()) if self.level_type == 'char': text = list(text) elif self.level_type == 'word': text = list(jieba.cut(text, cut_all=False, HMM=False)) elif self.level_type == 'ngram': text = get_ngram(text, ns=self.ngram_ns) else: raise RuntimeError( "your input level_type is wrong, it must be 'word' or 'char'") # text = [text_one for text_one in text] len_leave = self.len_max - len(text) if len_leave >= 0: text_index = [ self.token2idx[text_char] if text_char in self.token2idx else self.token2idx['[UNK]'] for text_char in text ] + [self.token2idx['[PAD]'] for i in range(len_leave)] else: text_index = [ self.token2idx[text_char] if text_char in self.token2idx else self.token2idx['[UNK]'] for text_char in text[0:self.len_max] ] return text_index
def sentence2idx(self, text, second_text=None): text = extract_chinese(str(text).upper()) text = str(text).upper() input_id, input_type_id = self.tokenizer.encode(first=text, second=second_text, max_len=self.len_max) return [input_id, input_type_id]
def sentence2idx(self, text): text = extract_chinese(str(text).upper()) input_id, input_type_id = self.tokenizer.encode(first=text, max_len=self.len_max) # input_mask = [0 if ids == 0 else 1 for ids in input_id] # return input_id, input_type_id, input_mask return [input_id, input_type_id]
def sentence2idx(self, text): text = extract_chinese(str(text).upper()) tokens = self.tokenizer.encode(text) tokens = tokens + [0] * (self.target_len - len(tokens)) \ if len(tokens) < self.target_len \ else tokens[0:self.target_len] token_input = np.expand_dims(np.array(tokens), axis=0) segment_input = np.zeros_like(token_input) memory_length_input = np.zeros((1, 1)) return [token_input, segment_input, memory_length_input]
def sentence2idx(self, text): text = extract_chinese(str(text).upper()) tokens = self.tokenizer.encode(text) tokens = tokens + [0] * (self.target_len - len(tokens)) \ if len(tokens) < self.target_len \ else tokens[0:self.target_len] token_input = np.expand_dims(np.array(tokens), axis=0) segment_input = np.zeros_like(token_input) memory_length_input = np.zeros( (1, 1)) # np.array([[self.memory_len]]) # np.zeros((1, 1)) masks = [1] * len(tokens) + ([0] * (self.target_len - len(tokens)) if len(tokens) < self.target_len else []) mask_input = np.expand_dims(np.array(masks), axis=0) if self.trainable: return [ token_input, segment_input, memory_length_input, mask_input ] else: return [token_input, segment_input, memory_length_input]
def pred_tet(path_hyper_parameter=path_hyper_parameters, path_test=None, rate=1.0): """ 测试集测试与模型评估 :param hyper_parameters: json, 超参数 :param path_test:str, path of test data, 测试集 :param rate: 比率, 抽出rate比率语料取训练 :return: None """ hyper_parameters = load_json(path_hyper_parameter) if path_test: # 从外部引入测试数据地址 hyper_parameters['data']['test_data'] = path_test time_start = time.time() # graph初始化 graph = Graph(hyper_parameters) print("graph init ok!") graph.load_model() print("graph load ok!") ra_ed = graph.word_embedding # 数据预处理 pt = PreprocessSim(path_model_dir) data = pd.read_csv(hyper_parameters['data']['test_data']) sentence_1 = data["sentence1"].values.tolist() sentence_2 = data["sentence2"].values.tolist() labels = data["label"].values.tolist() sentence_1 = [extract_chinese(str(line1).upper()) for line1 in sentence_1] sentence_2 = [extract_chinese(str(line2).upper()) for line2 in sentence_2] labels = [extract_chinese(str(line3).upper()) for line3 in labels] # 取该数据集的百分之几的语料测试 len_rate = int(len(labels) * rate) sentence_1 = sentence_1[0:len_rate] sentence_2 = sentence_2[0:len_rate] labels = labels[0:len_rate] y_pred = [] count = 0 for i in range(len_rate): count += 1 ques_embed = ra_ed.sentence2idx(text=sentence_1[i], second_text=sentence_2[i]) # print(hyper_parameters['embedding_type']) if hyper_parameters['embedding_type'] in ['bert', 'albert']: # bert数据处理, token x_val_1 = np.array([ques_embed[0]]) x_val_2 = np.array([ques_embed[1]]) x_val = [x_val_1, x_val_2] # 预测 pred = graph.predict(x_val) pre = pt.prereocess_idx(pred[0]) label_pred = pre[0][0][0] if count % 1000 == 0: print(label_pred) y_pred.append(label_pred) print("data pred ok!") # 预测结果转为int类型 index_y = [pt.l2i_i2l['l2i'][i] for i in labels] index_pred = [pt.l2i_i2l['l2i'][i] for i in y_pred] target_names = [ pt.l2i_i2l['i2l'][str(i)] for i in list(set((index_pred + index_y))) ] # 评估 report_predict = classification_report(index_y, index_pred, target_names=target_names, digits=9) print(report_predict) print("耗时:" + str(time.time() - time_start))