コード例 #1
0
 def sentence2idx(self, text):
     text = extract_chinese(str(text).upper())
     if self.level_type == 'char':
         text = list(text)
     elif self.level_type == 'word':
         text = list(jieba.cut(text, cut_all=False, HMM=False))
     elif self.level_type == 'ngram':
         text = get_ngram(text, ns=self.ngram_ns)
     else:
         raise RuntimeError(
             "your input level_type is wrong, it must be 'word' or 'char'")
     # text = [text_one for text_one in text]
     len_leave = self.len_max - len(text)
     if len_leave >= 0:
         text_index = [
             self.token2idx[text_char]
             if text_char in self.token2idx else self.token2idx['[UNK]']
             for text_char in text
         ] + [self.token2idx['[PAD]'] for i in range(len_leave)]
     else:
         text_index = [
             self.token2idx[text_char]
             if text_char in self.token2idx else self.token2idx['[UNK]']
             for text_char in text[0:self.len_max]
         ]
     return text_index
コード例 #2
0
 def sentence2idx(self, text, second_text=None):
     text = extract_chinese(str(text).upper())
     text = str(text).upper()
     input_id, input_type_id = self.tokenizer.encode(first=text,
                                                     second=second_text,
                                                     max_len=self.len_max)
     return [input_id, input_type_id]
コード例 #3
0
 def sentence2idx(self, text):
     text = extract_chinese(str(text).upper())
     input_id, input_type_id = self.tokenizer.encode(first=text,
                                                     max_len=self.len_max)
     # input_mask = [0 if ids == 0 else 1 for ids in input_id]
     # return input_id, input_type_id, input_mask
     return [input_id, input_type_id]
コード例 #4
0
 def sentence2idx(self, text):
     text = extract_chinese(str(text).upper())
     tokens = self.tokenizer.encode(text)
     tokens = tokens + [0] * (self.target_len - len(tokens)) \
                            if len(tokens) < self.target_len \
                            else tokens[0:self.target_len]
     token_input = np.expand_dims(np.array(tokens), axis=0)
     segment_input = np.zeros_like(token_input)
     memory_length_input = np.zeros((1, 1))
     return [token_input, segment_input, memory_length_input]
コード例 #5
0
 def sentence2idx(self, text):
     text = extract_chinese(str(text).upper())
     tokens = self.tokenizer.encode(text)
     tokens = tokens + [0] * (self.target_len - len(tokens)) \
                            if len(tokens) < self.target_len \
                            else tokens[0:self.target_len]
     token_input = np.expand_dims(np.array(tokens), axis=0)
     segment_input = np.zeros_like(token_input)
     memory_length_input = np.zeros(
         (1, 1))  # np.array([[self.memory_len]]) # np.zeros((1, 1))
     masks = [1] * len(tokens) + ([0] * (self.target_len - len(tokens))
                                  if len(tokens) < self.target_len else [])
     mask_input = np.expand_dims(np.array(masks), axis=0)
     if self.trainable:
         return [
             token_input, segment_input, memory_length_input, mask_input
         ]
     else:
         return [token_input, segment_input, memory_length_input]
コード例 #6
0
def pred_tet(path_hyper_parameter=path_hyper_parameters,
             path_test=None,
             rate=1.0):
    """
        测试集测试与模型评估
    :param hyper_parameters: json, 超参数
    :param path_test:str, path of test data, 测试集
    :param rate: 比率, 抽出rate比率语料取训练
    :return: None
    """
    hyper_parameters = load_json(path_hyper_parameter)
    if path_test:  # 从外部引入测试数据地址
        hyper_parameters['data']['test_data'] = path_test
    time_start = time.time()
    # graph初始化
    graph = Graph(hyper_parameters)
    print("graph init ok!")
    graph.load_model()
    print("graph load ok!")
    ra_ed = graph.word_embedding
    # 数据预处理
    pt = PreprocessSim(path_model_dir)

    data = pd.read_csv(hyper_parameters['data']['test_data'])
    sentence_1 = data["sentence1"].values.tolist()
    sentence_2 = data["sentence2"].values.tolist()
    labels = data["label"].values.tolist()
    sentence_1 = [extract_chinese(str(line1).upper()) for line1 in sentence_1]
    sentence_2 = [extract_chinese(str(line2).upper()) for line2 in sentence_2]
    labels = [extract_chinese(str(line3).upper()) for line3 in labels]

    # 取该数据集的百分之几的语料测试
    len_rate = int(len(labels) * rate)
    sentence_1 = sentence_1[0:len_rate]
    sentence_2 = sentence_2[0:len_rate]
    labels = labels[0:len_rate]
    y_pred = []
    count = 0
    for i in range(len_rate):
        count += 1
        ques_embed = ra_ed.sentence2idx(text=sentence_1[i],
                                        second_text=sentence_2[i])
        # print(hyper_parameters['embedding_type'])
        if hyper_parameters['embedding_type'] in ['bert',
                                                  'albert']:  # bert数据处理, token
            x_val_1 = np.array([ques_embed[0]])
            x_val_2 = np.array([ques_embed[1]])
            x_val = [x_val_1, x_val_2]
            # 预测
            pred = graph.predict(x_val)
            pre = pt.prereocess_idx(pred[0])
            label_pred = pre[0][0][0]
            if count % 1000 == 0:
                print(label_pred)
            y_pred.append(label_pred)

    print("data pred ok!")
    # 预测结果转为int类型
    index_y = [pt.l2i_i2l['l2i'][i] for i in labels]
    index_pred = [pt.l2i_i2l['l2i'][i] for i in y_pred]
    target_names = [
        pt.l2i_i2l['i2l'][str(i)] for i in list(set((index_pred + index_y)))
    ]
    # 评估
    report_predict = classification_report(index_y,
                                           index_pred,
                                           target_names=target_names,
                                           digits=9)
    print(report_predict)
    print("耗时:" + str(time.time() - time_start))