def sim_two_question(): """测试一下两个问题的相似句子""" from FeatureProject.bert.extract_keras_bert_feature import KerasBertVector from sklearn import preprocessing from math import pi import numpy as np import time import math def cosine_distance(v1, v2): # 余弦距离 if type(v1) == list: v1 = np.array(v1) if type(v2) == list: v2 = np.array(v2) if v1.all() and v2.all(): return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2)) else: return 0 def scale_zoom(rate): # sig 缩放 zoom = (1 + np.exp(-float(rate))) / 2 return zoom def scale_triangle(rate): # sin 缩放 triangle = math.sin(rate / 1 * pi / 2 - pi / 2) return triangle bert_vector = KerasBertVector() print("bert start ok!") while True: print("input ques-1: ") ques_1 = input() print("input ques_2: ") ques_2 = input() vector_1 = bert_vector.bert_encode([ques_1]) vector_2 = bert_vector.bert_encode([ques_2]) sim = cosine_distance(vector_1[0], vector_2[0]) # sim_list = [sim, 0, 0.2, 0.4, 0.6, 0.8, 1.0] # sim = preprocessing.scale(sim_list)[0] # sim = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit_transform(sim_list)[0] # sim_1 = preprocessing.normalize(sim_list, norm='l1')[0] # sim_2 = preprocessing.normalize(sim_list, norm='l2')[0] # sim = scale_zoom(sim) # sim = scale_triangle(sim) # print(sim_1) # print(sim_2) print(sim)
def calculate_count(): """ 统计一下1000条测试数据的平均耗时 :return: """ from FeatureProject.bert.extract_keras_bert_feature import KerasBertVector import time bert_vector = KerasBertVector() print("bert start ok!") time_start = time.time() for i in range(10): vector = bert_vector.bert_encode(["jy,你知道吗,我一直都很喜欢你呀,在一起在一起在一起,哈哈哈哈"]) time_end = time.time() time_avg = (time_end - time_start) / 10 print(vector) print(time_avg)
def chatbot_sentence_vec_by_bert_own(): """bert encode is writted by my own""" from FeatureProject.bert.extract_keras_bert_feature import KerasBertVector from conf.path_config import chicken_and_gossip_path from utils.text_tools import txtRead import numpy as np # 读取数据和一些参数,这里只取了100个标准问题 topk = 5 matrix_ques_save_path = "doc_vecs_chicken_and_gossip" questions = txtRead(chicken_and_gossip_path, encodeType='utf-8') ques = [ques.split('\t')[0] for ques in questions][0:100] # 生成标准问题的bert句向量 bert_vector = KerasBertVector() ques_basic_vecs = bert_vector.bert_encode(ques) # 线上你可以生成,直接调用,然后直接load就好 np.savetxt(matrix_ques_save_path, ques_basic_vecs) # matrix_ques = np.loadtxt(matrix_ques_save_path) query_bert_vec = bert_vector.bert_encode(["小姜机器人是什么"])[0] query_bert_vec = np.array(query_bert_vec) print(query_bert_vec) # 矩阵点乘,很快的,你也可以用annoy等工具,计算就更加快了 qq_score = np.sum(query_bert_vec * ques_basic_vecs, axis=1) / np.linalg.norm(ques_basic_vecs, axis=1) topk_idx = np.argsort(qq_score)[::-1][:topk] for idx in topk_idx: print('小姜机器人回答检索: %s\t%s' % (qq_score[idx], questions[idx])) while True: print("你的问题:") query = input() query_bert_vec = bert_vector.bert_encode([query])[0] query_bert_vec = np.array(query_bert_vec) # 矩阵点乘,很快的,你也可以用annoy等工具,计算就更加快了 qq_score = np.sum(query_bert_vec * ques_basic_vecs, axis=1) / np.linalg.norm(ques_basic_vecs, axis=1) topk_idx = np.argsort(qq_score)[::-1][:topk] for idx in topk_idx: print('小姜机器人回答检索: %s\t%s' % (qq_score[idx], questions[idx]))