예제 #1
0
def sim_two_question():
    """测试一下两个问题的相似句子"""
    from FeatureProject.bert.extract_keras_bert_feature import KerasBertVector
    from sklearn import preprocessing
    from math import pi
    import numpy as np
    import time
    import math

    def cosine_distance(v1, v2):  # 余弦距离
        if type(v1) == list:
            v1 = np.array(v1)
        if type(v2) == list:
            v2 = np.array(v2)

        if v1.all() and v2.all():
            return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))
        else:
            return 0

    def scale_zoom(rate):  # sig 缩放
        zoom = (1 + np.exp(-float(rate))) / 2
        return zoom

    def scale_triangle(rate):  # sin 缩放
        triangle = math.sin(rate / 1 * pi / 2 - pi / 2)
        return triangle

    bert_vector = KerasBertVector()
    print("bert start ok!")
    while True:
        print("input ques-1: ")
        ques_1 = input()
        print("input ques_2: ")
        ques_2 = input()
        vector_1 = bert_vector.bert_encode([ques_1])
        vector_2 = bert_vector.bert_encode([ques_2])
        sim = cosine_distance(vector_1[0], vector_2[0])
        # sim_list = [sim, 0, 0.2, 0.4, 0.6, 0.8, 1.0]
        # sim = preprocessing.scale(sim_list)[0]
        # sim = preprocessing.MinMaxScaler(feature_range=(0, 1)).fit_transform(sim_list)[0]
        # sim_1 = preprocessing.normalize(sim_list, norm='l1')[0]
        # sim_2 = preprocessing.normalize(sim_list, norm='l2')[0]
        # sim = scale_zoom(sim)
        # sim = scale_triangle(sim)
        # print(sim_1)
        # print(sim_2)
        print(sim)
예제 #2
0
def calculate_count():
    """
      统计一下1000条测试数据的平均耗时
    :return: 
    """
    from FeatureProject.bert.extract_keras_bert_feature import KerasBertVector
    import time

    bert_vector = KerasBertVector()
    print("bert start ok!")
    time_start = time.time()
    for i in range(10):
        vector = bert_vector.bert_encode(["jy,你知道吗,我一直都很喜欢你呀,在一起在一起在一起,哈哈哈哈"])

    time_end = time.time()
    time_avg = (time_end - time_start) / 10
    print(vector)
    print(time_avg)
def chatbot_sentence_vec_by_bert_own():
    """bert encode is writted by my own"""
    from FeatureProject.bert.extract_keras_bert_feature import KerasBertVector
    from conf.path_config import chicken_and_gossip_path
    from utils.text_tools import txtRead
    import numpy as np

    # 读取数据和一些参数,这里只取了100个标准问题
    topk = 5
    matrix_ques_save_path = "doc_vecs_chicken_and_gossip"
    questions = txtRead(chicken_and_gossip_path, encodeType='utf-8')
    ques = [ques.split('\t')[0] for ques in questions][0:100]

    # 生成标准问题的bert句向量
    bert_vector = KerasBertVector()
    ques_basic_vecs = bert_vector.bert_encode(ques)

    # 线上你可以生成,直接调用,然后直接load就好
    np.savetxt(matrix_ques_save_path, ques_basic_vecs)
    # matrix_ques = np.loadtxt(matrix_ques_save_path)

    query_bert_vec = bert_vector.bert_encode(["小姜机器人是什么"])[0]
    query_bert_vec = np.array(query_bert_vec)
    print(query_bert_vec)
    # 矩阵点乘,很快的,你也可以用annoy等工具,计算就更加快了
    qq_score = np.sum(query_bert_vec * ques_basic_vecs, axis=1) / np.linalg.norm(ques_basic_vecs, axis=1)
    topk_idx = np.argsort(qq_score)[::-1][:topk]
    for idx in topk_idx:
        print('小姜机器人回答检索: %s\t%s' % (qq_score[idx], questions[idx]))


    while True:
        print("你的问题:")
        query = input()
        query_bert_vec = bert_vector.bert_encode([query])[0]
        query_bert_vec = np.array(query_bert_vec)
        # 矩阵点乘,很快的,你也可以用annoy等工具,计算就更加快了
        qq_score = np.sum(query_bert_vec * ques_basic_vecs, axis=1) / np.linalg.norm(ques_basic_vecs, axis=1)
        topk_idx = np.argsort(qq_score)[::-1][:topk]
        for idx in topk_idx:
            print('小姜机器人回答检索: %s\t%s' % (qq_score[idx], questions[idx]))