Ejemplo n.º 1
0
def prepare_dict_model():

    lines = open(config.sort_all_file_path, "r").readlines()
    ws = Word2Sequence()
    lines = [cut_sentence_by_character(line) for line in lines]

    for line in lines:
        ws.fit(line)

    ws.build_vocab()

    pickle.dump(ws, open(config.sort_ws_model_path, "wb"))
Ejemplo n.º 2
0
def collate_fn(batch):
    '''
    batch 就是__getitem__返回的内容
    :param batch:
    :return:
    '''
    # 按照input1的句子长度降序
    batch=sorted(batch,key=lambda x:x[-2],reverse=True)

    input1s,input2s,labels,input1_lens,input2_lens=zip(*batch)

    # input1,input2转成index向量,其余转成LongTensor

    input1s=torch.LongTensor([lib.word_sequence_model.transform(cut_sentence_by_character(input1)) for input1 in input1s])
    input2s=torch.LongTensor([lib.word_sequence_model.transform(cut_sentence_by_character(input2)) for input2 in input2s])

    labels=torch.LongTensor([float(label.strip())  for label in labels])

    input1_lens=torch.LongTensor(input1_lens)
    input2_lens=torch.LongTensor(input2_lens)

    return input1s,input2s,labels,input1_lens,input2_lens
Ejemplo n.º 3
0
    def search_memory(self,question_sentence,fasttext_model):
        # 问题>向量>cp>记忆

        question_word_list=sentence_process.cut_sentence_by_character(question_sentence)

        # 用fasttext把用户问题转成向量
        question_sequence=fasttext_model.get_sentence_vector(" ".join(question_word_list))

        cp=self.get_pysprnn_model(fasttext_model)

        # k返回几个候选,num_indexes一个数最多同属几个聚类,
        candidates=cp.search(question_sequence,k=4,num_indexes=3)

        return candidates[0]
Ejemplo n.º 4
0
    def get_pysprnn_model(self,fasttext):

        if os.path.exists(config.recall_pysparnn_cp_model_path):
            return pickle.load(open(config.recall_pysparnn_cp_model_path,"rb"))
        else:
            lines=open(config.recall_merged_q_path,"r").readlines()

            lines=[line.strip() for line in lines]

            quesions_string_cut=[" ".join(sentence_process.cut_sentence_by_character(setence)) for setence in lines]

            quesions_vectors=[fasttext.get_sentence_vector(quesion_string_cut) for quesion_string_cut in quesions_string_cut]

            # fasttest
            cp=ci.MultiClusterIndex(quesions_vectors,quesions_string_cut,num_indexes=3)

            pickle.dump(cp,open(config.recall_pysparnn_cp_model_path,"wb"))

            return cp
Ejemplo n.º 5
0
def test_dict_model():
    sentence = "如何在linux下安装storm"
    ws = pickle.load(open(config.sort_ws_model_path, "rb"))
    sequence = ws.transform(cut_sentence_by_character(sentence))
    print(cut_sentence_by_character(sentence))
    print(sequence)
Ejemplo n.º 6
0
 def if_ask_question(self, sentence):
     word_list = sentence_process.cut_sentence_by_character(sentence)
     label, scores = self.model.predict(" ".join(word_list))
     return label[0], scores[0]
Ejemplo n.º 7
0
# -*- coding: utf-8 -*-
# @Time    : 2020/12/22 10:08 PM
# @Author  : Kevin
from utils import sentence_process

if __name__ == '__main__':
    new_merged_q_path="/Users/kevin/Downloads/work/pycharm/Ying/data/search/new_merged_q.txt"
    question_cut_by_character_path="/Users/kevin/Downloads/work/pycharm/Ying/data/search/question_cut_by_character.txt"

    with open(new_merged_q_path,"r") as new_merged_q_file:
        with open(question_cut_by_character_path,"w+") as question_cut_by_character_file:

            lines=new_merged_q_file.readlines()

            questions=[" ".join(sentence_process.cut_sentence_by_character(line.strip())) for line in lines]

            for question in questions:
                question_cut_by_character_file.write(question+"\n")