def create_dialog(self): # 问题输入 GlobalVariable.get_value('OUTPUT')("您好,有什么能帮助您的?") init_sentence = GlobalVariable.get_value('INPUT')() # 新增一个对话场景,并将初始句加入相应对话场景中 self.dialog_tree.add_dialog_branch(init_sentence, 'FAQ') # 开始对话决策过程 self.dialog_policy.input_query(init_sentence)
def __init__(self): self.args = GlobalVariable.get_value('BERT_ARGS') self.model = BertForSequenceClassification.from_pretrained(self.args.get('bert_model'), cache_dir=PYTORCH_PRETRAINED_BERT_CACHE / 'distributed_{}'.format( self.args.get('local_rank'))) self.processors = {'SentencePro': sentencePro} self.processor = self.processors['SentencePro']() self.label_list = self.processor.get_labels() self.device = torch.device("cuda" if torch.cuda.is_available() and not self.args.get('no_cuda') else "cpu") self.tokenizer = BertTokenizer.from_pretrained(self.args.get('bert_model')) self.model.load_state_dict(GlobalVariable.get_value('NEW_STATE_DICT'))
def type_provide(self, query, answer_id): # 数据加载 faq_dict = GlobalVariable.get_value('FAQ_DATA') # 问题类型判断 query_type = faq_dict[answer_id]["专业"] GlobalVariable.get_value('OUTPUT')('您想询问的是否是[%s]相关问题?(是/否)' % query_type) if GlobalVariable.get_value('INPUT')() == '是': self.correct(query, query_type) else: self.error(query)
def candidate_selecting(self, query, query_type): user_query_list = [query] # 调用模型计算,获取指定类型下,问题对应候选5个答案ID answer_id_list = dssm_model_infer(user_query_list, model_name='AttentionDSSM', top_k=5, query_type=query_type) # 数据加载 faq_dict = GlobalVariable.get_value('FAQ_DATA') # 输出指定ID问题 for index in range(len(answer_id_list[0])): GlobalVariable.get_value( 'OUTPUT')(str(index + 1) + '.' + faq_dict[answer_id_list[0][index]]["问题"]) # 候选问题选择 GlobalVariable.get_value('OUTPUT')( '上述问题是否包含您想问的问题,如果是,请返回相应问题序号,如果不是,请回[否]') respond = GlobalVariable.get_value('INPUT')() if respond.isdigit(): GlobalVariable.get_value('OUTPUT')( faq_dict[answer_id_list[0][int(respond) - 1]]["答案"]) self.selecting() else: self.error(query)
def dssm_model_infer(queries, model_name='MultiGruDSSM', top_k=1, threshold=0., query_type='所有'): """ dssm模型计算函数,通过参数获取问题,从指定路径加载需要匹配数据, 获取top-k个候选答案 并根据给定阈值过滤答案 :param queries: 问题列表 :param model_name: 调用的模型名称 :param top_k: 候选答案数目 :param threshold: 相似度阈值 :param query_type: 要匹配的问题类型 :return: 包含答案ID的二维数组 """ # 问题格式转换 query_set = [] for query in queries: query_set.append(list(query)) # 匹配数据索引获取 index_list = GlobalVariable.get_value('FAQ_INDEX')[query_type] # 匹配数据对应特征向量的获取 t_set = [] faq_data = GlobalVariable.get_value('FAQ_DATA') for index in index_list: t_set.append(faq_data[index]['embedding']) # 模型计算 dssm = GlobalVariable.get_value('MODEL')['DSSM'][model_name + '_INFER'] dssm.q_set = query_set dssm.t_set = t_set dssm.init_model_parameters() dssm.generate_data_set() result_prob_list, result_id_list = dssm.inference(top_k) answer_id_list = [] for i in range(len(result_id_list)): answer_id = [] for j in range(len(result_id_list[i])): if result_prob_list[i][j] <= threshold: break answer_id.append(index_list[result_id_list[i][j]]) answer_id_list.append(answer_id) return answer_id_list
def dssm_model_extract_t_pre(model_name='MultiGruDSSM'): # 匹配数据获取 query_dict = {'Domain': [], 'Encyclopedia': [], 'Gossip': []} faq_dict = GlobalVariable.get_value('FAQ_DATA') for key in faq_dict: if faq_dict[key]['专业'] == '百科': query_dict['Encyclopedia'].append(list(faq_dict[key]['问题'])) elif faq_dict[key]['专业'] == '闲聊': query_dict['Gossip'].append(list(faq_dict[key]['问题'])) else: query_dict['Domain'].append(list(faq_dict[key]['问题'])) # 字向量字典获取 embedding_dict = GlobalVariable.get_value('Word2Vec_CHARACTER_EMBEDDING') word_dict = {} vec_set = [] i = 0 for key in embedding_dict: word_dict[key] = i vec_set.append(embedding_dict[key][0]) i += 1 for key in query_dict: # 模型计算 dssm = dssm_model[model_name](t_set=query_dict[key], dict_set=word_dict, vec_set=vec_set, is_extract=True) dssm.init_model_parameters() dssm.generate_data_set() dssm.build_graph() t_state = dssm.extract_t_pre() # 匹配数据对应特征向量的存储 t_pre_dict = {} for i in range(len(t_state)): t_pre_dict[i] = list(map(float, list(t_state[i]))) with open('./KnowledgeMemory/Embedding/DSSM/' + model_name + '/' + key + 'Embedding.json', 'w', encoding='utf-8') as file_object: json.dump(t_pre_dict, file_object, ensure_ascii=False, indent=2)
def dssm_model_train(model_name='MultiGruDSSM'): """ dssm模型训练函数,从指定路径加载数据 :return: None """ # 训练数据获取 query_set = [] answer_set = [] faq_dict = GlobalVariable.get_value('FAQ_DATA') for key in faq_dict: query_set.append(list(faq_dict[key]['问题'])) answer_set.append(list(faq_dict[key]['答案'])) # 翻转问题,增加数据多样性,变成两个问题指向同一答案 # for key in faq_dict: # query_set.append(list(faq_dict[key]['问题'])[::-1]) # answer_set.append(list(faq_dict[key]['答案'])) # 字向量字典获取 embedding_dict = GlobalVariable.get_value('Word2Vec_CHARACTER_EMBEDDING') word_dict = {} vec_set = [] i = 0 for key in embedding_dict: word_dict[key] = i vec_set.append(embedding_dict[key][0]) i += 1 # 模型训练 dssm = dssm_model[model_name](q_set=query_set, t_set=answer_set, dict_set=word_dict, vec_set=vec_set, batch_size=len(query_set) // 2) dssm.init_model_parameters() dssm.generate_data_set() dssm.build_graph() dssm.train()
def types_selecting(self, query): # 获取问题类型 query_types = [] faq_list = GlobalVariable.get_value('FAQ_DATA') for faq_dict in faq_list: if faq_dict['专业'] not in query_types: query_types.append(faq_dict['专业']) # 输出问题类型 GlobalVariable.get_value('OUTPUT')('目前已有的问题类型有:') for index in range(len(query_types)): GlobalVariable.get_value('OUTPUT')(str(index + 1) + '.' + query_types[index]) # 问题类型选择 GlobalVariable.get_value('OUTPUT')( '上述类型是否包含您想问的问题类型,如果是,请返回相应类型序号,如果不是,请回[否]') respond = GlobalVariable.get_value('INPUT')() if respond.isdigit(): query_type = query_types[int(respond) - 1] self.selecting(query, query_type) else: self.error(query)
def get_answer(queries, model_name='MultiGruModel', top_k=1, threshold=0.): """ 不包含多轮对话,根据输入的多个问题,到指定模型中获取每个问题对应的前k个答案 :param queries: 问题列表 :param model_name: 调用模型名字 :param top_k: 返回的问题数 :param threshold: 相似度阈值 :return: 实际答案和问题二维数组 """ print('get answer---------') # 调用模型计算,获取每一个问题对应top-k个答案ID answer_id_list = dssm_model_infer(queries, model_name=model_name, top_k=top_k, threshold=threshold) # 数据加载 faq_dict = GlobalVariable.get_value('FAQ_DATA') # 获取指定ID问题 query_set = [] for answer in answer_id_list: query_list = [] for id in answer: query_list.append(faq_dict[id]["问题"]) query_set.append(query_list) # 获取指定ID答案 answer_set = [] for answer in answer_id_list: answer_list = [] for id in answer: answer_list.append(faq_dict[id]["答案"]) answer_set.append(answer_list) return query_set, answer_set
def answer_matching(self, query): user_query_list = [query] # 调用模型计算,获取问题对应答案ID answer_id_list = dssm_model_infer(user_query_list, model_name='AttentionDSSM', top_k=1) # 数据加载 faq_dict = GlobalVariable.get_value('FAQ_DATA') # 输出指定ID答案 GlobalVariable.get_value('OUTPUT')( faq_dict[answer_id_list[0][0]]["答案"]) # 是否正确的判断 GlobalVariable.get_value('OUTPUT')('是否是正确答案?(是/否)') if GlobalVariable.get_value('INPUT')() == '是': self.correct() else: self.error(query, answer_id_list[0][0])
# @Function : from KnowledgeExtraction.QuestionClassificationBert.Args import BertArgs from KnowledgeExtraction.QuestionClassificationBert.TrainClassificationModel import BertForClassification from UtilArea import GlobalVariable if __name__ == '__main__': do_train = False if do_train: # 1.训练模型 # 设置训练模式时的Bert模型参数 train_args = BertArgs(do_train=True, do_eval=True, no_cuda=False) # 加载训练类 train_classification = BertForClassification(train_args) # 训练 print('---------Start Training-------------') train_classification.train() print('---------Finish Training------------') # 2.加载训练好的模型进行预测 # TODO:(1)改为意图识别(2)规范接口对接 # 输入一句话进行问题分类或意图识别: GlobalVariable._init() PredictModel = GlobalVariable.get_value('QUESTION_CLASSIFICATION_MODEL') input_sentence = input('请输入问题:') res = PredictModel.test(input_sentence) # test(model, processor, args, label_list, tokenizer, device, input_sentence)
def end_process(self): GlobalVariable.get_value('OUTPUT')('感谢为您解答!')