def __init__(self, corpus_file, word2id): time_s = time.time() self.contexts, self.responses = load_corpus_file(corpus_file, word2id, size=50000) self._train_model() self.corpus_mm = self.tfidf_model[self.corpus] self.index = similarities.MatrixSimilarity(self.corpus_mm) logger.debug("Time to build tfidf model by %s: %2.f seconds." % (corpus_file, time.time() - time_s))
def search(self, sqls): """ 执行cypher查询,并返回相应结果 :param sqls: :return: """ final_answers = [] logger.debug("input sqls: %s" % sqls) for sql_dict in sqls: question_type = sql_dict['question_type'] queries = sql_dict['sql'] answers = [] for query in queries: ress = self.g.run(query).data() answers += ress final_answer = self.answer_prettify(question_type, answers) if final_answer: final_answers.append(final_answer) logger.debug("cypher result: %s" % final_answers) return final_answers
def answer(self, query): self.last_txt.append(query) logger.debug('-' * 20) logger.debug("init_query=%s" % query) response = self.m.predict(query) logger.debug("seq2seq_response=%s" % response) self.last_txt.append(response) return response
def search_bing(query): """ 通过bing检索答案,包括bing知识图谱、bing网典 :param query: :return: list, string """ answer = [] left_text = '' # 获取bing的摘要 soup_bing = html_crawler.get_html_bing(bing_url_prefix + quote(query)) # 判断是否在Bing的知识图谱中 r = soup_bing.find(class_="bm_box") if r: r = r.find_all(class_="b_vList") if r and len(r) > 1: r = r[1].find("li").get_text().strip() if r: answer.append(r) logger.debug("Bing知识图谱找到答案") return answer, left_text else: r = soup_bing.find(id="b_results") if r: bing_list = r.find_all('li') for bl in bing_list: temp = bl.get_text() if temp.__contains__(" - 必应网典"): logger.debug("查找Bing网典") url = bl.find("h2").find("a")['href'] if url: bingwd_soup = html_crawler.get_html_bingwd(url) r = bingwd_soup.find(class_='bk_card_desc').find("p") if r: r = r.get_text().replace("\n", "").strip() if r: logger.debug("Bing网典找到答案") answer.append(r) return answer, left_text left_text += r.get_text() return answer, left_text
def search_baidu(self, query): """ 通过baidu检索答案,包括百度知识图谱、百度诗词、百度万年历、百度计算器、百度知道 :param query: :return: list, string """ answer = [] left_text = '' # 抓取百度前10条的摘要 soup_baidu = html_crawler.get_html_baidu(baidu_url_prefix + quote(query)) if not soup_baidu: return answer, left_text for i in range(1, self.topk): items = soup_baidu.find(id=i) print(i, items) if not items: logger.debug("百度找不到答案") break # 判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案 if ('mu' in items.attrs) and i == 1: r = items.find(class_='op_exactqa_s_answer') if r: logger.debug("百度知识图谱找到答案") answer.append(r.get_text().strip()) return answer, left_text # 古诗词判断 if ('mu' in items.attrs) and i == 1: r = items.find(class_="op_exactqa_detail_s_answer") if r: logger.debug("百度诗词找到答案") answer.append(r.get_text().strip()) return answer, left_text # 万年历 & 日期 if ('mu' in items.attrs) and i == 1 and items.attrs['mu'].__contains__(calendar_url): r = items.find(class_="op-calendar-content") if r: logger.debug("百度万年历找到答案") answer.append(r.get_text().strip().replace("\n", "").replace(" ", "")) return answer, left_text if ('tpl' in items.attrs) and i == 1 and items.attrs['tpl'].__contains__('calendar_new'): r = items.attrs['fk'].replace("6018_", "") logger.debug(r) if r: logger.debug("百度万年历新版找到答案") answer.append(r) return answer, left_text # 计算器 if ('mu' in items.attrs) and i == 1 and items.attrs['mu'].__contains__(calculator_url): r = items.find(class_="op_new_val_screen_result") if r: logger.debug("计算器找到答案") answer.append(r.get_text().strip()) return answer, left_text # 天气 if ('mu' in items.attrs) and i == 1 and items.attrs['mu'].__contains__(weather_url): r = items.find(class_="op_weather4_twoicon_today") if r: logger.debug("天气找到答案") answer.append(r.get_text().replace('\n', '').strip()) return answer, left_text # 百度知道 if ('mu' in items.attrs) and i == 1: r = items.find(class_='op_best_answer_question_link') if r: zhidao_soup = html_crawler.get_html_zhidao(r['href']) r = zhidao_soup.find(class_='bd answer').find('pre') if not r: r = zhidao_soup.find(class_='bd answer').find(class_='line content').find( class_="best-text mb-10") if r: logger.debug("百度知道找到答案") answer.append(r.get_text().strip().replace("展开全部", "").strip()) return answer, left_text if items.find("h3"): # 百度知道 if items.find("h3").find("a").get_text().__contains__("百度知道") and (i == 1 or i == 2): url = items.find("h3").find("a")['href'] if url: zhidao_soup = html_crawler.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer') if r: r = r.find('pre') if not r: r = zhidao_soup.find(class_='bd answer').find(class_='line content').find( class_="best-text mb-10") if r: logger.debug("百度知道找到答案") answer.append(r.get_text().strip().replace("展开全部", "").strip()) return answer, left_text # 百度百科 if items.find("h3").find("a").get_text().__contains__("百度百科") and (i == 1 or i == 2): url = items.find("h3").find("a")['href'] if url: logger.debug("百度百科找到答案") baike_soup = html_crawler.get_html_baike(url) r = baike_soup.find(class_='lemma-summary') if r: answer.append(r.get_text().replace("\n", "").strip()) return answer, left_text left_text += items.get_text() return answer, left_text
# -*- coding: utf-8 -*- """ @author:XuMing([email protected]) @description: """ import sys sys.path.append("..") from dialogbot.searchdialog.internet.search_engine import Engine from dialogbot.utils.logger import logger if __name__ == '__main__': engine = Engine() logger.debug(engine.search("北京今天天气如何?")) logger.debug(engine.search("上海呢?")) logger.debug(engine.search("武汉呢?")) logger.debug(engine.search("武汉明天呢?")) ans = engine.search("貂蝉是谁") logger.debug(ans) ans = engine.search("西施是谁") logger.debug(ans) ans = engine.search("你知道我是谁") logger.debug(ans) context = engine.contents print(context)
def __init__(self, corpus_file, word2id): time_s = time.time() self.contexts, self.responses = load_corpus_file(corpus_file, word2id) logger.debug("Time to build onehot model by %s : %2.f seconds." % (corpus_file, time.time() - time_s))
def answer_prettify(self, question_type, answers): """ 根据不同的qustion_type,调用相应的回复模板 :param question_type: 问题类型 :param answers: 回答 :return: """ final_answer = "" if not answers: return final_answer logger.debug("original answer: %s" % answers) if question_type == 'disease_symptom': desc = [i['n.name'] for i in answers] subject = answers[0]['m.name'] final_answer = '{0}的症状包括:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'symptom_disease': desc = [i['m.name'] for i in answers] subject = answers[0]['n.name'] final_answer = '症状{0}可能染上的疾病有:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'disease_cause': desc = [i['m.cause'] for i in answers] subject = answers[0]['m.name'] final_answer = '{0}可能的成因有:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'disease_prevent': desc = [i['m.prevent'] for i in answers] subject = answers[0]['m.name'] final_answer = '{0}的预防措施包括:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'disease_lasttime': desc = [i['m.cure_lasttime'] for i in answers] subject = answers[0]['m.name'] final_answer = '{0}治疗可能持续的周期为:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'disease_cureway': desc = [';'.join(i['m.cure_way']) for i in answers] subject = answers[0]['m.name'] final_answer = '{0}可以尝试如下治疗:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'disease_cureprob': desc = [i['m.cured_prob'] for i in answers] subject = answers[0]['m.name'] final_answer = '{0}治愈的概率为(仅供参考):{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'disease_easyget': desc = [i['m.easy_get'] for i in answers] subject = answers[0]['m.name'] final_answer = '{0}的易感人群包括:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'disease_desc': desc = [i['m.desc'] for i in answers] subject = answers[0]['m.name'] final_answer = '{0},熟悉一下:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'disease_acompany': desc1 = [i['n.name'] for i in answers] desc2 = [i['m.name'] for i in answers] subject = answers[0]['m.name'] desc = [i for i in desc1 + desc2 if i != subject] final_answer = '{0}的症状包括:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'disease_not_food': desc = [i['n.name'] for i in answers] subject = answers[0]['m.name'] final_answer = '{0}忌食的食物包括有:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'disease_do_food': do_desc = [i['n.name'] for i in answers if i['r.name'] == '宜吃'] recommand_desc = [ i['n.name'] for i in answers if i['r.name'] == '推荐食谱' ] subject = answers[0]['m.name'] final_answer = '{0}宜食的食物包括有:{1}\n推荐食谱包括有:{2}'.format( subject, ';'.join(list(set(do_desc))[:self.num_limit]), ';'.join(list(set(recommand_desc))[:self.num_limit])) elif question_type == 'food_not_disease': desc = [i['m.name'] for i in answers] subject = answers[0]['n.name'] final_answer = '患有{0}的人最好不要吃{1}'.format( ';'.join(list(set(desc))[:self.num_limit]), subject) elif question_type == 'food_do_disease': desc = [i['m.name'] for i in answers] subject = answers[0]['n.name'] final_answer = '患有{0}的人建议多试试{1}'.format( ';'.join(list(set(desc))[:self.num_limit]), subject) elif question_type == 'disease_drug': desc = [i['n.name'] for i in answers] subject = answers[0]['m.name'] final_answer = '{0}通常的使用的药品包括:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'drug_disease': desc = [i['m.name'] for i in answers] subject = answers[0]['n.name'] final_answer = '{0}主治的疾病有{1},可以试试'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'disease_check': desc = [i['n.name'] for i in answers] subject = answers[0]['m.name'] final_answer = '{0}通常可以通过以下方式检查出来:{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) elif question_type == 'check_disease': desc = [i['m.name'] for i in answers] subject = answers[0]['n.name'] final_answer = '通常可以通过{0}检查出来的疾病有{1}'.format( subject, ';'.join(list(set(desc))[:self.num_limit])) logger.debug("apply template answer: %s" % final_answer) return final_answer
def parser(self, res_classify): """ 解析主函数 :param res_classify: :return: """ args = res_classify['args'] entity_dict = self.build_entitydict(args) question_types = res_classify['question_types'] sqls = [] for question_type in question_types: sql_dict = {} sql_dict['question_type'] = question_type sql = [] if question_type == 'disease_symptom': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'symptom_disease': sql = self.sql_transfer(question_type, entity_dict.get('symptom')) elif question_type == 'disease_cause': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'disease_acompany': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'disease_not_food': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'disease_do_food': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'food_not_disease': sql = self.sql_transfer(question_type, entity_dict.get('food')) elif question_type == 'food_do_disease': sql = self.sql_transfer(question_type, entity_dict.get('food')) elif question_type == 'disease_drug': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'drug_disease': sql = self.sql_transfer(question_type, entity_dict.get('drug')) elif question_type == 'disease_check': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'check_disease': sql = self.sql_transfer(question_type, entity_dict.get('check')) elif question_type == 'disease_prevent': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'disease_lasttime': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'disease_cureway': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'disease_cureprob': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'disease_easyget': sql = self.sql_transfer(question_type, entity_dict.get('disease')) elif question_type == 'disease_desc': sql = self.sql_transfer(question_type, entity_dict.get('disease')) if sql: sql_dict['sql'] = sql logger.debug("sql dict: %s" % sql_dict) sqls.append(sql_dict) return sqls
def classify(self, question): """ 分类主函数 :param question: :return: """ data = {} medical_dict = self.check_medical(question) logger.debug("medical dict: %s" % medical_dict) if not medical_dict: return {} data['args'] = medical_dict # 收集问句当中所涉及到的实体类型 types = [] for t in medical_dict.values(): # values 是list类型 types += t question_types = [] # 症状 if self.check_words(self.symptom_qwds, question) and ('disease' in types): question_type = 'disease_symptom' question_types.append(question_type) if self.check_words(self.symptom_qwds, question) and ('symptom' in types): question_type = 'symptom_disease' question_types.append(question_type) # 原因 if self.check_words(self.cause_qwds, question) and ('disease' in types): question_type = 'disease_cause' question_types.append(question_type) # 并发症 if self.check_words(self.acompany_qwds, question) and ('disease' in types): question_type = 'disease_acompany' question_types.append(question_type) # 推荐食品 if self.check_words(self.food_qwds, question) and 'disease' in types: deny_status = self.check_words(self.deny_words, question) if deny_status: question_type = 'disease_not_food' else: question_type = 'disease_do_food' question_types.append(question_type) # 已知食物找疾病 if self.check_words(self.food_qwds + self.cure_qwds, question) and 'food' in types: deny_status = self.check_words(self.deny_words, question) if deny_status: question_type = 'food_not_disease' else: question_type = 'food_do_disease' question_types.append(question_type) # 推荐药品 if self.check_words(self.drug_qwds, question) and 'disease' in types: question_type = 'disease_drug' question_types.append(question_type) # 药品治啥病 if self.check_words(self.cure_qwds, question) and 'drug' in types: question_type = 'drug_disease' question_types.append(question_type) # 疾病接受检查项目 if self.check_words(self.check_qwds, question) and 'disease' in types: question_type = 'disease_check' question_types.append(question_type) # 已知检查项目查相应疾病 if self.check_words(self.check_qwds + self.cure_qwds, question) and 'check' in types: question_type = 'check_disease' question_types.append(question_type) # 症状防御 if self.check_words(self.prevent_qwds, question) and 'disease' in types: question_type = 'disease_prevent' question_types.append(question_type) # 疾病医疗周期 if self.check_words(self.lasttime_qwds, question) and 'disease' in types: question_type = 'disease_lasttime' question_types.append(question_type) # 疾病治疗方式 if self.check_words(self.cureway_qwds, question) and 'disease' in types: question_type = 'disease_cureway' question_types.append(question_type) # 疾病治愈可能性 if self.check_words(self.cureprob_qwds, question) and 'disease' in types: question_type = 'disease_cureprob' question_types.append(question_type) # 疾病易感染人群 if self.check_words(self.easyget_qwds, question) and 'disease' in types: question_type = 'disease_easyget' question_types.append(question_type) # 若没有查到相关的外部查询信息,那么则将该疾病的描述信息返回 if question_types == [] and 'disease' in types: question_types = ['disease_desc'] # 若没有查到相关的外部查询信息,那么则将该疾病的描述信息返回 if question_types == [] and 'symptom' in types: question_types = ['symptom_disease'] # 将多个分类结果进行合并处理,组装成一个字典 data['question_types'] = question_types logger.debug("data info: %s" % data) return data