def query(entity, attr): soup = To.get_html_baidu("http://baike.baidu.com/item/" + entity) basicInfo_block = soup.find(class_='basic-info cmn-clearfix') if basicInfo_block == None: # print 'info None' return attr + "::找不到" else: info = get_info(basicInfo_block) # for i in info: # print i # print info[i] # print '-----------' if info.has_key(attr.decode('utf8')): # print 'has key'+attr.decode('utf8') return info[attr.decode('utf8')] else: # print 'no key 进行同义词判断' # 同义词判断 attr_list = T.load_baikeattr_name( os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) + '/resources/Attribute_name.txt') attr = T.load_synonyms_word_inattr( attr, os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) + '/resources/SynonDic.txt', attr_list) if info.has_key(attr.decode('utf8')): return info[attr.decode('utf8')] else: return attr + "::找不到"
def query(entity, attr): entity_uri = 'http://baike.baidu.com/item/' + entity result = '查询百科列表实体:' + entity_uri + '\n' soup = To.get_html_baidu(entity_uri) basicInfo_block = soup.find(class_='basic-info cmn-clearfix') if basicInfo_block == None: return result + entity + "-找不到\n" else: info = get_info(basicInfo_block) if attr in info: return info[attr] else: # 同义词判断 result += '属性' + attr + '-找不到\n' attr_list = T.load_baikeattr_name( os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) + '/resources/Attribute_name.txt') attr = T.load_synonyms_word_inattr( attr, os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) + '/resources/SynonDic.txt', attr_list) if attr in info: return info[attr] else: return result + '同义属性' + attr + '-找不到\n'
def _get_key_sentence(self, contents, query_cut): """ 获得关键语句作为答案 :param contents: 句子集合 :param query_cut: 问句提取关键词 :return: """ # 一个句子内有更高的高频词,说明句子的重要性更棒棒 split_result = [] # 分词结果 TF = {} IDF = {} TF_IDF = {} for s in contents: word_list = TextProcess.cut(s) word_list = list( set([word for word in word_list if word not in self.stop_word])) split_result.append(word_list) for word in word_list: TF[word] = TF.get(word, 0) + 1 for word in set(word_list): IDF[word] = IDF.get(word, 0) + 1 # 含该词的句子数,而不是出现的次数 for k in TF: TF[k] = TF[k] / len(TF) IDF[k] = math.log(len(contents) / IDF[k]) TF_IDF[k] = TF[k] * IDF[k] topic_word = sorted(TF_IDF, key=lambda k: TF_IDF[k], reverse=True) topic_word = topic_word[:self.topic] # print("Query:", query_cut) # print("Topic:", topic_word) # 得分 词的重要性是(用tf或tf-idf衡量)/句子长度 score = [] for i, word_list in enumerate(split_result): s = 0. if len(word_list) <= 1 or (len(word_list) == 2 and word_list[1] == " "): # 只有一个词或者一个词加空格不太可能是答案 continue # print("sentence:{}\nwortcut:{}".format(contents[i], word_list)) for word in word_list: w = 0 if word in query_cut: # print("Word {} in query".format(word)) w += 0.5 if word in topic_word: # print("Word {} in topic".format(word)) w += 0.5 s += TF_IDF[word] * w # s = s / len(word_list) score.append((i, s)) # print("Score:{:.5f}".format(s)) # print("-------------------------------------") score = sorted(score, key=lambda x: x[1], reverse=True) result = [] if len(score) > self.n: score = score[:self.n] for pair in score: result.append(contents[pair[0]]) return result
def _get_key_sentence(self, contents): """ 获得关键语句作为答案 :param contents: 句子集合 :return: """ # 一个句子内有更高的高频词,说明句子的重要性更棒棒 split_result = [] # 分词结果 TF = {} IDF = {} for s in contents: word_list = TextProcess.cut(s) word_list = [ word for word in word_list if word not in self.stop_word ] split_result.append(word_list) for word in word_list: TF[word] = TF.get(word, 0) + 1 for word in set(word_list): IDF[word] = IDF.get(word, 0) + 1 # 含该词的句子数,而不是出现的次数 for k in TF: TF[k] = TF[k] / len(TF) IDF[k] = math.log(len(contents) / IDF[k]) # 得分 词的重要性是(用tf或tf-idf衡量)/句子长度 score = [] for i, word_list in enumerate(split_result): s = 0. if len(word_list) <= 1 or (len(word_list) == 2 and word_list[1] == " "): # 只有一个词或者一个词加空格不太可能是答案 continue alpha_num = 0. # 考虑答案多为代码和命令含英文和符号超过一定比例的的权重增加 for word in word_list: if self._judge_pure_english(word): alpha_num += 1 s += TF[word] * IDF[word] if alpha_num == 0: s = 0 # 一个英文字符都没,肯定是不需要的 else: s = s / len(word_list) if (alpha_num / len(word_list)) > 0.5: s = s * (1. + (alpha_num / len(word_list))) else: s = s * (alpha_num / len(word_list)) # print("word cut{} score:{} alpha:{}".format(word_list, s, alpha_num)) score.append((i, s)) score = sorted(score, key=lambda x: x[1], reverse=True) result = [] if len(score) > self.n: score = score[:self.n] for pair in score: print(contents[pair[0]], pair[1]) result.append(contents[pair[0]]) return result
def _similarity(self, t1, t2): """ 百度api存在qps的问题 :param t1: :param t2: :return: """ t1_list = [ word for word in TextProcess.cut(t1) if word not in self.stop_word ] t2_list = [ word for word in TextProcess.cut(t2) if word not in self.stop_word ] em1 = self.sentence_emb(t1_list) em2 = self.sentence_emb(t2_list) score = self.cos(em1, em2) # score = self.vector_similarity(t1_list, t2_list) score = score * 0.5 + 0.5 # 归一化 return 1, score
def qa(question): #初始化jieba分词器 T.jieba_initialize() #切换到语料库所在工作目录 mybot_path = './' # os.chdir(mybot_path) mybot = aiml.Kernel() mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/std-startup.xml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") # mybot.respond('Load Doc Snake') #载入百科属性列表 input_message = question if len(input_message) > 60: print(mybot.respond("句子长度过长")) elif input_message.strip() == '': print(mybot.respond("无")) print(input_message) message = T.wordSegment(input_message) # 去标点 print('word Seg:' + message) print('词性:') words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) print("=======") print(response) print("=======") if response == "": ans = mybot.respond('找不到答案') # print(robot_id + ":" + ans) print("{0}:{1}".format(robot_id, ans)) # 百科搜索 elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): print("search from baike") print(response) res = response.split(':') #实体 entity = str(res[1]).replace(" ", "") #属性 attr = str(res[2]).replace(" ", "") print(entity + '<---->' + attr) ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print("{0}:{1}".format(robot_id, QAT.ptranswer(ans, False))) elif ans.decode('utf-8').__contains__(u'::找不到'): #百度摘要+Bing摘要 print("通用搜索") log.info("通用搜索") ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print("NoMatchingTemplate") ans = search_summary.kwquery(input_message) if len(ans) == 0: ans = mybot.respond('找不到答案') logs.info("{0}:{1}".format(robot_id, ans)) elif len(ans) > 1: logs.info(sys.exc_info()) logs.info("不确定候选答案") logs.info("[{0}][func:{1}][line:{2}]:不确定候选答案".format( sys._getframe().f_code.co_filename, sys._getframe().f_code.co_name, sys._getframe().f_lineno)) print(robot_id + ': ') for a in ans: print(a) # print(a.encode("utf8")) else: print('{0}:{1}'.format(robot_id, ans[0])) # 匹配模版 else: print("{}: {}".format(robot_id, response))
def main(): # 初始化jb分词器 T.jieba_initialize() # 切换到语料库所在工作目录 mybot_path = './' os.chdir(mybot_path) # 加载AIML的规则 mybot = aiml.Kernel() mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/std-startup.xml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/abc.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bot_profile.aiml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/general.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/infor.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/main.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/new07281.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/salutations.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/virus0727.aiml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/zextra_weibao.aiml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") # mybot.respond('Load Doc Snake') #载入百科属性列表 # print ''' # .----------------. .-----------------. .----------------. .----------------. .----------------. # | .--------------. || .--------------. || .--------------. || .--------------. || .--------------. | # | | _______ | || | ____ _____ | || | __ | || | ___ ____ | || | _________ | | # | | / ___ | | || ||_ \|_ _| | || | / \ | || | |_ ||_ _| | || | |_ ___ | | | # | | | (__ \_| | || | | \ | | | || | / /\ \ | || | | |_/ / | || | | |_ \_| | | # | | '.___`-. | || | | |\ \| | | || | / /__\ \ | || | | __'. | || | | _| _ | | # | | |`\____) | | || | _| |_\ |_ | || | _/ / \ \_ | || | _| | \ \_ | || | _| |___/ | | | # | | |_______.' | || ||_____|\____| | || ||____| |____|| || | |____||____| | || | |_________| | | # | | | || | | || | | || | | || | | | # | '--------------' || '--------------' || '--------------' || '--------------' || '--------------' | # '----------------' '----------------' '----------------' '----------------' '----------------' # Eric:你好,我是Eric。╭(╯^╰)╮ # ''' print('泰康小康:你好,我是小康。╭(╯^╰)╮') # 对问题的处理流程 while True: # 输入这个问题 input_message = raw_input("您想问什么 >> ") # 对输入问题进行简单的处理: # 忽略过长(超过60)的问题 # 忽略空问题 if len(input_message) > 60: print(mybot.respond("句子长度过长")) continue elif input_message.strip() == '': print(mybot.respond("无")) continue # 利用Tools工具对问题进行处理 print(input_message) message = T.wordSegment(input_message) # 去标点 print('word Seg:' + message) # print('词性:') words = T.postag(input_message) # 退出 if message == 'q': exit() # 返回信息的优先级 else: # 首先是AIML的模板匹配 response = mybot.respond(message) print("=======") print(response) print(len(response.decode('utf8'))) print("=======")
def kwquery(query): #分词 去停用词 抽取关键词 log = '提取关键词:' keywords = [] words = T.postag(query) for k in words: # 只保留名词 if k.flag.__contains__("n"): keywords.append(k.word) log += k.word log += '#' * 50 + '\n' answer = [] text = [] # 找到答案置1 flag = 0 # 抓取百度前10条的摘要 log += '百度前10条的摘要' url = 'https://www.baidu.com/s?wd=' + quote(query) log += url + '#' * 50 + '\n' soup_baidu = To.get_html_baidu(url) for i in range(1, 11): if soup_baidu == None: break results = soup_baidu.find(id=i) if results == None: log += '百度摘要找不到答案' + '#' * 50 + '\n' break log += '第' + str(i) + '条摘要:\n' log += clean_str(results.get_text()) + '#' * 50 + '\n' #判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案 #if 'mu' in results.attrs and i == 1: if 'mu' in results.attrs and results.find( class_='op_exactqa_s_answer') != None: r = results.find(class_='op_exactqa_s_answer') log += '第一条百度摘要为百度搜索根据知识图谱直接匹配出的内容,优先查找\n' log += '百度知识图谱找到答案' + '#' * 50 + '\n' answer.append(r.get_text().strip()) flag = 1 break #古诗词判断 if 'mu' in results.attrs and i == 1 and results.find( class_="op_exactqa_detail_s_answer") != None: r = results.find(class_="op_exactqa_detail_s_answer") log += '百度诗词找到答案' + '#' * 50 + '\n' answer.append(r.get_text().strip()) flag = 1 break #万年历 & 日期 if 'mu' in results.attrs and i == 1 and results.attrs[ 'mu'].__contains__('http://open.baidu.com/calendar'): r = results.find(class_="op-calendar-content") if r != None: log += '百度万年历找到答案' + '#' * 50 + '\n' answer.append(r.get_text().strip().replace("\n", "").replace( " ", "")) flag = 1 break if 'tpl' in results.attrs and i == 1 and results.attrs[ 'tpl'].__contains__('calendar_new'): r = results.attrs['fk'].replace("6018_", "") if r != None: log += '百度万年历新版找到答案' + '#' * 50 + '\n' answer.append(r) flag = 1 break #计算器 if 'mu' in results.attrs and i == 1 and results.attrs[ 'mu'].__contains__( 'http://open.baidu.com/static/calculator/calculator.html'): r = results.find(class_="op_new_val_screen_result") if r != None: log += '计算器找到答案' + '#' * 50 + '\n' answer.append(r.get_text().strip()) flag = 1 break # 百度知道答案 #if 'mu' in results.attrs and i == 1: if 'mu' in results.attrs and results.find( class_='op_best_answer_question_link') != None: r = results.find(class_='op_best_answer_question_link') url = r['href'] zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer').find('pre') if r == None: r = zhidao_soup.find(class_='bd answer').find( class_='line content') log += '百度知道best answer找到答案' + '#' * 50 + '\n' answer.append(r.get_text()) flag = 1 break if results.find("h3") != None: # 百度知道 if results.find("h3").find("a").get_text().__contains__(u"百度知道"): url = results.find("h3").find("a")['href'] if url == None: log += '百度知道找不到答案' + '#' * 50 + '\n' continue else: log += '百度知道找到答案' + '#' * 50 + '\n' zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer') if r == None: continue else: r = r.find('pre') if r == None: r = zhidao_soup.find(class_='bd answer').find( class_='line content') answer.append(r.get_text().strip()) flag = 1 break # 百度百科 if results.find("h3").find("a").get_text().__contains__(u"百度百科"): url = results.find("h3").find("a")['href'] if url == None: log += '百度百科找不到答案' + '#' * 50 + '\n' continue else: log += '百度百科找到答案' + '#' * 50 + '\n' baike_soup = To.get_html_baike(url) r = baike_soup.find(class_='lemma-summary') if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break text.append( clean_str(results.get_text()) + "(" + results.find("a")['href'] + ")") if flag == 1: return (answer, log) log += '百度前十条摘要不包含百度知道、百度百科内容,或相关内容中找不到答案' + '#' * 50 + '\n' #获取bing的摘要 log += '通过Bing查找\n' url = 'https://www.bing.com/search?q=' + quote(query) log += url + '\n' log += '#' * 50 soup_bing = To.get_html_bing(url) # 判断是否在Bing的知识图谱中 # bingbaike = soup_bing.find(class_="b_xlText b_emphText") bingbaike = soup_bing.find(class_="bm_box") if bingbaike != None: if bingbaike.find_all(class_="b_vList")[1] != None: if bingbaike.find_all(class_="b_vList")[1].find("li") != None: log += 'Bing百科找到答案' + '#' * 50 + '\n' flag = 1 answer.append(bingbaike.get_text()) return (answer, log) else: log += 'Bing百科找不到答案' + '#' * 50 + '\n' results = soup_bing.find(id="b_results") bing_list = results.find_all('li') for bl in bing_list: temp = bl.get_text() if temp.__contains__(u" - 必应网典"): url = bl.find("h2").find("a")['href'] if url == None: log += 'Bing网典找不到答案' + '#' * 50 + '\n' continue else: log += 'Bing网典找到答案' + '#' * 50 + '\n' bingwd_soup = To.get_html_bingwd(url) r = bingwd_soup.find(class_='bk_card_desc').find("p") if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break if flag == 1: return (answer, log) log += '没有找到答案,返回百度前十条摘要内容\n' #if flag == 0: #分句 #log += '' #cutlist = ["。", "?", ".", "_", "-", ":", "!", "?"] #temp = '' #sentences = [] #for i in range(0,len(text)): # if text[i] in cutlist: # if temp == '': # continue # else: # sentences.append(temp) # temp = '' # else: # temp += text[i] # # # 找到含有关键词的句子,去除无关的句子 # key_sentences = {} # for s in sentences: # for k in keywords: # if k in s: # key_sentences[s]=1 # 根据问题制定规则 # 识别人名 #target_list = {} #for ks in key_sentences: # # print ks # words = T.postag(ks) # for w in words: # if w.flag == ("nr"): # if w.word in target_list: # target_list[w.word] += 1 # else: # target_list[w.word] = 1 ## 找出最大词频 #sorted_lists = sorted(target_list.items(), key=lambda x: x[1], reverse=True) #去除问句中的关键词 #sorted_lists2 = [] # 候选队列 #for i, st in enumerate(sorted_lists): # if st[0] in keywords: # continue # else: # sorted_lists2.append(st) ##log += ' '.join(sorted_lists2) ##print ("返回前n个词频") #answer = [] #for i,st in enumerate(sorted_lists2): # if i< 3: # answer.append(st[0]) #for ks in key_sentences: # answer += ks + '\n' answer = text return (answer, log)
def main(): args = parse_args() timeout = args.timeout # 初始化jb分词器 T.jieba_initialize() # 切换到语料库所在工作目录 mybot_path = './' os.chdir(mybot_path) mybot = Kernel() mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/std-startup.xml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") if enable_chrome: question_obj = Value(ctypes.c_char_p, "".encode("utf-8")) browser_daemon = multiprocessing.Process(target=run_browser, args=(question_obj, )) browser_daemon.daemon = True browser_daemon.start() def __inner_job(): start = time.time() text_binary = analyze_current_screen_text( directory=data_directory, compress_level=image_compress_level[0]) keywords = get_text_from_image(image_data=text_binary, ) if not keywords: print("text not recognize") return true_flag, question, answers = parse_question_and_answer(keywords) #questions=question.decode('unicode-escape') #new_ans=[] #for ans in answers: # new_ans.append(ans.decode('unicode-escape')) print('-' * 72) print(question) print('-' * 72) print("\n".join(answers)) # notice browser if enable_chrome: with question_obj.get_lock(): question_obj.value = question keyboard.press("space") search_question = pre_process_question(question) summary = baidu_count(search_question, answers, timeout=timeout) summary_li = sorted(summary.items(), key=operator.itemgetter(1), reverse=True) data = [("选项", "同比")] for a, w in summary_li: data.append((a, w)) table = AsciiTable(data) print(table.table) print("*" * 72) if true_flag: print("肯定回答(**): ", summary_li[0][0]) print("否定回答( ): ", summary_li[-1][0]) else: print("肯定回答( ): ", summary_li[0][0]) print("否定回答(**): ", summary_li[-1][0]) print("*" * 72) ############################################################## input_message = question if len(input_message) > 60: print(mybot.respond("句子长度过长")) elif input_message.strip() == '': print(mybot.respond("无")) #print(input_message) message = T.wordSegment(input_message) # 去标点 #print('word Seg:' + message) #print('词性:') words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) #print("=======") #print(response) #print("=======") if response == "": ans = mybot.respond('找不到答案') print('Eric:' + ans) # 百科搜索 elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): #print("searchbaike") #print(response) res = response.split(':') # 实体 entity = str(res[1]).replace(" ", "") # 属性 attr = str(res[2]).replace(" ", "") #print(entity + '<---->' + attr) ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print('Eric:' + QAT.ptranswer(ans, False)) elif ans.decode('utf-8').__contains__(u'::找不到'): # 百度摘要+Bing摘要 print("通用搜索") ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): #print("NoMatchingTemplate") ans = search_summary.kwquery(input_message, answers) if len(ans) == 0: print('Eric:' + '找不到答案') elif len(ans) > 1: print("不确定候选答案") print('Eric: ') for a in ans: print(a) else: print('Eric:' + ans[0]) # 匹配模版 else: print('Eric:' + response) end = time.time() print("use {0} 秒".format(end - start)) save_screen(directory=data_directory) while True: print(""" 请在答题开始前就运行程序, 答题开始的时候按Enter预测答案 """) enter = input("按Enter键开始,按ESC键退出...") print(enter) if enter == chr(27): break try: __inner_job() except Exception as e: print(str(e)) print("欢迎下次使用")
def __inner_job(): start = time.time() text_binary = analyze_current_screen_text( directory=data_directory, compress_level=image_compress_level[0]) keywords = get_text_from_image(image_data=text_binary, ) if not keywords: print("text not recognize") return true_flag, question, answers = parse_question_and_answer(keywords) #questions=question.decode('unicode-escape') #new_ans=[] #for ans in answers: # new_ans.append(ans.decode('unicode-escape')) print('-' * 72) print(question) print('-' * 72) print("\n".join(answers)) # notice browser if enable_chrome: with question_obj.get_lock(): question_obj.value = question keyboard.press("space") search_question = pre_process_question(question) summary = baidu_count(search_question, answers, timeout=timeout) summary_li = sorted(summary.items(), key=operator.itemgetter(1), reverse=True) data = [("选项", "同比")] for a, w in summary_li: data.append((a, w)) table = AsciiTable(data) print(table.table) print("*" * 72) if true_flag: print("肯定回答(**): ", summary_li[0][0]) print("否定回答( ): ", summary_li[-1][0]) else: print("肯定回答( ): ", summary_li[0][0]) print("否定回答(**): ", summary_li[-1][0]) print("*" * 72) ############################################################## input_message = question if len(input_message) > 60: print(mybot.respond("句子长度过长")) elif input_message.strip() == '': print(mybot.respond("无")) #print(input_message) message = T.wordSegment(input_message) # 去标点 #print('word Seg:' + message) #print('词性:') words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) #print("=======") #print(response) #print("=======") if response == "": ans = mybot.respond('找不到答案') print('Eric:' + ans) # 百科搜索 elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): #print("searchbaike") #print(response) res = response.split(':') # 实体 entity = str(res[1]).replace(" ", "") # 属性 attr = str(res[2]).replace(" ", "") #print(entity + '<---->' + attr) ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print('Eric:' + QAT.ptranswer(ans, False)) elif ans.decode('utf-8').__contains__(u'::找不到'): # 百度摘要+Bing摘要 print("通用搜索") ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): #print("NoMatchingTemplate") ans = search_summary.kwquery(input_message, answers) if len(ans) == 0: print('Eric:' + '找不到答案') elif len(ans) > 1: print("不确定候选答案") print('Eric: ') for a in ans: print(a) else: print('Eric:' + ans[0]) # 匹配模版 else: print('Eric:' + response) end = time.time() print("use {0} 秒".format(end - start)) save_screen(directory=data_directory)
from Tools import TextProcess as T from QuestionParser import aiml_parse from AnswerGeneration import aiml_generate import time mybot = aiml.Kernel() mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/std-startup.xml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") T.jieba_initialize() def qa(question, T, mybot, QAT): q_parsed = aiml_parse.aiml_question_parsing(question, T, mybot) ans = aiml_generate.aiml_answer_generate(q_parsed, mybot, QAT, question) return ans def code_format(s): try: s = s.encode('utf8') except: s = s return s
def find_ans(question=''): global raw log = '答案来源:' cnt = 0 input_message = question if len(input_message) > 60: return (mybot.respond("句子长度过长"), log) elif input_message.strip() == '': return (mybot.respond("无"), log) # 检索本地知识库得到答案,代码在QA1文件夹 cnt += 1 log += str(cnt) + ':检索本地知识库\n' old_client = Client() ans, log_tmp = old_client.qa_find_ans(input_message) log += log_tmp if ans != "不知道~": return (clean_str(ans), log) log += '本地知识库找不到答案或答案不确定\n' message = T.wordSegment(input_message) response = mybot.respond(message) # log += 'AIML模板返回内容:' + response + '\n' if response == "": ans = mybot.respond('不知道~') return (ans, log) elif response[0] == '#': cnt += 1 log += str(cnt) + ':匹配不到问句模板\n' if response.__contains__("searchbaike"): res = response.split(':') entity = str(res[1]).replace(" ","") attr = str(res[2]).replace(" ","") cnt += 1 log += str(cnt) + ':匹配到实体属性模板,' + '实体:' + entity + ' 属性:' + attr + '\n' ans = baike.query(entity, attr) if type(ans) == list: cnt += 1 log += str(cnt) + ':来自百科Infobox\n' return (QAT.ptranswer(ans,False), log) elif '-找不到' in ans: cnt += 1 log += str(cnt) + ':百科Infobox查询不到:' + ans + '\n' cnt += 1 log += str(cnt) + '来自搜索\n' (ans, tmplog) = search_summary.kwquery(input_message) log += tmplog elif response.__contains__("NoMatchingTemplate"): cnt += 1 log += str(cnt) + ':匹配不到实体关系模板\n' cnt += 1 log += str(cnt) + ':来自搜索\n' (ans,tmplog) = search_summary.kwquery(input_message) log += tmplog if len(ans) == 0: cnt += 1 log += str(cnt) + ':未查询到答案\n' return (mybot.respond('不知道~'), log) elif len(ans) >1: cnt += 1 log += str(cnt) + ':返回百度摘要\n' if raw == False and ('什么是' in question or '是什么' in question): result = "给你找到几篇新闻:" for a in ans: result += a + '\n' return (result, log) else: raw = False question = question.replace("是什么", "").replace("什么是", "") ans2, log2 = find_ans(question + "是什么") ans1, log1 = find_ans("什么是" + question) if "给你找到几篇新闻" not in ans1: return (ans1, log1) else: return (ans2, log2) else: return (clean_str(ans[0]), log) # 直接匹配问句模版 else: cnt += 1 log += str(cnt) + ':匹配问句模板\n' return (clean_str(response), log)
def respond(self, text): question = list(TextProcess.postag(text)) # 对查询字符串进行分词 keywords = [] # print(question) logger.global_logger.info("query: {} cut:{}".format(text, question)) for word, tag in question: # 去除停用词 if word in self.stop_words: continue if 'n' not in tag or "un" == tag: # and 'v' not in tag: # 保证名词进去keyword,即保证对象描述不会太远,后面再用语义匹配方法匹配出来 continue keywords.append(word) if len(keywords) == 0: # 如果一个名词都没有,放动词 for word, tag in question: # 去除停用词 if word in self.stop_words: continue if 'v' not in tag: # and 'v' not in tag: # 保证名词进去keyword,即保证对象描述不会太远,后面再用语义匹配方法匹配出来 continue keywords.append(word) # 匹配keyword # condition = [" QUESTION like \'%{}%\'".format(keyword) for keyword in keywords] # 慢 condition = [ " instr(QUESTION, '{}') > 0 ".format(keyword) for keyword in keywords ] # 快 if len(condition) == 0: return [] sql = "select QUESTION ,ANSWER from qa_pair where {}".format( "and".join(condition)) logger.global_logger.info("going to execute this sql: {}".format(sql)) result = self.cursor.execute(sql) # (id ,q,a) res = [] # 计算所有问题和问句得相似度,排序 for row in result: q = row[0] a = row[1] state, sim = self._similarity(text, q) # logger.global_logger.info("text:{} query:{} score:{}".format(text, q, sim)) if state == 0: raise Exception("similarity Api Error.") elif sim > 0.9: res.append((q, a, sim)) # 挑选得分第一的返回(可以并列) finall = [] if len(res) > 0: ans = sorted(res, key=lambda x: x[2], reverse=True) score = -1 for a in ans: if a[2] > score: score = a[2] logger.global_logger.info( "[MATCH RESULT]{} match:{} score:{}".format( text, a[0], score)) finall.append((a[0], a[1])) elif a[2] == score: finall.append((a[0], a[1])) # else: # finall.append(("", "")) return finall
def kwquery(query, answers): # 分词 去停用词 抽取关键词 keywords = [] words = T.postag(query) for k in words: # 只保留名词 if k.flag.__contains__("n"): # print k.flag # print k.word keywords.append(k.word) answer = [] text = '' # 找到答案就置1 flag = 0 # 抓取百度前10条的摘要 soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd=' + parse.quote(query)) for i in range(1, 10): if soup_baidu == None: break results = soup_baidu.find(id=i) if results == None: print "百度摘要找不到答案" break # print '=============' # print results.attrs # print type(results.attrs) # print results['class'] # 判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案 if 'mu' in results.attrs and i == 1: # print results.attrs["mu"] r = results.find(class_='op_exactqa_s_answer') if r == None: print "百度知识图谱找不到答案" else: # print r.get_text() print "百度知识图谱找到答案" answer.append(r.get_text().strip()) flag = 1 break # 古诗词判断 if 'mu' in results.attrs and i == 1: r = results.find(class_="op_exactqa_detail_s_answer") if r == None: print "百度诗词找不到答案" else: # print r.get_text() print "百度诗词找到答案" answer.append(r.get_text().strip()) flag = 1 break # 万年历 & 日期 if 'mu' in results.attrs and i == 1 and results.attrs[ 'mu'].__contains__('http://open.baidu.com/calendar'): r = results.find(class_="op-calendar-content") if r == None: print "百度万年历找不到答案" else: # print r.get_text() print "百度万年历找到答案" answer.append(r.get_text().strip().replace("\n", "").replace( " ", "")) flag = 1 break if 'tpl' in results.attrs and i == 1 and results.attrs[ 'tpl'].__contains__('calendar_new'): r = results.attrs['fk'].replace("6018_", "") print r if r == None: print "百度万年历新版找不到答案" # continue else: # print r.get_text() print "百度万年历新版找到答案" answer.append(r) flag = 1 break # 计算器 if 'mu' in results.attrs and i == 1 and results.attrs[ 'mu'].__contains__( 'http://open.baidu.com/static/calculator/calculator.html'): r = results.find(class_="op_new_val_screen_result") if r == None: print "计算器找不到答案" # continue else: # print r.get_text() print "计算器找到答案" answer.append(r.get_text().strip()) flag = 1 break # 百度知道答案 if 'mu' in results.attrs and i == 1: r = results.find(class_='op_best_answer_question_link') if r == None: print "百度知道图谱找不到答案" else: print "百度知道图谱找到答案" url = r['href'] zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer').find('pre') if r == None: r = zhidao_soup.find(class_='bd answer').find( class_='line content') answer.append(r.get_text()) flag = 1 break if results.find("h3") != None: # 百度知道 if results.find("h3").find("a").get_text().__contains__( u"百度知道") and (i == 1 or i == 2): url = results.find("h3").find("a")['href'] if url == None: print "百度知道图谱找不到答案" continue else: print "百度知道图谱找到答案" zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer') if r == None: continue else: r = r.find('pre') if r == None: r = zhidao_soup.find(class_='bd answer').find( class_='line content') answer.append(r.get_text().strip()) flag = 1 break # 百度百科 if results.find("h3").find("a").get_text().__contains__( u"百度百科") and (i == 1 or i == 2): url = results.find("h3").find("a")['href'] if url == None: print "百度百科找不到答案" continue else: print "百度百科找到答案" baike_soup = To.get_html_baike(url) r = baike_soup.find(class_='lemma-summary') if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break text += results.get_text() if flag == 1: return answer # 获取bing的摘要 soup_bing = To.get_html_bing('https://www.bing.com/search?q=' + parse.quote(query)) # 判断是否在Bing的知识图谱中 # bingbaike = soup_bing.find(class_="b_xlText b_emphText") bingbaike = soup_bing.find(class_="bm_box") if bingbaike != None: if bingbaike.find_all(class_="b_vList")[1] != None: if bingbaike.find_all(class_="b_vList")[1].find("li") != None: print "Bing知识图谱找到答案" flag = 1 answer.append(bingbaike.get_text()) # print "=====" # print answer # print "=====" return answer else: print "Bing知识图谱找不到答案" results = soup_bing.find(id="b_results") bing_list = results.find_all('li') for bl in bing_list: temp = bl.get_text() if temp.__contains__(u" - 必应网典"): print "查找Bing网典" url = bl.find("h2").find("a")['href'] if url == None: print "Bing网典找不到答案" continue else: print "Bing网典找到答案" bingwd_soup = To.get_html_bingwd(url) r = bingwd_soup.find(class_='bk_card_desc').find("p") if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break if flag == 1: return answer text += results.get_text() # print text # 如果再两家搜索引擎的知识图谱中都没找到答案,那么就分析摘要 if flag == 0: # 分句 cutlist = [u"。", u"?", u".", u"_", u"-", u":", u"!", u"?"] temp = '' sentences = [] for i in range(0, len(text)): if text[i] in cutlist: if temp == '': continue else: # print temp sentences.append(temp) temp = '' else: temp += text[i] # 找到含有关键词的句子,去除无关的句子 key_sentences = {} for s in sentences: for k in keywords: if k in s: key_sentences[s] = 1 # 根据问题制定规则 target_list = {} for Hans in answers: for sentence in key_sentences: if Hans in sentence: if Hans in target_list: target_list[Hans] += 1 else: target_list[Hans] = 1 print(target_list) # 找出最大词频 ##print(target_list.items()) sorted_lists = sorted(target_list.items(), key=lambda item: item[1], reverse=True) # print len(target_list) # 去除问句中的关键词 sorted_lists2 = [] # 候选队列 for i, st in enumerate(sorted_lists): # print st[0] if st[0] in keywords and st[0] not in answers: continue else: sorted_lists2.append(st) print "返回前n个词频" answer = [] for i, st in enumerate(sorted_lists2): # print st[0] # print st[1] if i < 3: # print st[0] # print st[1] answer.append(st[0]) # print answer return answer
def respond(self, query): """ 采用文本摘要等技术来完成更广泛提问的总结 :param query: :return: """ # 查找百度 url = 'https://www.baidu.com/s?wd=' + quote(query) # print(url) t1 = time.time() soup_baidu = Html_Tools.get_html_baidu(url) # print("Query Baidu:{}".format(time.time() - t1)) contents = [] key_word = list(TextProcess.postag(query)) # (word tag) key_word = [ word for word, tag in key_word if (word not in self.stop_word and "n" in tag) ] t1 = time.time() for i in range(1, 10): # print("content -- {}".format(i)) if soup_baidu == None: break results = soup_baidu.find(id=i) if results == None: # ("Id{}找不到".format(i)) continue infos = results.find_all('h3') for info in infos: tag = info.find("a") if tag is None: continue else: href = tag['href'] if "www.baidu.com/link" not in href: continue try: sub_soup = Html_Tools.get_html(href) info_list = self._extract(sub_soup) # 句子级的过滤 for info in info_list: # 问句过滤 if any(["?" in info, "?" in info]): continue else: contents.append(info) except: pass # print("For :{}".format((time.time() - t1) / 10)) if len(contents) > 0: t1 = time.time() key_sentence = self._get_key_sentence(list(set(contents)), key_word) # print("Key Sentence:{}".format(time.time() - t1)) else: key_sentence = [] # print() return key_sentence
def kwquery(query): # 分词 去停用词 抽取关键词 keywords = [] words = T.postag(query) for k in words: # 只保留名词 if k.flag.__contains__("n"): # print k.flag # print k.word keywords.append(k.word) answer = [] text = '' # 找到答案就置1 flag = 0 # 抓取百度前10条的摘要 url = 'https://www.baidu.com/s?wd=' + quote(query) # print(url) soup_baidu = To.get_html_baidu(url) for i in range(1, 10): # print("content -- {}".format(i)) if soup_baidu == None: break results = soup_baidu.find(id=i) if results == None: print("Id{}找不到".format(i)) continue # 判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案 text += results.get_text() if 'mu' in results.attrs: # 一般在前三条 # print results.attrs["mu"] r = results.find(class_='op_exactqa_s_answer') if r == None: # print("百度知识图谱找不到答案") pass else: # print r.get_text() # print("百度知识图谱找到答案") answer.append(r.get_text().strip().replace(" ", "")) flag = 1 break # 电影栏目 if 'mu' in results.attrs and results.attrs['mu'].__contains__( "http://nourl.baidu.com/"): if results.find(class_="c-gap-top-small") is not None: items = results.find_all(class_="c-gap-top-small") for item in items: if item.find("a") is not None: answer.append(item.find("a").get_text()) flag = 1 break else: pass # 天气判断 weather_list = results.find_all( class_="op_weather4_twoicon_today OP_LOG_LINK") # 今天的天气 if len(weather_list) > 0: # print("百度天气找到了") weather_info = weather_list[0] date = weather_info.find( class_="op_weather4_twoicon_date").get_text().strip() C = weather_info.find( class_="op_weather4_twoicon_temp").get_text().strip() rain_or_not = weather_info.find( class_="op_weather4_twoicon_weath").get_text().strip() wind = weather_info.find( class_="op_weather4_twoicon_wind").get_text().strip() ans = "{}\t{}\t{}\t{}".format(date, C, rain_or_not, wind) answer.append(ans) # 获取未来的天气 weather_list = results.find_all( class_="op_weather4_twoicon_day OP_LOG_LINK") # 未来的天气 for weather_info in weather_list: # print(weather_info) date = weather_info.find( class_="op_weather4_twoicon_date_day").get_text().strip() C = weather_info.find( class_="op_weather4_twoicon_temp").get_text().strip() rain_or_not = weather_info.find( class_="op_weather4_twoicon_weath").get_text().strip() wind = weather_info.find( class_="op_weather4_twoicon_wind").get_text().strip() ans = "{}\t{}\t{}\t{}".format(date, C, rain_or_not, wind) answer.append(ans) flag = 1 break else: # print("百度天气找不到") pass # 古诗词判断 if 'mu' in results.attrs: r = results.find(class_="op_exactqa_detail_s_answer") if r == None: # print("百度诗词找不到答案") pass else: # print r.get_text() # print("百度诗词找到答案") answer.append(r.get_text().strip()) flag = 1 break # 万年历 & 日期 if 'mu' in results.attrs and i == 1 and results.attrs[ 'mu'].__contains__('http://open.baidu.com/calendar'): r = results.find(class_="op-calendar-content") if r == None: # print("百度万年历找不到答案") pass else: # print r.get_text() # print("百度万年历找到答案") answer.append(r.get_text().strip().replace("\n", "").replace( " ", "")) flag = 1 break # if 'tpl' in results.attrs and i == 1 and results.attrs['tpl'].__contains__('calendar_new'): # # print(results) # r = None # results.attrs['fk'].replace("6018_", "") # print(r) # # if r == None: # print("百度万年历新版找不到答案") # # continue # else: # # print r.get_text() # print("百度万年历新版找到答案") # answer.append(r) # flag = 1 # break # 计算器 if 'mu' in results.attrs and results.attrs['mu'].__contains__( 'http://open.baidu.com/static/calculator/calculator.html'): # r = results.find('div').find_all('td')[1].find_all('div')[1] r = results.find(class_="op_new_val_screen_result") if r == None: # print("计算器找不到答案") pass # continue else: # print r.get_text() # print("计算器找到答案") answer.append(r.get_text().strip()) flag = 1 break # 百度知道答案 # if 'mu' in results.attrs: # r = results.find(class_='op_best_answer_question_link') # if r == None: # print("百度知道图谱找不到答案") # else: # print("百度知道图谱找到答案") # url = r['href'] # zhidao_soup = To.get_html_zhidao(url) # r = zhidao_soup.find(class_='bd answer').find('pre') # if r == None: # r = zhidao_soup.find(class_='bd answer').find(class_='line content') # # answer.append(r.get_text()) # flag = 1 # break # if results.find("h3") != None: # 百度知道 # if results.find("h3").find("a").get_text().__contains__("百度知道") and (i == 1 or i == 2): # url = results.find("h3").find("a")['href'] # if url == None: # # print("百度知道图谱找不到答案") # continue # 当前id只会存在一个h3,没有答案则进入下一个id找 # else: # # print("百度知道图谱找到答案") # zhidao_soup = To.get_html_zhidao(url) # r = zhidao_soup.find(class_='bd answer') # if r == None: # continue # else: # r = r.find('pre') # if r == None: # r = zhidao_soup.find(class_='bd answer').find(class_='line content') # text = r.get_text().strip() # answer.append(text) # flag = 1 # break # 百度百科 link = results.find("h3").find("a") if link is not None and link.get_text().__contains__("百度百科"): url = results.find("h3").find("a")['href'] if url == None: # print("百度百科找不到答案") continue else: # print("百度百科找到答案") baike_soup = To.get_html_baike(url) r = baike_soup.find(class_='lemma-summary') if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break text += results.get_text() if flag == 1: return answer # 获取bing的摘要 soup_bing = To.get_html_bing('https://www.bing.com/search?q=' + quote(query)) # 判断是否在Bing的知识图谱中 # bingbaike = soup_bing.find(class_="b_xlText b_emphText") bingbaike = soup_bing.find(class_="bm_box") if bingbaike != None: if bingbaike.find_all(class_="b_vList")[1] != None: if bingbaike.find_all(class_="b_vList")[1].find("li") != None: # print("Bing知识图谱找到答案") flag = 1 answer.append(bingbaike.get_text()) # print "=====" # print answer # print "=====" return answer else: # print("Bing知识图谱找不到答案") results = soup_bing.find(id="b_results") bing_list = results.find_all('li') for bl in bing_list: temp = bl.get_text() if temp.__contains__(" - 必应网典"): print("查找Bing网典") url = bl.find("h2").find("a")['href'] if url == None: # print("Bing网典找不到答案") continue else: # print("Bing网典找到答案") bingwd_soup = To.get_html_bingwd(url) r = bingwd_soup.find(class_='bk_card_desc').find("p") if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break if flag == 1: return answer # text += results.get_text() # 如果再两家搜索引擎的知识图谱中都没找到答案, # answer.append("") return answer