def work(schoolname,req): try: soup = To.get_html_baidu_selenium1(schoolname, req) soup = soup.find(class_ = "nav-container clearfix").find("span").find("ul").find_all("li") soup = To.get_html_baidu("http://gaokao.chsi.com.cn" + (soup[3].a)["href"], req) soup = soup.find(class_ = "yxk-detail-con") return soup.text.strip() except Exception as ex: print(ex) return ""
def canteen(schoolname, req): try: soup = To.get_html_baidu_selenium1(schoolname, req) soup = soup.find(class_="nav-container clearfix").find_all("a") for i in soup: if i.text == u"食宿条件": url = i["href"] break soup = To.get_html_baidu("http://gaokao.chsi.com.cn" + url, req) soup = soup.find_all(class_="yxk-detail-con") return soup[1].text except Exception as ex: print(ex) return ""
def major(schoolname,req): try: soup = To.get_html_baidu_selenium2(schoolname, req) soup = soup.find(class_ = "sm_nav bk").find_all("a") ans = "" for i in soup: if i.text == u"专业介绍": soup = To.get_html_baidu(i["href"], req) soup = soup.find(class_ = "plan_con").find_all("span") for t in soup: ans += t.text.replace('|', '').strip().encode("utf-8") + ',' return ans.strip(',') return "" except Exception as ex: print(ex) return ""
def query(entity, attr): soup = To.get_html_baidu("http://baike.baidu.com/item/" + entity) basicInfo_block = soup.find(class_='basic-info cmn-clearfix') if basicInfo_block == None: # print 'info None' return attr + "::找不到" else: info = get_info(basicInfo_block) # for i in info: # print i # print info[i] # print '-----------' if info.has_key(attr.decode('utf8')): # print 'has key'+attr.decode('utf8') return info[attr.decode('utf8')] else: # print 'no key 进行同义词判断' # 同义词判断 attr_list = T.load_baikeattr_name( os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) + '/resources/Attribute_name.txt') attr = T.load_synonyms_word_inattr( attr, os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) + '/resources/SynonDic.txt', attr_list) if info.has_key(attr.decode('utf8')): return info[attr.decode('utf8')] else: return attr + "::找不到"
def website(schoolname, req): try: soup = To.get_html_baidu_selenium1(schoolname, req) soup = soup.find(class_="mid").find(class_="msg").find("span").a return soup.text except Exception as ex: print(ex) return ""
def zipcode(schoolname,req): try: soup = To.get_html_baidu("https://www.baidu.com/s?wd=" + schoolname + "邮编", req) soup = soup.find(class_ = "op_post_content") return soup.text.strip()[:6] except Exception as ex: print(ex) return ""
def area(schoolname,req): try: soup = To.get_html_baidu("https://www.baidu.com/s?wd=" + schoolname + "占地", req) soup = soup.find(class_ = "op_exactqa_s_answer") return soup.text.strip().encode("utf-8") except Exception as ex: print(ex) return ""
def phonenumber(schoolname, req): try: url = "" soup = To.get_html_baidu_selenium1(schoolname, req) soup = soup.find(class_="nav-container clearfix").find_all("a") for i in soup: if i.text == u"学校简介": url = i["href"] break soup = To.get_html_baidu("http://gaokao.chsi.com.cn" + url, req) soup = soup.find(class_="container").find_all(class_="yxk-detail-con") if soup[len(soup) - 1].text == "": return "" else: return soup[len(soup) - 1].text.encode("utf-8") except Exception as ex: print(ex) return ""
def email(schoolname, req): try: soup = To.get_html_baidu_selenium2(schoolname, req) soup = soup.find(class_="college_msg bk").find(class_="left contact") soup = soup.text.split() return (soup[2])[5:] except Exception as ex: print(ex) return ""
def intro(schoolname,req): try: soup = To.get_html_baidu("https://baike.baidu.com/item/" + schoolname, req) soup = soup.find(class_ = "lemma-summary").find("div").text if '[' in soup and ']' in soup: soup = soup[:soup.index('[')] return soup except Exception as ex: print(ex) return ""
def schoolproperty(schoolname,req): try: soup = To.get_html_baidu("https://baike.baidu.com/item/" + schoolname, req) soup1 = soup.find(class_ = "basic-info cmn-clearfix").find_all("dt") soup2 = soup.find(class_ = "basic-info cmn-clearfix").find_all("dd") for i in soup1: if i.text == u"属性": return soup2[soup1.index(i)].text except Exception as ex: print(ex) return ""
def college(schoolname,req): try: soup = To.get_html_baidu("https://baike.baidu.com/item/" + schoolname, req) soup = soup.find_all(class_ = "para-title level-3") for i in soup: if u"院系" in i.text or u"学院" in i.text: tmp = i.nextSibling.nextSibling.text if u'[' in tmp and u']' in tmp: tmp = tmp[:tmp.index('[')] return tmp return "" except Exception as ex: print(ex) return ""
def scoreline(schoolname,req): try: soup = To.get_html_baidu_selenium1(schoolname, req) soup = soup.find(id = "lqfs").find_all("tr") for i in soup: if (u"本科一批" in i.text): tmp = i.find_all("td") if tmp[0].text == "" or tmp[1].text == "" or tmp[2].text == "" or tmp[3].text == "": return "" else: return schoolname + tmp[0].text.encode("utf8") + "年" + tmp[1].text.encode("utf8") + "的省市分数线是" + tmp[2].text.encode("utf8") + ",录取平均分是" + tmp[3].text.encode("utf8") return "" except Exception as ex: print(ex) return ""
def team(schoolname,req): try: soup = To.get_html_baidu("https://baike.baidu.com/item/" + schoolname, req) soup = soup.find_all(class_ = "para-title level-3") for i in soup: if u"师资" in i.text: tmp = i.nextSibling.nextSibling if tmp.find("div") != None: tmp.div.clear() tmp = tmp.text if '[' in tmp and ']' in tmp: tmp = tmp[:tmp.index('[')] return tmp[:len(tmp)-1] return "" except Exception as ex: print(ex) return ""
def query(entity,attr): soup = To.get_html_baike("http://baike.baidu.com/item/"+entity) basicInfo_block = soup.find(class_= 'basic-info cmn-clearfix') if not basicInfo_block: return '找不到' else: info = get_info(basicInfo_block) ##print(info) if info.get(attr): return info[attr].strip() else: attr_list = T.load_baikeattr_name(os.path.dirname(os.path.split(os.path.realpath(__file__))[0])+'/resources/Attribute_name.txt') attr = T.load_synonyms_word_inattr(attr,os.path.dirname(os.path.split(os.path.realpath(__file__))[0])+'/resources/SynonDic.txt',attr_list) if info.get(attr): return info[attr].strip() else: return '找不到'
def kwquery(query): #分词 去停用词 抽取关键词 keywords = [] words = T.postag(query) for k in words: # 只保留名词 if k.flag.__contains__("n"): # print k.flag # print k.word keywords.append(k.word) answer = [] text = '' # 找到答案就置1 flag = 0 # 抓取百度前10条的摘要 soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd='+quote(query)) for i in range(1,10): if soup_baidu == None: break results = soup_baidu.find(id=i) if results == None: print "百度摘要找不到答案" break # print '=============' # print results.attrs # print type(results.attrs) # print results['class'] #判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案 if results.attrs.has_key('mu') and i == 1: # print results.attrs["mu"] r = results.find(class_='op_exactqa_s_answer') if r == None: print "百度知识图谱找不到答案" else: # print r.get_text() print "百度知识图谱找到答案" answer.append(r.get_text().strip()) flag = 1 break #古诗词判断 if results.attrs.has_key('mu') and i == 1: r = results.find(class_="op_exactqa_detail_s_answer") if r == None: print "百度诗词找不到答案" else: # print r.get_text() print "百度诗词找到答案" answer.append(r.get_text().strip()) flag = 1 break #万年历 & 日期 if results.attrs.has_key('mu') and i == 1 and results.attrs['mu'].__contains__('http://open.baidu.com/calendar'): r = results.find(class_="op-calendar-content") if r == None: print "百度万年历找不到答案" else: # print r.get_text() print "百度万年历找到答案" answer.append(r.get_text().strip().replace("\n","").replace(" ","")) flag = 1 break if results.attrs.has_key('tpl') and i == 1 and results.attrs['tpl'].__contains__('calendar_new'): r = results.attrs['fk'].replace("6018_","") print r if r == None: print "百度万年历新版找不到答案" # continue else: # print r.get_text() print "百度万年历新版找到答案" answer.append(r) flag = 1 break #计算器 if results.attrs.has_key('mu') and i == 1 and results.attrs['mu'].__contains__('http://open.baidu.com/static/calculator/calculator.html'): r = results.find('div').find_all('td')[1].find_all('div')[1] if r == None: print "计算器找不到答案" # continue else: # print r.get_text() print "计算器找到答案" answer.append(r.get_text().strip()) flag = 1 break # 百度知道答案 if results.attrs.has_key('mu') and i == 1: r = results.find(class_='op_best_answer_question_link') if r == None: print "百度知道图谱找不到答案" else: print "百度知道图谱找到答案" url = r['href'] zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer').find('pre') if r == None: r = zhidao_soup.find(class_='bd answer').find(class_='line content') answer.append(r.get_text()) flag = 1 break if results.find("h3") != None: # 百度知道 if results.find("h3").find("a").get_text().__contains__(u"百度知道") and (i == 1 or i ==2): url = results.find("h3").find("a")['href'] if url == None: print "百度知道图谱找不到答案" continue else: print "百度知道图谱找到答案" zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer') if r == None: continue else: r = r.find('pre') if r == None : r = zhidao_soup.find(class_='bd answer').find(class_='line content') answer.append(r.get_text().strip()) flag = 1 break # 百度百科 if results.find("h3").find("a").get_text().__contains__(u"百度百科") and (i == 1 or i ==2): url = results.find("h3").find("a")['href'] if url == None: print "百度百科找不到答案" continue else: print "百度百科找到答案" baike_soup = To.get_html_baike(url) r = baike_soup.find(class_='lemma-summary') if r == None: continue else: r = r.get_text().replace("\n","").strip() answer.append(r) flag = 1 break text += results.get_text() if flag == 1: return answer #获取bing的摘要 soup_bing = To.get_html_bing('https://www.bing.com/search?q='+quote(query)) # 判断是否在Bing的知识图谱中 # bingbaike = soup_bing.find(class_="b_xlText b_emphText") bingbaike = soup_bing.find(class_="bm_box") if bingbaike != None: if bingbaike.find_all(class_="b_vList")[1] != None: if bingbaike.find_all(class_="b_vList")[1].find("li") != None: print "Bing知识图谱找到答案" flag = 1 answer.append(bingbaike.get_text()) # print "=====" # print answer # print "=====" return answer else: print "Bing知识图谱找不到答案" results = soup_bing.find(id="b_results") bing_list = results.find_all('li') for bl in bing_list: temp = bl.get_text() if temp.__contains__(u" - 必应网典"): print "查找Bing网典" url = bl.find("h2").find("a")['href'] if url == None: print "Bing网典找不到答案" continue else: print "Bing网典找到答案" bingwd_soup = To.get_html_bingwd(url) r = bingwd_soup.find(class_='bk_card_desc').find("p") if r == None: continue else: r = r.get_text().replace("\n","").strip() answer.append(r) flag = 1 break if flag == 1: return answer text += results.get_text() # print text # 如果再两家搜索引擎的知识图谱中都没找到答案,那么就分析摘要 if flag == 0: #分句 cutlist = [u"。",u"?",u".", u"_", u"-",u":",u"!",u"?"] temp = '' sentences = [] for i in range(0,len(text)): if text[i] in cutlist: if temp == '': continue else: # print temp sentences.append(temp) temp = '' else: temp += text[i] # 找到含有关键词的句子,去除无关的句子 key_sentences = {} for s in sentences: for k in keywords: if k in s: key_sentences[s]=1 # 根据问题制定规则 # 识别人名 target_list = {} for ks in key_sentences: # print ks words = T.postag(ks) for w in words: # print "=====" # print w.word if w.flag == ("nr"): if target_list.has_key(w.word): target_list[w.word] += 1 else: target_list[w.word] = 1 # 找出最大词频 sorted_lists = sorted(target_list.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) # print len(target_list) #去除问句中的关键词 sorted_lists2 = [] # 候选队列 for i, st in enumerate(sorted_lists): # print st[0] if st[0] in keywords: continue else: sorted_lists2.append(st) print "返回前n个词频" answer = [] for i,st in enumerate(sorted_lists2): # print st[0] # print st[1] if i< 3: # print st[0] # print st[1] answer.append(st[0]) # print answer return answer
def qa(question): #初始化jb分词器 T.jieba_initialize() #切换到语料库所在工作目录 mybot_path = './' # os.chdir(mybot_path) mybot = aiml.Kernel() mybot.learn(os.path.split(os.path.realpath(__file__))[0]+"/resources/std-startup.xml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") # mybot.respond('Load Doc Snake') #载入百科属性列表 print ''' .----------------. .-----------------. .----------------. .----------------. .----------------. | .--------------. || .--------------. || .--------------. || .--------------. || .--------------. | | | _______ | || | ____ _____ | || | __ | || | ___ ____ | || | _________ | | | | / ___ | | || ||_ \|_ _| | || | / \ | || | |_ ||_ _| | || | |_ ___ | | | | | | (__ \_| | || | | \ | | | || | / /\ \ | || | | |_/ / | || | | |_ \_| | | | | '.___`-. | || | | |\ \| | | || | / /__\ \ | || | | __'. | || | | _| _ | | | | |`\____) | | || | _| |_\ |_ | || | _/ / \ \_ | || | _| | \ \_ | || | _| |___/ | | | | | |_______.' | || ||_____|\____| | || ||____| |____|| || | |____||____| | || | |_________| | | | | | || | | || | | || | | || | | | | '--------------' || '--------------' || '--------------' || '--------------' || '--------------' | '----------------' '----------------' '----------------' '----------------' '----------------' Eric:你好,我是Eric。╭(╯^╰)╮ ''' input_message = question if len(input_message) > 60: print mybot.respond("句子长度过长") elif input_message.strip() == '': print mybot.respond("无") print input_message message = T.wordSegment(input_message) # 去标点 print 'word Seg:'+ message print '词性:' words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) print "=======" print response print "=======" if response == "": ans = mybot.respond('找不到答案') print 'Eric:' + ans # 百科搜索 elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') #实体 entity = str(res[1]).replace(" ","") #属性 attr = str(res[2]).replace(" ","") print entity+'<---->'+attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print 'Eric:' + QAT.ptranswer(ans,False) elif ans.decode('utf-8').__contains__(u'::找不到'): #百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) if len(ans) == 0: ans = mybot.respond('找不到答案') print 'Eric:' + ans elif len(ans) >1: print "不确定候选答案" print 'Eric: ' for a in ans: print a.encode("utf8") else: print 'Eric:' + ans[0].encode("utf8") # 匹配模版 else: print 'Eric:' + response
elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') #实体 entity = str(res[1]).replace(" ","") #属性 attr = str(res[2]).replace(" ","") print entity+'<---->'+attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print 'Eric:' + QAT.ptranswer(ans,False) continue elif ans.decode('utf-8').__contains__(u'::找不到'): #百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) if len(ans) == 0: ans = mybot.respond('找不到答案') print 'Eric:' + ans
def run(question): # 初始化jb分词器 T.jieba_initialize() # 切换到语料库所在工作目录 mybot_path = './' os.chdir(mybot_path) mybot = aiml.Kernel() if os.path.isfile("bot_brain.brn"): mybot.bootstrap(brainFile="bot_brain.brn") else: mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/std-startup.xml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/tuling.xml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") #mybot.bootstrap(learnFiles="std-startup.xml", commands="load aiml b") mybot.saveBrain("bot_brain.brn") # 载入百科属性列表 print ''' Eric:你好,我是问答机器人。╭(╯^╰)╮ ''' input_message = question if len(input_message) > 60: return mybot.respond("句子长度过长") #continue elif input_message.strip() == '': return mybot.respond("无话可说") #continue # print input_message message = T.wordSegment(input_message) # 去标点 # print 'word Seg:'+ message # print '词性:' words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) # 在AIML数据集里寻找答案 print "=======" if response[0] == '#': print response + 'mark' pass else: return response print "=======" if response == "": ans = mybot.respond('找不到答案') print 'Eric:' + ans # 百科搜索 elif response[0] == '#' or len(response) < 1: # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') # 实体 entity = str(res[1]).replace(" ", "") # 属性 attr = str(res[2]).replace(" ", "") print entity + '<---->' + attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: return '回答:' + QAT.ptranswer(ans, False) #continue elif ans.decode('utf-8').__contains__(u'::找不到'): # 百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) if len(ans) == 0: ans = mybot.respond('找不到答案') return '回答:' + ans elif len(ans) > 1: print "不确定候选答案" return ans print 'Eric: ' for a in ans: print a.encode("utf-8") else: return '回答:' + ans[0].encode("utf-8") # 匹配模版 else: return '回答:' + response
def kwquery(query): #分词 去停用词 抽取关键词 keywords = [] words = T.postag(query) for k in words: # 只保留名词 if k.flag.__contains__("n"): # print k.flag # print k.word keywords.append(k.word) answer = [] text = '' # 找到百科的答案就置1 flag = 0 # 抓取百度前10条的摘要 soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd=' + quote(query)) for i in range(1, 10): if soup_baidu == None: break results = soup_baidu.find(id=i) if results == None: print "百度摘要找不到答案" break # print '=============' # print results.attrs # print type(results.attrs) # print results['class'] #判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案 if results.attrs.has_key('mu') and i == 1: # print results.attrs["mu"] r = results.find(class_='op_exactqa_s_answer') # print r if r == None: print "百度知识图谱找不到答案" # continue else: # print r.get_text() print "百度知识图谱找到答案" answer.append(r.get_text().strip()) flag = 1 break #古诗词判断 if results.attrs.has_key('mu') and i == 1: r = results.find(class_="op_exactqa_detail_s_answer") if r == None: print "百度诗词找不到答案" # continue else: # print r.get_text() print "百度诗词找到答案" answer.append(r.get_text().strip()) flag = 1 break #计算器 if results.attrs.has_key( 'mu') and i == 1 and results.attrs['mu'].__contains__( 'http://open.baidu.com/static/calculator/calculator.html'): r = results.find('div').find_all('td')[1].find_all('div')[1] if r == None: print "计算器找不到答案" # continue else: # print r.get_text() print "计算器找到答案" answer.append(r.get_text().strip()) flag = 1 break # 百度知道答案 if results.attrs.has_key('mu') and i == 1: r = results.find(class_='op_best_answer_question_link') if r == None: print "百度知道图谱找不到答案" else: print "百度知道图谱找到答案" url = r['href'] zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer').find('pre') answer.append(r.get_text()) flag = 1 break if results.find("h3") != None: # 百度知道 if results.find("h3").find("a").get_text().__contains__( u"百度知道") and i == 1: url = results.find("h3").find("a")['href'] if url == None: print "百度知道图谱找不到答案" continue else: print "百度知道图谱找到答案" zhidao_soup = To.get_html_zhidao(url) r = zhidao_soup.find(class_='bd answer') if r == None: continue else: r = r.find('pre') answer.append(r.get_text().strip()) flag = 1 break # 百度百科 if results.find("h3").find("a").get_text().__contains__( u"百度百科") and i == 1: url = results.find("h3").find("a")['href'] if url == None: print "百度百科找不到答案" continue else: print "百度百科找到答案" baike_soup = To.get_html_baike(url) r = baike_soup.find(class_='lemma-summary') if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break text += results.get_text() if flag == 1: return answer #获取bing的摘要 soup_bing = To.get_html_bing('https://www.bing.com/search?q=' + quote(query)) # 判断是否在Bing的知识图谱中 # bingbaike = soup_bing.find(class_="b_xlText b_emphText") bingbaike = soup_bing.find(class_="bm_box") if bingbaike != None: if bingbaike.find_all(class_="b_vList")[1] != None: if bingbaike.find_all(class_="b_vList")[1].find("li") != None: print "Bing知识图谱找到答案" flag = 1 answer.append(bingbaike.get_text()) # print "=====" # print answer # print "=====" return answer else: print "Bing知识图谱找不到答案" results = soup_bing.find(id="b_results") bing_list = results.find_all('li') for bl in bing_list: temp = bl.get_text() if temp.__contains__(u" - 必应网典"): print "查找Bing网典" url = bl.find("h2").find("a")['href'] if url == None: print "Bing网典找不到答案" continue else: print "Bing网典找到答案" bingwd_soup = To.get_html_bingwd(url) r = bingwd_soup.find(class_='bk_card_desc').find("p") if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break if flag == 1: return answer text += results.get_text() # print text # 如果再两家搜索引擎的知识图谱中都没找到答案,那么就分析摘要 if flag == 0: #分句 cutlist = [u"。", u"?", u".", u"_", u"-", u":", u"!", u"?"] temp = '' sentences = [] for i in range(0, len(text)): if text[i] in cutlist: if temp == '': continue else: # print temp sentences.append(temp) temp = '' else: temp += text[i] # 找到含有关键词的句子,去除无关的句子 key_sentences = {} for s in sentences: for k in keywords: if k in s: key_sentences[s] = 1 # 根据问题制定规则 # 识别人名 target_list = {} for ks in key_sentences: # print ks words = T.postag(ks) for w in words: # print "=====" # print w.word if w.flag == ("nr"): if target_list.has_key(w.word): target_list[w.word] += 1 else: target_list[w.word] = 1 # 找出最大词频 sorted_lists = sorted(target_list.items(), lambda x, y: cmp(x[1], y[1]), reverse=True) # print len(target_list) #去除问句中的关键词 sorted_lists2 = [] # 候选队列 for i, st in enumerate(sorted_lists): # print st[0] if st[0] in keywords: continue else: sorted_lists2.append(st) print "返回前n个词频" answer = [] for i, st in enumerate(sorted_lists2): # print st[0] # print st[1] if i < 3: # print st[0] # print st[1] answer.append(st[0]) # print answer return answer
elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') #实体 entity = str(res[1]).replace(" ", "") #属性 attr = str(res[2]).replace(" ", "") print entity + '<---->' + attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print 'Eric:' + QAT.ptranswer(ans, False) continue elif ans.decode('utf-8').__contains__(u'::找不到'): #百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) if len(ans) == 0: ans = mybot.respond('找不到答案') print 'Eric:' + ans elif len(ans) > 1:
def answer(question): if len(question) > 600: print mybot.respond("句子长度过长") raise Exception("Too Long") elif question.strip() == '': print mybot.respond("无") raise Exception("No Input") print question message = T.wordSegment(question) # 去标点 print 'word Seg:'+ message print '词性:' words = T.postag(question) if message == 'q': exit() else: response = mybot.respond(message) print response if response == "": raise Exception("No Answer") # 百科搜索 elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') #实体 entity = str(res[1]).replace(" ","") #属性 attr = str(res[2]).replace(" ","") print entity+'<---->'+attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print 'Eric:' + QAT.ptranswer(ans,False) return [QAT.ptranswer(ans,False)] elif ans.decode('utf-8').__contains__(u'::找不到'): #百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(question) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(question) if len(ans) == 0: raise Exception("No Answer") elif len(ans) >1: print "不确定候选答案" print 'Eric: ' for a in ans: print a.encode("utf8") return [a.encode("utf8") for a in ans] else: print 'Eric:' + ans[0].encode("utf8") return [ans[0].encode("utf8")] # 匹配模版 else: print 'Eric:' + response return [response]
def kwquery(query, intention, schoolname): #分词 去停用词 抽取关键词 keywords = [] # try: # pynlpir.open() # keywords = pynlpir.get_key_words(query, weighted=True) # print "关键词:" # for key_word in keywords: # print key_word[0], '\t', key_word[1] # pynlpir.close() # except Exception as ex: # print ex # words = T.postag(query) # for k in words: # # 只保留名词 # if k.flag.__contains__("n"): # # print k.flag # # print k.word # keywords.append(k.word) # to === html_tools.py req = To.Session() #初始化session对话 为爬虫做准备 # req.session.cookies.clear() answer = [] answerdict = {} answerdict["schoolname"] = "" answerdict["intention"] = "" answerdict["index"] = -1 text = '' # 找到答案就置1 flag = 0 dic = { '分数线': scoreline, '食堂': canteen, '宿舍': dormitory, '官网': website, '英文名': englishname, '专业': major, '学院': college, '收费': charge, '地址': location, '邮编': zipcode, '占地': area, '招办电话': phonenumber, '学校性质': schoolproperty, '邮箱': email, '硕士点': master, '博士点': doctor, '校庆日': celebration, '知名校友': alumnus, '就业情况': work, '创办时间': establish_time, '学校代码': schoolcode, '师资力量': team, '学校简介': intro, } #switch语句 if intention != "" and schoolname != "": pachong = False try: # 先进行查询 sql = u"SELECT `内容`,`序号` FROM `问答预存` WHERE `学校名`='" + schoolname + "' AND `意图`='" + intention + "'ORDER BY `均分` DESC" SQLresults = pj.EXSQL(sql) #没有进行定向爬取 if len(SQLresults) == 0: answerdict['index'] = 1 pachong = True print u'数据库中没有此记录' else: #如有 则进行概率选择 print u'数据库有记录表' dangwei = random.randint(1, 10) if (dangwei <= 4): dangwei = 1 elif (dangwei >= 9): dangwei = 4 elif (dangwei >= 5 and dangwei <= 7): dangwei = 2 else: dangwei = 3 print("档位:" + dangwei.__str__()) #调用__str__方法,输出挡位值 #添加交互信息 if (dangwei == 4): answerdict['index'] = 4 if (dangwei <= 3): answerdict['index'] = SQLresults[dangwei - 1][1] answer.append(SQLresults[dangwei - 1][0]) flag = 1 except Exception as e: print(e) if intention != "其他": print u'非其他' if pachong: yitupaqu = dic[intention](schoolname, req).strip() if yitupaqu != "": print u'调用爬虫后的数据', yitupaqu answer.append(yitupaqu) pj.InitSchool(schoolname, intention, yitupaqu) flag = 1 if flag == 1: print("before search") # 抓取百度前10条的摘要 soup_baidu = To.get_html_baidu( 'https://www.baidu.com/s?wd=' + quote(query), req) #判断是否有两个id为1的页面 for i in range(1, 10): if flag == 1: break if soup_baidu == None: break resultsS = soup_baidu.find_all(id=i) for results in resultsS: if results == None: # print "百度摘要找不到答案" continue # print '=============' # print results.attrs # print type(results.attrs) # print results['class'] # 判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案 if results.attrs.has_key('mu') and i == 1: # print results.attrs["mu"] r = results.find_all(class_='op_best_answer_content') if r == None or len(r) < 2: pass # print "百度知识图谱找不到答案" else: # print r.get_text() # print "百度知识图谱找到答案" r = r[1] r = r.get_text().strip() if r != "": answer.append(r) flag = 1 break r = results.find(class_='op_exactqa_s_answer') if r == None: pass # print "百度知识图谱找不到答案" else: # print r.get_text() # print "百度知识图谱找到答案" r = r.get_text().strip() if r != "": answer.append(r) flag = 1 break # 百度百科 # if results.find("h3").find("a").get_text().__contains__(u"百度百科") and (i == 1 or i ==2 or i==3): if results.find("h3") != None: if results.find("h3").find("a").get_text().__contains__( u"_百度百科"): url = results.find("h3").find("a")['href'] if url == None: # print "百度百科找不到答案" continue else: # print "百度百科找到答案" baike_soup = To.get_html_baike(url, req) r = baike_soup.find(class_='lemma-summary') if r == None: continue else: r = r.get_text().replace("\n", "").strip() if r != "": answer.append(r) flag = 1 break # 古诗词判断 if results.attrs.has_key('mu') and i == 1: r = results.find(class_="op_exactqa_detail_s_answer") if r == None: pass # print "百度诗词找不到答案" else: # print r.get_text() # print "百度诗词找到答案" r = r.get_text().strip() if r != "": answer.append(r) flag = 1 break # 万年历 & 日期 if results.attrs.has_key('mu') and i == 1 and results.attrs[ 'mu'].__contains__('http://open.baidu.com/calendar'): r = results.find(class_="op-calendar-content") if r == None: pass # print "百度万年历找不到答案" else: # print r.get_text() # print "百度万年历找到答案" r = r.get_text().strip().replace("\n", "").replace(" ", "") if r != "": answer.append(r) flag = 1 break if results.attrs.has_key('tpl') and i == 1 and results.attrs[ 'tpl'].__contains__('calendar_new'): # r = results.attrs['fk'].replace("6018_","") if results.find(attrs={"data-compress": "off"}): r = results.find(attrs={"data-compress": "off"}).get_text() r = r[r.find('selectDate'):] r = r[r.find('[') + 1:r.find(']')] r = r.replace("\"", "") r = r.split(',') # r=results.find(class_="op-calendar-new-right-date") # print r if r == None: pass # print "百度万年历新版找不到答案" # continue else: # print r.get_text() # print "百度万年历新版找到答案" r = r[0] + "年" + r[1] + "月" + r[2] + "日" answer.append(r) flag = 1 break if results.attrs.has_key('tpl') and i <= 2 and results.attrs[ 'tpl'].__contains__('exactqa'): # r = results.attrs['fk'].replace("6018_","") r = results.find(class_="op_exactqa_s_prop c-gap-bottom-small") # print r.a if r == None: pass # print "百度黄历找不到答案" # continue else: r = r.a # print "百度黄历找到答案" answer.append(r.get_text()) flag = 1 break # 计算器 if results.attrs.has_key( 'mu' ) and i == 1 and results.attrs['mu'].__contains__( 'http://open.baidu.com/static/calculator/calculator.html'): # r = results.find('div').find_all('td')[1].find_all('div')[1] r = results.find(class_='op_new_val_screen_result') if r == None: pass # print "计算器找不到答案" # continue else: # print r.get_text() # print "计算器找到答案" r = r.get_text().strip() if r != "": answer.append(r) flag = 1 break # 百度知道答案 if results.attrs.has_key('mu') and i == 1: # print results.attrs["mu"] r = results.find(class_='op_best_answer_question_link') if r == None: r = results.find(class_='op_generalqa_answer_title') if r == None: pass # print "百度知道图谱找不到答案" else: r = r.a # print "百度知道图谱找到答案" url = r['href'] zhidao_soup = To.get_html_zhidao(url, req) r = zhidao_soup.find(class_='bd answer').find('pre') if r == None: continue answer.append(r.get_text()) flag = 1 break else: # print "百度知道图谱找到答案" url = r['href'] zhidao_soup = To.get_html_zhidao(url, req) r = zhidao_soup.find(class_='bd answer') if r == None: continue r = r.find('pre') if r == None: continue answer.append(r.get_text()) flag = 1 break if results.find("h3") != None: # 百度知道 if results.find("h3").find("a").get_text().__contains__( u"百度知道") and (i <= 5): url = results.find("h3").find("a")['href'] if url == None: # print "百度知道找不到答案" continue else: # print "百度知道找到答案" zhidao_soup = To.get_html_zhidao(url, req) r = zhidao_soup.find(class_='bd answer') if r == None: continue r = r.find(class_='best-text mb-10') if r == None: continue ex = r.find(class_='wgt-best-mask') if ex != None: ex.extract() r = r.get_text().strip() if r != "": answer.append(r) flag = 1 break text += results.get_text() #经过上述去数据库查询,去百度搜索关键词并写入数据库或者针对其他目的进行爬虫搜索获得ans answerdict['schoolname'] = schoolname answerdict['intention'] = intention #如果有答案 if flag == 1: answerdict['answer'] = answer return answerdict else: for i in range(1, 10): results = soup_baidu.find(id=i) if (results == None): answer.append(u"很抱歉,网络可能出现异常!") break ######## url = results.find("h3").find("a")['href'] if url == None: # print "百度知道找不到答案" continue else: # print "百度知道找到答案" zhidao_soup = To.get_html_baidufirst(url, req) r = zhidao_soup.findAll("p") if r == None: continue else: for txt in r: t = txt.get_text().strip() if t != "": answer.append(txt.get_text().strip()) if answer.__len__() >= 20: break if answer.__len__() >= 3: break ######### # r = results.find(class_ = "c-abstract") # if r==None: # continue # else: # [s.extract() for s in r(['span'])] # answer.append(r.get_text()) # break del req answerdict['answer'] = answer return answerdict
def ws(): user_socket = request.environ.get('wsgi.websocket') # type:WebSocket while 1: msg =user_socket.receive() question = json.loads(msg) q = question['data']['mine']['content'] msg =q input_message = str(msg).encode('utf-8') if len(input_message) > 60: answer = mybot.respond("句子长度过长") # continue elif input_message.strip() == '': answer = mybot.respond("无话可说") # continue # print input_message message = T.wordSegment(input_message) # 去标点 # print 'word Seg:'+ message # print '词性:' words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) # 在AIML数据集里寻找答案 print "=======" if response[0] == '#': print response + 'mark' else: answer = response print "=======" if response == "": ans = mybot.respond('找不到答案') answer = ans # 百科搜索 elif response[0] == '#' or len(response) < 1: # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') # 实体 entity = str(res[1]).replace(" ", "") # 属性 attr = str(res[2]).replace(" ", "") print entity + '<---->' + attr ans = baike.query(entity, attr) # 如果命中答案 print type(ans) if type(ans) == list: answer = '回答:' + QAT.ptranswer(ans, False) # continue elif ans.decode('utf-8').__contains__(u'::找不到'): # 百度摘要+Bing摘要 print "通用搜索" answer = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) print type(ans) if len(ans) == 0: ans = mybot.respond('找不到答案') answer = '回答:' + ans elif len(ans) > 1: print "不确定候选答案" answer = ans[0] print 'Eric: ' for a in ans: print a.encode("utf-8") else: answer = '回答:' + ans[0].encode("utf-8") # 匹配模版 else: answer = '回答:' + response s = '展开全部' print type(answer).__name__ if (type(answer).__name__ == 'list' and len(answer)>0) or '唔... 怎么回答...'in answer or '天气' in msg: answer = geta(msg) answer = answer print 'wocao' else: #print answer if s in str(answer): print answer answer = str(answer).replace('\n', '').replace('展开全部', "").split('已赞过')[0] print 'OS' +answer print user_socket,msg res =answer a = { "username": "******", "avatar": "https://robot.rszhang.top/images/icon/nv/0.jpg", "id": "-2", # //消息的来源ID(如果是私聊,则是用户id,如果是群聊,则是群组id) "type": "friend", # //聊天窗口来源类型,从发送消息传递的to里面获取 "content": res, # //消息内容 "cid": 0, # //消息id,可不传。除非你要对消息进行一些操作(如撤回) "mine": True, # //是否我发送的消息,如果为true,则会显示在右方 "fromid": "100000", # /消息的发送者id(比如群组中的某个消息发送者),可用于自动解决浏览器多窗口时的一些问题 "timestamp": 1467475443306, # //服务端时间戳毫秒数。注意:如果你返回的是标准的 unix 时间戳,记得要 *1000 } user_socket.send(json.dumps(a))
def handle(self): conn = self.request conn.sendall('欢迎访问智能百科问答系统') Flag =True data = conn.recv(4096) while Flag: input_message = data print "input_message=====" print input_message print "==========" reply = '' if len(input_message) > 60: print mybot.respond("句子长度过长") reply = mybot.respond("句子长度过长") conn.sendall(reply) Flag =False continue elif input_message.strip() == '无': print mybot.respond("无") reply = mybot.respond("无") conn.sendall(reply) Flag = False continue print input_message message = T.wordSegment(input_message) # 去标点 print 'word Seg:' + message print '词性:' words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) print "=======" print response print "=======" if response == "": ans = mybot.respond('找不到答案') print 'Eric:' + ans reply = mybot.respond('找不到答案') conn.sendall(reply) Flag = False # 百科搜索 elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') # 实体 entity = str(res[1]).replace(" ", "") # 属性 attr = str(res[2]).replace(" ", "") print entity + '<---->' + attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print 'Eric:' + QAT.ptranswer(ans, False) reply = QAT.ptranswer(ans, False) conn.sendall(reply) Flag = False continue elif ans.decode('utf-8').__contains__(u'::找不到'): # 百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) if len(ans) == 0: ans = mybot.respond('找不到答案') print 'Eric:' + ans reply = ans conn.sendall(reply) Flag = False elif len(ans) > 1: print "不确定候选答案" print 'Eric: ' for a in ans: print a.encode("utf8") reply += a.encode("utf8") + '\n' conn.sendall(reply) Flag = False else: print 'Eric:' + ans[0].encode("utf8") reply = ans[0].encode("utf8") conn.sendall(reply) Flag = False # 匹配模版 else: print 'Eric:' + response reply = response conn.sendall(reply) Flag = False
def qa(): #初始化jb分词器 T.jieba_initialize() #切换到语料库所在工作目录 mybot_path = './' # os.chdir(mybot_path) mybot = aiml.Kernel() mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/std-startup.xml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml") mybot.learn( os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml") # mybot.respond('Load Doc Snake') #载入百科属性列表 print ''' .----------------. .-----------------. .----------------. .----------------. .----------------. | .--------------. || .--------------. || .--------------. || .--------------. || .--------------. | | | _______ | || | ____ _____ | || | __ | || | ___ ____ | || | _________ | | | | / ___ | | || ||_ \|_ _| | || | / \ | || | |_ ||_ _| | || | |_ ___ | | | | | | (__ \_| | || | | \ | | | || | / /\ \ | || | | |_/ / | || | | |_ \_| | | | | '.___`-. | || | | |\ \| | | || | / /__\ \ | || | | __'. | || | | _| _ | | | | |`\____) | | || | _| |_\ |_ | || | _/ / \ \_ | || | _| | \ \_ | || | _| |___/ | | | | | |_______.' | || ||_____|\____| | || ||____| |____|| || | |____||____| | || | |_________| | | | | | || | | || | | || | | || | | | | '--------------' || '--------------' || '--------------' || '--------------' || '--------------' | '----------------' '----------------' '----------------' '----------------' '----------------' Eric:你好,我是Eric。╭(╯^╰)╮ ''' while True: input_message = raw_input("Enter your message >> ") if len(input_message) > 60: print mybot.respond("句子长度过长") continue elif input_message.strip() == '': print mybot.respond("无") continue print input_message message = T.wordSegment(input_message) # 去标点 print 'word Seg:' + message print '词性:' words = T.postag(input_message) if message == 'q': exit() else: response = mybot.respond(message) print "=======" print response print "=======" if response == "": ans = mybot.respond('找不到答案') print 'Eric:' + ans # 百科搜索 elif response[0] == '#': # 匹配百科 if response.__contains__("searchbaike"): print "searchbaike" print response res = response.split(':') #实体 entity = str(res[1]).replace(" ", "") #属性 attr = str(res[2]).replace(" ", "") print entity + '<---->' + attr ans = baike.query(entity, attr) # 如果命中答案 if type(ans) == list: print 'Eric:' + QAT.ptranswer(ans, False) continue elif ans.decode('utf-8').__contains__(u'::找不到'): #百度摘要+Bing摘要 print "通用搜索" ans = search_summary.kwquery(input_message) # 匹配不到模版,通用查询 elif response.__contains__("NoMatchingTemplate"): print "NoMatchingTemplate" ans = search_summary.kwquery(input_message) if len(ans) == 0: ans = mybot.respond('找不到答案') print 'Eric:' + ans elif len(ans) > 1: print "不确定候选答案" print 'Eric: ' for a in ans: print a.encode("utf8") else: print 'Eric:' + ans[0].encode("utf8") # 匹配模版 else: print 'Eric:' + response
def kwquery(query): #分词 去停用词 抽取关键词 keywords = [] words = T.postag(query) for k in words: #print(k) # 只保留名词 if k.flag.__contains__("n"): # #print k.flag # #print k.word keywords.append(k.word) answer = [] text = '' # 找到答案就置1 flag = 0 # 抓取百度前10条的摘要 soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd=' + quote(query)) for i in range(1, 10): if soup_baidu == None: break results = soup_baidu.find(id=i) if results == None: ##print("百度百科找不到答案") break if results.attrs.get('mu') and i == 1: # #print results.attrs["mu"] r = results.find(class_='op_exactqa_s_answer') if r == None: pass ##print("百度知识图谱找不到答案") else: ##print("百度知识图谱找到答案") answer.append(r.get_text().strip()) flag = 1 break if flag == 0 and len(keywords) > 1: ##print(keywords) ans_lst = list(kgquery_entity(keywords[0], keywords[1])) if len(ans_lst) != 0: answer = ans_lst flag = 1 if results.find("h3") != None and flag == 0: if results.find("h3").find("a").get_text().__contains__( u"百度百科") and (i == 1 or i == 2): url = results.find("h3").find("a")['href'] if url == None: ##print("百度百科找不到答案") continue else: #print("百度百科找到答案") baike_soup = To.get_html_baike(url) r = baike_soup.find(class_='lemma-summary') if r == None: continue else: r = r.get_text().replace("\n", "").strip() answer.append(r) flag = 1 break text += results.get_text().strip() if flag == 0: #分句 cutlist = [u"。", u"?", u".", u"_", u"-", u":", u"!", u"?"] temp = '' sentences = [] for i in range(0, len(text)): if text[i] in cutlist: if temp == '': continue else: # #print temp sentences.append(temp) temp = '' else: temp += text[i] # 找到含有关键词的句子,去除无关的句子 key_sentences = {} for s in sentences: for k in keywords: if k in s: key_sentences[s] = 1 # 识别人名 target_list = {} for ks in key_sentences: # #print ks words = T.postag(ks) for w in words: # #print "=====" # #print w.word if w.flag == ("nr"): if target_list.get(w.word): target_list[w.word] += 1 else: target_list[w.word] = 1 sorted_lists = sorted(target_list.items(), key=operator.itemgetter(1), reverse=True) sorted_lists2 = [] # 候选队列 for i, st in enumerate(sorted_lists): # #print st[0] if st[0] in keywords: continue else: sorted_lists2.append(st) #print("返回前3个词频") answer = [] for i, st in enumerate(sorted_lists2): # #print st[0] # #print st[1] if i < 3: # #print st[0] # #print st[1] answer.append(st[0]) ##print(answer) return answer