Beispiel #1
0
def work(schoolname,req):
    try:
        soup = To.get_html_baidu_selenium1(schoolname, req)
        soup = soup.find(class_ = "nav-container clearfix").find("span").find("ul").find_all("li")
        soup = To.get_html_baidu("http://gaokao.chsi.com.cn" + (soup[3].a)["href"], req)
        soup = soup.find(class_ = "yxk-detail-con")
        return soup.text.strip()
    except Exception as ex:
        print(ex)
        return ""
def canteen(schoolname, req):
    try:
        soup = To.get_html_baidu_selenium1(schoolname, req)
        soup = soup.find(class_="nav-container clearfix").find_all("a")
        for i in soup:
            if i.text == u"食宿条件":
                url = i["href"]
                break
        soup = To.get_html_baidu("http://gaokao.chsi.com.cn" + url, req)
        soup = soup.find_all(class_="yxk-detail-con")
        return soup[1].text
    except Exception as ex:
        print(ex)
        return ""
Beispiel #3
0
def major(schoolname,req):
    try:
        soup = To.get_html_baidu_selenium2(schoolname, req)
        soup = soup.find(class_ = "sm_nav bk").find_all("a")
        ans = ""
        for i in soup:
            if i.text == u"专业介绍":
                soup = To.get_html_baidu(i["href"], req)
                soup = soup.find(class_ = "plan_con").find_all("span")
                for t in soup:
                    ans += t.text.replace('|', '').strip().encode("utf-8") + ','
                return ans.strip(',')
        return ""
    except Exception as ex:
        print(ex)
        return ""
Beispiel #4
0
def query(entity, attr):
    soup = To.get_html_baidu("http://baike.baidu.com/item/" + entity)
    basicInfo_block = soup.find(class_='basic-info cmn-clearfix')
    if basicInfo_block == None:
        # print 'info None'
        return attr + "::找不到"
    else:
        info = get_info(basicInfo_block)
        # for i in info:
        #     print i
        #     print info[i]
        # print '-----------'
        if info.has_key(attr.decode('utf8')):
            # print 'has key'+attr.decode('utf8')
            return info[attr.decode('utf8')]
        else:
            # print 'no key 进行同义词判断'
            # 同义词判断
            attr_list = T.load_baikeattr_name(
                os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) +
                '/resources/Attribute_name.txt')
            attr = T.load_synonyms_word_inattr(
                attr,
                os.path.dirname(os.path.split(os.path.realpath(__file__))[0]) +
                '/resources/SynonDic.txt', attr_list)
            if info.has_key(attr.decode('utf8')):
                return info[attr.decode('utf8')]
            else:
                return attr + "::找不到"
def website(schoolname, req):
    try:
        soup = To.get_html_baidu_selenium1(schoolname, req)
        soup = soup.find(class_="mid").find(class_="msg").find("span").a
        return soup.text
    except Exception as ex:
        print(ex)
        return ""
Beispiel #6
0
def zipcode(schoolname,req):
    try:
        soup = To.get_html_baidu("https://www.baidu.com/s?wd=" + schoolname + "邮编", req)
        soup = soup.find(class_ = "op_post_content")
        return soup.text.strip()[:6]
    except Exception as ex:
        print(ex)
        return ""
Beispiel #7
0
def area(schoolname,req):
    try:
        soup = To.get_html_baidu("https://www.baidu.com/s?wd=" + schoolname + "占地", req)
        soup = soup.find(class_ = "op_exactqa_s_answer")
        return soup.text.strip().encode("utf-8")
    except Exception as ex:
        print(ex)
        return ""
def phonenumber(schoolname, req):
    try:
        url = ""
        soup = To.get_html_baidu_selenium1(schoolname, req)
        soup = soup.find(class_="nav-container clearfix").find_all("a")
        for i in soup:
            if i.text == u"学校简介":
                url = i["href"]
                break
        soup = To.get_html_baidu("http://gaokao.chsi.com.cn" + url, req)
        soup = soup.find(class_="container").find_all(class_="yxk-detail-con")
        if soup[len(soup) - 1].text == "":
            return ""
        else:
            return soup[len(soup) - 1].text.encode("utf-8")
    except Exception as ex:
        print(ex)
        return ""
def email(schoolname, req):
    try:
        soup = To.get_html_baidu_selenium2(schoolname, req)
        soup = soup.find(class_="college_msg bk").find(class_="left contact")
        soup = soup.text.split()
        return (soup[2])[5:]
    except Exception as ex:
        print(ex)
        return ""
Beispiel #10
0
def intro(schoolname,req):
    try:
        soup = To.get_html_baidu("https://baike.baidu.com/item/" + schoolname, req)
        soup = soup.find(class_ = "lemma-summary").find("div").text
        if '[' in soup and ']' in soup:
            soup = soup[:soup.index('[')]
        return soup
    except Exception as ex:
        print(ex)
        return ""
Beispiel #11
0
def schoolproperty(schoolname,req):
    try:
        soup = To.get_html_baidu("https://baike.baidu.com/item/" + schoolname, req)
        soup1 = soup.find(class_ = "basic-info cmn-clearfix").find_all("dt")
        soup2 = soup.find(class_ = "basic-info cmn-clearfix").find_all("dd")
        for i in soup1:
            if i.text == u"属性":
                return soup2[soup1.index(i)].text
    except Exception as ex:
        print(ex)
        return ""
Beispiel #12
0
def college(schoolname,req):
    try:
        soup = To.get_html_baidu("https://baike.baidu.com/item/" + schoolname, req)
        soup = soup.find_all(class_ = "para-title level-3")
        for i in soup:
            if u"院系" in i.text or u"学院" in i.text:
                tmp = i.nextSibling.nextSibling.text
                if u'[' in tmp and u']' in tmp:
                    tmp = tmp[:tmp.index('[')]
                return tmp
        return ""
    except Exception as ex:
        print(ex)
        return ""
Beispiel #13
0
def scoreline(schoolname,req):
    try:
        soup = To.get_html_baidu_selenium1(schoolname, req)
        soup = soup.find(id = "lqfs").find_all("tr")
        for i in soup:
            if (u"本科一批" in i.text):
                tmp = i.find_all("td")
                if tmp[0].text == "" or tmp[1].text == "" or tmp[2].text == "" or tmp[3].text == "":
                    return ""
                else:
                    return schoolname + tmp[0].text.encode("utf8") + "年" + tmp[1].text.encode("utf8") + "的省市分数线是" + tmp[2].text.encode("utf8") + ",录取平均分是" + tmp[3].text.encode("utf8")
        return ""
    except Exception as ex:
        print(ex)
        return ""
Beispiel #14
0
def team(schoolname,req):
    try:
        soup = To.get_html_baidu("https://baike.baidu.com/item/" + schoolname, req)
        soup = soup.find_all(class_ = "para-title level-3")
        for i in soup:
            if u"师资" in i.text:
                tmp = i.nextSibling.nextSibling
                if tmp.find("div") != None:
                    tmp.div.clear()
                tmp = tmp.text
                if '[' in tmp and ']' in tmp:
                    tmp = tmp[:tmp.index('[')]
                return tmp[:len(tmp)-1]
        return ""
    except Exception as ex:
        print(ex)
        return ""
Beispiel #15
0
def query(entity,attr):
    soup = To.get_html_baike("http://baike.baidu.com/item/"+entity)
    basicInfo_block = soup.find(class_= 'basic-info cmn-clearfix')
    if not basicInfo_block:
        return '找不到'
    else:
        info  = get_info(basicInfo_block)
        ##print(info)
        if info.get(attr):
            return info[attr].strip()
        else:
            attr_list = T.load_baikeattr_name(os.path.dirname(os.path.split(os.path.realpath(__file__))[0])+'/resources/Attribute_name.txt')
            attr = T.load_synonyms_word_inattr(attr,os.path.dirname(os.path.split(os.path.realpath(__file__))[0])+'/resources/SynonDic.txt',attr_list)
            if info.get(attr):
                return info[attr].strip()
            else:
                return '找不到'
Beispiel #16
0
def kwquery(query):
    #分词 去停用词 抽取关键词
    keywords = []
    words = T.postag(query)
    for k in words:
        # 只保留名词
        if k.flag.__contains__("n"):
            # print k.flag
            # print k.word
            keywords.append(k.word)

    answer = []
    text = ''
    # 找到答案就置1
    flag = 0


    # 抓取百度前10条的摘要
    soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd='+quote(query))

    for i in range(1,10):
        if soup_baidu == None:
            break
        results = soup_baidu.find(id=i)

        if results == None:
            print "百度摘要找不到答案"
            break
        # print '============='
        # print results.attrs
        # print type(results.attrs)
        # print results['class']
        #判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案
        if results.attrs.has_key('mu') and i == 1:
            # print results.attrs["mu"]
            r = results.find(class_='op_exactqa_s_answer')
            if r == None:
                print "百度知识图谱找不到答案"
            else:
                # print r.get_text()
                print "百度知识图谱找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break


        #古诗词判断
        if results.attrs.has_key('mu') and i == 1:
            r = results.find(class_="op_exactqa_detail_s_answer")
            if r == None:
                print "百度诗词找不到答案"
            else:
                # print r.get_text()
                print "百度诗词找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break

        #万年历 & 日期
        if results.attrs.has_key('mu') and i == 1 and results.attrs['mu'].__contains__('http://open.baidu.com/calendar'):
            r = results.find(class_="op-calendar-content")
            if r == None:
                print "百度万年历找不到答案"
            else:
                # print r.get_text()
                print "百度万年历找到答案"
                answer.append(r.get_text().strip().replace("\n","").replace(" ",""))
                flag = 1
                break

        if results.attrs.has_key('tpl') and i == 1 and results.attrs['tpl'].__contains__('calendar_new'):
            r = results.attrs['fk'].replace("6018_","")
            print r

            if r == None:
                print "百度万年历新版找不到答案"
                # continue
            else:
                # print r.get_text()
                print "百度万年历新版找到答案"
                answer.append(r)
                flag = 1
                break


        #计算器
        if results.attrs.has_key('mu') and i == 1 and results.attrs['mu'].__contains__('http://open.baidu.com/static/calculator/calculator.html'):
            r = results.find('div').find_all('td')[1].find_all('div')[1]
            if r == None:
                print "计算器找不到答案"
                # continue
            else:
                # print r.get_text()
                print "计算器找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break


        # 百度知道答案
        if results.attrs.has_key('mu') and i == 1:
            r = results.find(class_='op_best_answer_question_link')
            if r == None:
                print "百度知道图谱找不到答案"
            else:
                print "百度知道图谱找到答案"
                url = r['href']
                zhidao_soup = To.get_html_zhidao(url)
                r = zhidao_soup.find(class_='bd answer').find('pre')
                if r == None:
                    r = zhidao_soup.find(class_='bd answer').find(class_='line content')

                answer.append(r.get_text())
                flag = 1
                break

        if results.find("h3") != None:
            # 百度知道
            if results.find("h3").find("a").get_text().__contains__(u"百度知道") and (i == 1 or i ==2):
                url = results.find("h3").find("a")['href']
                if url == None:
                    print "百度知道图谱找不到答案"
                    continue
                else:
                    print "百度知道图谱找到答案"
                    zhidao_soup = To.get_html_zhidao(url)

                    r = zhidao_soup.find(class_='bd answer')
                    if r == None:
                        continue
                    else:
                        r = r.find('pre')
                        if r == None :
                            r = zhidao_soup.find(class_='bd answer').find(class_='line content')
                    answer.append(r.get_text().strip())
                    flag = 1
                    break

            # 百度百科
            if results.find("h3").find("a").get_text().__contains__(u"百度百科") and (i == 1 or i ==2):
                url = results.find("h3").find("a")['href']
                if url == None:
                    print "百度百科找不到答案"
                    continue
                else:
                    print "百度百科找到答案"
                    baike_soup = To.get_html_baike(url)

                    r = baike_soup.find(class_='lemma-summary')
                    if r == None:
                        continue
                    else:
                        r = r.get_text().replace("\n","").strip()
                    answer.append(r)
                    flag = 1
                    break
        text += results.get_text()

    if flag == 1:
        return answer

    #获取bing的摘要
    soup_bing = To.get_html_bing('https://www.bing.com/search?q='+quote(query))
    # 判断是否在Bing的知识图谱中
    # bingbaike = soup_bing.find(class_="b_xlText b_emphText")
    bingbaike = soup_bing.find(class_="bm_box")

    if bingbaike != None:
        if bingbaike.find_all(class_="b_vList")[1] != None:
            if bingbaike.find_all(class_="b_vList")[1].find("li") != None:
                print "Bing知识图谱找到答案"
                flag = 1
                answer.append(bingbaike.get_text())
                # print "====="
                # print answer
                # print "====="
                return answer
    else:
        print "Bing知识图谱找不到答案"
        results = soup_bing.find(id="b_results")
        bing_list = results.find_all('li')
        for bl in bing_list:
            temp =  bl.get_text()
            if temp.__contains__(u" - 必应网典"):
                print "查找Bing网典"
                url = bl.find("h2").find("a")['href']
                if url == None:
                    print "Bing网典找不到答案"
                    continue
                else:
                    print "Bing网典找到答案"
                    bingwd_soup = To.get_html_bingwd(url)

                    r = bingwd_soup.find(class_='bk_card_desc').find("p")
                    if r == None:
                        continue
                    else:
                        r = r.get_text().replace("\n","").strip()
                    answer.append(r)
                    flag = 1
                    break

        if flag == 1:
            return answer

        text += results.get_text()

    # print text


    # 如果再两家搜索引擎的知识图谱中都没找到答案,那么就分析摘要
    if flag == 0:
        #分句
        cutlist = [u"。",u"?",u".", u"_", u"-",u":",u"!",u"?"]
        temp = ''
        sentences = []
        for i in range(0,len(text)):
            if text[i] in cutlist:
                if temp == '':
                    continue
                else:
                    # print temp
                    sentences.append(temp)
                temp = ''
            else:
                temp += text[i]

        # 找到含有关键词的句子,去除无关的句子
        key_sentences = {}
        for s in sentences:
            for k in keywords:
                if k in s:
                    key_sentences[s]=1


        # 根据问题制定规则

        # 识别人名
        target_list = {}
        for ks in key_sentences:
            # print ks
            words = T.postag(ks)
            for w in words:
                # print "====="
                # print w.word
                if w.flag == ("nr"):
                    if target_list.has_key(w.word):
                        target_list[w.word] += 1
                    else:
                        target_list[w.word] = 1

        # 找出最大词频
        sorted_lists = sorted(target_list.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)
        # print len(target_list)
        #去除问句中的关键词
        sorted_lists2 = []
        # 候选队列
        for i, st in enumerate(sorted_lists):
            # print st[0]
            if st[0] in keywords:
                continue
            else:
                sorted_lists2.append(st)

        print "返回前n个词频"
        answer = []
        for i,st in enumerate(sorted_lists2):
            # print st[0]
            # print st[1]
            if i< 3:
                # print st[0]
                # print st[1]
                answer.append(st[0])
        # print answer

    return answer
Beispiel #17
0
def qa(question):

    #初始化jb分词器
    T.jieba_initialize()

    #切换到语料库所在工作目录
    mybot_path = './'
    # os.chdir(mybot_path)

    mybot = aiml.Kernel()
    mybot.learn(os.path.split(os.path.realpath(__file__))[0]+"/resources/std-startup.xml")
    mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml")
    mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml")
    mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml")
    mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml")
    mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml")
    mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml")
    # mybot.respond('Load Doc Snake')
    #载入百科属性列表

    print '''
.----------------.  .-----------------. .----------------.  .----------------.  .----------------.
| .--------------. || .--------------. || .--------------. || .--------------. || .--------------. |
| |    _______   | || | ____  _____  | || |      __      | || |  ___  ____   | || |  _________   | |
| |   /  ___  |  | || ||_   \|_   _| | || |     /  \     | || | |_  ||_  _|  | || | |_   ___  |  | |
| |  |  (__ \_|  | || |  |   \ | |   | || |    / /\ \    | || |   | |_/ /    | || |   | |_  \_|  | |
| |   '.___`-.   | || |  | |\ \| |   | || |   / /__\ \   | || |   |  __'.    | || |   |  _|  _   | |
| |  |`\____) |  | || | _| |_\   |_  | || | _/ /    \ \_ | || |  _| |  \ \_  | || |  _| |___/ |  | |
| |  |_______.'  | || ||_____|\____| | || ||____|  |____|| || | |____||____| | || | |_________|  | |
| |              | || |              | || |              | || |              | || |              | |
| '--------------' || '--------------' || '--------------' || '--------------' || '--------------' |
 '----------------'  '----------------'  '----------------'  '----------------'  '----------------'
 Eric:你好,我是Eric。╭(╯^╰)╮
    '''


    input_message = question

    if len(input_message) > 60:
        print mybot.respond("句子长度过长")
    elif input_message.strip() == '':
        print mybot.respond("无")

    print input_message
    message = T.wordSegment(input_message)
    # 去标点
    print 'word Seg:'+ message
    print '词性:'
    words = T.postag(input_message)


    if message == 'q':
        exit()
    else:
        response = mybot.respond(message)

        print "======="
        print response
        print "======="

        if response == "":
            ans = mybot.respond('找不到答案')
            print 'Eric:' + ans
        # 百科搜索
        elif response[0] == '#':
            # 匹配百科
            if response.__contains__("searchbaike"):
                print "searchbaike"
                print response
                res = response.split(':')
                #实体
                entity = str(res[1]).replace(" ","")
                #属性
                attr = str(res[2]).replace(" ","")
                print entity+'<---->'+attr

                ans = baike.query(entity, attr)
                # 如果命中答案
                if type(ans) == list:
                    print 'Eric:' + QAT.ptranswer(ans,False)
                elif ans.decode('utf-8').__contains__(u'::找不到'):
                    #百度摘要+Bing摘要
                    print "通用搜索"
                    ans = search_summary.kwquery(input_message)

            # 匹配不到模版,通用查询
            elif response.__contains__("NoMatchingTemplate"):
                print "NoMatchingTemplate"
                ans = search_summary.kwquery(input_message)


            if len(ans) == 0:
                ans = mybot.respond('找不到答案')
                print 'Eric:' + ans
            elif len(ans) >1:
                print "不确定候选答案"
                print 'Eric: '
                for a in ans:
                    print a.encode("utf8")
            else:
                print 'Eric:' + ans[0].encode("utf8")

        # 匹配模版
        else:
            print 'Eric:' + response
Beispiel #18
0
            elif response[0] == '#':
                # 匹配百科
                if response.__contains__("searchbaike"):
                    print "searchbaike"
                    print response
                    res = response.split(':')
                    #实体
                    entity = str(res[1]).replace(" ","")
                    #属性
                    attr = str(res[2]).replace(" ","")
                    print entity+'<---->'+attr

                    ans = baike.query(entity, attr)
                    # 如果命中答案
                    if type(ans) == list:
                        print 'Eric:' + QAT.ptranswer(ans,False)
                        continue
                    elif ans.decode('utf-8').__contains__(u'::找不到'):
                        #百度摘要+Bing摘要
                        print "通用搜索"
                        ans = search_summary.kwquery(input_message)

                # 匹配不到模版,通用查询
                elif response.__contains__("NoMatchingTemplate"):
                    print "NoMatchingTemplate"
                    ans = search_summary.kwquery(input_message)


                if len(ans) == 0:
                    ans = mybot.respond('找不到答案')
                    print 'Eric:' + ans
Beispiel #19
0
def run(question):
    # 初始化jb分词器
    T.jieba_initialize()

    # 切换到语料库所在工作目录
    mybot_path = './'
    os.chdir(mybot_path)

    mybot = aiml.Kernel()
    if os.path.isfile("bot_brain.brn"):
        mybot.bootstrap(brainFile="bot_brain.brn")
    else:
        mybot.learn(
            os.path.split(os.path.realpath(__file__))[0] +
            "/resources/std-startup.xml")
        mybot.learn(
            os.path.split(os.path.realpath(__file__))[0] +
            "/resources/tuling.xml")
        mybot.learn(
            os.path.split(os.path.realpath(__file__))[0] +
            "/resources/bye.aiml")
        mybot.learn(
            os.path.split(os.path.realpath(__file__))[0] +
            "/resources/tools.aiml")
        mybot.learn(
            os.path.split(os.path.realpath(__file__))[0] +
            "/resources/bad.aiml")
        mybot.learn(
            os.path.split(os.path.realpath(__file__))[0] +
            "/resources/funny.aiml")
        mybot.learn(
            os.path.split(os.path.realpath(__file__))[0] +
            "/resources/OrdinaryQuestion.aiml")
        mybot.learn(
            os.path.split(os.path.realpath(__file__))[0] +
            "/resources/Common conversation.aiml")
        #mybot.bootstrap(learnFiles="std-startup.xml", commands="load aiml b")
        mybot.saveBrain("bot_brain.brn")

    # 载入百科属性列表

    print '''
    Eric:你好,我是问答机器人。╭(╯^╰)╮
       '''

    input_message = question

    if len(input_message) > 60:
        return mybot.respond("句子长度过长")
        #continue
    elif input_message.strip() == '':
        return mybot.respond("无话可说")
        #continue

    # print input_message
    message = T.wordSegment(input_message)
    # 去标点
    # print 'word Seg:'+ message
    # print '词性:'
    words = T.postag(input_message)

    if message == 'q':
        exit()
    else:
        response = mybot.respond(message)  # 在AIML数据集里寻找答案

        print "======="
        if response[0] == '#':
            print response + 'mark'
            pass
        else:
            return response

        print "======="

        if response == "":
            ans = mybot.respond('找不到答案')
            print 'Eric:' + ans
        # 百科搜索
        elif response[0] == '#' or len(response) < 1:
            # 匹配百科
            if response.__contains__("searchbaike"):
                print "searchbaike"
                print response
                res = response.split(':')
                # 实体
                entity = str(res[1]).replace(" ", "")
                # 属性
                attr = str(res[2]).replace(" ", "")
                print entity + '<---->' + attr

                ans = baike.query(entity, attr)

                # 如果命中答案
                if type(ans) == list:
                    return '回答:' + QAT.ptranswer(ans, False)
                    #continue
                elif ans.decode('utf-8').__contains__(u'::找不到'):
                    # 百度摘要+Bing摘要
                    print "通用搜索"
                    ans = search_summary.kwquery(input_message)

            # 匹配不到模版,通用查询
            elif response.__contains__("NoMatchingTemplate"):
                print "NoMatchingTemplate"
                ans = search_summary.kwquery(input_message)

            if len(ans) == 0:
                ans = mybot.respond('找不到答案')
                return '回答:' + ans
            elif len(ans) > 1:
                print "不确定候选答案"
                return ans
                print 'Eric: '
                for a in ans:
                    print a.encode("utf-8")
            else:
                return '回答:' + ans[0].encode("utf-8")

        # 匹配模版
        else:
            return '回答:' + response
Beispiel #20
0
def kwquery(query):
    #分词 去停用词 抽取关键词
    keywords = []
    words = T.postag(query)
    for k in words:
        # 只保留名词
        if k.flag.__contains__("n"):
            # print k.flag
            # print k.word
            keywords.append(k.word)

    answer = []
    text = ''
    # 找到百科的答案就置1
    flag = 0

    # 抓取百度前10条的摘要
    soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd=' +
                                   quote(query))

    for i in range(1, 10):
        if soup_baidu == None:
            break
        results = soup_baidu.find(id=i)

        if results == None:
            print "百度摘要找不到答案"
            break
        # print '============='
        # print results.attrs
        # print type(results.attrs)
        # print results['class']
        #判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案
        if results.attrs.has_key('mu') and i == 1:
            # print results.attrs["mu"]
            r = results.find(class_='op_exactqa_s_answer')

            # print r
            if r == None:
                print "百度知识图谱找不到答案"
                # continue
            else:
                # print r.get_text()
                print "百度知识图谱找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break

        #古诗词判断
        if results.attrs.has_key('mu') and i == 1:
            r = results.find(class_="op_exactqa_detail_s_answer")

            if r == None:
                print "百度诗词找不到答案"
                # continue
            else:
                # print r.get_text()
                print "百度诗词找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break

        #计算器
        if results.attrs.has_key(
                'mu') and i == 1 and results.attrs['mu'].__contains__(
                    'http://open.baidu.com/static/calculator/calculator.html'):

            r = results.find('div').find_all('td')[1].find_all('div')[1]

            if r == None:
                print "计算器找不到答案"
                # continue
            else:
                # print r.get_text()
                print "计算器找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break

        # 百度知道答案
        if results.attrs.has_key('mu') and i == 1:
            r = results.find(class_='op_best_answer_question_link')
            if r == None:
                print "百度知道图谱找不到答案"
            else:
                print "百度知道图谱找到答案"
                url = r['href']
                zhidao_soup = To.get_html_zhidao(url)
                r = zhidao_soup.find(class_='bd answer').find('pre')
                answer.append(r.get_text())
                flag = 1
                break

        if results.find("h3") != None:
            # 百度知道
            if results.find("h3").find("a").get_text().__contains__(
                    u"百度知道") and i == 1:
                url = results.find("h3").find("a")['href']
                if url == None:
                    print "百度知道图谱找不到答案"
                    continue
                else:
                    print "百度知道图谱找到答案"
                    zhidao_soup = To.get_html_zhidao(url)

                    r = zhidao_soup.find(class_='bd answer')
                    if r == None:
                        continue
                    else:
                        r = r.find('pre')

                    answer.append(r.get_text().strip())
                    flag = 1
                    break

            # 百度百科
            if results.find("h3").find("a").get_text().__contains__(
                    u"百度百科") and i == 1:
                url = results.find("h3").find("a")['href']
                if url == None:
                    print "百度百科找不到答案"
                    continue
                else:
                    print "百度百科找到答案"
                    baike_soup = To.get_html_baike(url)

                    r = baike_soup.find(class_='lemma-summary')
                    if r == None:
                        continue
                    else:
                        r = r.get_text().replace("\n", "").strip()
                    answer.append(r)
                    flag = 1
                    break
        text += results.get_text()

    if flag == 1:
        return answer

    #获取bing的摘要
    soup_bing = To.get_html_bing('https://www.bing.com/search?q=' +
                                 quote(query))
    # 判断是否在Bing的知识图谱中
    # bingbaike = soup_bing.find(class_="b_xlText b_emphText")
    bingbaike = soup_bing.find(class_="bm_box")

    if bingbaike != None:
        if bingbaike.find_all(class_="b_vList")[1] != None:
            if bingbaike.find_all(class_="b_vList")[1].find("li") != None:
                print "Bing知识图谱找到答案"
                flag = 1
                answer.append(bingbaike.get_text())
                # print "====="
                # print answer
                # print "====="
                return answer
    else:
        print "Bing知识图谱找不到答案"
        results = soup_bing.find(id="b_results")
        bing_list = results.find_all('li')
        for bl in bing_list:
            temp = bl.get_text()
            if temp.__contains__(u" - 必应网典"):
                print "查找Bing网典"
                url = bl.find("h2").find("a")['href']
                if url == None:
                    print "Bing网典找不到答案"
                    continue
                else:
                    print "Bing网典找到答案"
                    bingwd_soup = To.get_html_bingwd(url)

                    r = bingwd_soup.find(class_='bk_card_desc').find("p")
                    if r == None:
                        continue
                    else:
                        r = r.get_text().replace("\n", "").strip()
                    answer.append(r)
                    flag = 1
                    break

        if flag == 1:
            return answer

        text += results.get_text()

    # print text

    # 如果再两家搜索引擎的知识图谱中都没找到答案,那么就分析摘要
    if flag == 0:
        #分句
        cutlist = [u"。", u"?", u".", u"_", u"-", u":", u"!", u"?"]
        temp = ''
        sentences = []
        for i in range(0, len(text)):
            if text[i] in cutlist:
                if temp == '':
                    continue
                else:
                    # print temp
                    sentences.append(temp)
                temp = ''
            else:
                temp += text[i]

        # 找到含有关键词的句子,去除无关的句子
        key_sentences = {}
        for s in sentences:
            for k in keywords:
                if k in s:
                    key_sentences[s] = 1

        # 根据问题制定规则

        # 识别人名
        target_list = {}
        for ks in key_sentences:
            # print ks
            words = T.postag(ks)
            for w in words:
                # print "====="
                # print w.word
                if w.flag == ("nr"):
                    if target_list.has_key(w.word):
                        target_list[w.word] += 1
                    else:
                        target_list[w.word] = 1

        # 找出最大词频
        sorted_lists = sorted(target_list.items(),
                              lambda x, y: cmp(x[1], y[1]),
                              reverse=True)
        # print len(target_list)
        #去除问句中的关键词
        sorted_lists2 = []
        # 候选队列
        for i, st in enumerate(sorted_lists):
            # print st[0]
            if st[0] in keywords:
                continue
            else:
                sorted_lists2.append(st)

        print "返回前n个词频"
        answer = []
        for i, st in enumerate(sorted_lists2):
            # print st[0]
            # print st[1]
            if i < 3:
                # print st[0]
                # print st[1]
                answer.append(st[0])
        # print answer

    return answer
Beispiel #21
0
            elif response[0] == '#':
                # 匹配百科
                if response.__contains__("searchbaike"):
                    print "searchbaike"
                    print response
                    res = response.split(':')
                    #实体
                    entity = str(res[1]).replace(" ", "")
                    #属性
                    attr = str(res[2]).replace(" ", "")
                    print entity + '<---->' + attr

                    ans = baike.query(entity, attr)
                    # 如果命中答案
                    if type(ans) == list:
                        print 'Eric:' + QAT.ptranswer(ans, False)
                        continue
                    elif ans.decode('utf-8').__contains__(u'::找不到'):
                        #百度摘要+Bing摘要
                        print "通用搜索"
                        ans = search_summary.kwquery(input_message)

                # 匹配不到模版,通用查询
                elif response.__contains__("NoMatchingTemplate"):
                    print "NoMatchingTemplate"
                    ans = search_summary.kwquery(input_message)

                if len(ans) == 0:
                    ans = mybot.respond('找不到答案')
                    print 'Eric:' + ans
                elif len(ans) > 1:
Beispiel #22
0
def answer(question):
  if len(question) > 600:
      print mybot.respond("句子长度过长")
      raise Exception("Too Long")
  elif question.strip() == '':
      print mybot.respond("无")
      raise Exception("No Input")

  print question
  message = T.wordSegment(question)
  # 去标点
  print 'word Seg:'+ message
  print '词性:'
  words = T.postag(question)


  if message == 'q':
      exit()
  else:
    response = mybot.respond(message)

    print response

    if response == "":
        raise Exception("No Answer")
    # 百科搜索
    elif response[0] == '#':
      # 匹配百科
      if response.__contains__("searchbaike"):
        print "searchbaike"
        print response
        res = response.split(':')
        #实体
        entity = str(res[1]).replace(" ","")
        #属性
        attr = str(res[2]).replace(" ","")
        print entity+'<---->'+attr

        ans = baike.query(entity, attr)
        # 如果命中答案
        if type(ans) == list:
          print 'Eric:' + QAT.ptranswer(ans,False)
          return [QAT.ptranswer(ans,False)]
        elif ans.decode('utf-8').__contains__(u'::找不到'):
          #百度摘要+Bing摘要
          print "通用搜索"
          ans = search_summary.kwquery(question)

      # 匹配不到模版,通用查询
      elif response.__contains__("NoMatchingTemplate"):
        print "NoMatchingTemplate"
        ans = search_summary.kwquery(question)


      if len(ans) == 0:
        raise Exception("No Answer")
      elif len(ans) >1:
        print "不确定候选答案"
        print 'Eric: '
        for a in ans:
          print a.encode("utf8")
        return [a.encode("utf8") for a in ans]
      else:
        print 'Eric:' + ans[0].encode("utf8")
        return [ans[0].encode("utf8")]

    # 匹配模版
    else:
      print 'Eric:' + response
      return [response]
def kwquery(query, intention, schoolname):
    #分词 去停用词 抽取关键词
    keywords = []
    # try:
    #     pynlpir.open()
    #     keywords = pynlpir.get_key_words(query, weighted=True)
    #     print "关键词:"
    #     for key_word in keywords:
    #         print key_word[0], '\t', key_word[1]
    #     pynlpir.close()
    # except Exception as ex:
    #     print ex

    # words = T.postag(query)
    # for k in words:
    #     # 只保留名词
    #     if k.flag.__contains__("n"):
    #         # print k.flag
    #         # print k.word
    #         keywords.append(k.word)
    # to === html_tools.py
    req = To.Session()
    #初始化session对话 为爬虫做准备
    # req.session.cookies.clear()
    answer = []
    answerdict = {}
    answerdict["schoolname"] = ""
    answerdict["intention"] = ""
    answerdict["index"] = -1

    text = ''
    # 找到答案就置1
    flag = 0

    dic = {
        '分数线': scoreline,
        '食堂': canteen,
        '宿舍': dormitory,
        '官网': website,
        '英文名': englishname,
        '专业': major,
        '学院': college,
        '收费': charge,
        '地址': location,
        '邮编': zipcode,
        '占地': area,
        '招办电话': phonenumber,
        '学校性质': schoolproperty,
        '邮箱': email,
        '硕士点': master,
        '博士点': doctor,
        '校庆日': celebration,
        '知名校友': alumnus,
        '就业情况': work,
        '创办时间': establish_time,
        '学校代码': schoolcode,
        '师资力量': team,
        '学校简介': intro,
    }
    #switch语句
    if intention != "" and schoolname != "":

        pachong = False
        try:
            # 先进行查询
            sql = u"SELECT `内容`,`序号` FROM `问答预存` WHERE `学校名`='" + schoolname + "' AND `意图`='" + intention + "'ORDER BY `均分` DESC"
            SQLresults = pj.EXSQL(sql)
            #没有进行定向爬取
            if len(SQLresults) == 0:
                answerdict['index'] = 1
                pachong = True
                print u'数据库中没有此记录'
            else:
                #如有 则进行概率选择
                print u'数据库有记录表'
                dangwei = random.randint(1, 10)
                if (dangwei <= 4):
                    dangwei = 1
                elif (dangwei >= 9):
                    dangwei = 4
                elif (dangwei >= 5 and dangwei <= 7):
                    dangwei = 2
                else:
                    dangwei = 3
                print("档位:" + dangwei.__str__())  #调用__str__方法,输出挡位值
                #添加交互信息
                if (dangwei == 4):
                    answerdict['index'] = 4
                if (dangwei <= 3):
                    answerdict['index'] = SQLresults[dangwei - 1][1]
                    answer.append(SQLresults[dangwei - 1][0])
                    flag = 1
        except Exception as e:
            print(e)

        if intention != "其他":
            print u'非其他'
            if pachong:
                yitupaqu = dic[intention](schoolname, req).strip()
                if yitupaqu != "":
                    print u'调用爬虫后的数据', yitupaqu
                    answer.append(yitupaqu)
                    pj.InitSchool(schoolname, intention, yitupaqu)
                    flag = 1
    if flag == 1:
        print("before search")

    # 抓取百度前10条的摘要
    soup_baidu = To.get_html_baidu(
        'https://www.baidu.com/s?wd=' + quote(query), req)
    #判断是否有两个id为1的页面
    for i in range(1, 10):
        if flag == 1:
            break
        if soup_baidu == None:
            break
        resultsS = soup_baidu.find_all(id=i)
        for results in resultsS:
            if results == None:
                # print "百度摘要找不到答案"
                continue
                # print '============='
                # print results.attrs
                # print type(results.attrs)
                # print results['class']
                # 判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案
            if results.attrs.has_key('mu') and i == 1:
                # print results.attrs["mu"]
                r = results.find_all(class_='op_best_answer_content')
                if r == None or len(r) < 2:
                    pass  # print "百度知识图谱找不到答案"
                else:
                    # print r.get_text()
                    # print "百度知识图谱找到答案"
                    r = r[1]
                    r = r.get_text().strip()
                    if r != "":
                        answer.append(r)
                        flag = 1
                        break

                r = results.find(class_='op_exactqa_s_answer')
                if r == None:
                    pass  # print "百度知识图谱找不到答案"
                else:
                    # print r.get_text()
                    # print "百度知识图谱找到答案"
                    r = r.get_text().strip()
                    if r != "":
                        answer.append(r)
                        flag = 1
                        break

            # 百度百科
            # if results.find("h3").find("a").get_text().__contains__(u"百度百科") and (i == 1 or i ==2 or i==3):
            if results.find("h3") != None:
                if results.find("h3").find("a").get_text().__contains__(
                        u"_百度百科"):
                    url = results.find("h3").find("a")['href']
                    if url == None:
                        # print "百度百科找不到答案"
                        continue
                    else:
                        # print "百度百科找到答案"
                        baike_soup = To.get_html_baike(url, req)

                        r = baike_soup.find(class_='lemma-summary')
                        if r == None:
                            continue
                        else:
                            r = r.get_text().replace("\n", "").strip()
                            if r != "":
                                answer.append(r)
                                flag = 1
                                break

            # 古诗词判断
            if results.attrs.has_key('mu') and i == 1:
                r = results.find(class_="op_exactqa_detail_s_answer")
                if r == None:
                    pass  # print "百度诗词找不到答案"
                else:
                    # print r.get_text()
                    # print "百度诗词找到答案"
                    r = r.get_text().strip()
                    if r != "":
                        answer.append(r)
                        flag = 1
                        break

            # 万年历 & 日期
            if results.attrs.has_key('mu') and i == 1 and results.attrs[
                    'mu'].__contains__('http://open.baidu.com/calendar'):
                r = results.find(class_="op-calendar-content")
                if r == None:
                    pass  # print "百度万年历找不到答案"
                else:
                    # print r.get_text()
                    # print "百度万年历找到答案"
                    r = r.get_text().strip().replace("\n", "").replace(" ", "")
                    if r != "":
                        answer.append(r)
                        flag = 1
                        break

            if results.attrs.has_key('tpl') and i == 1 and results.attrs[
                    'tpl'].__contains__('calendar_new'):
                # r = results.attrs['fk'].replace("6018_","")
                if results.find(attrs={"data-compress": "off"}):
                    r = results.find(attrs={"data-compress": "off"}).get_text()
                    r = r[r.find('selectDate'):]
                    r = r[r.find('[') + 1:r.find(']')]
                    r = r.replace("\"", "")
                    r = r.split(',')

                # r=results.find(class_="op-calendar-new-right-date")
                # print r
                if r == None:
                    pass  # print "百度万年历新版找不到答案"
                    # continue
                else:
                    # print r.get_text()
                    # print "百度万年历新版找到答案"
                    r = r[0] + "年" + r[1] + "月" + r[2] + "日"
                    answer.append(r)
                    flag = 1
                    break

            if results.attrs.has_key('tpl') and i <= 2 and results.attrs[
                    'tpl'].__contains__('exactqa'):
                # r = results.attrs['fk'].replace("6018_","")
                r = results.find(class_="op_exactqa_s_prop c-gap-bottom-small")
                # print r.a

                if r == None:
                    pass  # print "百度黄历找不到答案"
                    # continue
                else:
                    r = r.a
                    # print "百度黄历找到答案"
                    answer.append(r.get_text())
                    flag = 1
                    break

            # 计算器
            if results.attrs.has_key(
                    'mu'
            ) and i == 1 and results.attrs['mu'].__contains__(
                    'http://open.baidu.com/static/calculator/calculator.html'):
                # r = results.find('div').find_all('td')[1].find_all('div')[1]
                r = results.find(class_='op_new_val_screen_result')
                if r == None:
                    pass  # print "计算器找不到答案"
                    # continue
                else:
                    # print r.get_text()
                    # print "计算器找到答案"
                    r = r.get_text().strip()
                    if r != "":
                        answer.append(r)
                        flag = 1
                        break

            # 百度知道答案
            if results.attrs.has_key('mu') and i == 1:
                # print results.attrs["mu"]
                r = results.find(class_='op_best_answer_question_link')
                if r == None:
                    r = results.find(class_='op_generalqa_answer_title')
                    if r == None:
                        pass  # print "百度知道图谱找不到答案"
                    else:
                        r = r.a
                        # print "百度知道图谱找到答案"
                        url = r['href']
                        zhidao_soup = To.get_html_zhidao(url, req)
                        r = zhidao_soup.find(class_='bd answer').find('pre')
                        if r == None:
                            continue
                        answer.append(r.get_text())
                        flag = 1
                        break
                else:
                    # print "百度知道图谱找到答案"
                    url = r['href']
                    zhidao_soup = To.get_html_zhidao(url, req)
                    r = zhidao_soup.find(class_='bd answer')
                    if r == None:
                        continue
                    r = r.find('pre')
                    if r == None:
                        continue
                    answer.append(r.get_text())
                    flag = 1
                    break

            if results.find("h3") != None:
                # 百度知道
                if results.find("h3").find("a").get_text().__contains__(
                        u"百度知道") and (i <= 5):
                    url = results.find("h3").find("a")['href']
                    if url == None:
                        # print "百度知道找不到答案"
                        continue
                    else:
                        # print "百度知道找到答案"
                        zhidao_soup = To.get_html_zhidao(url, req)

                        r = zhidao_soup.find(class_='bd answer')
                        if r == None:
                            continue
                        r = r.find(class_='best-text mb-10')
                        if r == None:
                            continue
                        ex = r.find(class_='wgt-best-mask')
                        if ex != None:
                            ex.extract()
                        r = r.get_text().strip()
                        if r != "":
                            answer.append(r)
                            flag = 1
                            break

            text += results.get_text()

    #经过上述去数据库查询,去百度搜索关键词并写入数据库或者针对其他目的进行爬虫搜索获得ans
    answerdict['schoolname'] = schoolname
    answerdict['intention'] = intention
    #如果有答案
    if flag == 1:
        answerdict['answer'] = answer
        return answerdict
    else:
        for i in range(1, 10):
            results = soup_baidu.find(id=i)
            if (results == None):
                answer.append(u"很抱歉,网络可能出现异常!")
                break

            ########
            url = results.find("h3").find("a")['href']
            if url == None:
                # print "百度知道找不到答案"
                continue
            else:
                # print "百度知道找到答案"
                zhidao_soup = To.get_html_baidufirst(url, req)

                r = zhidao_soup.findAll("p")
                if r == None:
                    continue
                else:
                    for txt in r:
                        t = txt.get_text().strip()
                        if t != "":
                            answer.append(txt.get_text().strip())
                            if answer.__len__() >= 20:
                                break
                    if answer.__len__() >= 3:
                        break
            #########

            # r = results.find(class_ = "c-abstract")
            # if r==None:
            #     continue
            # else:
            #     [s.extract() for s in r(['span'])]
            #     answer.append(r.get_text())
            #     break

    del req

    answerdict['answer'] = answer
    return answerdict
Beispiel #24
0
def ws():
    user_socket = request.environ.get('wsgi.websocket')  # type:WebSocket
    while 1:
        msg =user_socket.receive()
        question = json.loads(msg)
        q = question['data']['mine']['content']
        msg =q
        input_message = str(msg).encode('utf-8')
        if len(input_message) > 60:
            answer =  mybot.respond("句子长度过长")
            # continue
        elif input_message.strip() == '':
            answer = mybot.respond("无话可说")
            # continue

        # print input_message
        message = T.wordSegment(input_message)
        # 去标点
        # print 'word Seg:'+ message
        # print '词性:'
        words = T.postag(input_message)

        if message == 'q':
            exit()
        else:
            response = mybot.respond(message)  # 在AIML数据集里寻找答案

            print "======="
            if response[0] == '#':
                print response + 'mark'

            else:
               answer =  response

            print "======="

            if response == "":
                ans = mybot.respond('找不到答案')
                answer = ans
            # 百科搜索
            elif response[0] == '#' or len(response) < 1:
                # 匹配百科
                if response.__contains__("searchbaike"):
                    print "searchbaike"
                    print response
                    res = response.split(':')
                    # 实体
                    entity = str(res[1]).replace(" ", "")
                    # 属性
                    attr = str(res[2]).replace(" ", "")
                    print entity + '<---->' + attr

                    ans = baike.query(entity, attr)

                    # 如果命中答案
                    print type(ans)
                    if type(ans) == list:
                        answer = '回答:' + QAT.ptranswer(ans, False)
                        # continue
                    elif ans.decode('utf-8').__contains__(u'::找不到'):
                        # 百度摘要+Bing摘要
                        print "通用搜索"
                        answer = search_summary.kwquery(input_message)

                # 匹配不到模版,通用查询
                elif response.__contains__("NoMatchingTemplate"):
                    print "NoMatchingTemplate"
                    ans = search_summary.kwquery(input_message)
                    print type(ans)

                if len(ans) == 0:
                    ans = mybot.respond('找不到答案')
                    answer = '回答:' + ans
                elif len(ans) > 1:
                    print "不确定候选答案"
                    answer = ans[0]
                    print 'Eric: '
                    for a in ans:
                        print a.encode("utf-8")
                else:
                    answer = '回答:' + ans[0].encode("utf-8")



            # 匹配模版
            else:
                answer = '回答:' + response

        s = '展开全部'
        print type(answer).__name__

        if (type(answer).__name__ == 'list' and len(answer)>0) or '唔... 怎么回答...'in answer or '天气' in msg:
            answer = geta(msg)
            answer = answer
            print 'wocao'
        else:
            #print answer
            if s in str(answer):
                print answer
                answer = str(answer).replace('\n', '').replace('展开全部', "").split('已赞过')[0]
                print  'OS' +answer
        print user_socket,msg

        res =answer
        a = {
            "username": "******",
            "avatar": "https://robot.rszhang.top/images/icon/nv/0.jpg",
            "id": "-2",  # //消息的来源ID(如果是私聊,则是用户id,如果是群聊,则是群组id)
            "type": "friend",  # //聊天窗口来源类型,从发送消息传递的to里面获取
            "content": res,  # //消息内容
            "cid": 0,  # //消息id,可不传。除非你要对消息进行一些操作(如撤回)
            "mine": True,  # //是否我发送的消息,如果为true,则会显示在右方
            "fromid": "100000",  # /消息的发送者id(比如群组中的某个消息发送者),可用于自动解决浏览器多窗口时的一些问题
            "timestamp": 1467475443306,  # //服务端时间戳毫秒数。注意:如果你返回的是标准的 unix 时间戳,记得要 *1000
        }

        user_socket.send(json.dumps(a))
Beispiel #25
0
    def handle(self):
        conn = self.request
        conn.sendall('欢迎访问智能百科问答系统')
        Flag =True
        data = conn.recv(4096)
        while Flag:
            input_message = data

            print "input_message====="
            print input_message
            print "=========="

            reply = ''

            if len(input_message) > 60:
                print mybot.respond("句子长度过长")
                reply = mybot.respond("句子长度过长")
                conn.sendall(reply)
                Flag =False
                continue
            elif input_message.strip() == '无':
                print mybot.respond("无")
                reply = mybot.respond("无")
                conn.sendall(reply)
                Flag = False
                continue

            print input_message
            message = T.wordSegment(input_message)
            # 去标点
            print 'word Seg:' + message
            print '词性:'
            words = T.postag(input_message)

            if message == 'q':
                exit()
            else:
                response = mybot.respond(message)

                print "======="
                print response
                print "======="

                if response == "":
                    ans = mybot.respond('找不到答案')
                    print 'Eric:' + ans
                    reply = mybot.respond('找不到答案')
                    conn.sendall(reply)
                    Flag = False

                # 百科搜索
                elif response[0] == '#':
                    # 匹配百科
                    if response.__contains__("searchbaike"):
                        print "searchbaike"
                        print response
                        res = response.split(':')
                        # 实体
                        entity = str(res[1]).replace(" ", "")
                        # 属性
                        attr = str(res[2]).replace(" ", "")
                        print entity + '<---->' + attr

                        ans = baike.query(entity, attr)
                        # 如果命中答案
                        if type(ans) == list:
                            print 'Eric:' + QAT.ptranswer(ans, False)
                            reply = QAT.ptranswer(ans, False)
                            conn.sendall(reply)
                            Flag = False
                            continue
                        elif ans.decode('utf-8').__contains__(u'::找不到'):
                            # 百度摘要+Bing摘要
                            print "通用搜索"
                            ans = search_summary.kwquery(input_message)

                    # 匹配不到模版,通用查询
                    elif response.__contains__("NoMatchingTemplate"):
                        print "NoMatchingTemplate"
                        ans = search_summary.kwquery(input_message)

                    if len(ans) == 0:
                        ans = mybot.respond('找不到答案')
                        print 'Eric:' + ans
                        reply = ans
                        conn.sendall(reply)
                        Flag = False

                    elif len(ans) > 1:
                        print "不确定候选答案"
                        print 'Eric: '
                        for a in ans:
                            print a.encode("utf8")
                            reply += a.encode("utf8") + '\n'
                        conn.sendall(reply)
                        Flag = False
                    else:
                        print 'Eric:' + ans[0].encode("utf8")
                        reply = ans[0].encode("utf8")
                        conn.sendall(reply)
                        Flag = False

                # 匹配模版
                else:
                    print 'Eric:' + response
                    reply = response
                    conn.sendall(reply)
                    Flag = False
Beispiel #26
0
def qa():

    #初始化jb分词器
    T.jieba_initialize()

    #切换到语料库所在工作目录
    mybot_path = './'
    # os.chdir(mybot_path)

    mybot = aiml.Kernel()
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/std-startup.xml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/Common conversation.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/OrdinaryQuestion.aiml")
    # mybot.respond('Load Doc Snake')
    #载入百科属性列表

    print '''
.----------------.  .-----------------. .----------------.  .----------------.  .----------------.
| .--------------. || .--------------. || .--------------. || .--------------. || .--------------. |
| |    _______   | || | ____  _____  | || |      __      | || |  ___  ____   | || |  _________   | |
| |   /  ___  |  | || ||_   \|_   _| | || |     /  \     | || | |_  ||_  _|  | || | |_   ___  |  | |
| |  |  (__ \_|  | || |  |   \ | |   | || |    / /\ \    | || |   | |_/ /    | || |   | |_  \_|  | |
| |   '.___`-.   | || |  | |\ \| |   | || |   / /__\ \   | || |   |  __'.    | || |   |  _|  _   | |
| |  |`\____) |  | || | _| |_\   |_  | || | _/ /    \ \_ | || |  _| |  \ \_  | || |  _| |___/ |  | |
| |  |_______.'  | || ||_____|\____| | || ||____|  |____|| || | |____||____| | || | |_________|  | |
| |              | || |              | || |              | || |              | || |              | |
| '--------------' || '--------------' || '--------------' || '--------------' || '--------------' |
 '----------------'  '----------------'  '----------------'  '----------------'  '----------------'
 Eric:你好,我是Eric。╭(╯^╰)╮
    '''

    while True:
        input_message = raw_input("Enter your message >> ")

        if len(input_message) > 60:
            print mybot.respond("句子长度过长")
            continue
        elif input_message.strip() == '':
            print mybot.respond("无")
            continue

        print input_message
        message = T.wordSegment(input_message)
        # 去标点
        print 'word Seg:' + message
        print '词性:'
        words = T.postag(input_message)

        if message == 'q':
            exit()
        else:
            response = mybot.respond(message)

            print "======="
            print response
            print "======="

            if response == "":
                ans = mybot.respond('找不到答案')
                print 'Eric:' + ans
            # 百科搜索
            elif response[0] == '#':
                # 匹配百科
                if response.__contains__("searchbaike"):
                    print "searchbaike"
                    print response
                    res = response.split(':')
                    #实体
                    entity = str(res[1]).replace(" ", "")
                    #属性
                    attr = str(res[2]).replace(" ", "")
                    print entity + '<---->' + attr

                    ans = baike.query(entity, attr)
                    # 如果命中答案
                    if type(ans) == list:
                        print 'Eric:' + QAT.ptranswer(ans, False)
                        continue
                    elif ans.decode('utf-8').__contains__(u'::找不到'):
                        #百度摘要+Bing摘要
                        print "通用搜索"
                        ans = search_summary.kwquery(input_message)

                # 匹配不到模版,通用查询
                elif response.__contains__("NoMatchingTemplate"):
                    print "NoMatchingTemplate"
                    ans = search_summary.kwquery(input_message)

                if len(ans) == 0:
                    ans = mybot.respond('找不到答案')
                    print 'Eric:' + ans
                elif len(ans) > 1:
                    print "不确定候选答案"
                    print 'Eric: '
                    for a in ans:
                        print a.encode("utf8")
                else:
                    print 'Eric:' + ans[0].encode("utf8")

            # 匹配模版
            else:
                print 'Eric:' + response
Beispiel #27
0
def kwquery(query):
    #分词 去停用词 抽取关键词
    keywords = []
    words = T.postag(query)
    for k in words:
        #print(k)
        # 只保留名词
        if k.flag.__contains__("n"):
            # #print k.flag
            # #print k.word
            keywords.append(k.word)

    answer = []
    text = ''
    # 找到答案就置1
    flag = 0
    # 抓取百度前10条的摘要
    soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd=' +
                                   quote(query))

    for i in range(1, 10):
        if soup_baidu == None:
            break
        results = soup_baidu.find(id=i)
        if results == None:
            ##print("百度百科找不到答案")
            break
        if results.attrs.get('mu') and i == 1:
            # #print results.attrs["mu"]
            r = results.find(class_='op_exactqa_s_answer')
            if r == None:
                pass
                ##print("百度知识图谱找不到答案")
            else:
                ##print("百度知识图谱找到答案")
                answer.append(r.get_text().strip())
                flag = 1
                break

        if flag == 0 and len(keywords) > 1:
            ##print(keywords)

            ans_lst = list(kgquery_entity(keywords[0], keywords[1]))

            if len(ans_lst) != 0:
                answer = ans_lst
                flag = 1

        if results.find("h3") != None and flag == 0:
            if results.find("h3").find("a").get_text().__contains__(
                    u"百度百科") and (i == 1 or i == 2):
                url = results.find("h3").find("a")['href']
                if url == None:
                    ##print("百度百科找不到答案")
                    continue
                else:
                    #print("百度百科找到答案")
                    baike_soup = To.get_html_baike(url)

                    r = baike_soup.find(class_='lemma-summary')
                    if r == None:
                        continue
                    else:
                        r = r.get_text().replace("\n", "").strip()
                    answer.append(r)
                    flag = 1
                    break

        text += results.get_text().strip()

    if flag == 0:
        #分句
        cutlist = [u"。", u"?", u".", u"_", u"-", u":", u"!", u"?"]
        temp = ''
        sentences = []
        for i in range(0, len(text)):
            if text[i] in cutlist:
                if temp == '':
                    continue
                else:
                    # #print temp
                    sentences.append(temp)
                temp = ''
            else:
                temp += text[i]
        # 找到含有关键词的句子,去除无关的句子
        key_sentences = {}
        for s in sentences:
            for k in keywords:
                if k in s:
                    key_sentences[s] = 1

        # 识别人名
        target_list = {}
        for ks in key_sentences:
            # #print ks
            words = T.postag(ks)
            for w in words:
                # #print "====="
                # #print w.word
                if w.flag == ("nr"):
                    if target_list.get(w.word):
                        target_list[w.word] += 1
                    else:
                        target_list[w.word] = 1

        sorted_lists = sorted(target_list.items(),
                              key=operator.itemgetter(1),
                              reverse=True)
        sorted_lists2 = []
        # 候选队列
        for i, st in enumerate(sorted_lists):
            # #print st[0]
            if st[0] in keywords:
                continue
            else:
                sorted_lists2.append(st)
        #print("返回前3个词频")
        answer = []
        for i, st in enumerate(sorted_lists2):
            # #print st[0]
            # #print st[1]
            if i < 3:
                # #print st[0]
                # #print st[1]
                answer.append(st[0])
    ##print(answer)
    return answer