Esempio n. 1
0
def qa(question):

    #初始化jieba分词器
    T.jieba_initialize()

    #切换到语料库所在工作目录
    mybot_path = './'
    # os.chdir(mybot_path)

    mybot = aiml.Kernel()
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/std-startup.xml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/OrdinaryQuestion.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/Common conversation.aiml")
    # mybot.respond('Load Doc Snake')
    #载入百科属性列表

    input_message = question

    if len(input_message) > 60:
        print(mybot.respond("句子长度过长"))
    elif input_message.strip() == '':
        print(mybot.respond("无"))

    print(input_message)
    message = T.wordSegment(input_message)
    # 去标点
    print('word Seg:' + message)
    print('词性:')
    words = T.postag(input_message)
    if message == 'q':
        exit()
    else:
        response = mybot.respond(message)

        print("=======")
        print(response)
        print("=======")

        if response == "":
            ans = mybot.respond('找不到答案')
            # print(robot_id + ":" + ans)
            print("{0}:{1}".format(robot_id, ans))
        # 百科搜索
        elif response[0] == '#':
            # 匹配百科
            if response.__contains__("searchbaike"):
                print("search from baike")
                print(response)
                res = response.split(':')
                #实体
                entity = str(res[1]).replace(" ", "")
                #属性
                attr = str(res[2]).replace(" ", "")
                print(entity + '<---->' + attr)

                ans = baike.query(entity, attr)
                # 如果命中答案
                if type(ans) == list:
                    print("{0}:{1}".format(robot_id, QAT.ptranswer(ans,
                                                                   False)))
                elif ans.decode('utf-8').__contains__(u'::找不到'):
                    #百度摘要+Bing摘要
                    print("通用搜索")
                    log.info("通用搜索")
                    ans = search_summary.kwquery(input_message)

            # 匹配不到模版,通用查询
            elif response.__contains__("NoMatchingTemplate"):
                print("NoMatchingTemplate")
                ans = search_summary.kwquery(input_message)

            if len(ans) == 0:
                ans = mybot.respond('找不到答案')
                logs.info("{0}:{1}".format(robot_id, ans))
            elif len(ans) > 1:
                logs.info(sys.exc_info())
                logs.info("不确定候选答案")
                logs.info("[{0}][func:{1}][line:{2}]:不确定候选答案".format(
                    sys._getframe().f_code.co_filename,
                    sys._getframe().f_code.co_name,
                    sys._getframe().f_lineno))
                print(robot_id + ': ')
                for a in ans:
                    print(a)
                    # print(a.encode("utf8"))
            else:
                print('{0}:{1}'.format(robot_id, ans[0]))

        # 匹配模版
        else:
            print("{}: {}".format(robot_id, response))
Esempio n. 2
0
def kwquery(query):
    #分词 去停用词 抽取关键词
    log = '提取关键词:'
    keywords = []
    words = T.postag(query)
    for k in words:
        # 只保留名词
        if k.flag.__contains__("n"):
            keywords.append(k.word)
            log += k.word
    log += '#' * 50 + '\n'
    answer = []
    text = []
    # 找到答案置1
    flag = 0

    # 抓取百度前10条的摘要
    log += '百度前10条的摘要'
    url = 'https://www.baidu.com/s?wd=' + quote(query)
    log += url + '#' * 50 + '\n'
    soup_baidu = To.get_html_baidu(url)

    for i in range(1, 11):
        if soup_baidu == None:
            break
        results = soup_baidu.find(id=i)

        if results == None:
            log += '百度摘要找不到答案' + '#' * 50 + '\n'
            break
        log += '第' + str(i) + '条摘要:\n'
        log += clean_str(results.get_text()) + '#' * 50 + '\n'
        #判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案
        #if 'mu' in results.attrs and i == 1:
        if 'mu' in results.attrs and results.find(
                class_='op_exactqa_s_answer') != None:
            r = results.find(class_='op_exactqa_s_answer')
            log += '第一条百度摘要为百度搜索根据知识图谱直接匹配出的内容,优先查找\n'
            log += '百度知识图谱找到答案' + '#' * 50 + '\n'
            answer.append(r.get_text().strip())
            flag = 1
            break

        #古诗词判断
        if 'mu' in results.attrs and i == 1 and results.find(
                class_="op_exactqa_detail_s_answer") != None:
            r = results.find(class_="op_exactqa_detail_s_answer")
            log += '百度诗词找到答案' + '#' * 50 + '\n'
            answer.append(r.get_text().strip())
            flag = 1
            break

        #万年历 & 日期
        if 'mu' in results.attrs and i == 1 and results.attrs[
                'mu'].__contains__('http://open.baidu.com/calendar'):
            r = results.find(class_="op-calendar-content")
            if r != None:
                log += '百度万年历找到答案' + '#' * 50 + '\n'
                answer.append(r.get_text().strip().replace("\n", "").replace(
                    " ", ""))
                flag = 1
                break

        if 'tpl' in results.attrs and i == 1 and results.attrs[
                'tpl'].__contains__('calendar_new'):
            r = results.attrs['fk'].replace("6018_", "")
            if r != None:
                log += '百度万年历新版找到答案' + '#' * 50 + '\n'
                answer.append(r)
                flag = 1
                break

        #计算器
        if 'mu' in results.attrs and i == 1 and results.attrs[
                'mu'].__contains__(
                    'http://open.baidu.com/static/calculator/calculator.html'):
            r = results.find(class_="op_new_val_screen_result")
            if r != None:
                log += '计算器找到答案' + '#' * 50 + '\n'
                answer.append(r.get_text().strip())
                flag = 1
                break

        # 百度知道答案
        #if 'mu' in results.attrs and i == 1:
        if 'mu' in results.attrs and results.find(
                class_='op_best_answer_question_link') != None:
            r = results.find(class_='op_best_answer_question_link')
            url = r['href']
            zhidao_soup = To.get_html_zhidao(url)
            r = zhidao_soup.find(class_='bd answer').find('pre')
            if r == None:
                r = zhidao_soup.find(class_='bd answer').find(
                    class_='line content')
            log += '百度知道best answer找到答案' + '#' * 50 + '\n'
            answer.append(r.get_text())
            flag = 1
            break

        if results.find("h3") != None:
            # 百度知道
            if results.find("h3").find("a").get_text().__contains__(u"百度知道"):
                url = results.find("h3").find("a")['href']
                if url == None:
                    log += '百度知道找不到答案' + '#' * 50 + '\n'
                    continue
                else:
                    log += '百度知道找到答案' + '#' * 50 + '\n'
                    zhidao_soup = To.get_html_zhidao(url)
                    r = zhidao_soup.find(class_='bd answer')
                    if r == None:
                        continue
                    else:
                        r = r.find('pre')
                        if r == None:
                            r = zhidao_soup.find(class_='bd answer').find(
                                class_='line content')
                    answer.append(r.get_text().strip())
                    flag = 1
                    break

            # 百度百科
            if results.find("h3").find("a").get_text().__contains__(u"百度百科"):
                url = results.find("h3").find("a")['href']
                if url == None:
                    log += '百度百科找不到答案' + '#' * 50 + '\n'
                    continue
                else:
                    log += '百度百科找到答案' + '#' * 50 + '\n'
                    baike_soup = To.get_html_baike(url)

                    r = baike_soup.find(class_='lemma-summary')
                    if r == None:
                        continue
                    else:
                        r = r.get_text().replace("\n", "").strip()
                    answer.append(r)
                    flag = 1
                    break
        text.append(
            clean_str(results.get_text()) + "(" + results.find("a")['href'] +
            ")")

    if flag == 1:
        return (answer, log)
    log += '百度前十条摘要不包含百度知道、百度百科内容,或相关内容中找不到答案' + '#' * 50 + '\n'

    #获取bing的摘要
    log += '通过Bing查找\n'
    url = 'https://www.bing.com/search?q=' + quote(query)
    log += url + '\n'
    log += '#' * 50
    soup_bing = To.get_html_bing(url)
    # 判断是否在Bing的知识图谱中
    # bingbaike = soup_bing.find(class_="b_xlText b_emphText")
    bingbaike = soup_bing.find(class_="bm_box")

    if bingbaike != None:
        if bingbaike.find_all(class_="b_vList")[1] != None:
            if bingbaike.find_all(class_="b_vList")[1].find("li") != None:
                log += 'Bing百科找到答案' + '#' * 50 + '\n'
                flag = 1
                answer.append(bingbaike.get_text())
                return (answer, log)
    else:
        log += 'Bing百科找不到答案' + '#' * 50 + '\n'
        results = soup_bing.find(id="b_results")
        bing_list = results.find_all('li')
        for bl in bing_list:
            temp = bl.get_text()
            if temp.__contains__(u" - 必应网典"):
                url = bl.find("h2").find("a")['href']
                if url == None:
                    log += 'Bing网典找不到答案' + '#' * 50 + '\n'
                    continue
                else:
                    log += 'Bing网典找到答案' + '#' * 50 + '\n'
                    bingwd_soup = To.get_html_bingwd(url)

                    r = bingwd_soup.find(class_='bk_card_desc').find("p")
                    if r == None:
                        continue
                    else:
                        r = r.get_text().replace("\n", "").strip()
                    answer.append(r)
                    flag = 1
                    break

        if flag == 1:
            return (answer, log)

    log += '没有找到答案,返回百度前十条摘要内容\n'
    #if flag == 0:
    #分句
    #log += ''
    #cutlist = ["。", "?", ".", "_", "-", ":", "!", "?"]
    #temp = ''
    #sentences = []
    #for i in range(0,len(text)):
    #    if text[i] in cutlist:
    #         if temp == '':
    #             continue
    #         else:
    #             sentences.append(temp)
    #         temp = ''
    #     else:
    #         temp += text[i]
    #
    # # 找到含有关键词的句子,去除无关的句子
    # key_sentences = {}
    # for s in sentences:
    #     for k in keywords:
    #         if k in s:
    #             key_sentences[s]=1

    # 根据问题制定规则
    # 识别人名
    #target_list = {}
    #for ks in key_sentences:
    #    # print ks
    #    words = T.postag(ks)
    #    for w in words:
    #        if w.flag == ("nr"):
    #            if w.word in target_list:
    #                target_list[w.word] += 1
    #            else:
    #                target_list[w.word] = 1

    ## 找出最大词频
    #sorted_lists = sorted(target_list.items(), key=lambda x: x[1], reverse=True)
    #去除问句中的关键词
    #sorted_lists2 = []
    # 候选队列
    #for i, st in enumerate(sorted_lists):
    #    if st[0] in keywords:
    #        continue
    #    else:
    #        sorted_lists2.append(st)
    ##log += ' '.join(sorted_lists2)
    ##print ("返回前n个词频")
    #answer = []
    #for i,st in enumerate(sorted_lists2):
    #    if i< 3:
    #        answer.append(st[0])
    #for ks in key_sentences:
    #    answer += ks + '\n'
    answer = text

    return (answer, log)
Esempio n. 3
0
def main():
    # 初始化jb分词器
    T.jieba_initialize()

    # 切换到语料库所在工作目录
    mybot_path = './'
    os.chdir(mybot_path)

    # 加载AIML的规则
    mybot = aiml.Kernel()
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/std-startup.xml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/abc.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/bot_profile.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/general.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/infor.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] + "/resources/main.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/new07281.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/salutations.aiml")
    mybot.learn(
        os.path.split(os.path.realpath(__file__))[0] +
        "/resources/virus0727.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/zextra_weibao.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bye.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/tools.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/bad.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/funny.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/OrdinaryQuestion.aiml")
    # mybot.learn(os.path.split(os.path.realpath(__file__))[0] + "/resources/Common conversation.aiml")

    # mybot.respond('Load Doc Snake')
    #载入百科属性列表

    #     print '''
    # .----------------.  .-----------------. .----------------.  .----------------.  .----------------.
    # | .--------------. || .--------------. || .--------------. || .--------------. || .--------------. |
    # | |    _______   | || | ____  _____  | || |      __      | || |  ___  ____   | || |  _________   | |
    # | |   /  ___  |  | || ||_   \|_   _| | || |     /  \     | || | |_  ||_  _|  | || | |_   ___  |  | |
    # | |  |  (__ \_|  | || |  |   \ | |   | || |    / /\ \    | || |   | |_/ /    | || |   | |_  \_|  | |
    # | |   '.___`-.   | || |  | |\ \| |   | || |   / /__\ \   | || |   |  __'.    | || |   |  _|  _   | |
    # | |  |`\____) |  | || | _| |_\   |_  | || | _/ /    \ \_ | || |  _| |  \ \_  | || |  _| |___/ |  | |
    # | |  |_______.'  | || ||_____|\____| | || ||____|  |____|| || | |____||____| | || | |_________|  | |
    # | |              | || |              | || |              | || |              | || |              | |
    # | '--------------' || '--------------' || '--------------' || '--------------' || '--------------' |
    #  '----------------'  '----------------'  '----------------'  '----------------'  '----------------'
    #  Eric:你好,我是Eric。╭(╯^╰)╮
    #     '''

    print('泰康小康:你好,我是小康。╭(╯^╰)╮')

    # 对问题的处理流程
    while True:
        # 输入这个问题
        input_message = raw_input("您想问什么 >> ")

        # 对输入问题进行简单的处理:
        # 忽略过长(超过60)的问题
        # 忽略空问题
        if len(input_message) > 60:
            print(mybot.respond("句子长度过长"))
            continue
        elif input_message.strip() == '':
            print(mybot.respond("无"))
            continue

        # 利用Tools工具对问题进行处理
        print(input_message)
        message = T.wordSegment(input_message)
        # 去标点
        print('word Seg:' + message)
        # print('词性:')
        words = T.postag(input_message)

        # 退出
        if message == 'q':
            exit()
        # 返回信息的优先级
        else:
            # 首先是AIML的模板匹配
            response = mybot.respond(message)

            print("=======")
            print(response)
            print(len(response.decode('utf8')))
            print("=======")
Esempio n. 4
0
    def __inner_job():
        start = time.time()
        text_binary = analyze_current_screen_text(
            directory=data_directory, compress_level=image_compress_level[0])

        keywords = get_text_from_image(image_data=text_binary, )
        if not keywords:
            print("text not recognize")
            return

        true_flag, question, answers = parse_question_and_answer(keywords)
        #questions=question.decode('unicode-escape')
        #new_ans=[]
        #for ans in answers:
        # new_ans.append(ans.decode('unicode-escape'))

        print('-' * 72)
        print(question)
        print('-' * 72)
        print("\n".join(answers))

        # notice browser
        if enable_chrome:
            with question_obj.get_lock():
                question_obj.value = question
                keyboard.press("space")

        search_question = pre_process_question(question)
        summary = baidu_count(search_question, answers, timeout=timeout)
        summary_li = sorted(summary.items(),
                            key=operator.itemgetter(1),
                            reverse=True)
        data = [("选项", "同比")]
        for a, w in summary_li:
            data.append((a, w))
        table = AsciiTable(data)
        print(table.table)

        print("*" * 72)
        if true_flag:
            print("肯定回答(**): ", summary_li[0][0])
            print("否定回答(  ): ", summary_li[-1][0])
        else:
            print("肯定回答(  ): ", summary_li[0][0])
            print("否定回答(**): ", summary_li[-1][0])
        print("*" * 72)

        ##############################################################
        input_message = question

        if len(input_message) > 60:
            print(mybot.respond("句子长度过长"))
        elif input_message.strip() == '':
            print(mybot.respond("无"))

        #print(input_message)
        message = T.wordSegment(input_message)
        # 去标点
        #print('word Seg:' + message)
        #print('词性:')
        words = T.postag(input_message)

        if message == 'q':
            exit()
        else:
            response = mybot.respond(message)

            #print("=======")
            #print(response)
            #print("=======")

            if response == "":
                ans = mybot.respond('找不到答案')
                print('Eric:' + ans)
            # 百科搜索
            elif response[0] == '#':
                # 匹配百科
                if response.__contains__("searchbaike"):
                    #print("searchbaike")
                    #print(response)
                    res = response.split(':')
                    # 实体
                    entity = str(res[1]).replace(" ", "")
                    # 属性
                    attr = str(res[2]).replace(" ", "")
                    #print(entity + '<---->' + attr)

                    ans = baike.query(entity, attr)
                    # 如果命中答案
                    if type(ans) == list:
                        print('Eric:' + QAT.ptranswer(ans, False))

                    elif ans.decode('utf-8').__contains__(u'::找不到'):
                        # 百度摘要+Bing摘要
                        print("通用搜索")
                        ans = search_summary.kwquery(input_message)

                # 匹配不到模版,通用查询
                elif response.__contains__("NoMatchingTemplate"):
                    #print("NoMatchingTemplate")
                    ans = search_summary.kwquery(input_message, answers)

                if len(ans) == 0:
                    print('Eric:' + '找不到答案')
                elif len(ans) > 1:
                    print("不确定候选答案")
                    print('Eric: ')
                    for a in ans:
                        print(a)
                else:
                    print('Eric:' + ans[0])

            # 匹配模版
            else:
                print('Eric:' + response)

        end = time.time()
        print("use {0} 秒".format(end - start))
        save_screen(directory=data_directory)
Esempio n. 5
0
    def respond(self, text):

        question = list(TextProcess.postag(text))  # 对查询字符串进行分词
        keywords = []
        # print(question)
        logger.global_logger.info("query: {}  cut:{}".format(text, question))
        for word, tag in question:  # 去除停用词
            if word in self.stop_words:
                continue
            if 'n' not in tag or "un" == tag:  # and 'v' not in tag:
                # 保证名词进去keyword,即保证对象描述不会太远,后面再用语义匹配方法匹配出来
                continue
            keywords.append(word)
        if len(keywords) == 0:
            # 如果一个名词都没有,放动词
            for word, tag in question:  # 去除停用词
                if word in self.stop_words:
                    continue
                if 'v' not in tag:  # and 'v' not in tag:
                    # 保证名词进去keyword,即保证对象描述不会太远,后面再用语义匹配方法匹配出来
                    continue
                keywords.append(word)

        # 匹配keyword
        # condition = [" QUESTION like \'%{}%\'".format(keyword) for keyword in keywords] # 慢
        condition = [
            " instr(QUESTION, '{}') > 0 ".format(keyword)
            for keyword in keywords
        ]  # 快
        if len(condition) == 0:
            return []
        sql = "select QUESTION ,ANSWER from qa_pair where {}".format(
            "and".join(condition))
        logger.global_logger.info("going to execute this sql: {}".format(sql))
        result = self.cursor.execute(sql)  # (id ,q,a)
        res = []
        # 计算所有问题和问句得相似度,排序
        for row in result:
            q = row[0]
            a = row[1]
            state, sim = self._similarity(text, q)
            # logger.global_logger.info("text:{}  query:{} score:{}".format(text, q, sim))
            if state == 0:
                raise Exception("similarity Api Error.")
            elif sim > 0.9:
                res.append((q, a, sim))
        # 挑选得分第一的返回(可以并列)
        finall = []
        if len(res) > 0:
            ans = sorted(res, key=lambda x: x[2], reverse=True)
            score = -1
            for a in ans:
                if a[2] > score:
                    score = a[2]
                    logger.global_logger.info(
                        "[MATCH RESULT]{} match:{} score:{}".format(
                            text, a[0], score))
                    finall.append((a[0], a[1]))
                elif a[2] == score:
                    finall.append((a[0], a[1]))
        # else:
        #     finall.append(("", ""))
        return finall
Esempio n. 6
0
def kwquery(query, answers):
    # 分词 去停用词 抽取关键词
    keywords = []
    words = T.postag(query)
    for k in words:
        # 只保留名词
        if k.flag.__contains__("n"):
            # print k.flag
            # print k.word
            keywords.append(k.word)

    answer = []
    text = ''
    # 找到答案就置1
    flag = 0

    # 抓取百度前10条的摘要
    soup_baidu = To.get_html_baidu('https://www.baidu.com/s?wd=' +
                                   parse.quote(query))

    for i in range(1, 10):
        if soup_baidu == None:
            break
        results = soup_baidu.find(id=i)

        if results == None:
            print
            "百度摘要找不到答案"
            break
        # print '============='
        # print results.attrs
        # print type(results.attrs)
        # print results['class']
        # 判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案
        if 'mu' in results.attrs and i == 1:
            # print results.attrs["mu"]
            r = results.find(class_='op_exactqa_s_answer')
            if r == None:
                print
                "百度知识图谱找不到答案"
            else:
                # print r.get_text()
                print
                "百度知识图谱找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break

        # 古诗词判断
        if 'mu' in results.attrs and i == 1:
            r = results.find(class_="op_exactqa_detail_s_answer")
            if r == None:
                print
                "百度诗词找不到答案"
            else:
                # print r.get_text()
                print
                "百度诗词找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break

        # 万年历 & 日期
        if 'mu' in results.attrs and i == 1 and results.attrs[
                'mu'].__contains__('http://open.baidu.com/calendar'):
            r = results.find(class_="op-calendar-content")
            if r == None:
                print
                "百度万年历找不到答案"
            else:
                # print r.get_text()
                print
                "百度万年历找到答案"
                answer.append(r.get_text().strip().replace("\n", "").replace(
                    " ", ""))
                flag = 1
                break

        if 'tpl' in results.attrs and i == 1 and results.attrs[
                'tpl'].__contains__('calendar_new'):
            r = results.attrs['fk'].replace("6018_", "")
            print
            r

            if r == None:
                print
                "百度万年历新版找不到答案"
                # continue
            else:
                # print r.get_text()
                print
                "百度万年历新版找到答案"
                answer.append(r)
                flag = 1
                break

        # 计算器
        if 'mu' in results.attrs and i == 1 and results.attrs[
                'mu'].__contains__(
                    'http://open.baidu.com/static/calculator/calculator.html'):
            r = results.find(class_="op_new_val_screen_result")
            if r == None:
                print
                "计算器找不到答案"
                # continue
            else:
                # print r.get_text()
                print
                "计算器找到答案"
                answer.append(r.get_text().strip())
                flag = 1
                break

        # 百度知道答案
        if 'mu' in results.attrs and i == 1:
            r = results.find(class_='op_best_answer_question_link')
            if r == None:
                print
                "百度知道图谱找不到答案"
            else:
                print
                "百度知道图谱找到答案"
                url = r['href']
                zhidao_soup = To.get_html_zhidao(url)
                r = zhidao_soup.find(class_='bd answer').find('pre')
                if r == None:
                    r = zhidao_soup.find(class_='bd answer').find(
                        class_='line content')

                answer.append(r.get_text())
                flag = 1
                break

        if results.find("h3") != None:
            # 百度知道
            if results.find("h3").find("a").get_text().__contains__(
                    u"百度知道") and (i == 1 or i == 2):
                url = results.find("h3").find("a")['href']
                if url == None:
                    print
                    "百度知道图谱找不到答案"
                    continue
                else:
                    print
                    "百度知道图谱找到答案"
                    zhidao_soup = To.get_html_zhidao(url)

                    r = zhidao_soup.find(class_='bd answer')
                    if r == None:
                        continue
                    else:
                        r = r.find('pre')
                        if r == None:
                            r = zhidao_soup.find(class_='bd answer').find(
                                class_='line content')
                    answer.append(r.get_text().strip())
                    flag = 1
                    break

            # 百度百科
            if results.find("h3").find("a").get_text().__contains__(
                    u"百度百科") and (i == 1 or i == 2):
                url = results.find("h3").find("a")['href']
                if url == None:
                    print
                    "百度百科找不到答案"
                    continue
                else:
                    print
                    "百度百科找到答案"
                    baike_soup = To.get_html_baike(url)

                    r = baike_soup.find(class_='lemma-summary')
                    if r == None:
                        continue
                    else:
                        r = r.get_text().replace("\n", "").strip()
                    answer.append(r)
                    flag = 1
                    break
        text += results.get_text()

    if flag == 1:
        return answer

    # 获取bing的摘要
    soup_bing = To.get_html_bing('https://www.bing.com/search?q=' +
                                 parse.quote(query))
    # 判断是否在Bing的知识图谱中
    # bingbaike = soup_bing.find(class_="b_xlText b_emphText")
    bingbaike = soup_bing.find(class_="bm_box")

    if bingbaike != None:
        if bingbaike.find_all(class_="b_vList")[1] != None:
            if bingbaike.find_all(class_="b_vList")[1].find("li") != None:
                print
                "Bing知识图谱找到答案"
                flag = 1
                answer.append(bingbaike.get_text())
                # print "====="
                # print answer
                # print "====="
                return answer
    else:
        print
        "Bing知识图谱找不到答案"
        results = soup_bing.find(id="b_results")
        bing_list = results.find_all('li')
        for bl in bing_list:
            temp = bl.get_text()
            if temp.__contains__(u" - 必应网典"):
                print
                "查找Bing网典"
                url = bl.find("h2").find("a")['href']
                if url == None:
                    print
                    "Bing网典找不到答案"
                    continue
                else:
                    print
                    "Bing网典找到答案"
                    bingwd_soup = To.get_html_bingwd(url)

                    r = bingwd_soup.find(class_='bk_card_desc').find("p")
                    if r == None:
                        continue
                    else:
                        r = r.get_text().replace("\n", "").strip()
                    answer.append(r)
                    flag = 1
                    break

        if flag == 1:
            return answer

        text += results.get_text()

    # print text

    # 如果再两家搜索引擎的知识图谱中都没找到答案,那么就分析摘要
    if flag == 0:
        # 分句
        cutlist = [u"。", u"?", u".", u"_", u"-", u":", u"!", u"?"]
        temp = ''
        sentences = []
        for i in range(0, len(text)):
            if text[i] in cutlist:
                if temp == '':
                    continue
                else:
                    # print temp
                    sentences.append(temp)
                temp = ''
            else:
                temp += text[i]

        # 找到含有关键词的句子,去除无关的句子
        key_sentences = {}
        for s in sentences:
            for k in keywords:
                if k in s:
                    key_sentences[s] = 1

        # 根据问题制定规则

        target_list = {}
        for Hans in answers:
            for sentence in key_sentences:
                if Hans in sentence:
                    if Hans in target_list:
                        target_list[Hans] += 1
                    else:
                        target_list[Hans] = 1
        print(target_list)

        # 找出最大词频
        ##print(target_list.items())
        sorted_lists = sorted(target_list.items(),
                              key=lambda item: item[1],
                              reverse=True)
        # print len(target_list)
        # 去除问句中的关键词
        sorted_lists2 = []
        # 候选队列
        for i, st in enumerate(sorted_lists):
            # print st[0]
            if st[0] in keywords and st[0] not in answers:
                continue
            else:
                sorted_lists2.append(st)

        print
        "返回前n个词频"
        answer = []
        for i, st in enumerate(sorted_lists2):
            # print st[0]
            # print st[1]
            if i < 3:
                # print st[0]
                # print st[1]
                answer.append(st[0])
                # print answer

    return answer
Esempio n. 7
0
    def respond(self, query):
        """
        采用文本摘要等技术来完成更广泛提问的总结
        :param query:
        :return:
        """
        # 查找百度
        url = 'https://www.baidu.com/s?wd=' + quote(query)
        # print(url)
        t1 = time.time()
        soup_baidu = Html_Tools.get_html_baidu(url)
        # print("Query Baidu:{}".format(time.time() - t1))
        contents = []
        key_word = list(TextProcess.postag(query))  # (word tag)
        key_word = [
            word for word, tag in key_word
            if (word not in self.stop_word and "n" in tag)
        ]
        t1 = time.time()
        for i in range(1, 10):
            # print("content -- {}".format(i))
            if soup_baidu == None:
                break

            results = soup_baidu.find(id=i)

            if results == None:
                # ("Id{}找不到".format(i))
                continue
            infos = results.find_all('h3')

            for info in infos:

                tag = info.find("a")
                if tag is None:
                    continue
                else:
                    href = tag['href']
                    if "www.baidu.com/link" not in href:
                        continue
                    try:
                        sub_soup = Html_Tools.get_html(href)
                        info_list = self._extract(sub_soup)
                        # 句子级的过滤
                        for info in info_list:
                            # 问句过滤
                            if any(["?" in info, "?" in info]):
                                continue
                            else:
                                contents.append(info)
                    except:
                        pass
        # print("For :{}".format((time.time() - t1) / 10))
        if len(contents) > 0:
            t1 = time.time()
            key_sentence = self._get_key_sentence(list(set(contents)),
                                                  key_word)
            # print("Key Sentence:{}".format(time.time() - t1))
        else:
            key_sentence = []
        # print()
        return key_sentence
Esempio n. 8
0
def kwquery(query):
    # 分词 去停用词 抽取关键词
    keywords = []
    words = T.postag(query)
    for k in words:
        # 只保留名词
        if k.flag.__contains__("n"):
            # print k.flag
            # print k.word
            keywords.append(k.word)

    answer = []
    text = ''
    # 找到答案就置1
    flag = 0

    # 抓取百度前10条的摘要
    url = 'https://www.baidu.com/s?wd=' + quote(query)
    # print(url)
    soup_baidu = To.get_html_baidu(url)
    for i in range(1, 10):
        # print("content -- {}".format(i))
        if soup_baidu == None:
            break

        results = soup_baidu.find(id=i)

        if results == None:
            print("Id{}找不到".format(i))
            continue

        # 判断是否有mu,如果第一个是百度知识图谱的 就直接命中答案
        text += results.get_text()
        if 'mu' in results.attrs:  # 一般在前三条
            # print results.attrs["mu"]
            r = results.find(class_='op_exactqa_s_answer')
            if r == None:
                # print("百度知识图谱找不到答案")
                pass
            else:
                # print r.get_text()
                # print("百度知识图谱找到答案")
                answer.append(r.get_text().strip().replace("  ", ""))
                flag = 1
                break

        # 电影栏目
        if 'mu' in results.attrs and results.attrs['mu'].__contains__(
                "http://nourl.baidu.com/"):
            if results.find(class_="c-gap-top-small") is not None:
                items = results.find_all(class_="c-gap-top-small")
                for item in items:
                    if item.find("a") is not None:
                        answer.append(item.find("a").get_text())
                flag = 1
                break
            else:
                pass

        # 天气判断
        weather_list = results.find_all(
            class_="op_weather4_twoicon_today OP_LOG_LINK")  # 今天的天气
        if len(weather_list) > 0:
            # print("百度天气找到了")
            weather_info = weather_list[0]
            date = weather_info.find(
                class_="op_weather4_twoicon_date").get_text().strip()
            C = weather_info.find(
                class_="op_weather4_twoicon_temp").get_text().strip()
            rain_or_not = weather_info.find(
                class_="op_weather4_twoicon_weath").get_text().strip()
            wind = weather_info.find(
                class_="op_weather4_twoicon_wind").get_text().strip()
            ans = "{}\t{}\t{}\t{}".format(date, C, rain_or_not, wind)
            answer.append(ans)
            # 获取未来的天气
            weather_list = results.find_all(
                class_="op_weather4_twoicon_day OP_LOG_LINK")  # 未来的天气
            for weather_info in weather_list:
                # print(weather_info)
                date = weather_info.find(
                    class_="op_weather4_twoicon_date_day").get_text().strip()
                C = weather_info.find(
                    class_="op_weather4_twoicon_temp").get_text().strip()
                rain_or_not = weather_info.find(
                    class_="op_weather4_twoicon_weath").get_text().strip()
                wind = weather_info.find(
                    class_="op_weather4_twoicon_wind").get_text().strip()
                ans = "{}\t{}\t{}\t{}".format(date, C, rain_or_not, wind)
                answer.append(ans)
            flag = 1
            break
        else:
            # print("百度天气找不到")
            pass

        # 古诗词判断
        if 'mu' in results.attrs:
            r = results.find(class_="op_exactqa_detail_s_answer")
            if r == None:
                # print("百度诗词找不到答案")
                pass
            else:
                # print r.get_text()
                # print("百度诗词找到答案")
                answer.append(r.get_text().strip())
                flag = 1
                break

        # 万年历 & 日期
        if 'mu' in results.attrs and i == 1 and results.attrs[
                'mu'].__contains__('http://open.baidu.com/calendar'):
            r = results.find(class_="op-calendar-content")
            if r == None:
                # print("百度万年历找不到答案")
                pass
            else:
                # print r.get_text()
                # print("百度万年历找到答案")
                answer.append(r.get_text().strip().replace("\n", "").replace(
                    " ", ""))
                flag = 1
                break

        # if 'tpl' in results.attrs and i == 1 and results.attrs['tpl'].__contains__('calendar_new'):
        #     # print(results)
        #     r = None  # results.attrs['fk'].replace("6018_", "")
        #     print(r)
        #
        #     if r == None:
        #         print("百度万年历新版找不到答案")
        #         # continue
        #     else:
        #         # print r.get_text()
        #         print("百度万年历新版找到答案")
        #         answer.append(r)
        #         flag = 1
        #         break

        # 计算器
        if 'mu' in results.attrs and results.attrs['mu'].__contains__(
                'http://open.baidu.com/static/calculator/calculator.html'):
            # r = results.find('div').find_all('td')[1].find_all('div')[1]
            r = results.find(class_="op_new_val_screen_result")
            if r == None:
                # print("计算器找不到答案")
                pass
                # continue
            else:
                # print r.get_text()
                # print("计算器找到答案")
                answer.append(r.get_text().strip())
                flag = 1
                break

        # 百度知道答案
        # if 'mu' in results.attrs:
        #     r = results.find(class_='op_best_answer_question_link')
        #     if r == None:
        #         print("百度知道图谱找不到答案")
        #     else:
        #         print("百度知道图谱找到答案")
        #         url = r['href']
        #         zhidao_soup = To.get_html_zhidao(url)
        #         r = zhidao_soup.find(class_='bd answer').find('pre')
        #         if r == None:
        #             r = zhidao_soup.find(class_='bd answer').find(class_='line content')
        #
        #         answer.append(r.get_text())
        #         flag = 1
        #         break
        #
        if results.find("h3") != None:
            # 百度知道
            # if results.find("h3").find("a").get_text().__contains__("百度知道") and (i == 1 or i == 2):
            #     url = results.find("h3").find("a")['href']
            #     if url == None:
            #         # print("百度知道图谱找不到答案")
            #         continue  # 当前id只会存在一个h3,没有答案则进入下一个id找
            #     else:
            #         # print("百度知道图谱找到答案")
            #         zhidao_soup = To.get_html_zhidao(url)
            #         r = zhidao_soup.find(class_='bd answer')
            #         if r == None:
            #             continue
            #         else:
            #             r = r.find('pre')
            #             if r == None:
            #                 r = zhidao_soup.find(class_='bd answer').find(class_='line content')
            #         text = r.get_text().strip()
            #         answer.append(text)
            #         flag = 1
            #         break

            # 百度百科
            link = results.find("h3").find("a")
            if link is not None and link.get_text().__contains__("百度百科"):
                url = results.find("h3").find("a")['href']
                if url == None:
                    # print("百度百科找不到答案")
                    continue
                else:
                    # print("百度百科找到答案")
                    baike_soup = To.get_html_baike(url)

                    r = baike_soup.find(class_='lemma-summary')
                    if r == None:
                        continue
                    else:
                        r = r.get_text().replace("\n", "").strip()
                    answer.append(r)
                    flag = 1
                    break

        text += results.get_text()

    if flag == 1:
        return answer

    # 获取bing的摘要
    soup_bing = To.get_html_bing('https://www.bing.com/search?q=' +
                                 quote(query))
    # 判断是否在Bing的知识图谱中
    # bingbaike = soup_bing.find(class_="b_xlText b_emphText")
    bingbaike = soup_bing.find(class_="bm_box")

    if bingbaike != None:
        if bingbaike.find_all(class_="b_vList")[1] != None:
            if bingbaike.find_all(class_="b_vList")[1].find("li") != None:
                # print("Bing知识图谱找到答案")
                flag = 1
                answer.append(bingbaike.get_text())
                # print "====="
                # print answer
                # print "====="
                return answer
    else:
        # print("Bing知识图谱找不到答案")
        results = soup_bing.find(id="b_results")
        bing_list = results.find_all('li')
        for bl in bing_list:
            temp = bl.get_text()
            if temp.__contains__(" - 必应网典"):
                print("查找Bing网典")
                url = bl.find("h2").find("a")['href']
                if url == None:
                    # print("Bing网典找不到答案")
                    continue
                else:
                    # print("Bing网典找到答案")
                    bingwd_soup = To.get_html_bingwd(url)

                    r = bingwd_soup.find(class_='bk_card_desc').find("p")
                    if r == None:
                        continue
                    else:
                        r = r.get_text().replace("\n", "").strip()
                    answer.append(r)
                    flag = 1
                    break

        if flag == 1:
            return answer

            # text += results.get_text()
    # 如果再两家搜索引擎的知识图谱中都没找到答案,
    # answer.append("")
    return answer