Ejemplo n.º 1
0
    def __init__(self, corpus_file, word2index):

        time_s = time.time()
        self.contexts, self.responses = load_corpus_file(
            corpus_file, word2index)
        self.bm2_inst = BM25Sort(self.contexts)
        logger.debug("Time to build bm25_model by %s : %2.f seconds." %
                     (corpus_file, time.time() - time_s))
    def __init__(self, corpus_file, word2index):

        time_s = time.time()
        self.contexts, self.responses = load_corpus_file(corpus_file,
                                                         word2index,
                                                         size=50000)

        self._train_tfidf_model()  # 获得了 self.tfidf_model
        self.corpus_mm = self.tfidf_model[self.corpus]  # 生成tfidf向量化的语料库
        # self.index[docno] = simi_vector, 相似度矩阵
        self.index = similarities.MatrixSimilarity(
            self.corpus_mm)  # 注意:MatrixSimilarity() 需要全部放入内存中

        logger.debug("Time to build tfidf model by %s: %2.f seconds." %
                     (corpus_file, time.time() - time_s))
Ejemplo n.º 3
0
    def search_bing(query):
        """
        通过微软bing检索答案,包括bing知识图谱,bing网典
        """
        answer, left_text = [], ''

        # 获取bing搜索结果的摘要
        soup_bing = html_crawler.get_html_bing(bing_url_prefix + quote(query))

        # 判断是否在bing的知识图谱中
        r = soup_bing.find(class_="bm_box")

        if r:
            r = r.find_all(class_="b_vList")
            if r and len(r) > 1:
                r = r[1].find("li").get_text().strip()
                if r:
                    answer.append(r)
                    logger.debug("Bing知识图谱找到答案")
                    return answer, left_text

        # Bing网典中查找
        else:
            r = soup_bing.find(id="b_results")
            if r:
                bing_list = r.find_all('li')
                for bl in bing_list:
                    temp = bl.get_text()
                    if temp.__contains__(" - 必应网典"):
                        logger.debug("查找Bing网典")
                        # Bing 网典链接
                        url = bl.find("h2").find("a")['href']
                        if url:
                            bingwd_soup = html_crawler.get_html_bingwd(url)
                            r = bingwd_soup.find(
                                class_='bk_card_desc').find("p")
                            if r:
                                r = r.get_text().replace("\n", "").strip()
                                if r:
                                    logger.debug("Bing网典找到答案")
                                    answer.append(r)
                                    return answer, left_text
        left_text += r.get_text()

        # print("bing: ", left_text)
        return answer, left_text
Ejemplo n.º 4
0
 def __init__(self, corpus_file, word2index):
     time_s = time.time()
     self.contexts, self.responses = load_corpus_file(
         corpus_file, word2index)
     logger.debug("Time to build onehot model by %s : %2.f seconds." %
                  (corpus_file, time.time() - time_s))
Ejemplo n.º 5
0
    def search_baidu(self, query):
        """
        通过baidu检索答案,包括百度的知识图谱、百度诗词、百度万年历、百度计算器、百度知道等
        """
        answer, left_text = [], ''

        # 抓取百度搜索的前10条摘要结果
        soup_baidu = html_crawler.get_html_baidu(baidu_url_prefix +
                                                 quote(query))

        if not soup_baidu:
            return answer, left_text

        for i in range(1, self.topk):
            items = soup_baidu.find(id=1)  # 获取id=1的标签项(即第一个搜索结果)
            # print("i= {}, items= {}".format(i, items))

            if not items:
                logger.debug("百度找不到答案")
                break

            # 判断是否有mu,如果第一个是百度知识图谱的,就直接命中答案
            # 百度知识图谱
            if ("mu" in items.attrs) and i == 1:
                r = items.find(class_='op_exactqa_s_answer')
                if r:
                    logger.debug("百度知识图谱中找到答案")
                    answer.append(r.get_text().strip())
                    return answer, left_text

            # 百度古诗词
            if ("mu" in items.attrs) and i == 1:
                r = items.find(class_='op_exactqa_detail_s_answer')
                if r:
                    logger.debug("百度知识图谱中找到答案")
                    answer.append(r.get_text().strip())
                    return answer, left_text

            # 百度万年历 & 日期
            if ('mu' in items.attrs
                ) and i == 1 and items.attrs['mu'].__contains__(calendar_url):
                r = items.find(class_="op-calendar-content")
                if r:
                    logger.debug("百度万年历找到答案")
                    answer.append(r.get_text().strip().replace("\n",
                                                               "").replace(
                                                                   " ", ""))
                    return answer, left_text
            if (
                    'tpl' in items.attrs
            ) and i == 1 and items.attrs['tpl'].__contains__('calendar_new'):
                r = items.attrs['fk'].replace("6018_", "")
                logger.debug(r)
                if r:
                    logger.debug("百度万年历新版找到答案")
                    answer.append(r)
                    return answer, left_text

            # 百度搜索的计算器
            if (
                    'mu' in items.attrs
            ) and i == 1 and items.attrs['mu'].__contains__(calculator_url):
                r = items.find(class_="op_new_val_screen_result")
                if r:
                    logger.debug("计算器找到答案")
                    answer.append(r.get_text().strip())
                    return answer, left_text

            # 百度搜索的天气
            if ('mu' in items.attrs
                ) and i == 1 and items.attrs['mu'].__contains__(weather_url):
                r = items.find(class_="op_weather4_twoicon_today")
                if r:
                    logger.debug("天气找到答案")
                    answer.append(r.get_text().replace('\n', '').strip())
                    return answer, left_text

            # 百度知道
            if ('mu' in items.attrs) and i == 1:
                r = items.find(class_='op_best_answer_question_link')
                if r:
                    zhidao_soup = html_crawler.get_html_zhidao(r['href'])
                    r = zhidao_soup.find(class_='bd answer').find('pre')
                    if not r:
                        r = zhidao_soup.find(class_='bd answer').find(class_='line content').\
                            find(class_="best-text mb-10")
                    if r:
                        logger.debug("百度知道找到答案")
                        answer.append(r.get_text().strip().replace("展开全部",
                                                                   "").strip())
                        return answer, left_text
            # 百度知道的另一种形式
            if items.find("h3"):
                if items.find("h3").find("a").get_text().__contains__(
                        "百度知道") and (i == 1 or i == 2):
                    url = items.find("h3").find("a")['href']
                    if url:
                        zhidao_soup = html_crawler.get_html_zhidao(url)
                        r = zhidao_soup.find(class_='bd answer')
                        if r:
                            r = r.find('pre')
                            if not r:
                                r = zhidao_soup.find(class_='bd answer').find(
                                    class_='line content').find(
                                        class_="best-text mb-10")
                            if r:
                                logger.debug("百度知道找到答案")
                                answer.append(r.get_text().strip().replace(
                                    "展开全部", "").strip())
                                return answer, left_text

                # 百度百科
                if items.find("h3").find("a").get_text().__contains__(
                        "百度百科") and (i == 1 or i == 2):
                    url = items.find("h3").find("a")['href']
                    if url:
                        logger.debug("百度百科找到答案")
                        baike_soup = html_crawler.get_html_baike(url)

                        r = baike_soup.find(class_='lemma-summary')
                        if r:
                            answer.append(r.get_text().replace("\n",
                                                               "").strip())
                            return answer, left_text

            # 如果上面的全部匹配失败,就加入到left_text中
            left_text += items.get_text()

        # print("baidu: ", left_text)
        return answer, left_text