Example #1
0
def _load_define_dict(file_path):
    global _self_define_words
    if sys.version_info[0] < 3:
        words = open(file_path, 'r')
    else:
        words = open(file_path, 'r', encoding='utf-8')
    words = words.readlines()
    for w in words:
        if w.startswith("#"):
            continue
        _self_define_words.add(code.any2unicode(w).strip())
    logger.info("self_define_words = %d", len(_self_define_words))
def key_words_to_synonyms(keywords):
    """
    :return: 返回关键词替换后的同义词
    """
    synonyms_dict = config.get_value("synonyms_dict", None)
    replace_keywords = set()
    if synonyms_dict is None or len(synonyms_dict) == 0:
        logger.info("load synonyms dict error!")
        return set(keywords)
    else:
        logger.info("in rule based qq similarity, len(synonyms_dict) = %s",
                    len(synonyms_dict))
    for word in keywords:
        replace_keywords.add(synonyms_dict.get(word, word))
    return replace_keywords
Example #3
0
    def __init__(self):
        self.faqs = config.get_value("faq_list")  # key: id, value: doc:
        self.synonym = dict()
        self.defualt_match_type = None
        self.load_defualt_match_type()
        start = time.time()
        self.init_synonym()
        end = time.time()
        logger.info("load faq segment used %d second, total Len faq = %d",
                    end - start, len(self.faqs))
        logger.info("load synonym successful, synonym len = %d",
                    len(self.synonym))

        if not is_test:
            self.repeatingTimer = RepeatingTimer(refresh_interval,
                                                 self.update_dict)
            self.repeatingTimer.start()
def find_answer_by_rule(query, filted_faq):
    """
    :param query:
    :param filted_faq:
    :return: 匹配一条规则后返回答案
    """
    def is_keywords_in_query(query, key_words):
        if len(key_words) != 0:
            flag = True
            query_words_join = ''.join(query)
            for keyword in key_words:
                if keyword not in query_words_join:
                    flag = False
                    break
            return flag
        else:
            return False

    logger.info('filted faq length in find answer by rule = %d',
                len(filted_faq))
    hit_count = 0
    answers = []
    for rule_id, key_words, answer_ids, score in _rules:
        if is_keywords_in_query(query, key_words) and (hit_count < 3):
            for answer_id in answer_ids:
                answer_string = find_answer_string_in_filted_faqs(
                    answer_id, filted_faq)
                if answer_string is None:
                    continue
                answer = [
                    answer_id,
                    (
                        score,
                        answer_string,
                        rule_id,
                        '#'.join(key_words),
                    )
                ]
                answers.append(answer)
                hit_count += 1
    return answers
def old_find_answer_by_rule(query, filted_faq):
    """
    :param query:
    :param filted_faq:
    :return: 匹配一条规则后返回答案
    """
    logger.info('filted faq length in find answer by rule = %d',
                len(filted_faq))
    query_words = set(query)
    for rule_id, key_words, answer_id, score in _rules:
        if len(query_words & key_words) != len(key_words):
            continue
        for key, words, ori_query, faq_type in filted_faq:
            print 'query = ', '#'.join(query_words), len('#'.join(query_words))
            # print 'filted faq = ', '#'.join(words)
            print 'key words = ', '#'.join(key_words), len('#'.join(key_words))
            # for word in list(key_words)[0]:
            #     print 'key word type = ', type(word)
            # for word in list(query_words)[0]:
            #     print 'query word type = ', type(word)
            # for word in list(query_words)[0]:
            #     print 'filterd word type = ', type(word)

            answer_words = set(words)
            print 'filted faq = ', '#'.join(answer_words), len(
                '#'.join(answer_words))
            if key == 1004:
                print 'find key'
            print 'answer_words & key_words = ', '#'.join(answer_words
                                                          & key_words)
            if len(answer_words & key_words) == len(key_words):
                logger.info('match rule, rule id = %d, rule = %s', rule_id,
                            '#'.join(key_words))
                logger.info('match query = %s, origin query = %s',
                            '#'.join(answer_words), ori_query)
                answer = Answer(
                    key,
                    score,
                    ori_query,
                )
                return answer
    return None
Example #6
0
    def top_n_similariy_questions(self, query, topN, req_faq_type):
        '''
        :param req_faq_type:
        :param query:
        :param topN:
        :return: list
        '''
        defualt_match_type_on = False
        # logger.info("origin query = %s", query)
        rewrite_query = preprocess.rewrite(query)
        if req_faq_type is None or req_faq_type == '':
            defualt_match_type_on = True
        try:
            req_faq_type = req_faq_type.lower()
        except:
            req_faq_type = 'passport'

        logger.info("len faqs = %d, req_faq_type = %s", len(self.faqs),
                    req_faq_type)
        query_words = WordSeger.get_wordseg(rewrite_query)
        logger.info("query seged words = %s", '#'.join(query_words))
        score_dict = {}
        logger.info("query words = %s", ' '.join(query_words))
        filted_faqs = questionRecall.recall(self.faqs, query_words)
        logger.info("filted faqs length = %d", len(filted_faqs))
        rule_answers = find_answer_by_rule(query_words, filted_faqs)
        # print 'answer = ', rule_answers
        if len(rule_answers) != 0:
            for rule_answer in rule_answers:
                if len(rule_answer) != 2:
                    continue
                rule_key = rule_answer[0]
                score_dict[rule_key] = rule_answer[1]

        # bm25相似度计算例子
        # bm25er = BM25(filted_faqs)
        for key, words, ori_query, faq_type in filted_faqs:
            if req_faq_type == "test":
                pass
            else:
                if defualt_match_type_on:
                    if faq_type not in self.defualt_match_type:
                        continue
                elif req_faq_type not in faq_type:
                    continue
            qq_score = compare(query_words, words, self.synonym)
            # qq_score = compare_by_sentence_vector(query_words, words)
            # qq_score2 = bm25er.sim(query_words, words)
            # qq_score = qq_score1
            logger.info("score = %f, compare %s && %s", qq_score,
                        '#'.join(query_words), '#'.join(words))
            if key in score_dict:
                origin_score = score_dict[key][0]
                if qq_score > origin_score:
                    score_dict[key] = (qq_score, ori_query, '-1', None)
            else:
                score_dict[key] = (qq_score, ori_query, '-1', None)
        logger.info("compare finished")

        # 所有的计算结果排序
        res = sorted(score_dict.items(), key=lambda x: x[1][0],
                     reverse=True)[:topN]
        answers = []
        logger.info("res length = %d", len(res))
        for key, scored_answer in res:
            score, answer, match_type, rule = scored_answer
            logger.info("top n result: key = %s, score = %f, query = %s", key,
                        score, answer)
            answer = Answer(AnswerId=key,
                            Score=score,
                            SimilarityQuery=answer,
                            RewriteQuery=rewrite_query,
                            CleanWords='#'.join(query_words),
                            MatchType=match_type,
                            Rule=rule)
            answers.append(answer)
        return answers
            if len(items) != 3:
                raise Exception("导入规则错误!, rule id = %d" % (rule_id))
            keywords = key_words_to_synonyms(
                map(lambda x: x.strip(), items[0].split('#')))
            answer_ids = map(lambda x: int(x.strip()),
                             items[1].strip().split('|'))
            score = float(items[2].strip())
            if algorithm_type == 'word2vec':
                score = 1.0
            rule = [str(rule_id), keywords, answer_ids, score]
            _rules.append(rule)
            rule_id += 1


load_rules(_rule_path)
logger.info(">>>>load rule sucess, len rule = %d", len(_rules))


def find_answer_by_rule(query, filted_faq):
    """
    :param query:
    :param filted_faq:
    :return: 匹配一条规则后返回答案
    """
    def is_keywords_in_query(query, key_words):
        if len(key_words) != 0:
            flag = True
            query_words_join = ''.join(query)
            for keyword in key_words:
                if keyword not in query_words_join:
                    flag = False