def _load_define_dict(file_path): global _self_define_words if sys.version_info[0] < 3: words = open(file_path, 'r') else: words = open(file_path, 'r', encoding='utf-8') words = words.readlines() for w in words: if w.startswith("#"): continue _self_define_words.add(code.any2unicode(w).strip()) logger.info("self_define_words = %d", len(_self_define_words))
def key_words_to_synonyms(keywords): """ :return: 返回关键词替换后的同义词 """ synonyms_dict = config.get_value("synonyms_dict", None) replace_keywords = set() if synonyms_dict is None or len(synonyms_dict) == 0: logger.info("load synonyms dict error!") return set(keywords) else: logger.info("in rule based qq similarity, len(synonyms_dict) = %s", len(synonyms_dict)) for word in keywords: replace_keywords.add(synonyms_dict.get(word, word)) return replace_keywords
def __init__(self): self.faqs = config.get_value("faq_list") # key: id, value: doc: self.synonym = dict() self.defualt_match_type = None self.load_defualt_match_type() start = time.time() self.init_synonym() end = time.time() logger.info("load faq segment used %d second, total Len faq = %d", end - start, len(self.faqs)) logger.info("load synonym successful, synonym len = %d", len(self.synonym)) if not is_test: self.repeatingTimer = RepeatingTimer(refresh_interval, self.update_dict) self.repeatingTimer.start()
def find_answer_by_rule(query, filted_faq): """ :param query: :param filted_faq: :return: 匹配一条规则后返回答案 """ def is_keywords_in_query(query, key_words): if len(key_words) != 0: flag = True query_words_join = ''.join(query) for keyword in key_words: if keyword not in query_words_join: flag = False break return flag else: return False logger.info('filted faq length in find answer by rule = %d', len(filted_faq)) hit_count = 0 answers = [] for rule_id, key_words, answer_ids, score in _rules: if is_keywords_in_query(query, key_words) and (hit_count < 3): for answer_id in answer_ids: answer_string = find_answer_string_in_filted_faqs( answer_id, filted_faq) if answer_string is None: continue answer = [ answer_id, ( score, answer_string, rule_id, '#'.join(key_words), ) ] answers.append(answer) hit_count += 1 return answers
def old_find_answer_by_rule(query, filted_faq): """ :param query: :param filted_faq: :return: 匹配一条规则后返回答案 """ logger.info('filted faq length in find answer by rule = %d', len(filted_faq)) query_words = set(query) for rule_id, key_words, answer_id, score in _rules: if len(query_words & key_words) != len(key_words): continue for key, words, ori_query, faq_type in filted_faq: print 'query = ', '#'.join(query_words), len('#'.join(query_words)) # print 'filted faq = ', '#'.join(words) print 'key words = ', '#'.join(key_words), len('#'.join(key_words)) # for word in list(key_words)[0]: # print 'key word type = ', type(word) # for word in list(query_words)[0]: # print 'query word type = ', type(word) # for word in list(query_words)[0]: # print 'filterd word type = ', type(word) answer_words = set(words) print 'filted faq = ', '#'.join(answer_words), len( '#'.join(answer_words)) if key == 1004: print 'find key' print 'answer_words & key_words = ', '#'.join(answer_words & key_words) if len(answer_words & key_words) == len(key_words): logger.info('match rule, rule id = %d, rule = %s', rule_id, '#'.join(key_words)) logger.info('match query = %s, origin query = %s', '#'.join(answer_words), ori_query) answer = Answer( key, score, ori_query, ) return answer return None
def top_n_similariy_questions(self, query, topN, req_faq_type): ''' :param req_faq_type: :param query: :param topN: :return: list ''' defualt_match_type_on = False # logger.info("origin query = %s", query) rewrite_query = preprocess.rewrite(query) if req_faq_type is None or req_faq_type == '': defualt_match_type_on = True try: req_faq_type = req_faq_type.lower() except: req_faq_type = 'passport' logger.info("len faqs = %d, req_faq_type = %s", len(self.faqs), req_faq_type) query_words = WordSeger.get_wordseg(rewrite_query) logger.info("query seged words = %s", '#'.join(query_words)) score_dict = {} logger.info("query words = %s", ' '.join(query_words)) filted_faqs = questionRecall.recall(self.faqs, query_words) logger.info("filted faqs length = %d", len(filted_faqs)) rule_answers = find_answer_by_rule(query_words, filted_faqs) # print 'answer = ', rule_answers if len(rule_answers) != 0: for rule_answer in rule_answers: if len(rule_answer) != 2: continue rule_key = rule_answer[0] score_dict[rule_key] = rule_answer[1] # bm25相似度计算例子 # bm25er = BM25(filted_faqs) for key, words, ori_query, faq_type in filted_faqs: if req_faq_type == "test": pass else: if defualt_match_type_on: if faq_type not in self.defualt_match_type: continue elif req_faq_type not in faq_type: continue qq_score = compare(query_words, words, self.synonym) # qq_score = compare_by_sentence_vector(query_words, words) # qq_score2 = bm25er.sim(query_words, words) # qq_score = qq_score1 logger.info("score = %f, compare %s && %s", qq_score, '#'.join(query_words), '#'.join(words)) if key in score_dict: origin_score = score_dict[key][0] if qq_score > origin_score: score_dict[key] = (qq_score, ori_query, '-1', None) else: score_dict[key] = (qq_score, ori_query, '-1', None) logger.info("compare finished") # 所有的计算结果排序 res = sorted(score_dict.items(), key=lambda x: x[1][0], reverse=True)[:topN] answers = [] logger.info("res length = %d", len(res)) for key, scored_answer in res: score, answer, match_type, rule = scored_answer logger.info("top n result: key = %s, score = %f, query = %s", key, score, answer) answer = Answer(AnswerId=key, Score=score, SimilarityQuery=answer, RewriteQuery=rewrite_query, CleanWords='#'.join(query_words), MatchType=match_type, Rule=rule) answers.append(answer) return answers
if len(items) != 3: raise Exception("导入规则错误!, rule id = %d" % (rule_id)) keywords = key_words_to_synonyms( map(lambda x: x.strip(), items[0].split('#'))) answer_ids = map(lambda x: int(x.strip()), items[1].strip().split('|')) score = float(items[2].strip()) if algorithm_type == 'word2vec': score = 1.0 rule = [str(rule_id), keywords, answer_ids, score] _rules.append(rule) rule_id += 1 load_rules(_rule_path) logger.info(">>>>load rule sucess, len rule = %d", len(_rules)) def find_answer_by_rule(query, filted_faq): """ :param query: :param filted_faq: :return: 匹配一条规则后返回答案 """ def is_keywords_in_query(query, key_words): if len(key_words) != 0: flag = True query_words_join = ''.join(query) for keyword in key_words: if keyword not in query_words_join: flag = False