def cal_user_portrait(self, user): """ 分析用户画像,并存入数据库 :param user: :return: """ colname = 'history' col_portrait_name = 'user_portrait' col = connect_mongodb_col(ChatBot.dbname, colname) history = [item for item in col.find({'user': user}, {'_id': 0, 'user': 0}).sort('time', -1).limit(50)] # 初始化历史记录字典,用于将历史记录按theme分类 record_dict = {theme: [] for theme in self.themes} # 主题集合 theme_set = set() for item in history: record_dict[item['theme']].append(item) theme_set.add(item['theme']) keys = {} for theme in theme_set: dict = self.themebots[theme].get_user_portrait(user) for word in dict: keys[word] = keys.get(word, 0) + dict[word] print(keys) key_str = "" for w in keys: key_str += w + "::" + str(keys[w]) + "--" col = connect_mongodb_col(ChatBot.dbname, col_portrait_name) col.insert({'user': user, 'portrait': key_str}) return keys
def get_theme_docs(theme): """ 传入主题,获取相应的文档内容 :param theme: :return: """ num = 20 dbname = 'chatbotdb' doc_col = connect_mongodb_col(dbname, theme + '_doc') l = [] for item in doc_col.find({}, {'_id': 0}): l.append(item['title'].replace('Q:', '')) l.append(item['type']) clen = len(item['content']) ct = min(math.floor(clen / num), 10) for i in range(ct): l.append(item['content'][i * num:i * num + num]) ques_col = connect_mongodb_col(dbname, theme + '_ques') for item in ques_col.find({}, {'_id': 0}): l.append(item['question'].replace('Q:', '')) l.append(item['type']) qlen = len(item['answer']) qt = min(math.floor(qlen / num), 10) for i in range(qt): l.append(item['answer'][i * num:i * num + num]) return list(set(l))
def set_types(): col = connect_mongodb_col('chatbotdb', 'document_type') types = '' for item in col.find({}, {'_id': 0}): types += item['type_s'] + ',' connect_mongodb_col('chatbotdb', 'themebot').update({'theme': 'linux'}, {"$set": {"types": types}}) for i in types.split(','): print(i)
def simulated_pattern_historical_record(theme='弹性计算', kind_num=10): """ 用于模式模拟用户的历史数据并插入数据库 :return: """ dbname = 'chatbotdb' colname = 'history' document_items = [item['title'] for item in connect_mongodb_col(dbname, theme + '_doc').find({}, {'title': 1})] kind_list = [] for i in range(kind_num): kind_list.append(random.randint(15, 30)) user_num = sum(kind_list) id = 0 type_num = 0 # 历史记录数据表 col = connect_mongodb_col(dbname, colname) insert_list = [] # 插入多条数据的列表 for kind in kind_list: type_num += 1 p_num = random.randint(10, 20) pattern_items = random.sample(document_items, p_num) left_document = list(set(document_items) - set(pattern_items)) for i in range(kind): id += 1 user_name = 'user_' + str(type_num) + '_' + str(id) print(user_name) # 获取模板数据 pattern_record = random_records(pattern_items, random.randint(5, p_num + 5)) # 获取随机数据 ran_records = random_records(left_document, random.randint(5, 15)) for r in pattern_record.keys(): per = random.randint(0, 100) p = 0 if per < 5: p = 0 # 不满意 elif per < 85: p = 1 # 一般 else: p = 2 # 满意 insert_list.append( {'user': user_name, 'time': r, 'theme': theme, 'title': pattern_record[r], 'type': 'documentation', 'preference': p}) for r in ran_records.keys(): per = random.randint(0, 100) p = 0 if per < 5: p = 0 # 不满意 elif per < 85: p = 1 # 一般 else: p = 2 # 满意 insert_list.append( {'user': user_name, 'time': r, 'theme': theme, 'title': ran_records[r], 'type': 'documentation', 'preference': p}) col.insert_many(insert_list)
def initialize_linux_doc(): """ 初始化linux_doc表单数据 :return: """ doc_col = connect_mongodb_col('chatbotdb', 'document') linux_doc_col = connect_mongodb_col('chatbotdb', 'linux_doc') list = [] for item in doc_col.find({}, {'_id': 0}): list.append({'title': item['title'], 'type': item['type_s'], 'path': item['path'], 'content': item['content'], 'access': item['access'], 'docID': item['docID']}) linux_doc_col.insert_many(list)
def MRR_2(theme, k=1.0, train=False, num=5): """ 计算MRR值 :param testing_dict: :return: """ global bot if bot is None: bot = theme_bot.ThemeBot(theme=theme) # 创建主题机器人 bot.start(train=train, k=k) db_name = 'chatbotdb' col_name = 'linux_test_2' col = connect_mongodb_col(db_name, col_name) # 返回的文档数目 testing_set = get_testing_set() SMRR = 0 SR = 0 SP = 0 SMAP = 0 distribution = {} notfound = [] for i in range(num + 1): distribution[i] = 0 for item in testing_set.keys(): rank = 0 ranklist = [] TP = 0 sim_docs = bot.get_similar_documents(item, num) if sim_docs is None: print(sim_docs) if sim_docs is not None: for i in range(len(sim_docs)): if sim_docs[i][0] in testing_set[item]: rank = i + 1 ranklist.append(rank) distribution[rank] += 1 if len(ranklist) > 0: SMRR += 1 / ranklist[0] else: notfound.append(item) # 计算TP值 for i in range(len(sim_docs)): if sim_docs[i][0] in testing_set[item]: TP += 1 SR += TP / len(testing_set[item]) SP += TP / num """ if TP / len(testing_set[item]) < 0.4: print(item, print(TP / len(testing_set[item]))) col.delete_one({'question': item}) """ for i in range(len(ranklist)): SMAP += (i + 1) / (ranklist[i] * len(ranklist)) MRR = SMRR / len(testing_set) AR = SR / len(testing_set) AP = SP / len(testing_set) AMAP = SMAP / len(testing_set) return MRR, AR, AP, AMAP, testing_set, distribution, notfound
def load_english_dict(): from db_connect import connect_mongodb_col dbname = 'chatbotdb' colname = 'idf_dict' col = connect_mongodb_col(dbname, colname) for item in col.find({}, {'_id': 0}): if distinguish_english(item['word']): English_dictionary[item['word']] = item['idf']
def __init__(self): jieba.load_userdict(ChatBot.userdict_path) themebot_col = connect_mongodb_col(ChatBot.dbname, ChatBot.themebot_col_name) self.themes = [] self.themebots = {} self.themequesbots = {} for item in themebot_col.find({}, {'_id': 0}): self.themes.append(item['theme']) load_addwordlist()
def extract_tf_idf(row_corpus, db_name='chatbotdb', col_name='idf_dict'): """ 从语料中分词获得tf-idf字典 :param row_corpus: :param db_name: :param col_name: :return: """ stopwords = stopwordslist(stopword_path) # 加载停用词的路径 # 获得文档个数 total = len(row_corpus) # 包含所有语料单词的字典 all_dict = {} # 记录IDF值的字典 idf_dict = {} for c in row_corpus: content = c.replace("\r", "").replace("\n", "").replace("\\r", "").replace( "\\n", "") # 删除换行和多余的空格 content_seg = [] for word in content.split(): # 按空格区分英语单词 for cut_word in jieba.cut(word): # 使用jieba进行中文分词 content_seg.append(cut_word) # 为文件内容分词 temp_dict = {} for seg in content_seg: if seg not in stopwords: temp_dict[seg] = 1 for key in temp_dict.keys(): num = all_dict.get(key, 0) all_dict[key] = num + 1 # 计算idf并存入字典 import math for key in all_dict.keys(): if all_dict[key] != 1: p = '%.10f' % (math.log10(total / (all_dict[key] + 1))) idf_dict[key] = p idf_list = [] # 将数据写入文件中 if os.path.exists(corpus_directory) is False: os.mkdir(corpus_directory) fw = open(idf_dict_path, 'w', encoding='utf-8') for k in idf_dict: if k != '\n': fw.write(k + ' ' + idf_dict[k] + '\n') idf_list.append({"word": k, "idf": idf_dict[k]}) fw.close() # 将语料库数据存入mongodb中 idf_col = db_connect.connect_mongodb_col(db_name, col_name) # idf_col.remove({}) idf_col.insert_many(idf_list) print("语料库生成完成")
def get_hot_doc(self): """ 返回热门文档 :return: """ col = connect_mongodb_col(ChatBot.dbname, 'hot_doc') docs = [] for item in col.find({}, {'_id': 0}): docs.append((item['title'], item['theme'], 1)) return [docs]
def read_corpus_from_mongo(cls): """ read interrogative train data from local """ col = connect_mongodb_col('chatbotdb', 'ques_classify') list = [] for item in col.find({}, {'_id': 0}): list.append({'content': item['content'], 'label': item['type']}) print(list) return list
def get_dict(): col = connect_mongodb_col('chatbotdb', 'user_portrait') t = [item for item in col.find({}, {'_id': 0})] t = t[0] print(t) dict = {} for item in t['portrait'].split("--"): v = item.split("::") if len(v) == 2: dict[v[0]] = float(v[1]) print(dict)
def extract_addword(): """ 提取添加词 :return: """ col = db_connect.connect_mongodb_col('chatbotdb', 'idf_dict') f = open('corpus/addword.txt', 'w', encoding='utf-8') for item in col.find({}, {'_id': 0}): if float(item['idf']) < 1 and len(item['word']) > 1: f.write(item['word'] + '\n') f.close()
def get_title_testing_list(theme='linux'): """ 返回基础测试字典 {查询内容:返回结果} :return: """ db_name = 'chatbotdb' col = connect_mongodb_col(db_name, theme + '_doc') testing_list = [] for item in col.find({}, {'title': 1}): testing_list.append({'query': item['title'], 'docs': [item['title']]}) return testing_list
def get_testing_set(): """ 获取测试集 :return: """ db_name = 'chatbotdb' col_name = 'linux_test_2' col = connect_mongodb_col(db_name, col_name) testing_set = {} for item in col.find({}, {'_id': 0}): testing_set[item['question']] = item['answer'].split(',') return testing_set
def extract_addword(self): """ 选取所有theme对应的类别内容 对类别进行分词得到附加词表 并将存入tf_idf字典中 :return: """ types = [] themebot_col = connect_mongodb_col(ChatBot.dbname, ChatBot.themebot_col_name) for item in themebot_col.find({}, {'_id': 0}): types.append(item['theme'] + item['types']) produce_addwordlist(types) print('附加词生成成功!')
def reset_theme_table(theme): dbname = 'chatbotdb' colname = theme + '_doc' col = connect_mongodb_col(dbname, colname, False) for item in col.find({}, {'_id': 1, 'answer': 1}): pos = random.randint(20, 100) neg = random.randint(10, 100) acc = random.randint(pos + neg, 500) com = random.randint(acc * 2, acc * 10) col.update({'_id': item['_id']}, {"$set": {"positive": pos, "negative": neg, "access": acc, 'recommended': com, 'update_time': random_time()}})
def extract_dict(self): """ 读取所有theme对应的文档内容 对内容进行分词并计算tf-idf值 :return: """ print('开始分词并计算tf-idf值') row_corpus = [] for theme in self.themes: theme_col = connect_mongodb_col(ChatBot.dbname, theme + '_doc') for item in theme_col.find({}, {'title': 1, 'content': 1}): row_corpus.append(item['title'].lower() + item['content'].lower()) extract_tf_idf(row_corpus) print('成功获得字典')
def read_themes(): """ 从数据库中读入所有的主题 :return: """ dbname = 'chatbotdb' col_name = 'themebot' col = connect_mongodb_col(dbname, col_name) themes = {} i = 0 for item in col.find({}, {'theme': 1}).sort('theme'): i += 1 themes[item['theme']] = i return themes
def tan_xing(theme): doc_col = connect_mongodb_col('chatbotdb', theme) zhuanyou_doc_col = connect_mongodb_col('chatbotdb', theme + '_doc') list = [] type_list = [] title_set = set() for item in doc_col.find({}, {'_id': 0}): if len(item['body']) == 0 or item['question'] in title_set: continue list.append( {'title': item['question'], 'type': item['application'] + '-' + item['catalog'], 'path': 'null', 'content': item['body'], 'access': 0}) title_set.add(item['question']) type_list.append(item['application'] + '-' + item['catalog']) zhuanyou_doc_col.insert_many(list) type_set = set(type_list) s = '' for item in type_set: s += item + ',' theme_col = connect_mongodb_col('chatbotdb', 'themebot') result = theme_col.find_one({'theme': theme}) if result is None: theme_col.insert({'theme': theme, 'types': s, 'document_table': theme + '_doc', 'document_mat': theme + '_mat'})
def simulated_historical_record(user='******', theme='弹性计算'): """ 用于随机模拟用户的历史数据并插入数据库 :return: """ dbname = 'chatbotdb' colname = 'history' document_items = [item['title'] for item in connect_mongodb_col(dbname, theme + '_doc').find({}, {'title': 1})] record = random_records(document_items, random.randint(5, 25)) col = connect_mongodb_col(dbname, colname) insert_list = [] # 插入多条数据的列表 for r in record.keys(): per = random.randint(0, 100) p = 0 if per < 5: p = 0 # 不满意 elif per < 85: p = 1 # 一般 else: p = 2 # 满意 insert_list.append( {'user': user, 'time': r, 'theme': theme, 'title': record[r], 'type': 'documentation', 'preference': p}) col.insert_many(insert_list)
def similar_recommanded(self, user, recommended_num=20): """ 传入用户名和主题名,获取其历史记录,获得历史权重值,筛选出比重高的主题 将历史记录传入对应的ThemeBot,获得相似文档的推荐集合 :param themes: :return: """ colname = 'history' col = connect_mongodb_col(ChatBot.dbname, colname) history = [item for item in col.find({'user': user}, {'_id': 0, 'user': 0}).sort('time', -1).limit(50)] # 如果没有用户记录,则返回热门推荐文档 if len(history) == 0: return self.get_hot_doc() # 初始化历史权重字典 weight_dict = {theme: 0 for theme in self.themes} # 初始化历史记录字典,用于将历史记录按theme分类 record_dict = {theme: [] for theme in self.themes} # 读取当前时间 now = time.time() # 一天的时间戳值 day_value = 86400 for item in history: # 转为时间数组 timeArray = time.strptime(item['time'], "%Y-%m-%d %H:%M:%S") # 转为时间戳 timeStamp = int(time.mktime(timeArray)) diff_value = math.floor((now - timeStamp) / day_value) # 计算历史权重 item['decay'] = exponential_decay(diff_value) try: weight_dict[item['theme']] += exponential_decay(diff_value) record_dict[item['theme']].append(item) except KeyError: continue # print(weight_dict) theme_list = sorted(weight_dict, key=weight_dict.get, reverse=True) weight_sum = 0 for theme in record_dict.keys(): print(record_dict[theme]) # 计算权重和 for theme in theme_list: weight_sum += weight_dict[theme] # 获取推荐集合 recommended_list = [] for theme in theme_list: record_num = math.floor(recommended_num * weight_dict[theme] / weight_sum) if record_num > 0: recommended_list.append(self.themebots[theme].historical_recommanded(record_dict[theme], record_num)) return recommended_list
def get_user_portrait(self, user): """ 获取用户画像 :param user: :return: """ col_portrait_name = 'user_portrait' col = connect_mongodb_col(ChatBot.dbname, col_portrait_name) result = [item for item in col.find({'user': user}, {'_id': 0})] print(result) if len(result) == 0: return self.cal_user_portrait(user) else: dict = {} for item in result[0]['portrait'].split("--"): v = item.split("::") if len(v) == 2: dict[v[0]] = float(v[1]) return dict
def read_question(self, reload=False, num=0): """ 读取的问题,num 默认为0,如果为0,则载入所有问题记录 :param num: :return: """ # 若文档未读入或reload=True,从数据库重新读入数据 if self.questions is None or reload is True: col = connect_mongodb_col(ThemeQuesBot.db_name, self.question_table) self.questions = [ item for item in col.find({}, { 'question': 1, 'answer': 1 }).sort('question') ] if num == 0: return self.questions else: return self.questions[:num]
def load_properties(self): """ 从数据表中读取ThemeQuesBot的基础属性 :return: """ if self.theme is None: print('Theme 属性值为空,载入属性失败!') return else: data = connect_mongodb_col('chatbotdb', 'themebot').find_one( {'theme': self.theme}, {'_id': 0}) self.types = data['types'].split(',') # 通过对theme和types分词得到该主题的关键词 self.key_words = extract_key_words(self.theme.lower() + data['types'].lower()) self.question_table = self.theme + '_ques' path = ThemeQuesBot.root_directory + '/' + self.theme + '/' self.tfidf_location = path + self.theme + '_tfidf' self.lsi_location = path + self.theme + '_lsi' self.index_location = path + self.theme + '_index' self.dictionary_location = path + self.theme + '_dict' print('ThemeQuesBot属性载入成功')
def get_dict_id2idf(self, dictionary): """ 将gensim的字典形式转化为(id,idf) :param dictionary: :return: """ word_id_dict = dictionary.token2id idf_dict = {} id_idf = {} col = connect_mongodb_col(ThemeQuesBot.db_name, ThemeQuesBot.idf_col_name) id = 0 for w in word_id_dict.keys(): c = col.find_one({"word": w}, {"_id": 0, "word": 1, "idf": 1}) if c is None: idf_dict[w] = float(1) id_idf[id] = float(1) else: idf_dict[w] = float(c["idf"]) id_idf[id] = float(c["idf"]) id = id + 1 return idf_dict, id_idf
def get_themes(): col = connect_mongodb_col('chatbotdb', 'themebot') return [item['theme'] for item in col.find({}, {'theme': 1})]