Esempio n. 1
0
 def cal_user_portrait(self, user):
     """
     分析用户画像,并存入数据库
     :param user:
     :return:
     """
     colname = 'history'
     col_portrait_name = 'user_portrait'
     col = connect_mongodb_col(ChatBot.dbname, colname)
     history = [item for item in col.find({'user': user}, {'_id': 0, 'user': 0}).sort('time', -1).limit(50)]
     # 初始化历史记录字典,用于将历史记录按theme分类
     record_dict = {theme: [] for theme in self.themes}
     # 主题集合
     theme_set = set()
     for item in history:
         record_dict[item['theme']].append(item)
         theme_set.add(item['theme'])
     keys = {}
     for theme in theme_set:
         dict = self.themebots[theme].get_user_portrait(user)
         for word in dict:
             keys[word] = keys.get(word, 0) + dict[word]
     print(keys)
     key_str = ""
     for w in keys:
         key_str += w + "::" + str(keys[w]) + "--"
     col = connect_mongodb_col(ChatBot.dbname, col_portrait_name)
     col.insert({'user': user, 'portrait': key_str})
     return keys
def get_theme_docs(theme):
    """
    传入主题,获取相应的文档内容
    :param theme:
    :return:
    """
    num = 20
    dbname = 'chatbotdb'
    doc_col = connect_mongodb_col(dbname, theme + '_doc')
    l = []
    for item in doc_col.find({}, {'_id': 0}):
        l.append(item['title'].replace('Q:', ''))
        l.append(item['type'])
        clen = len(item['content'])
        ct = min(math.floor(clen / num), 10)
        for i in range(ct):
            l.append(item['content'][i * num:i * num + num])
    ques_col = connect_mongodb_col(dbname, theme + '_ques')
    for item in ques_col.find({}, {'_id': 0}):
        l.append(item['question'].replace('Q:', ''))
        l.append(item['type'])
        qlen = len(item['answer'])
        qt = min(math.floor(qlen / num), 10)
        for i in range(qt):
            l.append(item['answer'][i * num:i * num + num])
    return list(set(l))
Esempio n. 3
0
def set_types():
    col = connect_mongodb_col('chatbotdb', 'document_type')
    types = ''
    for item in col.find({}, {'_id': 0}):
        types += item['type_s'] + ','
    connect_mongodb_col('chatbotdb', 'themebot').update({'theme': 'linux'}, {"$set": {"types": types}})
    for i in types.split(','):
        print(i)
Esempio n. 4
0
def simulated_pattern_historical_record(theme='弹性计算', kind_num=10):
    """
    用于模式模拟用户的历史数据并插入数据库
    :return:
    """
    dbname = 'chatbotdb'
    colname = 'history'
    document_items = [item['title'] for item in connect_mongodb_col(dbname, theme + '_doc').find({}, {'title': 1})]
    kind_list = []
    for i in range(kind_num):
        kind_list.append(random.randint(15, 30))
    user_num = sum(kind_list)
    id = 0
    type_num = 0
    # 历史记录数据表
    col = connect_mongodb_col(dbname, colname)
    insert_list = []  # 插入多条数据的列表
    for kind in kind_list:
        type_num += 1
        p_num = random.randint(10, 20)
        pattern_items = random.sample(document_items, p_num)
        left_document = list(set(document_items) - set(pattern_items))
        for i in range(kind):
            id += 1
            user_name = 'user_' + str(type_num) + '_' + str(id)
            print(user_name)
            # 获取模板数据
            pattern_record = random_records(pattern_items, random.randint(5, p_num + 5))
            # 获取随机数据
            ran_records = random_records(left_document, random.randint(5, 15))
            for r in pattern_record.keys():
                per = random.randint(0, 100)
                p = 0
                if per < 5:
                    p = 0  # 不满意
                elif per < 85:
                    p = 1  # 一般
                else:
                    p = 2  # 满意
                insert_list.append(
                    {'user': user_name, 'time': r, 'theme': theme, 'title': pattern_record[r], 'type': 'documentation',
                     'preference': p})
            for r in ran_records.keys():
                per = random.randint(0, 100)
                p = 0
                if per < 5:
                    p = 0  # 不满意
                elif per < 85:
                    p = 1  # 一般
                else:
                    p = 2  # 满意
                insert_list.append(
                    {'user': user_name, 'time': r, 'theme': theme, 'title': ran_records[r], 'type': 'documentation',
                     'preference': p})
    col.insert_many(insert_list)
Esempio n. 5
0
def initialize_linux_doc():
    """
    初始化linux_doc表单数据
    :return:
    """
    doc_col = connect_mongodb_col('chatbotdb', 'document')
    linux_doc_col = connect_mongodb_col('chatbotdb', 'linux_doc')
    list = []
    for item in doc_col.find({}, {'_id': 0}):
        list.append({'title': item['title'], 'type': item['type_s'],
                     'path': item['path'], 'content': item['content'], 'access': item['access'],
                     'docID': item['docID']})
    linux_doc_col.insert_many(list)
Esempio n. 6
0
def MRR_2(theme, k=1.0, train=False, num=5):
    """
    计算MRR值
    :param testing_dict:
    :return:
    """
    global bot
    if bot is None:
        bot = theme_bot.ThemeBot(theme=theme)  # 创建主题机器人
        bot.start(train=train, k=k)
    db_name = 'chatbotdb'
    col_name = 'linux_test_2'
    col = connect_mongodb_col(db_name, col_name)
    # 返回的文档数目
    testing_set = get_testing_set()
    SMRR = 0
    SR = 0
    SP = 0
    SMAP = 0
    distribution = {}
    notfound = []
    for i in range(num + 1):
        distribution[i] = 0
    for item in testing_set.keys():
        rank = 0
        ranklist = []
        TP = 0
        sim_docs = bot.get_similar_documents(item, num)
        if sim_docs is None:
            print(sim_docs)

        if sim_docs is not None:
            for i in range(len(sim_docs)):
                if sim_docs[i][0] in testing_set[item]:
                    rank = i + 1
                    ranklist.append(rank)
                    distribution[rank] += 1
        if len(ranklist) > 0:
            SMRR += 1 / ranklist[0]
        else:
            notfound.append(item)
        # 计算TP值
        for i in range(len(sim_docs)):
            if sim_docs[i][0] in testing_set[item]:
                TP += 1
        SR += TP / len(testing_set[item])
        SP += TP / num
        """
        if TP / len(testing_set[item]) < 0.4:
            print(item, print(TP / len(testing_set[item])))
            col.delete_one({'question': item})
        """
        for i in range(len(ranklist)):
            SMAP += (i + 1) / (ranklist[i] * len(ranklist))
    MRR = SMRR / len(testing_set)
    AR = SR / len(testing_set)
    AP = SP / len(testing_set)
    AMAP = SMAP / len(testing_set)
    return MRR, AR, AP, AMAP, testing_set, distribution, notfound
def load_english_dict():
    from db_connect import connect_mongodb_col
    dbname = 'chatbotdb'
    colname = 'idf_dict'
    col = connect_mongodb_col(dbname, colname)
    for item in col.find({}, {'_id': 0}):
        if distinguish_english(item['word']):
            English_dictionary[item['word']] = item['idf']
Esempio n. 8
0
 def __init__(self):
     jieba.load_userdict(ChatBot.userdict_path)
     themebot_col = connect_mongodb_col(ChatBot.dbname, ChatBot.themebot_col_name)
     self.themes = []
     self.themebots = {}
     self.themequesbots = {}
     for item in themebot_col.find({}, {'_id': 0}):
         self.themes.append(item['theme'])
     load_addwordlist()
def extract_tf_idf(row_corpus, db_name='chatbotdb', col_name='idf_dict'):
    """
    从语料中分词获得tf-idf字典
    :param row_corpus:
    :param db_name:
    :param col_name:
    :return:
    """
    stopwords = stopwordslist(stopword_path)  # 加载停用词的路径
    # 获得文档个数
    total = len(row_corpus)
    # 包含所有语料单词的字典
    all_dict = {}
    # 记录IDF值的字典
    idf_dict = {}
    for c in row_corpus:
        content = c.replace("\r", "").replace("\n",
                                              "").replace("\\r", "").replace(
                                                  "\\n", "")  # 删除换行和多余的空格
        content_seg = []

        for word in content.split():  # 按空格区分英语单词
            for cut_word in jieba.cut(word):  # 使用jieba进行中文分词
                content_seg.append(cut_word)  # 为文件内容分词

        temp_dict = {}
        for seg in content_seg:
            if seg not in stopwords:
                temp_dict[seg] = 1

        for key in temp_dict.keys():
            num = all_dict.get(key, 0)
            all_dict[key] = num + 1

    # 计算idf并存入字典
    import math

    for key in all_dict.keys():
        if all_dict[key] != 1:
            p = '%.10f' % (math.log10(total / (all_dict[key] + 1)))
            idf_dict[key] = p

    idf_list = []
    # 将数据写入文件中
    if os.path.exists(corpus_directory) is False:
        os.mkdir(corpus_directory)
    fw = open(idf_dict_path, 'w', encoding='utf-8')
    for k in idf_dict:
        if k != '\n':
            fw.write(k + ' ' + idf_dict[k] + '\n')
            idf_list.append({"word": k, "idf": idf_dict[k]})
    fw.close()
    # 将语料库数据存入mongodb中
    idf_col = db_connect.connect_mongodb_col(db_name, col_name)
    # idf_col.remove({})
    idf_col.insert_many(idf_list)
    print("语料库生成完成")
Esempio n. 10
0
 def get_hot_doc(self):
     """
     返回热门文档
     :return:
     """
     col = connect_mongodb_col(ChatBot.dbname, 'hot_doc')
     docs = []
     for item in col.find({}, {'_id': 0}):
         docs.append((item['title'], item['theme'], 1))
     return [docs]
Esempio n. 11
0
 def read_corpus_from_mongo(cls):
     """
     read interrogative train data from local
     """
     col = connect_mongodb_col('chatbotdb', 'ques_classify')
     list = []
     for item in col.find({}, {'_id': 0}):
         list.append({'content': item['content'], 'label': item['type']})
     print(list)
     return list
Esempio n. 12
0
def get_dict():
    col = connect_mongodb_col('chatbotdb', 'user_portrait')
    t = [item for item in col.find({}, {'_id': 0})]
    t = t[0]
    print(t)
    dict = {}
    for item in t['portrait'].split("--"):
        v = item.split("::")
        if len(v) == 2:
            dict[v[0]] = float(v[1])
    print(dict)
def extract_addword():
    """
    提取添加词
    :return:
    """
    col = db_connect.connect_mongodb_col('chatbotdb', 'idf_dict')
    f = open('corpus/addword.txt', 'w', encoding='utf-8')
    for item in col.find({}, {'_id': 0}):
        if float(item['idf']) < 1 and len(item['word']) > 1:
            f.write(item['word'] + '\n')

    f.close()
Esempio n. 14
0
def get_title_testing_list(theme='linux'):
    """
    返回基础测试字典
    {查询内容:返回结果}
    :return:
    """
    db_name = 'chatbotdb'
    col = connect_mongodb_col(db_name, theme + '_doc')
    testing_list = []
    for item in col.find({}, {'title': 1}):
        testing_list.append({'query': item['title'], 'docs': [item['title']]})
    return testing_list
Esempio n. 15
0
def get_testing_set():
    """
    获取测试集
    :return:
    """
    db_name = 'chatbotdb'
    col_name = 'linux_test_2'
    col = connect_mongodb_col(db_name, col_name)
    testing_set = {}
    for item in col.find({}, {'_id': 0}):
        testing_set[item['question']] = item['answer'].split(',')
    return testing_set
Esempio n. 16
0
 def extract_addword(self):
     """
     选取所有theme对应的类别内容
     对类别进行分词得到附加词表
     并将存入tf_idf字典中
     :return:
     """
     types = []
     themebot_col = connect_mongodb_col(ChatBot.dbname, ChatBot.themebot_col_name)
     for item in themebot_col.find({}, {'_id': 0}):
         types.append(item['theme'] + item['types'])
     produce_addwordlist(types)
     print('附加词生成成功!')
Esempio n. 17
0
def reset_theme_table(theme):
    dbname = 'chatbotdb'
    colname = theme + '_doc'
    col = connect_mongodb_col(dbname, colname, False)
    for item in col.find({}, {'_id': 1, 'answer': 1}):
        pos = random.randint(20, 100)
        neg = random.randint(10, 100)
        acc = random.randint(pos + neg, 500)
        com = random.randint(acc * 2, acc * 10)
        col.update({'_id': item['_id']},
                   {"$set": {"positive": pos, "negative": neg,
                             "access": acc, 'recommended': com,
                             'update_time': random_time()}})
Esempio n. 18
0
 def extract_dict(self):
     """
     读取所有theme对应的文档内容
     对内容进行分词并计算tf-idf值
     :return:
     """
     print('开始分词并计算tf-idf值')
     row_corpus = []
     for theme in self.themes:
         theme_col = connect_mongodb_col(ChatBot.dbname, theme + '_doc')
         for item in theme_col.find({}, {'title': 1, 'content': 1}):
             row_corpus.append(item['title'].lower() + item['content'].lower())
     extract_tf_idf(row_corpus)
     print('成功获得字典')
Esempio n. 19
0
def read_themes():
    """
    从数据库中读入所有的主题
    :return:
    """
    dbname = 'chatbotdb'
    col_name = 'themebot'
    col = connect_mongodb_col(dbname, col_name)
    themes = {}
    i = 0
    for item in col.find({}, {'theme': 1}).sort('theme'):
        i += 1
        themes[item['theme']] = i
    return themes
Esempio n. 20
0
def tan_xing(theme):
    doc_col = connect_mongodb_col('chatbotdb', theme)
    zhuanyou_doc_col = connect_mongodb_col('chatbotdb', theme + '_doc')
    list = []
    type_list = []
    title_set = set()
    for item in doc_col.find({}, {'_id': 0}):
        if len(item['body']) == 0 or item['question'] in title_set:
            continue
        list.append(
            {'title': item['question'], 'type': item['application'] + '-' + item['catalog'], 'path': 'null',
             'content': item['body'], 'access': 0})
        title_set.add(item['question'])
        type_list.append(item['application'] + '-' + item['catalog'])
    zhuanyou_doc_col.insert_many(list)
    type_set = set(type_list)
    s = ''
    for item in type_set:
        s += item + ','
    theme_col = connect_mongodb_col('chatbotdb', 'themebot')
    result = theme_col.find_one({'theme': theme})
    if result is None:
        theme_col.insert({'theme': theme, 'types': s, 'document_table': theme + '_doc', 'document_mat': theme + '_mat'})
Esempio n. 21
0
def simulated_historical_record(user='******', theme='弹性计算'):
    """
    用于随机模拟用户的历史数据并插入数据库
    :return:
    """
    dbname = 'chatbotdb'
    colname = 'history'
    document_items = [item['title'] for item in connect_mongodb_col(dbname, theme + '_doc').find({}, {'title': 1})]
    record = random_records(document_items, random.randint(5, 25))
    col = connect_mongodb_col(dbname, colname)
    insert_list = []  # 插入多条数据的列表
    for r in record.keys():
        per = random.randint(0, 100)
        p = 0
        if per < 5:
            p = 0  # 不满意
        elif per < 85:
            p = 1  # 一般
        else:
            p = 2  # 满意
        insert_list.append(
            {'user': user, 'time': r, 'theme': theme, 'title': record[r], 'type': 'documentation', 'preference': p})
    col.insert_many(insert_list)
Esempio n. 22
0
 def similar_recommanded(self, user, recommended_num=20):
     """
     传入用户名和主题名,获取其历史记录,获得历史权重值,筛选出比重高的主题
     将历史记录传入对应的ThemeBot,获得相似文档的推荐集合
     :param themes:
     :return:
     """
     colname = 'history'
     col = connect_mongodb_col(ChatBot.dbname, colname)
     history = [item for item in col.find({'user': user}, {'_id': 0, 'user': 0}).sort('time', -1).limit(50)]
     # 如果没有用户记录,则返回热门推荐文档
     if len(history) == 0:
         return self.get_hot_doc()
     # 初始化历史权重字典
     weight_dict = {theme: 0 for theme in self.themes}
     # 初始化历史记录字典,用于将历史记录按theme分类
     record_dict = {theme: [] for theme in self.themes}
     # 读取当前时间
     now = time.time()
     # 一天的时间戳值
     day_value = 86400
     for item in history:
         # 转为时间数组
         timeArray = time.strptime(item['time'], "%Y-%m-%d %H:%M:%S")
         # 转为时间戳
         timeStamp = int(time.mktime(timeArray))
         diff_value = math.floor((now - timeStamp) / day_value)
         # 计算历史权重
         item['decay'] = exponential_decay(diff_value)
         try:
             weight_dict[item['theme']] += exponential_decay(diff_value)
             record_dict[item['theme']].append(item)
         except KeyError:
             continue
     # print(weight_dict)
     theme_list = sorted(weight_dict, key=weight_dict.get, reverse=True)
     weight_sum = 0
     for theme in record_dict.keys():
         print(record_dict[theme])
     # 计算权重和
     for theme in theme_list:
         weight_sum += weight_dict[theme]
     # 获取推荐集合
     recommended_list = []
     for theme in theme_list:
         record_num = math.floor(recommended_num * weight_dict[theme] / weight_sum)
         if record_num > 0:
             recommended_list.append(self.themebots[theme].historical_recommanded(record_dict[theme], record_num))
     return recommended_list
Esempio n. 23
0
 def get_user_portrait(self, user):
     """
     获取用户画像
     :param user:
     :return:
     """
     col_portrait_name = 'user_portrait'
     col = connect_mongodb_col(ChatBot.dbname, col_portrait_name)
     result = [item for item in col.find({'user': user}, {'_id': 0})]
     print(result)
     if len(result) == 0:
         return self.cal_user_portrait(user)
     else:
         dict = {}
         for item in result[0]['portrait'].split("--"):
             v = item.split("::")
             if len(v) == 2:
                 dict[v[0]] = float(v[1])
         return dict
Esempio n. 24
0
 def read_question(self, reload=False, num=0):
     """
     读取的问题,num 默认为0,如果为0,则载入所有问题记录
     :param num:
     :return:
     """
     # 若文档未读入或reload=True,从数据库重新读入数据
     if self.questions is None or reload is True:
         col = connect_mongodb_col(ThemeQuesBot.db_name,
                                   self.question_table)
         self.questions = [
             item for item in col.find({}, {
                 'question': 1,
                 'answer': 1
             }).sort('question')
         ]
     if num == 0:
         return self.questions
     else:
         return self.questions[:num]
Esempio n. 25
0
 def load_properties(self):
     """
     从数据表中读取ThemeQuesBot的基础属性
     :return:
     """
     if self.theme is None:
         print('Theme 属性值为空,载入属性失败!')
         return
     else:
         data = connect_mongodb_col('chatbotdb', 'themebot').find_one(
             {'theme': self.theme}, {'_id': 0})
         self.types = data['types'].split(',')
         # 通过对theme和types分词得到该主题的关键词
         self.key_words = extract_key_words(self.theme.lower() +
                                            data['types'].lower())
         self.question_table = self.theme + '_ques'
         path = ThemeQuesBot.root_directory + '/' + self.theme + '/'
         self.tfidf_location = path + self.theme + '_tfidf'
         self.lsi_location = path + self.theme + '_lsi'
         self.index_location = path + self.theme + '_index'
         self.dictionary_location = path + self.theme + '_dict'
         print('ThemeQuesBot属性载入成功')
Esempio n. 26
0
 def get_dict_id2idf(self, dictionary):
     """
     将gensim的字典形式转化为(id,idf)
     :param dictionary:
     :return:
     """
     word_id_dict = dictionary.token2id
     idf_dict = {}
     id_idf = {}
     col = connect_mongodb_col(ThemeQuesBot.db_name,
                               ThemeQuesBot.idf_col_name)
     id = 0
     for w in word_id_dict.keys():
         c = col.find_one({"word": w}, {"_id": 0, "word": 1, "idf": 1})
         if c is None:
             idf_dict[w] = float(1)
             id_idf[id] = float(1)
         else:
             idf_dict[w] = float(c["idf"])
             id_idf[id] = float(c["idf"])
         id = id + 1
     return idf_dict, id_idf
Esempio n. 27
0
def get_themes():
    col = connect_mongodb_col('chatbotdb', 'themebot')
    return [item['theme'] for item in col.find({}, {'theme': 1})]