def run_offline_paper(): client = Milvus(host=milvus_ip, port='19530') cur.execute("SELECT ID ,doc_vector FROM paper") papers = cur.fetchall() for i in papers: try: id = i[0] vec = i[1].split(",") vec = [eval(j) for j in vec] res = client.search(collection_name='ideaman', query_records=[vec], top_k=51) status = res[0].code if status == 0: topKqueryResult = [str(j) for j in res[-1]._id_array[0]] paper_vecs = ",".join(topKqueryResult[1:]) sql = 'INSERT INTO offline_paper(paper_id , recs) VALUES({} , "{}")'.format( id, paper_vecs) cur.execute(sql) try: conn.commit() except: conn.rollback() except: pass
def get_user(u_id): """ 根据u_id,生成用户,如果该用户的所有感兴趣的tags交集中很少或者没有论文,重新生成. :param u_id: 用户id :return: name:用户名 :return: tags:用户感兴趣的tags,格式为list :return: tag:用户感兴趣的tags,格式为字符串,如"1,2,3" :return SQL_2:用于获得该用户感兴趣tags的论文 """ # 创建用户 name = "tmp_%05d" % u_id tags = random.sample(interest_tags, random.randint(1, 3)) tag = ",".join(tags) id = u_id + 7015 id2tags[id] = tag # 2,随机为每一位用户的第一个个标签点击随机(2,5)篇的论文。 conditions = [] for t_id in tags: conditions.append('FIND_IN_SET("{}",tags)'.format(t_id)) condition = " AND ".join(conditions) SQL_2 = 'SELECT id FROM paper WHERE ' + condition + ' ORDER BY RAND() LIMIT {}'.format(random.randint(2, 8)) cur.execute(SQL_2) if len(cur.fetchall()) <= 4: return get_user(u_id) else: return name, tags, tag, id, SQL_2
def get_total_recall(): sql = "SELECT user_id , total_recall FROM user_rec LIMIT 20;" cur.execute(sql) dic = {} res = cur.fetchall() for i in res: dic[i[0]] = i[1] for i in dic: print(i) return dic
def click(user_id, paper_id, flag=1): """ 将点击信息写入到mysql中 :param user_id: 用户id :param paper_id:点击的论文id :return: null """ SQL = 'INSERT INTO click_log(u_id,item_id,event_type) VALUES("{}","{}","{}")'.format(user_id, paper_id, flag) try: cur.execute(SQL) conn.commit() except: conn.rollback()
def gen_UIR(): # 写入正样本 cur.execute( "SELECT u_id,item_id,event_type,add_time FROM click_log where event_type = 1" ) res = cur.fetchall() file = open("dataset/UIR.csv", "w", encoding='utf-8') for item in res: uid = item[0] itemid = item[1] rating = item[2] timestamp = item[3].timestamp() file.write(",".join([str(i) for i in [uid, itemid, rating, timestamp]]) + "\n") file.close() # 写入负样本 cur.execute("""SELECT u_id, GROUP_CONCAT(DISTINCT item_id ORDER BY item_id ASC SEPARATOR ' ') as neg FROM click_log WHERE event_type = 0 GROUP BY u_id ORDER BY LENGTH(neg) DESC""") res = cur.fetchall() file = open("./dataset/UIR_negative.csv", "w", encoding='utf-8') for item in res: uid = item[0] negs = item[1] file.write(" ".join([str(i) for i in [uid, negs]]) + "\n") file.close()
def main(user_number=8000): """ 用于生成用户点击论文数据, 1.生产指定数量的用户,指定每一位用户的感兴趣的标签[1,3]。将deleted位置设置为1 2.随机为每一位用户的每个标签点击随机(2,8)篇的论文。 3.随机选择用户,85%的概率对相应感兴趣的便签进行点击,15%的概率随机点击其他的论文,执行 8w,平均每个用户执行10次 """ # 1 生产指定数量的用户,指定每一位用户的感兴趣的标签,将deleted位置设置为1 for u_id in range(1, user_number + 1): name, tags, tag, id, SQL_2 = get_user(u_id) cur.execute(SQL_2) paper_list = list(cur.fetchall()) SQL_1 = 'INSERT INTO `user`(username,deleted,interest_tags) VALUES("{}", 1,"{}")'.format(name, tag) cur.execute(SQL_1) conn.commit() for paper_id in paper_list: click(id, paper_id[0]) if u_id % 100 == 0: print("已经完成用户数量:", u_id) SQL_5 = "SELECT id,interest_tags FROM `user` WHERE id >= 10;" cur.execute(SQL_5) for u_item in cur.fetchall(): id2tags[u_item[0]] = u_item[1].split(",") # 3.随机选择用户, # 随机选择用户 for num in range(user_number * 10): id = random.sample(id2tags.keys(), 1)[0] rand_num = random.random() if rand_num < 0.55: # 55%生成正样本 t_id = random.sample(id2tags.get(id), 1)[0] SQL_3 = 'SELECT id FROM paper WHERE FIND_IN_SET("{}",tags) ORDER BY RAND() LIMIT {}'.format(t_id, 1) cur.execute(SQL_3) paper_item = cur.fetchall()[0] paper_id = paper_item[0] click(id, paper_id) elif 0.55 <= rand_num <= 0.9: # 45%生成负样本 t_id = random.sample(id2tags.get(id), 1)[0] SQL_3 = 'SELECT id FROM paper WHERE NOT FIND_IN_SET("{}",tags) ORDER BY RAND() LIMIT {}'.format(t_id, 1) cur.execute(SQL_3) paper_item = cur.fetchall()[0] paper_id = paper_item[0] click(id, paper_id, 0) else: # 5 % 的概率随机点击其他的论文 SQL_4 = 'SELECT id FROM paper ORDER BY RAND() LIMIT 1' cur.execute(SQL_4) paper_item = cur.fetchall()[0] paper_id = paper_item[0] click(id, paper_id) if num % 1000 == 0: print("已经完成点击数量:%d", num)
def getPredictData(user_id: int, paper_id: int): # 初始化 tags_pad, author_pad, doc2vec_pad = 16, 100, 160 user_interest_tags_pad = 10 model_name = 'bert-base-uncased' MODEL_PATH = './bert-base-uncased/' # 通过词典导入分词器 tokenizer = BertTokenizer.from_pretrained(model_name) # 导入配置文件 model_config = BertConfig.from_pretrained(model_name) # 通过配置和路径导入模型 bert_model = BertModel.from_pretrained(MODEL_PATH, config=model_config) # 获取用户数据 cur.execute("SELECT id, interest_tags FROM `user` WHERE id = %d" % (user_id, )) res = cur.fetchone() userid = [str(res[0])] user_interest_tags = str(res[1]).split(",") user_interest_tags += ['0'] * (user_interest_tags_pad - len(user_interest_tags)) # 获取论文数据 cur.execute(""" SELECT id, title, description, tags, AUTHORS, doc_vector FROM paper WHERE pwc_tasks <> '' AND doc_vector IS NOT NULL AND `authors` NOT LIKE '%one%' AND tags NOT LIKE '%one%' AND id = {} """.format(paper_id)) item = cur.fetchone() paper_id = [str(item[0])] title = item[1] description = item[2] tags = item[3] authors = item[4] doc2vec = item[5] # title转换为vec encoded_input = tokenizer(title, return_tensors='pt') title = bert_model(**encoded_input)['pooler_output'].tolist()[0] title = [str(i) for i in title] # description转换为vec encoded_input = tokenizer(description, return_tensors='pt') description = bert_model(**encoded_input)['pooler_output'].tolist()[0] description = [str(i) for i in description] # tags转换为idx tags = [add2Map(tags2idx, i) for i in tags.split(",")] tags += ['0'] * (tags_pad - len(tags)) # authors 转换为idx authors = [add2Map(authors2idx, i) for i in authors.split(",")] authors += ['0'] * (author_pad - len(authors)) doc2vec = doc2vec.split(",") doc2vec += ['0'] * (doc2vec_pad - len(doc2vec)) line = userid + user_interest_tags + paper_id + title + description + tags + authors + doc2vec line = [eval(i) for i in line] return line
def gen_Tensor(): """ 第一步:下载数据 1. 从相邻主机的Mysql中下载数据,格式:userid , user_interest_tags , paper_id ,title , description ,tags , authors , doc_vector , add_time ,label 2. 原文文本中包含逗号,都用|代替 3. 将作者转换为独热编码,并补齐 4. 将tags转换为独热编码并补齐 5. 将pwc_tasks转换为独热编码并补齐 """ model_name = 'bert-base-uncased' MODEL_PATH = './bert-base-uncased/' # 通过词典导入分词器 tokenizer = BertTokenizer.from_pretrained(model_name) # 导入配置文件 model_config = BertConfig.from_pretrained(model_name) # 通过配置和路径导入模型 bert_model = BertModel.from_pretrained(MODEL_PATH, config=model_config) sql = """SELECT log.u_id AS userid, `user`.interest_tags AS user_interest_tags, paper.id AS paperid, paper.title AS title, paper.description AS description, paper.tags AS tags, paper.`AUTHORS` AS `authors`, paper.doc_vector AS doc_vector, log.add_time AS add_time, log.event_type AS label FROM ( SELECT u_id, item_id, event_type, add_time FROM click_log where u_id <= 10000) AS log INNER JOIN ( SELECT id, interest_tags FROM `user` ) AS `user` INNER JOIN ( SELECT id, title, description, tags, AUTHORS, doc_vector FROM paper WHERE AND doc_vector IS NOT NULL AND `authors` NOT LIKE '%one%' AND tags NOT LIKE '%one%' ) AS paper ON log.u_id = `user`.id AND log.item_id = paper.id""" cur.execute(sql) res = cur.fetchall() tags2idx, authors2idx, pwc_tasks2idx = {}, {}, {} tags_pad, author_pad, doc2vec_pad = 16, 100, 160 user_interest_tags_pad = 10 f = open('dataset/tensor.csv', 'w', encoding='utf-8') for index, item in enumerate(res): userid = [str(item[0])] user_interest_tags = str(item[1]).split(",") user_interest_tags += ['0'] * (user_interest_tags_pad - len(user_interest_tags)) paper_id = [str(item[2])] title = item[3] description = item[4] tags = item[5] authors = item[6] doc2vec = item[7] label = [str(item[9])] # title转换为vec encoded_input = tokenizer(title, return_tensors='pt') title = bert_model(**encoded_input)['pooler_output'].tolist()[0] title = [str(i) for i in title] # description转换为vec encoded_input = tokenizer(description, return_tensors='pt') description = bert_model(**encoded_input)['pooler_output'].tolist()[0] description = [str(i) for i in description] # tags转换为idx tags = [add2Map(tags2idx, i) for i in tags.split(",")] tags += ['0'] * (tags_pad - len(tags)) # authors 转换为idx authors = [add2Map(authors2idx, i) for i in authors.split(",")] authors += ['0'] * (author_pad - len(authors)) doc2vec = doc2vec.split(",") doc2vec += ['0'] * (doc2vec_pad - len(doc2vec)) line = ",".join(userid + user_interest_tags + paper_id + title + description + tags + authors + doc2vec + label) f.write(line + '\n') f.close() map2file(authors2idx, 'dataset/authors2idx') map2file(tags2idx, 'dataset/tags2idx') map2file(pwc_tasks2idx, 'dataset/pwc_task2idx')