def run_Ap_single_algrothm(self): """ 单曲的AP聚类 :return: cluster_centers_, labels_,cluster_centers_indices """ prop = ReadProperties("data/app.properties") # 用户Index user_index_file = prop.get("userId_index_path") # (commonTag artistTag)标签 tag_index_file = prop.get("common_artist_tag_index_path") # 单曲推荐包含艺术家的用户标签评分 user_tag_file1 = prop.get("user_common_artist_tag_score_path") user_list = [] tag_list = [] user_list = self.file_read_fun(user_index_file, user_list) # tag_list 包含所有的标签(commonTag artistTag):用于单曲推荐 tag_list = self.file_read_fun(tag_index_file, tag_list) # array 为单曲推荐的矩阵(commonTag artistTag) array = self.user_tag_matrix_fun(user_list, tag_list, user_tag_file1) # 单曲推荐的聚类中心 cluster_centers_, labels_, cluster_centers_indices = self.AP_algrothm( array) self.logger.log().info("ap cluster centers nums %s" % cluster_centers_.shape[0]) np.save(self.media_ap_centers, cluster_centers_) np.save(self.media_ap_labels, labels_) np.save(self.media_ap_indices, cluster_centers_indices) return cluster_centers_, labels_, cluster_centers_indices
def run_media_tag_algrothm(self): """ 单曲的AP聚类 :return: cluster_centers_, labels_,cluster_centers_indices """ prop = ReadProperties("data/app.properties") # 用户Index media_index_file = prop.get("all_mediaId_index_path") # (commonTag artistTag)标签 tag_index_file = prop.get("commonTag_index_path") # 单曲推荐包含艺术家的用户标签评分 media_tag_relation_file = prop.get("mediaId_common_tag_path") media_list = [] tag_list = [] media_list = self.file_read_fun(media_index_file, media_list) # tag_list 包含所有的标签(commonTag artistTag):用于单曲推荐 tag_list = self.file_read_fun(tag_index_file, tag_list) # array 为单曲推荐的矩阵(commonTag artistTag) array = self.media_tag_matrix_fun(media_list, tag_list, media_tag_relation_file) # media tag score ap聚类 cluster_centers_, labels_, cluster_centers_indices = self.k_Means_algrothm( array) self.logger.log().info("ap cluster centers nums %s" % cluster_centers_.shape[0]) np.save(self.media_relation_ap_centers, cluster_centers_) np.save(self.media_relation_ap_labels, labels_) return cluster_centers_, labels_, cluster_centers_indices
def __init__(self): self.prop = ReadProperties("data/app.properties") self.logger = LoggingUtil("data/logs/") self.host = self.prop.get("db_host") self.port = self.prop.get("db_port") self.db = self.prop.get("db") self.user_name = self.prop.get("user_name") self.password = self.prop.get("password")
def run_Ap_songList_algrothm(self): """ 歌单专题的AP聚类 :return: """ prop = ReadProperties("data/app.properties") # 用户Index user_index_file = prop.get("userId_index_path") # (common)标签 tag_common_file = prop.get("commonTag_index_path") # 歌单专题推荐不包含艺术家的用户标签打分 user_common_tag_file = prop.get("user_common_tag_score_path") user_list = [] tag_common_list = [] user_list = self.file_read_fun(user_index_file, user_list) tag_common_list = self.file_read_fun(tag_common_file, tag_common_list) array = self.user_tag_matrix_fun(user_list, tag_common_list, user_common_tag_file) # 歌单聚类中心 self.logger.log().info("song_list ap start...") cluster_centers_, labels_, cluster_centers_indices = self.AP_algrothm( array) self.logger.log().info("song_list ap finished!!!") self.logger.log().info("ap cluster centers nums: %s " % cluster_centers_.shape[0]) self.logger.log().info("ap finished!!!") np.save(self.mediaList_ap_centers, cluster_centers_) np.save(self.mediaList_ap_labels, labels_) np.save(self.mediaList_ap_indices, cluster_centers_indices) # """ # 绘制散点图观察聚类效果 # """ # import matplotlib.pyplot as plt # from itertools import cycle # plt.figure('AP') # plt.subplots(facecolor=(0.5, 0.5, 0.5)) # colors = cycle('rgbcmykw') # for k, col in zip(range(cluster_centers_.shape[0]), colors): # # labels == k 使用k与labels数组中的每个值进行比较 # # 如labels = [1,0],k=0,则‘labels==k’的结果为[False, True] # class_members = labels_ == k # cluster_center = array[cluster_centers_indices[k]] # 聚类中心的坐标 # plt.plot(array[class_members, 0], array[class_members, 1], col + '.') # plt.plot(cluster_center[0], cluster_center[1], markerfacecolor=col, # markeredgecolor='k', markersize=14) # for x in array[class_members]: # plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) # plt.xticks(fontsize=10, color="darkorange") # plt.yticks(fontsize=10, color="darkorange") # plt.show() return cluster_centers_, labels_, cluster_centers_indices
def k_Means_algrothm(self, array): """ 将用户标签矩阵聚类,得到用户标签的聚类中心 :param array: :return:cluster_centers_, labels_, cluster_centers_indices """ prop = ReadProperties("data/app.properties") cluster_centers_, labels_, cluster_centers_indices = self.k_Means_cluster_func( data=array) return cluster_centers_, labels_, cluster_centers_indices
def __init__(self): self.prop = ReadProperties("data/app.properties") self.logger = LoggingUtil("data/logs/") self.host = self.prop.get("db_host") self.port = int(self.prop.get("db_port")) self.db = self.prop.get("db") self.user = self.prop.get("user_name") self.password = self.prop.get("password") self.trainset = {} self.testset = {} # n_sim_user: top 20个用户, n_rec_media: top 10个推荐结果 self.n_sim_media = 20 self.n_rec_media = 10 # media_sim_mat: 歌曲之间的相似度, media_popular: 歌曲出现的次数, media_count: 播放歌曲的总数据 self.media_sim_mat = {} self.media_popular = {} self.media_count = 0 self.rec_media_dict = {} self.all_rec_medias = {} self.logger.log().info('Similar media number = %d' % self.n_sim_media) self.logger.log().info('Recommended media number = %d' % self.n_rec_media)
def AP_algrothm(self, array): """ 将用户标签矩阵聚类,得到用户标签的聚类中心 :param array: :return:cluster_centers_, labels_, cluster_centers_indices """ prop = ReadProperties("data/app.properties") cluster_centers_, labels_, cluster_centers_indices = self.affinityPropagation( data=array, ap_damping=float(prop.get("ap_damping")), ap_max_iter=int(prop.get("ap_max_iter")), ap_convergence_iter=int(prop.get("ap_convergence_iter")), ap_copy=True, ap_preference=None, ap_affinity=prop.get("ap_affinity"), ap_verbose=False) return cluster_centers_, labels_, cluster_centers_indices
def __init__(self, media_relation_ap_centers, media_relation_ap_labels): self.media_relation_ap_centers = media_relation_ap_centers self.media_relation_ap_labels = media_relation_ap_labels self.logger = LoggingUtil("data/logs/") self.prop = ReadProperties("data/app.properties")
self.logger.log().info("read column_index_file start...") column_index = self.file_read_fun(column_file, column_index) self.logger.log().info("read column_index_file finished!!!") array = self.generate_relation_mat(row_index, column_index, row_coloum_relation_file) self.logger.log().info("calculate similarity matrix start...") similarity_matrix_media = self.cluster_media_list_fun( cluster_centers, array) self.logger.log().info("calculate similarity matrix finished !!!") np.savetxt(mat_out_file, similarity_matrix_media) return similarity_matrix_media if __name__ == '__main__': sim_mat = Similarity_Matrix() prop = ReadProperties("data/app.properties") # 单曲相似度计算(应该是全部歌曲) sim_mat.cal_similarity_func( row_file=prop.get("all_mediaId_index_path"), column_file=prop.get("common_artist_tag_index_path"), row_coloum_relation_file=prop.get("mediaId_tag_path"), cluster_centers=np.load(prop.get("media_ap_centers") + ".npy"), mat_out_file=prop.get("media_similarity_matrix")) # 歌单相似度计算 sim_mat.cal_similarity_func( row_file=prop.get("mediaList_index_path"), column_file=prop.get("commonTag_index_path"), row_coloum_relation_file=prop.get("mediaList_tag_path"), cluster_centers=np.load(prop.get("media_list_ap_centers") + ".npy"), mat_out_file=prop.get("media_list_similarity_matrix"))
prop = ReadProperties("data/app.properties") # 用户Index media_index_file = prop.get("all_mediaId_index_path") # (commonTag artistTag)标签 tag_index_file = prop.get("commonTag_index_path") # 单曲推荐包含艺术家的用户标签评分 media_tag_relation_file = prop.get("mediaId_common_tag_path") media_list = [] tag_list = [] media_list = self.file_read_fun(media_index_file, media_list) # tag_list 包含所有的标签(commonTag artistTag):用于单曲推荐 tag_list = self.file_read_fun(tag_index_file, tag_list) # array 为单曲推荐的矩阵(commonTag artistTag) array = self.media_tag_matrix_fun(media_list, tag_list, media_tag_relation_file) # media tag score ap聚类 cluster_centers_, labels_, cluster_centers_indices = self.k_Means_algrothm( array) self.logger.log().info("ap cluster centers nums %s" % cluster_centers_.shape[0]) np.save(self.media_relation_ap_centers, cluster_centers_) np.save(self.media_relation_ap_labels, labels_) return cluster_centers_, labels_, cluster_centers_indices if __name__ == "__main__": prop = ReadProperties("data/app.properties") mediaTagAP = MediaLabelAP(prop.get("media_relation_ap_centers"), prop.get("media_relation_ap_labels")) mediaTagAP.run_media_tag_algrothm()
def main(self): prop_path = "data/app.properties" prop = ReadProperties(prop_path) date_path = datetime.today().strftime("%Y%m%d") # 创建一个日志文件夹 self.makeDir("data/logs/") self.makeDir("data/input/") self.makeDir("data/output") # tagIndex信息 commonTag artistTag self.file_down( local_path=prop.get("common_artist_tag_index_path"), hdfs_path=prop.get("fs_common_artist_tag_index") + date_path) # 用户数据 self.file_down( local_path=prop.get("userId_index_path"), hdfs_path=prop.get("fs_user_index") + date_path) # 歌单标签数据 self.file_down( local_path=prop.get("mediaList_tag_path"), hdfs_path=prop.get("fs_media_list_tag")) # 专题标签数据 self.file_down( local_path=prop.get("subject_tag_path"), hdfs_path=prop.get("fs_subject_tag")) # commonTagIndex 数据 self.file_down( local_path=prop.get("commonTag_index_path"), hdfs_path=prop.get("fs_common_tag_index")) # 用户基础标签评分(歌单和专题推荐) self.file_down( local_path=prop.get("user_common_tag_score_path"), hdfs_path=prop.get("fs_user_common_tag_score") + date_path) # 用户标签评分(单曲推荐) self.file_down( local_path=prop.get("user_common_artist_tag_score_path"), hdfs_path=prop.get("fs_user_common_artist_tag_score") + date_path) # 歌单 index 数据(index mediaList_id) self.file_down( local_path=prop.get("mediaList_index_path"), hdfs_path=prop.get("fs_media_list_index")) # 单曲播放得分 和 用户之间的关系(userId,mediaId,score) self.file_down( local_path=prop.get("media_play_score_path"), hdfs_path=prop.get("fs_user_play_score") + date_path + "/") # 专题index self.file_down( local_path=prop.get("subject_index_path"), hdfs_path=prop.get("fs_subject_index")) # mediaId index self.file_down(local_path=prop.get("mediaId_index_path"), hdfs_path=prop.get("fs_mediaId_index") + date_path) # # mediaId tag self.file_down(local_path=prop.get("mediaId_tag_path"), hdfs_path=prop.get("fs_mediaId_tag") + date_path) # userId tag(commonTag) score 单纯的播放的用户标签关系 歌单专题推荐 self.file_down(local_path=prop.get("user_play_tag_score_path"), hdfs_path=prop.get("fs_user_play_tag_score_path") + date_path) # userId tag(commonTag) score 单纯的收藏的用户标签关系 歌单专题推荐 self.file_down(local_path=prop.get("user_collect_tag_score_path"), hdfs_path=prop.get("fs_user_collect_tag_score_path") + date_path) # userId tag(commonTag) score(disperse) 单纯的播放的用户标签离散化得分 歌单专题推荐 self.file_down(local_path=prop.get("user_play_tag_disperse_score_path"), hdfs_path=prop.get("fs_user_play_tag_disperse_score_mat_list_subject_path") + date_path) # index userId (now) 当前用户index 不包含历史收藏 self.file_down(local_path=prop.get("user_now_play_index_path"), hdfs_path=prop.get("fs_user_now_play_index_path") + date_path) # userId tag(common & artist tag) score 单纯播放的用户标签评分 单曲推荐 self.file_down(local_path=prop.get("user_play_tag_score_single_mat_path"), hdfs_path=prop.get("fs_user_play_tag_score_single_mat_path") + date_path) # 所有歌曲 index mediaId self.file_down(local_path=prop.get("all_mediaId_index_path"), hdfs_path=prop.get("fs_all_mediaId_index_path")) # all mediaId common_tag self.file_down(local_path=prop.get("mediaId_common_tag_path"), hdfs_path=prop.get("fs_mediaId_common_tag_path")) # one month media play score data self.file_down(local_path=prop.get("media_play_score_one_month_path"), hdfs_path=prop.get("fs_media_play_score_one_month_path") + date_path)
class UserCenterTagFn(object): def __init__(self): self.prop = ReadProperties("data/app.properties") self.logger = LoggingUtil("data/logs/") self.user_tag_dict = {} pass def main(self): # media_ap_indices 返回的是一个聚类中心的索引 可以得到用户画像的标签信息 user_media_indices_file = self.prop.get("media_ap_indices") + ".npy" userCenter_user_relation = np.load(user_media_indices_file) content = "" user_id_index_list = [] userCenter_id_list = [] # 查找用户表数据得到用户id,之后再去查找用户标签评分数据得到用户的标签,之后就得到用户聚类中心和标签的关系 for i in range(0, userCenter_user_relation.shape[0]): user_id_index_list.append(userCenter_user_relation[i]) userCenter_id_list.append(i) user_file = self.prop.get("user_now_play_index_path") # 用户list user_list = [] user_list = self.file_read_fun_list(fileName=user_file, data=user_list) # 用户标签list user_tag_list = {} user_tag_list = self.file_read_fun_dict(fileName=self.prop.get("user_play_tag_score_single_mat_path"), data=user_tag_list) # 根据用户index 得到用户id userId_list = [] for i in range(0, len(user_id_index_list)): userId_list.append(user_list[user_id_index_list[i]]) user_tag_dict = {} for i in range(0, len(userId_list)): user_tag_dict.setdefault(i, {}) user_tag_dict[i].setdefault(userId_list[i], []) if userId_list[i] in user_tag_list.keys(): user_tag_dict[i][userId_list[i]].append(user_tag_list[userId_list[i]]) self.user_tag_dict = user_tag_dict # 存入mysql def saveToSql(self): conn = None cur = None try: conn = pymysql.connect(host=self.prop.get("db_host"), port=int(self.prop.get("db_port")), db=self.prop.get("db"), user=self.prop.get("user_name"), password=self.prop.get("password"), charset='utf8') cur = conn.cursor() # 获取游标 cur.execute("TRUNCATE x_media__user_userCenter_tag") insert_sql = "INSERT INTO x_media__user_userCenter_tag (user_center_id,user_id,tag,score,create_time,update_time) VALUES" temp_sql = "" for i in range(0, len(self.user_tag_dict)): if str(self.user_tag_dict[i].values()).replace('dict_values', '').replace("(", "").replace(")", "").replace( "[", "").replace("]", "") != "": user_id, tags = self.dict_key_value(self.user_tag_dict[i]) keys, values = self.dict_key_value(eval(tags)) for j in range(0, len(str(keys).split(","))): tagName = str(keys.split(",")[j]) score = float(values.split(",")[j]) userId = int(user_id) userCenterId = i temp_sql += "(%d,%d,\'%s\',%f,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'),str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" % ( userCenterId, userId, tagName, score, datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + "," insert_sql += temp_sql conn.ping(reconnect=True) cur.execute(insert_sql.rstrip(',')) conn.commit() except Exception as e: conn.rollback() print(e) self.logger.log().error(" insert error message : %s" % e) finally: self.logger.log().info("close streaming finished!!!") if conn is not None: conn.close() if cur is not None: cur.close() def file_read_fun_dict(self, fileName, data): """ 读取数据格式为 (String String) 的 .csv文件 :param fileName: :param data: :return: """ if os.path.exists(fileName): data = {} f = open(fileName, encoding="utf-8") for line in f.readlines(): row = line.split("\t") data.setdefault(row[0], {}) data[row[0]].setdefault(row[1], 0) data[row[0]][row[1]] += float(row[2].strip("\n")) f.close() else: self.logger.log().error("%s is not exists" % fileName) return data def file_read_fun_list(self, fileName, data): """ 读取数据格式为 (String String) 的 .csv文件 :param fileName: :param data: :return: """ if os.path.exists(fileName): f = open(fileName, encoding="utf-8") for line in f.readlines(): row = line.split("\t") data.append(row[1].rstrip("\n")) f.close() else: self.logger.log().error("%s is not exists" % fileName) return data def dict_key_value(self, data): k = str(data.keys()).replace("dict_keys", "").replace("(", "").replace(")", "").replace("[", "").replace("]", "").replace( "'", '') v = str(data.values()).replace("dict_values", "").replace("(", "").replace(")", "").replace("[", "").replace( "]", "") return k, v
def __init__(self): self.prop = ReadProperties("data/app.properties") self.logger = LoggingUtil("data/logs/") self.user_tag_dict = {} pass
conn.ping(reconnect=True) cur.execute(sql.rstrip(",")) conn.commit() else: break except Exception as e: self.logger.log().error("insert error %s" % e) finally: conn.close() cur.close() self.logger.log().info("close streaming...") if __name__ == '__main__': # ratingfile = 'data/16.RecommenderSystems/ml-1m/ratings.dat' prop = ReadProperties("data/app.properties") ratingfile = prop.get("media_play_score_one_month_path") # 创建ItemCF对象 itemcf = ItemBasedCF() # 将数据按照 7:3的比例,拆分成:训练集和测试集,存储在usercf的trainset和testset中 itemcf.generate_dataset(ratingfile, pivot=0.7) # 计算用户之间的相似度 itemcf.calc_media_sim() # 评估推荐效果 itemcf.evaluate() # itemcf.insert_to_mysql() # 查看推荐结果用户 user = "******" print("推荐结果", itemcf.recommend(user)) print("---", itemcf.testset.get(user, {}))
""" 基于 DBUtils 和 py 结合的简便操作数据库的类. """ __author__ = "wanglang" import logging import time import pymysql from DBUtils import PooledDB from ai_recommend_py.rs_recommend.readProperties import ReadProperties logging.basicConfig(level=logging.INFO, datefmt='%Y-%m-%d %H:%M:%S', format='%(asctime)s [%(levelname)s] %(message)s') prop = ReadProperties("data/app.properties") def get_time(fmt=None): ''' 获取当前时间 - @param: fmt 时间格式化字符串 ''' fmt = fmt or "%Y-%m-%d %H:%M:%S" return time.strftime(fmt, time.localtime()) def stitch_sequence(seq=None, is_field=True, suf=None): ''' 序列拼接方法, 用于将序列拼接成字符串 - :seq: 拼接序列
class ItemBasedCF(): """ TopN recommendation - ItemBasedCF """ def __init__(self): self.prop = ReadProperties("data/app.properties") self.logger = LoggingUtil("data/logs/") self.host = self.prop.get("db_host") self.port = int(self.prop.get("db_port")) self.db = self.prop.get("db") self.user = self.prop.get("user_name") self.password = self.prop.get("password") self.trainset = {} self.testset = {} # n_sim_user: top 20个用户, n_rec_media: top 10个推荐结果 self.n_sim_media = 20 self.n_rec_media = 10 # media_sim_mat: 歌曲之间的相似度, media_popular: 歌曲出现的次数, media_count: 播放歌曲的总数据 self.media_sim_mat = {} self.media_popular = {} self.media_count = 0 self.rec_media_dict = {} self.all_rec_medias = {} self.logger.log().info('Similar media number = %d' % self.n_sim_media) self.logger.log().info('Recommended media number = %d' % self.n_rec_media) @staticmethod def loadfile(filename): """loadfile(加载文件,返回一个生成器) Args: filename 文件名 Returns: line 行数据,去空格 """ fp = open(filename, 'r') for i, line in enumerate(fp): yield line.strip('\r\n') # if i > 0 and i % 100000 == 0: # print('loading %s(%s)' % (filename, i), file=sys.stderr) fp.close() def generate_dataset(self, filename, pivot=0.7): """loadfile(加载文件,将数据集按照7:3 进行随机拆分) Args: filename 文件名 pivot 拆分比例 """ trainset_len = 0 testset_len = 0 for line in self.loadfile(filename): user, media, rating, count, percentage_count = line.split('\t') # 通过pivot和随机函数比较,然后初始化用户和对应的值 if random.random() < pivot: # dict.setdefault(key, default=None) # key -- 查找的键值 # default -- 键不存在时,设置的默认键值 self.trainset.setdefault(user, {}) self.trainset[user][media] = str(rating + "\t" + count + "\t" + percentage_count) trainset_len += 1 else: self.testset.setdefault(user, {}) self.testset[user][media] = str(rating + "\t" + count + "\t" + percentage_count) testset_len += 1 self.logger.log().info('分离训练集和测试集成功') self.logger.log().info('train set = %s' % trainset_len) self.logger.log().info('test set = %s' % testset_len) def calc_media_sim(self): """ calc_media_sim() :return: item_sim_mat """ self.logger.log().info('counting medias number and popularity...') # 统计在所有的用户中`,不同歌曲的总播放次数, user, medias 某首歌曲被多少人播放过,数据来源中用户播放一首歌曲的所有记录被评分在了一起 for _, medias in self.trainset.items(): for media in medias: # count item popularity if media not in self.media_popular: self.media_popular[media] = 0 self.media_popular[media] += 1 self.logger.log().info('count medias number and popularity success') # total numbers of media self.media_count = len(self.media_popular) self.logger.log().info('total media number = %d' % self.media_count) # 统计在相同用户时,不同歌曲同时出现的次数(本意就是用户播放的每一首曲子与之相关的其他曲子的播放次数) item_sim_mat = self.media_sim_mat self.logger.log().info('building co-rated users matrix...') # user, medias for _, medias in self.trainset.items(): for m1 in medias: for m2 in medias: if m1 == m2: continue item_sim_mat.setdefault(m1, {}) item_sim_mat[m1].setdefault(m2, 0) item_sim_mat[m1][m2] += 1 self.logger.log().info('build co-rated users matrix success') # calculate similarity matrix self.logger.log().info('calculating media similarity matrix...') simfactor_count = 0 for m1, related_movies in item_sim_mat.items(): for m2, count in related_movies.items(): # 余弦相似度 item_sim_mat[m1][m2] = count / math.sqrt( self.media_popular[m1] * self.media_popular[m2]) simfactor_count += 1 self.logger.log().info('calculate media similarity matrix(similarity factor) success') self.logger.log().info('total similarity factor number = %d' % simfactor_count) def recommend(self, user): """recommend(找出top K的歌曲,对歌曲进行相似度sum的排序,取出top N的歌曲) Args: user 用户 Returns: rec_movie 歌曲推荐列表,按照相似度从大到小的排序 """ ''' Find K similar medias and recommend N medias. ''' K = self.n_sim_media N = self.n_rec_media rank = {} listened_media = self.trainset[user] # rating=歌曲得分, w=不同歌曲出现的次数 for media, rating in listened_media.items(): if media in self.media_sim_mat.keys(): for related_media, w in sorted( self.media_sim_mat[media].items(), key=itemgetter(1), reverse=True): if related_media in listened_media: continue rank.setdefault(related_media, 0) rank[related_media] += w * float(str(rating).split("\t")[0]) # return the N best medias return sorted(rank.items(), key=itemgetter(1), reverse=True) def evaluate(self): """ :param self: :return: precision, recall, coverage and popularity """ self.logger.log().info('Evaluation start...') # 返回top N的推荐结果 N = self.n_rec_media # varables for precision and recall # hit表示命中(测试集和推荐集相同+1),rec_count 每个用户的推荐数, test_count 每个用户对应的测试数据集的歌曲数目 hit = 0 rec_count = 0 test_count = 0 # varables for coverage # varables for popularity popular_sum = 0 # enumerate将其组成一个索引序列,利用它可以同时获得索引和值 # 参考地址:http://blog.csdn.net/churximi/article/details/51648388 for i, user in enumerate(self.trainset): if i > 0 and i % 500 == 0: self.logger.log().info('recommended for %d users' % i) test_medias = self.testset.get(user, {}) rec_medias = self.recommend(user) # 对比测试集和推荐集的差异 media, w for media, _ in rec_medias: if media in test_medias: hit += 1 self.all_rec_medias.setdefault(user, {}) self.all_rec_medias[user][media] = float(_) # 计算用户对应的歌曲出现次数log值的sum加和 popular_sum += math.log(1 + self.media_popular[str(media).split("\t")[0]]) rec_count += N test_count += len(test_medias) precision = hit / (1.0 * rec_count) # 命中/总推荐次数 recall = hit / (1.0 * test_count) # 命中/总测试数据 coverage = len(self.all_rec_medias) / (1.0 * self.media_count) # 推荐结果覆盖所有歌曲的覆盖率 popularity = popular_sum / (1.0 * rec_count) # 这个参数越大说明数据关联性越强 self.logger.log().info('precision=%.4f \t recall=%.4f \t coverage=%.4f \t popularity=%.4f' % ( precision, recall, coverage, popularity)) def insert_to_mysql(self): """ 将based-item 的用户推荐结果插入数据库 :return: """ conn = None cur = None try: self.logger.log().info("connect to mysql start ....") conn = pymysql.connect(host=self.host, port=self.port, db=self.db, user=self.user, password=self.password, charset='utf8') self.logger.log().info("connect to mysql success !!!") cur = conn.cursor() # 获取游标 cur.execute('truncate x_cf_item_recommend') self.logger.log().info("truncate table:x_cf_item_recommend success !!!") count = 0 sql = "insert into x_cf_item_recommend(user_id,media_id,score,create_time,update_time) values " temp_sql = "" total_count = 0 for user in self.all_rec_medias: media_score = self.all_rec_medias[user] if len(media_score) >= 5: total_count += 5 else: total_count += len(media_score) count_index = 0 for i, user in enumerate(self.all_rec_medias): media_score = self.all_rec_medias[user] for j, media in enumerate(media_score): if j < 5: score = media_score[media] temp_sql = "(\'%d\',\'%s\',%f,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'),str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" % ( int(user), str(media).split("\t")[0], float(score), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + "," sql += temp_sql count_index += 1 if (count_index % 2000 == 0) & (count_index != total_count): conn.ping(reconnect=True) cur.execute(sql.rstrip(",")) conn.commit() sql = "insert into x_cf_item_recommend(user_id,media_id,score,create_time,update_time) values " temp_sql = "" self.logger.log().info("insert to mysql part-%s success !!!" % (int(count_index / 2000))) continue if count_index == total_count: conn.ping(reconnect=True) cur.execute(sql.rstrip(",")) conn.commit() else: break except Exception as e: self.logger.log().error("insert error %s" % e) finally: conn.close() cur.close() self.logger.log().info("close streaming...")
hdfs_path=prop.get("fs_user_collect_tag_score_path") + date_path) # userId tag(commonTag) score(disperse) 单纯的播放的用户标签离散化得分 歌单专题推荐 self.file_down(local_path=prop.get("user_play_tag_disperse_score_path"), hdfs_path=prop.get("fs_user_play_tag_disperse_score_mat_list_subject_path") + date_path) # index userId (now) 当前用户index 不包含历史收藏 self.file_down(local_path=prop.get("user_now_play_index_path"), hdfs_path=prop.get("fs_user_now_play_index_path") + date_path) # userId tag(common & artist tag) score 单纯播放的用户标签评分 单曲推荐 self.file_down(local_path=prop.get("user_play_tag_score_single_mat_path"), hdfs_path=prop.get("fs_user_play_tag_score_single_mat_path") + date_path) # 所有歌曲 index mediaId self.file_down(local_path=prop.get("all_mediaId_index_path"), hdfs_path=prop.get("fs_all_mediaId_index_path")) # all mediaId common_tag self.file_down(local_path=prop.get("mediaId_common_tag_path"), hdfs_path=prop.get("fs_mediaId_common_tag_path")) # one month media play score data self.file_down(local_path=prop.get("media_play_score_one_month_path"), hdfs_path=prop.get("fs_media_play_score_one_month_path") + date_path) if __name__ == '__main__': prop = ReadProperties("data/app.properties") loadData = DownLoadData(host=prop.get("host"), user_name=prop.get("user_name")) loadData.main()
# from itertools import cycle # plt.figure('AP') # plt.subplots(facecolor=(0.5, 0.5, 0.5)) # colors = cycle('rgbcmykw') # for k, col in zip(range(cluster_centers_.shape[0]), colors): # # labels == k 使用k与labels数组中的每个值进行比较 # # 如labels = [1,0],k=0,则‘labels==k’的结果为[False, True] # class_members = labels_ == k # cluster_center = array[cluster_centers_indices[k]] # 聚类中心的坐标 # plt.plot(array[class_members, 0], array[class_members, 1], col + '.') # plt.plot(cluster_center[0], cluster_center[1], markerfacecolor=col, # markeredgecolor='k', markersize=14) # for x in array[class_members]: # plt.plot([cluster_center[0], x[0]], [cluster_center[1], x[1]], col) # plt.xticks(fontsize=10, color="darkorange") # plt.yticks(fontsize=10, color="darkorange") # plt.show() return cluster_centers_, labels_, cluster_centers_indices if __name__ == "__main__": prop = ReadProperties("data/app.properties") userTagAp = UserLabelAP(prop.get("media_ap_centers"), prop.get("media_ap_labels"), prop.get("media_ap_indices"), prop.get("media_list_ap_centers"), prop.get("media_list_ap_labels"), prop.get("media_list_ap_indices")) userTagAp.run_Ap_songList_algrothm() userTagAp.run_Ap_single_algrothm()
class ItemBaseMediaCF: def __init__(self): self.prop = ReadProperties("data/app.properties") self.logger = LoggingUtil("data/logs/") self.media_list = {} self.host = self.prop.get("db_host") self.port = self.prop.get("db_port") self.db = self.prop.get("db") self.user_name = self.prop.get("user_name") self.password = self.prop.get("password") pass def is_exist(self, file_name): """ 判断文件是否存在 :param fileName: :return:boolean 是否存在 """ return os.path.exists(file_name) def file_read_fun(self, file_name, data): """ 读取数据格式为 (String String) 的 .csv文件 :param fileName: :param data: :return: """ if self.is_exist(file_name): f = open(file_name, encoding="utf-8") for line in f.readlines(): row = line.split("\t") data.append(row[1].rstrip("\n")) f.close() else: self.logger.log().error("%s is not exists" % file_name) return data def media_tag_matrix_fun(self, media_id_list, tag_list, media_tag_file_name): """ user_tag_matrix_fun(返回歌曲标签矩阵,value为score) :param media_id_list: :param tag_list: :param media_tag_file_name: :return: 歌曲标签矩阵 """ self.logger.log().info("media tag matrix building...") len_row = len(media_id_list) len_column = len(tag_list) array = np.zeros((len_row, len_column)) if self.is_exist(media_tag_file_name): f = open(media_tag_file_name, encoding="utf-8") for line in f.readlines(): row = line.split("\t") media_id = row[0].strip() tag = row[1].strip() if tag in tag_list: array[media_id_list.index(media_id)][tag_list.index( tag)] = 1.0 self.logger.log().info("media tag matrix finished!!!") return array def cal_similarity_func(self, array): """ 计算相似度 :return:similarity_mat 相似度矩阵,top5存入mysql """ sim_mat = Similarity_Matrix() len_row = array.shape[0] len_column = array.shape[0] self.logger.log().info("cal similarity_mat start...") conn = None cur = None try: conn = pymysql.connect(host=self.host, port=int(self.port), db=self.db, user=self.user_name, password=self.password, charset='utf8') self.logger.log().info("connect to Mysql success!!!") cur = conn.cursor() # 获取游标 # cur.execute("TRUNCATE `x_cf_base_media_recommend`") sql = "INSERT INTO `x_cf_base_media_recommend`(mediaId_1,mediaId_2,score,create_time,update_time) VALUES" temp_sql = "" for i in range(len_row): media = {} # 索引的关系 media.setdefault(self.media_list[i], {}) for j in range(len_column): if i == j: media[self.media_list[i]][self.media_list[j]] = 0.0 continue else: score = math.fabs( sim_mat.similarity_func(array[i], array[j])) media[self.media_list[i]][self.media_list[j]] = float( score) # media 根据 得分排序取前十个 media_popular = list( sorted(media[self.media_list[i]].items(), key=lambda x: x[1], reverse=True))[:10] media_id_1 = list(media.keys())[0] for k in range(len(media_popular)): temp_sql = "(%d,%d,%f,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s')," \ "str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" \ % (int(media_id_1), int(media_popular[k][0]), float(media_popular[k][1]), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + "," sql += temp_sql if (i != 0) & ((i + 1) % 200 == 0) & ((i + 1) != len_row): conn.ping(reconnect=True) cur.execute(sql.rstrip(',')) conn.commit() sql = "INSERT INTO `x_cf_base_media_recommend`(mediaId_1,mediaId_2,score,create_time,update_time) VALUES" temp_sql = "" self.logger.log().info("cal media cosine similarity %s" % str(i + 1)) else: if (i + 1) == len_row: conn.ping(reconnect=True) cur.execute(sql.rstrip(',')) conn.commit() self.logger.log().info( "cal media cosine similarity completed !!!") except Exception as e: conn.rollback() self.logger.log().error("error message:%s" % e) finally: self.logger.log().info("stream closed") conn.close() cur.close() def main(self): """ 同类推荐歌曲相似度计算 :return: """ # media Index mediaId_index_file = self.prop.get("all_mediaId_index_path") # (commonTag)标签 tag_index_file = self.prop.get("commonTag_index_path") # 单曲推荐包含艺术家的用户标签评分 mediaId_tag_file = self.prop.get("mediaId_common_tag_path") medaiId_list = [] tag_list = [] medaiId_list = self.file_read_fun(mediaId_index_file, medaiId_list) self.media_list = medaiId_list # tag_list 包含所有的标签(commonTag artistTag):用于单曲推荐 tag_list = self.file_read_fun(tag_index_file, tag_list) # array 为单曲推荐的矩阵(commonTag artistTag) array = self.media_tag_matrix_fun(medaiId_list, tag_list, mediaId_tag_file) self.cal_similarity_func(array=array)
class ConnectToMysql: def __init__(self): self.prop = ReadProperties("data/app.properties") self.logger = LoggingUtil("data/logs/") self.host = self.prop.get("db_host") self.port = self.prop.get("db_port") self.db = self.prop.get("db") self.user_name = self.prop.get("user_name") self.password = self.prop.get("password") def file_read_fun(self, fileName, data): """ 读取数据格式为 (String String) 的 .csv文件 :param fileName: :param data: :return: """ if self.isExists(fileName): f = open(fileName, encoding="utf-8") for line in f.readlines(): row = line.split("\t") data.append(row[1].rstrip("\n")) f.close() return data def isExists(self, fileName): """ 判断文件是否存在 :param fileName: :return: """ return os.path.exists(fileName) def saveUserCenterToMysql(self, truncate_sql, user_id_list_file, user_labels_file, insert_sql): """ 将用户和用户聚类中心关系数据存入数据库 :return: """ conn = None cur = None try: conn = pymysql.connect(host=self.host, port=int(self.port), db=self.db, user=self.user_name, password=self.password, charset='utf8') self.logger.log().info("connect to mysql success !!!") # 将用户 和 用户中心存入 数据库 用户(user_list)和用户中心(labels) len 相同 user_id_list = [] user_id_list = self.file_read_fun(user_id_list_file, user_id_list) labels = np.load(user_labels_file) nums = len(labels) cur = conn.cursor() # 获取游标 cur.execute(truncate_sql) self.logger.log().info("truncate success table !!!") """2000个数据一组插入mysql中""" if nums > 2000: index = int(nums / 2000) for j in range(0, index): sql = insert_sql temp_sql = "" count = 0 for i in range(count + j * 2000, (j + 1) * 2000): temp_sql = "(%d,%d,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'),str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" % ( int(user_id_list[i]), labels[i], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + "," sql += temp_sql """rstrip(",")表示删除字符串右边的, lstrip(",") 表示删除左边的""" count += 1 if j + 1 == index: for k in range(index * 2000, nums): temp_sql = "(%d,%d,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'),str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" % ( int(user_id_list[k]), labels[k], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + "," sql += temp_sql conn.ping(reconnect=True) cur.execute(sql.rstrip(',')) conn.commit() else: sql = insert_sql for i in range(0, nums): sql += "(%d,%d,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'),str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" % ( int(user_id_list[i]), labels[i], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + "," conn.ping(reconnect=True) cur.execute(sql.rstrip(',')) conn.commit() self.logger.log().info("insert success !!!") except Exception as e: conn.rollback() print(e) self.logger.log().error(" insert error message : %s" % e) finally: self.logger.log().info("stream closed") conn.close() cur.close() def saveSimilarityMatrixToMysql(self, truncate_sql, similiarity_mat, media_file, insert_sql): """ 将相似度矩阵数据存入mysql :return: """ conn = None cur = None try: conn = pymysql.connect(host=self.host, port=int(self.port), db=self.db, user=self.user_name, password=self.password, charset='utf8') self.logger.log().info("connect to mysql success !!!") """将 相似度计算后的 用户聚类中心和 歌单 的 socre 数据存入 mysql""" similiarity_matrix = np.loadtxt(similiarity_mat) media_list_fileName = media_file media_list_id = [] media_list_id = self.file_read_fun(media_list_fileName, media_list_id) cur = conn.cursor() # 获取游标 cur.execute(truncate_sql) for i in range(0, similiarity_matrix.shape[0]): sql = insert_sql temp_sql = "" for j in range(0, similiarity_matrix.shape[1]): if similiarity_matrix[i][j] == 0: continue else: temp_sql = "(%d,%d,%f,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s')," \ "str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" \ % (i, int(media_list_id[j]), similiarity_matrix[i][j], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + "," sql += temp_sql if str(sql.rstrip()).endswith('values'): continue conn.ping(reconnect=True) cur.execute(sql.rstrip(',')) conn.commit() except Exception as e: conn.rollback() print(e) self.logger.log().error("error message:%s" % e) finally: self.logger.log().info("stream closed") conn.close() cur.close() def insertMediaRelationMatToMysql(self, truncate_sql, similarite_mat, insert_sql): conn = None cur = None self.logger.log().info("load file start...") media_relation_similarity_mat = np.loadtxt(similarite_mat) self.logger.log().info("load file finished") try: conn = pymysql.connect(host=self.host, port=int(self.port), db=self.db, user=self.user_name, password=self.password, charset='utf8') self.logger.log().info("connect to Mysql success!!!") cur = conn.cursor() # 获取游标 cur.execute(truncate_sql) for i in range(0, media_relation_similarity_mat.shape[0]): sql = insert_sql temp_sql = "" for j in range(0, media_relation_similarity_mat.shape[1]): if i == j: continue else: temp_sql += "(%s,%s,%f,str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s')," \ "str_to_date(\'%s\','%%Y-%%m-%%d %%H:%%i:%%s'))" \ % (i + 1, j + 1, media_relation_similarity_mat[i][j], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) + "," sql += temp_sql conn.ping(reconnect=True) cur.execute(sql.rstrip(',')) conn.commit() self.logger.log().info("insert success part_%s !!!" % (i + 1)) except Exception as e: conn.rollback() self.logger.log().error("error message:%s" % e) finally: self.logger.log().info("stream closed") conn.close() cur.close() def main(self): user_id_list_file = self.prop.get("user_now_play_index_path") user_mediaListSubject_labels_file = self.prop.get("media_list_ap_labels") + ".npy" # 单曲 truncate_media = "TRUNCATE `x_user__user_center`" insert_media_sql = "insert into x_user__user_center(user_id,user_center_id,create_time,update_time) values " user_media_labels_file = self.prop.get("media_ap_labels") + ".npy" truncate_mat_media = "TRUNCATE `x_user_center__rank`" similiarity_mat_file = self.prop.get("media_similarity_matrix") media_mat_file = self.prop.get("all_mediaId_index_path") insert_mat_media_sql = "insert into x_user_center__rank(user_center_id,media_id,score,create_time,update_time) values " self.saveUserCenterToMysql(truncate_sql=truncate_media, user_id_list_file=user_id_list_file, user_labels_file=user_media_labels_file, insert_sql=insert_media_sql) self.saveSimilarityMatrixToMysql(truncate_sql=truncate_mat_media, similiarity_mat=similiarity_mat_file, media_file=media_mat_file, insert_sql=insert_mat_media_sql) # 歌单 truncate_mediaList = "TRUNCATE `x_user__user_center_list`" insert_mediaList_sql = "insert into x_user__user_center_list(user_id,user_center_id,create_time,update_time) values " truncate_mat_mediaList = "TRUNCATE `x_user_center__rank_list`" similiarity_mat_file_mediaList = self.prop.get("media_list_similarity_matrix") mediaList_mat_file = self.prop.get("mediaList_index_path") insert_mat_mediaList_sql = "insert into x_user_center__rank_list(user_center_id,media_list_id,score,create_time,update_time) values " self.saveUserCenterToMysql(truncate_sql=truncate_mediaList, user_id_list_file=user_id_list_file, user_labels_file=user_mediaListSubject_labels_file, insert_sql=insert_mediaList_sql) self.saveSimilarityMatrixToMysql(truncate_sql=truncate_mat_mediaList, similiarity_mat=similiarity_mat_file_mediaList, media_file=mediaList_mat_file, insert_sql=insert_mat_mediaList_sql) # 专题 truncate_mediaSubject = "TRUNCATE `x_user__user_center_subject`" insert_mediaSubject_sql = "insert into x_user__user_center_subject(user_id,user_center_id,create_time,update_time) values " truncate_mat_mediaSubject = "TRUNCATE `x_user_center__rank_subject`" similiarity_mat_file_mediaSubject = self.prop.get("media_subject_similarity_matrix") mediaSubject_mat_file = self.prop.get("subject_index_path") insert_mat_mediaSubject_sql = "insert into x_user_center__rank_subject(user_center_id,subject_id,score,create_time,update_time) values " self.saveUserCenterToMysql(truncate_sql=truncate_mediaSubject, user_id_list_file=user_id_list_file, user_labels_file=user_mediaListSubject_labels_file, insert_sql=insert_mediaSubject_sql) self.saveSimilarityMatrixToMysql(truncate_sql=truncate_mat_mediaSubject, similiarity_mat=similiarity_mat_file_mediaSubject, media_file=mediaSubject_mat_file, insert_sql=insert_mat_mediaSubject_sql) # 歌曲相似度计算 同类型推荐 truncate_media_relation_sql = "TRUNCATE `x_cf_base_media_recommend`"; insert_media_relation_sql = "INSERT INTO `x_cf_base_media_recommend`(mediaId_1,mediaId_2,score,create_time,update_time) VALUES " media_relation_similarity_mat_file = self.prop.get("media_relation_matrix_path") self.insertMediaRelationMatToMysql(truncate_sql=truncate_media_relation_sql, similarite_mat=media_relation_similarity_mat_file, insert_sql=insert_media_relation_sql)