def main(): # 数据集按均匀分布划分为M份,M-1份均为训练集,剩下1份为测试集 M = 8 k = 0 seed = 42 # 随机数种子 data = [ tuple(line.split('::')[:2]) for line in open( 'G:/master/python/PycharmProjects/RecommendationSystem/UserCF/MovieLens/data/ml-1m/ratings.dat' ).readlines() ] # win10上的ml-1m数据集 # data = [tuple(line.split(',')[:2]) for line in open('G:/Recommend/User-CF/MovieLens/ml-latest-small/ratings_test.csv').readlines()] # ml-latest-small数据集 train, test = SplitData(data, M, k, seed) # 基于训练集计算用户兴趣相似度 W = UserCF.UserSimilarity(train) # 离线指标计算 precision, recall = PrecisionRecall(train, test, W, 30, 90) coverage = Coverge(train, W, 30, 90) novelty = Novelty(train, W, 30, 90) F1 = 2 * precision * recall / (precision + recall) print( f'precision:{precision}\trecall:{recall}\tcoverage:{coverage}\tpopularity:{novelty}\tF1:{F1}' ) '''
def PrecisionRecall(train, test, W, N, K): # N为推荐物品的TopN,K为和用户兴趣相似的K个用户 hit = 0 p_all = 0 r_all = 0 for user in train.keys(): tu = test.get(user, {}) # 用户user在测试集上喜欢的物品集合,是一个字典 # if tu: # 由于|R(u)|=N是人为设定要推荐的物品数,所以此处就没必要为tu为空的user做推荐计算了 rank_all = UserCF.UserCF_IIFRecommend( user, train, W, K) # 给训练集中的user做的和他兴趣相似的所有用户有过行为的全部物品的推荐 rank_TopN = sorted(rank_all.items(), key=itemgetter(1), reverse=True)[0:N] # 推荐给用户user的TopN物品及对应的预测出来的user对该物品的兴趣度 # for item, pui in rank_all.items(): for item, pui in rank_TopN: if item in tu: hit += 1 p_all += N # p_all += len(rank_all) r_all += len(tu) return hit / p_all, hit / r_all
def TestUserCFMult(): start = time.time() file_path = '~/file/rs/dataset/ml-1m/ratings.dat' d_file = eva.readData(file_path, '::') M = 8 # 分组数 N = 10 # 推荐个数 K = [5, 10, 20, 40, 80, 120] # K = [40, 80] # seeds = np.arange(M) seeds = [0] columns_list = ['Precision', 'Recall', 'Coverage', 'Popularity'] d = pd.DataFrame(np.zeros([len(K), len(columns_list)]), index=K, columns=columns_list) for index, seed in enumerate(seeds): train, test = eva.SplitData(d_file, M, seed) item_popularity = eva.ItemsPopularity(d_file, M, seed) W_user = UserCF.UserSimilarityVersion3(train) q = Queue() for k in K: pw = Process(target=Evaluation, args=(q, k, train, test, item_popularity, W_user, N)) pw.start() # 启动写 pr = Process(target=WriteIntoD, args=(q, d)) pr.start() # 启动读 pw.join() # 等待pw结束 end = time.time() print('Total Time: %.2fs' % (end - start)) pr.join() # 强制结束读
def TestUserCFIIF(): file_path = '~/file/rs/dataset/ml-1m/ratings.dat' d_file = eva.readData(file_path, '::') M = 8 # 分组数 N = 10 # 推荐个数 # K = [5, 10, 20] K = [[5, 40, 120], [10, 20, 80]] # K = [5, 10, 20, 40, 80, 120, 160] # seeds = np.arange(M) seeds = [0] columns_list = [ 'Precision-I', 'Precision-II', 'Recall-I', 'Recall-II', 'Coverage-I', 'Coverage-II', 'Popularity-I', 'Popularity-II' ] I_columns = ['Precision-I', 'Recall-I', 'Coverage-I', 'Popularity-I'] II_columns = ['Precision-II', 'Recall-II', 'Coverage-II', 'Popularity-II'] d = pd.DataFrame(np.zeros([len(K), len(columns_list)]), index=K, columns=columns_list) for index, seed in enumerate(seeds): train, test = eva.SplitData(d_file, M, seed) item_popularity = eva.ItemsPopularity(d_file, M, seed) W_userI = UserCF.UserSimilarityVersion2(train) W_userII = UserCF.UserSimilarityVersion3(train) for k in K: precision, recall, coverage, popularity = eva.PrecisionAndRecallAndCoverageAndPopularity( train, test, item_popularity, k, W_userI, N, 1) d.loc[k, I_columns] += [precision, recall, coverage, popularity] precision, recall, coverage, popularity = eva.PrecisionAndRecallAndCoverageAndPopularity( train, test, item_popularity, k, W_userII, N, 1) d.loc[k, II_columns] += [precision, recall, coverage, popularity] d.loc[k] /= (index + 1) d.to_excel('Result-UserCF-I-II.xlsx', 'UserCF-I-II-K') fig, axes = plt.subplots(2, 2) axes[0][0].set_title('Precision') d.iloc[:, 0:2].plot(ax=axes[0][0], style=['o-', 'o-']) axes[0][1].set_title('Recall') d.iloc[:, 2:4].plot(ax=axes[0][1], style=['o-', 'o-']) axes[1][0].set_title('Coverage') d.iloc[:, 4:6].plot(ax=axes[1][0], style=['o-', 'o-']) axes[1][1].set_title('Popularity') d.iloc[:, 6:8].plot(ax=axes[1][1], style=['o-', 'o-']) plt.show()
def PrecisionAndRecallAndCoverageAndPopularity(train, test, item_popularity, K, W, N, method=1): hit = 0 num_rank = 0 num_tu = 0 recommend_items = set() all_items = set() popularity = 0.0 if method == 1: for user in train: # test / train if user not in test: continue all_items = all_items | set(train[user][0]) tu = test[user][0] rank = UserCF.Recommend(user, train, W, K, N) recommend_items = recommend_items | set(rank) # hit += len(np.intersect1d(rank, tu)) for item in rank: if item in tu: hit += 1 popularity += math.log(1 + item_popularity[item]) # for item, value in rank.items(): # if item in tu: # hit += 1 num_rank += len(rank) num_tu += len(tu) # print('Hit: ', hit) # print('Rank num: ', num_rank) # print('Test user\'s item num:', num_tu) # print(len(all_items), len(recommend_items)) return hit / (num_rank * 1.0), hit / ( num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0) elif method == 2: for user in train: # test / train if user not in test: continue all_items = all_items | set(train[user][0]) tu = test[user][0] rank = ItemCF.Recommend(user, train, W, K, N) for item, _ in rank: if item in tu: hit += 1 popularity += math.log(1 + item_popularity[item]) recommend_items.add(item) num_rank += len(rank) num_tu += len(tu) # print('Hit: ', hit) # print('Rank num: ', num_rank) # print('Test user\'s item num:', num_tu) # print(len(all_items), len(recommend_items)) return hit / (num_rank * 1.0), hit / ( num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0)
def Coverge(train, W, N, K): recommend_items = set() all_items = set() for user in train.keys(): for item in train[user].keys(): all_items.add(item) rank_all = UserCF.UserCF_IIFRecommend(user, train, W, K) rank_TopN = sorted(rank_all.items(), key=itemgetter(1), reverse=True)[0:N] # for item, pui in rank_all.items(): for item, pui in rank_TopN: recommend_items.add(item) return len(recommend_items) / len(all_items)
def Novelty(train, W, N, K): item_popularity = PopularityNums(train) ret = 0 n = 0 # n=用户数*TopN推荐列表中物品数N(推荐的物品总数) for user in train.keys(): rank_all = UserCF.UserCF_IIFRecommend(user, train, W, K) rank_TopN = sorted(rank_all.items(), key=itemgetter(1), reverse=True)[0:N] # for item, pui in rank_all.items(): for item, pui in rank_TopN: ret += math.log(1 + item_popularity[item]) n += 1 return ret / n
def TestUserCF(): # 读取数据集 file_path = '~/file/rs/dataset/ml-1m/ratings.dat' d_file = eva.readData(file_path, '::') M = 8 # 分组数 N = 10 # 推荐个数 # K = [5, 10] K = [5, 10, 20, 40, 80, 120, 160] # seeds = np.arange(M) seeds = [0] columns_list = ['Precision', 'Recall', 'Coverage', 'Popularity'] userCF_columns = ['Precision', 'Recall', 'Coverage', 'Popularity'] d = pd.DataFrame(np.zeros([len(K), len(columns_list)]), index=K, columns=columns_list) for index, seed in enumerate(seeds): train, test = eva.SplitData(d_file, M, seed) item_popularity = eva.ItemsPopularity(d_file, M, seed) W_user = UserCF.UserSimilarityVersion3(train) for k in K: print(k) precision, recall, coverage, popularity = eva.PrecisionAndRecallAndCoverageAndPopularity( train, test, item_popularity, k, W_user, N, 1) d.loc[k, userCF_columns] += [precision, recall, coverage, popularity] d.loc[k] /= (index + 1) d.to_excel('Result-UserCF-K.xlsx', 'UserCF-K') fig, axes = plt.subplots(2, 2) axes[0][0].set_title('Precision') axes[0][0].plot(d.iloc[:, 0], 'o-', label='precision') axes[0][1].set_title('Recall') axes[0][1].plot(d.iloc[:, 1], 'o-', label='recall') axes[1][0].set_title('Coverage') axes[1][0].plot(d.iloc[:, 2], 'o-', label='coverage') axes[1][1].set_title('Popularity') axes[1][1].plot(d.iloc[:, 3], 'o-', label='popularity') plt.legend() plt.show()
continue # for i in UserBehaviorWeight.keys(): # print(f'用户{i}的电影评分') # for j in UserBehaviorWeight[i].keys(): # print(f'电影{j},评分{UserBehaviorWeight[i][j]}') return UserBehaviorWeight # 测试小例子 # User = {'1': ['a', 'b', 'd'], '2': ['b', 'c', 'e'], '3': ['c', 'd'], '4': ['b', 'c', 'd'], '5': ['a', 'd']} # Userlist = ['a', 'b', 'c', 'd', 'e'] # Userbehavoirating = {'1': {'a': '5', 'b': '4', 'd': '2'}, '2': {'b': '2', 'c': '4', 'e': '1'}, '3': {'c': '3', 'd': '4'}, # '4': {'b': '1', 'c': '3', 'd': '4'}, '5': {'a': '1', 'd': '5'}} # testIteMapId = dict() # for i in range(0, 5): # testIteMapId[Userlist[i]] = i # Matrix = ItemSimilarity(User, testIteMapId) # Recommend('1', 3, Matrix[0], Matrix[1], User, testIteMapId, Userbehavoirating) filePath = 'E:\\迅雷下载\\ml-latest-small\\ratings.csv' movie_path = 'E:\\迅雷下载\\ml-latest-small\\movies.csv' data = loadItemData(movie_path) IdMapingName = data[0] userbehavior_rating = loadUserWeight(filePath) userbehavior = UserCF.loadData(filePath) IteMatrix = ItemSimilarity(userbehavior, data[1]) SortList = Recommend('190', 10, IteMatrix[0], IteMatrix[1], userbehavior, data[1], userbehavior_rating) PrintRecommendList(SortList[0], SortList[1], IdMapingName, 20)
ret[user][item] = rating return ret if __name__ == '__main__': data = readData() numFlod = 5 precision =0 recall = 0 coverage = 0 popularity =0 for i in range(0,numFlod): [oriTrain,oriTest] = SplitData(data,numFlod,i,0) train = transform(oriTrain) test = transform(oriTest) W = UserCF.UserSimilarity(train) rank = UserCF.Recommend('1',train,W) result = UserCF.Recommendation(test.keys(), train, W) # W = UserCF_IIF.UserSimilarity(train) # rank = UserCF_IIF.Recommend('1',train,W) # result = UserCF_IIF.Recommendation(test.keys(), train, W) # W = ItemCF.ItemSimilarity(train) # rank = ItemCF.Recommend('1',train,W) # result = ItemCF_IUF.Recommendation(test.keys(),train, W) # W = ItemCF_IUF.ItemSimilarity(train) # rank = ItemCF_IUF.Recommend('1',train,W) # result = ItemCF_IUF.Recommendation(test.keys(),train, W)