def generate_matrix(with_rating=False): """ :param with_rating: 训练集是否包括rating,True则rating范围为1~5,否则为0或1(只对部分算法有效) """ # UserCF.user_similarity_cosine(train, iif=False) # with/without rating # UserCF.user_similarity_cosine(train, iif=True) # with/without rating # UserCF.user_similarity_jaccard(train, iif=False) # with/without rating # UserCF.user_similarity_jaccard(train, iif=True) # with/without rating # UserCF.user_similarity_pearson(train, iif=False) # with rating # UserCF.user_similarity_pearson(train, iif=True) # with rating # UserCF.user_similarity_adjusted_cosine(train, iif=False) # with rating # UserCF.user_similarity_adjusted_cosine(train, iif=True) # with rating # UserCF.user_similarity_log_likelihood(train) # without rating ItemCF.item_similarity_cosine(train, norm=False, iuf=False, with_rating=with_rating) # with/without rating
def PrecisionAndRecallAndCoverageAndPopularity(train, test, item_popularity, K, W, N, method): """ 计算 准确率,召回率,覆盖率,流行度 Desc: Args: train test item_popularity K: 取相似度前K个 W: 相似度矩阵 N: TopN method: recommend 推荐方法 Returns: precision: 准确率 recall: 召回率 coverage: 覆盖率 popularity: 流行度 """ hit = 0 num_rank = 0 num_tu = 0 recommend_items = set() all_items = set() popularity = 0.0 for user in train: # test / train if user not in test: continue all_items = all_items | set(train[user][0]) tu = test[user][0] if method == 1: rank = ItemCF.Recommend1(user, train, W, K, N) elif method == 2: rank = ItemCF.Recommend2(user, train, W, K, N) for item, _ in rank: if item in tu: hit += 1 popularity += math.log(1 + item_popularity[item]) recommend_items.add(item) num_rank += len(rank) num_tu += len(tu) print(K, ': ', hit / (num_rank * 1.0), hit / (num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0)) return hit / (num_rank * 1.0), hit / ( num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0)
def PrecisionRecall(train, test, W, N, K): # N为推荐物品的TopN,K为和用户兴趣相似的K个用户 hit = 0 p_all = 0 r_all = 0 for user in train.keys(): tu = test.get(user, {}) # 用户user在测试集上喜欢的物品集合,是一个字典 # if tu: # 由于|R(u)|=N是人为设定要推荐的物品数,所以此处就没必要为tu为空的user做推荐计算了 rank_all = ItemCF.ItemCF_IUFNormRecommend(train, user, W, K) # K为指定的相似物品数 rank_TopN = sorted(rank_all.items(), key=itemgetter(1), reverse=True)[0:N] # 推荐给用户user的TopN物品及对应的预测出来的user对该物品的兴趣度 # for item, pui in rank_all.items(): for item, pui in rank_TopN: if item in tu: hit += 1 p_all += N # p_all += len(rank_all) r_all += len(tu) return hit / p_all, hit / r_all
def evaluate(train_set, test_set): print('Evaluating start ...') kg_sim = {} kg_sim_detail = {} for line in lf.load_file_all("./data/kg_sim1.txt"): temp = line.split("::") details = temp[1].replace("{", "").replace("}", "").split(",") kg_sim.setdefault(int(temp[0]), {}) for items in details: kv = items.split(":") kg_sim_detail[int(kv[0])] = kv[1] kg_sim[temp[0]] = kg_sim_detail user_sim_matrix = gg.calc_user_sim(train_set) movie_sim_matrix = ic.calc_movie_sim(train_set) all_rec_movies = set() for i in sim_mix: sim_score = i[0] sim_kg = i[1] for j in range(loop_count): # group generate group_mem = gg.generateGroup(user_sim_matrix, train_set, group_count, group_sim) group_rank_kg = ic.kg_ic_predict(movie_sim_matrix, train_set, group_mem, kg_sim, sim_score, sim_kg) g_rating_kg_lm = gsp.least_strategy(group_rank_kg, N) precision_kg_lm, recall_kg_lm, sim_score, sim_kg = ev_detail_kg( group_mem, test_set, g_rating_kg_lm, all_rec_movies, sim_score, sim_kg) g_rating_kg_avg = gsp.avg_strategy(group_rank_kg, N) precision_kg_avg, recall_kg_avg, sim_score, sim_kg = ev_detail_kg( group_mem, test_set, g_rating_kg_avg, all_rec_movies, sim_score, sim_kg) print("执行完毕!")
def PrecisionAndRecallAndCoverageAndPopularity(train, test, item_popularity, K, W, N, method=1): hit = 0 num_rank = 0 num_tu = 0 recommend_items = set() all_items = set() popularity = 0.0 if method == 1: for user in train: # test / train if user not in test: continue all_items = all_items | set(train[user][0]) tu = test[user][0] rank = UserCF.Recommend(user, train, W, K, N) recommend_items = recommend_items | set(rank) # hit += len(np.intersect1d(rank, tu)) for item in rank: if item in tu: hit += 1 popularity += math.log(1 + item_popularity[item]) # for item, value in rank.items(): # if item in tu: # hit += 1 num_rank += len(rank) num_tu += len(tu) # print('Hit: ', hit) # print('Rank num: ', num_rank) # print('Test user\'s item num:', num_tu) # print(len(all_items), len(recommend_items)) return hit / (num_rank * 1.0), hit / ( num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0) elif method == 2: for user in train: # test / train if user not in test: continue all_items = all_items | set(train[user][0]) tu = test[user][0] rank = ItemCF.Recommend(user, train, W, K, N) for item, _ in rank: if item in tu: hit += 1 popularity += math.log(1 + item_popularity[item]) recommend_items.add(item) num_rank += len(rank) num_tu += len(tu) # print('Hit: ', hit) # print('Rank num: ', num_rank) # print('Test user\'s item num:', num_tu) # print(len(all_items), len(recommend_items)) return hit / (num_rank * 1.0), hit / ( num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0)
def main(): # 读取数据集 file_path = '~/file/rs/dataset/ml-1m/ratings.dat' d_file = eva.readData(file_path, '::') M = 8 # 分组数 N = 10 # 推荐个数 K = [5, 10, 20, 40, 80, 120, 160] # seeds = np.arange(M) seeds = [0] columns_list = [ 'PrecisionUserCF', 'PrecisionItemCF', 'RecallUserCF', 'RecallItemCF', 'CoverageUserCF', 'CoverageItemCF', 'PopularityUserCF', 'PopularityItemCF' ] userCF_columns = [ 'PrecisionUserCF', 'RecallUserCF', 'CoverageUserCF', 'PopularityUserCF' ] itemCF_columns = [ 'PrecisionItemCF', 'RecallItemCF', 'CoverageItemCF', 'PopularityItemCF' ] d = pd.DataFrame(np.zeros([len(K), len(columns_list)]), index=K, columns=columns_list) for index, seed in enumerate(seeds): train, test = eva.SplitData(d_file, M, seed) item_popularity = eva.ItemsPopularity(d_file, M, seed) W_user = UserCF.UserSimilarityVersion3(train) W_item = ItemCF.ItemSimilarityVersion2(train) for k in K: precision, recall, coverage, popularity = eva.PrecisionAndRecallAndCoverageAndPopularity( train, test, item_popularity, k, W_user, N, 1) d.loc[k, userCF_columns] += [precision, recall, coverage, popularity] precision, recall, coverage, popularity = eva.PrecisionAndRecallAndCoverageAndPopularity( train, test, item_popularity, k, W_item, N, 2) d.loc[k, itemCF_columns] += [precision, recall, coverage, popularity] d.loc[k] /= (index + 1) d.to_excel('Result-UserCF-ItemCF-K.xlsx', 'UserCF-ItemCF-K') fig, axes = plt.subplots(2, 2) axes[0][0].set_title('Precision') d.iloc[:, 0:2].plot(ax=axes[0][0], style=['o-', 'o-']) axes[0][1].set_title('Recall') d.iloc[:, 2:4].plot(ax=axes[0][1], style=['o-', 'o-']) axes[1][0].set_title('Coverage') d.iloc[:, 4:6].plot(ax=axes[1][0], style=['o-', 'o-']) axes[1][1].set_title('Popularity') d.iloc[:, 6:8].plot(ax=axes[1][1], style=['o-', 'o-']) plt.show()
def TestItemCF(): """ 测试 ItemCF 算法 Desc: Args: Returns: """ start = time.time() path = '~/file/rs/dataset/ml-1m/ratingsTest.dat' d_file = pd.read_csv(path, sep='::', usecols=[0, 1, 2]) M = 8 # 分组数 N = 10 # 推荐个数 K = [5, 10, 20, 40, 80, 120, 160] train, test, train_items_list, item_popularity, items_user = SplitData(d_file, M, 1) # 0: seed w_item = ItemCF.ItemSimilarityVersion1(train, item_popularity, items_user) columns_list = [ 'precision', 'recall', 'coverage', 'popularity' ] d = pd.DataFrame( np.zeros([len(K), len(columns_list)]), index=K, columns=columns_list) p = Pool(4) result = dict() for k in K: result[k] = p.apply_async(Evaluation, args=(k, train, test, item_popularity, w_item, N, 2)) p.close() p.join() # 等待所有子进程执行完毕 for k, v in result.items(): d.loc[k, columns_list] += v.get() end = time.time() print('total time: %.2fs' % (end - start)) d.to_excel('Result-ItemCF-K.xlsx', 'ItemCF-K') fig, axes = plt.subplots(2, 2) axes[0][0].set_title('Precision') axes[0][0].plot(d.iloc[:, 0], 'o-', label='precision') axes[0][1].set_title('Recall') axes[0][1].plot(d.iloc[:, 1], 'o-', label='recall') axes[1][0].set_title('Coverage') axes[1][0].plot(d.iloc[:, 2], 'o-', label='coverage') axes[1][1].set_title('Popularity') axes[1][1].plot(d.iloc[:, 3], 'o-', label='popularity') plt.legend() plt.show()
def Coverge(train, W, N, K): recommend_items = set() all_items = set() for user in train.keys(): for item in train[user].keys(): all_items.add(item) rank_all = ItemCF.ItemCF_IUFNormRecommend(train, user, W, K) rank_TopN = sorted(rank_all.items(), key=itemgetter(1), reverse=True)[0:N] # for item, pui in rank_all.items(): for item, pui in rank_TopN: recommend_items.add(item) return len(recommend_items) / len(all_items)
def Novelty(train, W, N, K): item_popularity = PopularityNums(train) ret = 0 n = 0 # n=用户数*TopN推荐列表中物品数N(推荐的物品总数) for user in train.keys(): rank_all = ItemCF.ItemCF_IUFNormRecommend(train, user, W, K) rank_TopN = sorted(rank_all.items(), key=itemgetter(1), reverse=True)[0:N] # for item, pui in rank_all.items(): for item, pui in rank_TopN: ret += math.log(1 + item_popularity[item]) n += 1 return ret / n
def main(arg, f): # 数据集按均匀分布划分为M份,M-1份均为训练集,剩下1份为测试集 M = 8 k = 0 seed = 42 # 随机数种子 data = [ tuple(line.split('::')[:2]) for line in open( 'G:/master/python/PycharmProjects/RecommendationSystem/ItemCF/MovieLens/data/ml-1m/ratings.dat' ).readlines() ] # win10上的ml-1m数据集 # data = [tuple(line.split(',')[:2]) for line in open('G:/Recommend/User-CF/MovieLens/ml-latest-small/ratings_test.csv').readlines()] # ml-latest-small数据集 train, test = SplitData(data, M, k, seed) # 基于训练集计算物品相似度 W = ItemCF.ItemSimilarity(train) # 离线指标计算 precision, recall = PrecisionRecall(train, test, W, arg[1], arg[0]) f.write(str(precision)) f.write(',') f.write(str(recall)) f.write(',') coverage = Coverge(train, W, arg[1], arg[0]) f.write(str(coverage)) f.write(',') novelty = Novelty(train, W, arg[1], arg[0]) f.write(str(novelty)) f.write(',') F1 = 2 * precision * recall / (precision + recall) f.write(str(F1)) f.write(',') # print(f'precision:{precision}\trecall:{recall}\tcoverage:{coverage}\tpopularity:{novelty}\tF1:{F1}') '''
def get_recommendation_with_rating(user): # return UserCF.recommend_with_rating(user, train) return ItemCF.recommend_with_rating(user, train)
def get_recommendation(user): # return UserCF.recommend(user, train, _n, _user_k) return ItemCF.recommend(user, train, _n, _item_k)
def test_recommend(): """ 对比原版推荐算法和改进后的推荐算法 Desc: Args: Returns: """ file_path = '~/file/rs/dataset/ml-1m/ratings.dat' d_file = eva.readData(file_path, '::') M = 8 # 分组数 N = 10 # 推荐个数 K = [5, 10, 20, 40, 80, 120, 160] # K = [5, 10, 20, 40] train, test, train_items_list, item_popularity, items_user = SplitData(d_file, M, 0) # 0: seed # W = ItemCF.ItemSimilarityVersion1(train, item_popularity, items_user) W = ItemCF.ItemSimilarityVersion2(train, item_popularity, items_user) columns_list = [ 'Precision-I', 'Precision-II', 'Recall-I', 'Recall-II', 'Coverage-I', 'Coverage-II', 'Popularity-I', 'Popularity-II' ] I_columns = [ 'Precision-I', 'Recall-I', 'Coverage-I', 'Popularity-I' ] II_columns = [ 'Precision-II', 'Recall-II', 'Coverage-II', 'Popularity-II' ] d = pd.DataFrame( np.zeros([len(K), len(columns_list)]), index=K, columns=columns_list) # ItemCF p = Pool(4) resultI = dict() resultII = dict() for k in K: resultI[k] = p.apply_async(Evaluation, args=(k, train, test, item_popularity, W, N, 1)) resultII[k] = p.apply_async(Evaluation, args=(k, train, test, item_popularity, W, N, 2)) p.close() p.join() # 等待所有子进程执行完毕 for k, v in resultI.items(): d.loc[k, I_columns] += v.get() for k, v in resultII.items(): d.loc[k, II_columns] += v.get() d.to_excel('Result-ItemCF-Recommend-K.xlsx', 'ItemCF-K') fig, axes = plt.subplots(2, 2) axes[0][0].set_title('Precision') d.iloc[:, 0:2].plot(ax=axes[0][0], style=['o-', 'o-']) axes[0][1].set_title('Recall') d.iloc[:, 2:4].plot(ax=axes[0][1], style=['o-', 'o-']) axes[1][0].set_title('Coverage') d.iloc[:, 4:6].plot(ax=axes[1][0], style=['o-', 'o-']) axes[1][1].set_title('Popularity') d.iloc[:, 6:8].plot(ax=axes[1][1], style=['o-', 'o-']) plt.legend() plt.show()
def TestItemCF_Norm(): """ 对比 ItemCF 和 ItemCF-Norm Desc: Args: Returns: """ file_path = '~/file/rs/dataset/ml-1m/ratings.dat' start = time.time() d_file = eva.readData(file_path, '::') M = 8 # 分组数 N = 10 # 推荐个数 K = [5, 10, 20, 40, 80, 120, 160] train, test, train_items_list, item_popularity, items_user = SplitData(d_file, M, 0) # 0: seed W_ItemCF = ItemCF.ItemSimilarityVersion1(train, item_popularity, items_user) W_Norm = ItemCF.ItemSimilarityNorm(train, item_popularity, items_user) columns_list = [ 'Precision-ItemCF', 'Precision-Norm', 'Recall-ItemCF', 'Recall-Norm', 'Coverage-ItemCF', 'Coverage-Norm', 'Popularity-ItemCF', 'Popularity-Norm' ] I_columns = [ 'Precision-ItemCF', 'Recall-ItemCF', 'Coverage-ItemCF', 'Popularity-ItemCF' ] II_columns = [ 'Precision-Norm', 'Recall-Norm', 'Coverage-Norm', 'Popularity-Norm' ] d = pd.DataFrame( np.zeros([len(K), len(columns_list)]), index=K, columns=columns_list) # ItemCF p = Pool(4) resultItemCF = dict() resultNorm = dict() for k in K: resultItemCF[k] = p.apply_async(Evaluation, args=(k, train, test, item_popularity, W_ItemCF, N)) resultNorm[k] = p.apply_async(Evaluation, args=(k, train, test, item_popularity, W_Norm, N)) p.close() p.join() # 等待所有子进程执行完毕 for k, v in resultItemCF.items(): d.loc[k, I_columns] += v.get() for k, v in resultNorm.items(): d.loc[k, II_columns] += v.get() end = time.time() print('total time: %.2fs' % (end - start)) d.to_excel('Result-ItemCF-Norm-K.xlsx', 'ItemCF-K') fig, axes = plt.subplots(2, 2) axes[0][0].set_title('Precision') d.iloc[:, 0:2].plot(ax=axes[0][0], style=['o-', 'o-']) axes[0][1].set_title('Recall') d.iloc[:, 2:4].plot(ax=axes[0][1], style=['o-', 'o-']) axes[1][0].set_title('Coverage') d.iloc[:, 4:6].plot(ax=axes[1][0], style=['o-', 'o-']) axes[1][1].set_title('Popularity') d.iloc[:, 6:8].plot(ax=axes[1][1], style=['o-', 'o-']) plt.legend() plt.show()