def fillMatrix(F_list, step_list, LearnRating_list, penalty_list, N = 30): trainDataSet = FO.readDataSet('trainDataSet.txt') testDataSet = FO.readDataSet('testDataSet.txt') userList = FO.readUserList('data/users.txt') itemList = FO.readItemList('data/movies.txt') # 获取物品列表 dataSet = {**trainDataSet, **testDataSet} mu = calMu(dataSet) for F in F_list: for step in step_list: for LearnRating in LearnRating_list: for penalty in penalty_list: print(F,step,LearnRating,penalty) path = 'SVD++/Matrix/' + str(F) + '-' + str(step) + '-' + str(LearnRating) + '-' + str(penalty) if not os.path.exists(path): os.makedirs(path) bu, bi, p, q = ReadParameter(F, step, LearnRating, penalty) for user in userList: print(user) rank = dict() userRated = set([user_item[1] for user_item, value in dataSet.items() if user_item[0] == user]) UnRatedList = itemList - userRated for item in UnRatedList: rating = Predict(user, item, p, q, bu, bi, mu) if math.isnan(float(rating)): continue rank[item] = round(rating) if len(rank) == 0: continue if N > len(UnRatedList): N = len(UnRatedList) chooseList = random.sample(UnRatedList, N) with open(path + '/new_ratings.txt', 'a') as fileObject: for choose in chooseList: fileObject.write(str(user) + '::' + str(choose) + '::' + str(rank[choose]) + '\n')
def calItemSimilarity(trainDataSet): ''' :param trainDataSet: 训练数据集 :param type: 以何种方式计算相似度 :param simMeas: 计算相似的方法 :return: None 最后将用户之间的相似度写到文件里面 ''' # 建立 用户 到 物品 的倒排表 user_items = dict() for user, item in trainDataSet: if user not in user_items: user_items[user] = set() user_items[user].add(item) #for user, item in user_items.items(): # print(user, ':', item) # for item, user in item_users.items(): # print(item, ':', user) # 读取用户列表 itemList = FO.readItemList('data/movies.txt') #ItemRatingMean = FO.ReadRatingMean('ItemCF/ItemMean/add_trainDataSet_mean.txt') UserRatingMean = FO.ReadRatingMean('ItemCF/UserMean/trainDataSet_mean.txt') # 计算用户之间的相似度 for item in itemList: W = ItemSimilarity(trainDataSet, user_items, itemList, UserRatingMean, item) print(item) filename = 'ItemCF/Similarity/'+str(item)+'.txt' FO.WirteSimilarty(filename,W)
def testSystem(trainDataSet, testDataSet, K=20, N=30, type='implicit'): testUserList = set() # 记录测试数据集里面的用户列表 for user_item, value in testDataSet.items(): testUserList.add(user_item[0]) itemList = FO.readItemList('data/movies.txt') # 获取物品列表 recall = 0 # 计算总的召回率 precision = 0 # 计算总的精确度 coverage_item = set() # 计算覆盖的物品 userLove = dict() for user in testUserList: # 遍历所有测试用户 #userLove[user] = [(user_item[1],value) for user_item, value in trainDataSet.items() if user_item[0] == user] userLove[user] = set([ user_item[1] for user_item, value in trainDataSet.items() if user_item[0] == user ]) rank = Recommend(trainDataSet, itemList, userLove[user], user, K, type) print(rank) if N < len(rank): top = N else: top = len(rank) topRank = sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[:top] # 获取排名前N个的推荐物品 topItemRank = [item_rating[0] for item_rating in topRank] # 获取排名前N个的推荐物品 User_item_InTest = [ user_item[1] for user_item, rating in testDataSet.items() if user_item[0] == user ] # 获取用户在测试集里面喜欢的物品 #print(User_item_InTest) #print(topRank) #print(topItemRank) # 计算 准确率 以及 召回率 oneRecall = Recall(User_item_InTest, topItemRank) onePrecision = Precision(User_item_InTest, topItemRank) recall += oneRecall precision += onePrecision print('recall=%.4f,precision=%.4f' % (oneRecall, onePrecision)) # 计算覆盖率 Coverage(coverage_item, topItemRank) coverage = float(len(coverage_item)) / len(itemList) popularity = Popularity(trainDataSet, coverage_item) recall = recall / float(len(testUserList)) precision = precision / float(len(testUserList)) print('total recall=%.4f,precision=%.4f,coverage=%.4f,popularity=%.4f'%\ (recall,precision,coverage,popularity)) return recall, precision, coverage, popularity
def calItemRatingMean(dataSet): itemList = FO.readItemList('data/movies.txt') # 获取物品列表 itemMeanDict = dict() for item in itemList: print(item) itemRating = [value for user_item, value in dataSet.items() if user_item[1] == item] if len(itemRating) == 0: value = 0 else: value = sum(itemRating) / len(itemRating) itemMeanDict[item] = value return itemMeanDict
def calItemSimilarity(trainDataSet, type='implicit', simMeas=cosSim): ''' :param trainDataSet: 训练数据集 :param type: 以何种方式计算相似度 :param simMeas: 计算相似的方法 :return: None 最后将用户之间的相似度写到文件里面 ''' # 建立 用户 到 物品 的倒排表 users_item = dict() for user, item in trainDataSet: if user not in users_item: users_item[user] = set() users_item[user].add(item) for user, item in users_item.items(): print(user, ':', item) # for item, user in item_users.items(): # print(item, ':', user) # 读取用户列表 itemList = FO.readItemList('data/movies.txt') # 遍历每一个用户, 计算用户之间的相似度 for item in itemList: print(item) if type == 'implicit': # 判读计算相似度的方式, 这里隐式计算 W = ItemSimilarity_implicit(users_item, item) filename = 'ItemCF/ItemSimilarity/implicit/' + str(item) + '.txt' elif type == 'explicit': # 判读计算相似度的方式, 这里显式方式 W = ItemSimilarity_explicit(trainDataSet, users_item, itemList, item, simMeas) filename = 'ItemCF/ItemSimilarity/explicit/' + str( simMeas.__name__) + '/' + str(item) + '.txt' # 将结果写入文件之中 with open(filename, 'w') as fileObject: for users, values in W.items(): for u in users: fileObject.write(str(str(u) + '::')) fileObject.write(str(values) + '\n')
W = FO.readItemSimilarity(item, type) # 获取当前用户与其他用户的相似度 topItem = dict( sorted(W.items(), key=operator.itemgetter(1), reverse=True)[:K]) topSimItem = set([items[1] for items, value in topItem.items()]) topLoveItem = userLove & topSimItem if len(topLoveItem) == 0: continue # print(topLoveItem) for LoveItem in topLoveItem: wji = W[item, LoveItem] rui = dataSet[user, LoveItem] rank[item] = rank.get(item, 0) + wji * rui return dict( sorted(rank.items(), key=operator.itemgetter(1), reverse=True)[:N]) if __name__ == '__main__': trainDataSet = FO.readDataSet('ItemCF/trainDataSet.txt') testDataSet = FO.readDataSet('ItemCF/testDataSet.txt') ''' calItemSimilarity(trainDataSet, 'implicit') # 计算隐式相似度 calItemSimilarity(trainDataSet, 'explicit', cosSim) # 以余弦相似度计算显式相似度 calItemSimilarity(trainDataSet, 'explicit', ecludSim) # 以欧氏距离计算显式相似度 ''' itemList = FO.readItemList('data/movies.txt') for i in range(1, 11): rank = predictSystem(trainDataSet, testDataSet, i, itemList) print(rank)