def main():
    # 数据集按均匀分布划分为M份,M-1份均为训练集,剩下1份为测试集
    M = 8
    k = 0
    seed = 42  # 随机数种子

    data = [
        tuple(line.split('::')[:2]) for line in open(
            'G:/master/python/PycharmProjects/RecommendationSystem/UserCF/MovieLens/data/ml-1m/ratings.dat'
        ).readlines()
    ]  # win10上的ml-1m数据集
    # data = [tuple(line.split(',')[:2]) for line in open('G:/Recommend/User-CF/MovieLens/ml-latest-small/ratings_test.csv').readlines()]		# ml-latest-small数据集

    train, test = SplitData(data, M, k, seed)

    # 基于训练集计算用户兴趣相似度
    W = UserCF.UserSimilarity(train)

    # 离线指标计算
    precision, recall = PrecisionRecall(train, test, W, 30, 90)
    coverage = Coverge(train, W, 30, 90)
    novelty = Novelty(train, W, 30, 90)
    F1 = 2 * precision * recall / (precision + recall)
    print(
        f'precision:{precision}\trecall:{recall}\tcoverage:{coverage}\tpopularity:{novelty}\tF1:{F1}'
    )
    '''
def PrecisionRecall(train, test, W, N, K):  # N为推荐物品的TopN,K为和用户兴趣相似的K个用户
    hit = 0
    p_all = 0
    r_all = 0

    for user in train.keys():
        tu = test.get(user, {})  # 用户user在测试集上喜欢的物品集合,是一个字典

        # if tu:  # 由于|R(u)|=N是人为设定要推荐的物品数,所以此处就没必要为tu为空的user做推荐计算了
        rank_all = UserCF.UserCF_IIFRecommend(
            user, train, W, K)  # 给训练集中的user做的和他兴趣相似的所有用户有过行为的全部物品的推荐
        rank_TopN = sorted(rank_all.items(), key=itemgetter(1),
                           reverse=True)[0:N]
        # 推荐给用户user的TopN物品及对应的预测出来的user对该物品的兴趣度

        # for item, pui in rank_all.items():
        for item, pui in rank_TopN:
            if item in tu:
                hit += 1

        p_all += N
        # p_all += len(rank_all)
        r_all += len(tu)

    return hit / p_all, hit / r_all
Esempio n. 3
0
def TestUserCFMult():
    start = time.time()
    file_path = '~/file/rs/dataset/ml-1m/ratings.dat'
    d_file = eva.readData(file_path, '::')
    M = 8  # 分组数
    N = 10  # 推荐个数
    K = [5, 10, 20, 40, 80, 120]
    #  K = [40, 80]
    #  seeds = np.arange(M)
    seeds = [0]
    columns_list = ['Precision', 'Recall', 'Coverage', 'Popularity']
    d = pd.DataFrame(np.zeros([len(K), len(columns_list)]),
                     index=K,
                     columns=columns_list)
    for index, seed in enumerate(seeds):
        train, test = eva.SplitData(d_file, M, seed)
        item_popularity = eva.ItemsPopularity(d_file, M, seed)
        W_user = UserCF.UserSimilarityVersion3(train)
        q = Queue()
        for k in K:
            pw = Process(target=Evaluation,
                         args=(q, k, train, test, item_popularity, W_user, N))
            pw.start()  # 启动写
        pr = Process(target=WriteIntoD, args=(q, d))
        pr.start()  # 启动读
        pw.join()  # 等待pw结束
        end = time.time()
        print('Total Time: %.2fs' % (end - start))
        pr.join()  # 强制结束读
Esempio n. 4
0
def TestUserCFIIF():
    file_path = '~/file/rs/dataset/ml-1m/ratings.dat'
    d_file = eva.readData(file_path, '::')
    M = 8  # 分组数
    N = 10  # 推荐个数
    #  K = [5, 10, 20]
    K = [[5, 40, 120], [10, 20, 80]]
    #  K = [5, 10, 20, 40, 80, 120, 160]
    #  seeds = np.arange(M)
    seeds = [0]
    columns_list = [
        'Precision-I', 'Precision-II', 'Recall-I', 'Recall-II', 'Coverage-I',
        'Coverage-II', 'Popularity-I', 'Popularity-II'
    ]
    I_columns = ['Precision-I', 'Recall-I', 'Coverage-I', 'Popularity-I']
    II_columns = ['Precision-II', 'Recall-II', 'Coverage-II', 'Popularity-II']
    d = pd.DataFrame(np.zeros([len(K), len(columns_list)]),
                     index=K,
                     columns=columns_list)
    for index, seed in enumerate(seeds):
        train, test = eva.SplitData(d_file, M, seed)
        item_popularity = eva.ItemsPopularity(d_file, M, seed)
        W_userI = UserCF.UserSimilarityVersion2(train)
        W_userII = UserCF.UserSimilarityVersion3(train)
        for k in K:
            precision, recall, coverage, popularity = eva.PrecisionAndRecallAndCoverageAndPopularity(
                train, test, item_popularity, k, W_userI, N, 1)
            d.loc[k, I_columns] += [precision, recall, coverage, popularity]
            precision, recall, coverage, popularity = eva.PrecisionAndRecallAndCoverageAndPopularity(
                train, test, item_popularity, k, W_userII, N, 1)
            d.loc[k, II_columns] += [precision, recall, coverage, popularity]
        d.loc[k] /= (index + 1)

    d.to_excel('Result-UserCF-I-II.xlsx', 'UserCF-I-II-K')
    fig, axes = plt.subplots(2, 2)
    axes[0][0].set_title('Precision')
    d.iloc[:, 0:2].plot(ax=axes[0][0], style=['o-', 'o-'])
    axes[0][1].set_title('Recall')
    d.iloc[:, 2:4].plot(ax=axes[0][1], style=['o-', 'o-'])
    axes[1][0].set_title('Coverage')
    d.iloc[:, 4:6].plot(ax=axes[1][0], style=['o-', 'o-'])
    axes[1][1].set_title('Popularity')
    d.iloc[:, 6:8].plot(ax=axes[1][1], style=['o-', 'o-'])

    plt.show()
Esempio n. 5
0
def PrecisionAndRecallAndCoverageAndPopularity(train, test, item_popularity, K, W, N, method=1):
    hit = 0
    num_rank = 0
    num_tu = 0
    recommend_items = set()
    all_items = set()
    popularity = 0.0
    if method == 1:
        for user in train:  # test / train
            if user not in test:
                continue
            all_items = all_items | set(train[user][0])
            tu = test[user][0]
            rank = UserCF.Recommend(user, train, W, K, N)
            recommend_items = recommend_items | set(rank)
            #  hit += len(np.intersect1d(rank, tu))
            for item in rank:
                if item in tu:
                    hit += 1
                popularity += math.log(1 + item_popularity[item])
            #  for item, value in rank.items():
            #  if item in tu:
            #  hit += 1
            num_rank += len(rank)
            num_tu += len(tu)
        #  print('Hit: ', hit)
        #  print('Rank num: ', num_rank)
        #  print('Test user\'s item num:', num_tu)
        #  print(len(all_items), len(recommend_items))
        return hit / (num_rank * 1.0), hit / (
            num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0)
    elif method == 2:
        for user in train:  # test / train
            if user not in test:
                continue
            all_items = all_items | set(train[user][0])
            tu = test[user][0]
            rank = ItemCF.Recommend(user, train, W, K, N)
            for item, _ in rank:
                if item in tu:
                    hit += 1
                popularity += math.log(1 + item_popularity[item])
                recommend_items.add(item)
            num_rank += len(rank)
            num_tu += len(tu)
        #  print('Hit: ', hit)
        #  print('Rank num: ', num_rank)
        #  print('Test user\'s item num:', num_tu)
        #  print(len(all_items), len(recommend_items))
        return hit / (num_rank * 1.0), hit / (
            num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0)
def Coverge(train, W, N, K):
    recommend_items = set()
    all_items = set()

    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)

        rank_all = UserCF.UserCF_IIFRecommend(user, train, W, K)
        rank_TopN = sorted(rank_all.items(), key=itemgetter(1),
                           reverse=True)[0:N]

        # for item, pui in rank_all.items():
        for item, pui in rank_TopN:
            recommend_items.add(item)

    return len(recommend_items) / len(all_items)
def Novelty(train, W, N, K):
    item_popularity = PopularityNums(train)

    ret = 0
    n = 0  # n=用户数*TopN推荐列表中物品数N(推荐的物品总数)

    for user in train.keys():
        rank_all = UserCF.UserCF_IIFRecommend(user, train, W, K)
        rank_TopN = sorted(rank_all.items(), key=itemgetter(1),
                           reverse=True)[0:N]

        # for item, pui in rank_all.items():
        for item, pui in rank_TopN:
            ret += math.log(1 + item_popularity[item])
            n += 1

    return ret / n
Esempio n. 8
0
def TestUserCF():
    # 读取数据集
    file_path = '~/file/rs/dataset/ml-1m/ratings.dat'
    d_file = eva.readData(file_path, '::')
    M = 8  # 分组数
    N = 10  # 推荐个数
    #  K = [5, 10]
    K = [5, 10, 20, 40, 80, 120, 160]
    #  seeds = np.arange(M)
    seeds = [0]
    columns_list = ['Precision', 'Recall', 'Coverage', 'Popularity']
    userCF_columns = ['Precision', 'Recall', 'Coverage', 'Popularity']
    d = pd.DataFrame(np.zeros([len(K), len(columns_list)]),
                     index=K,
                     columns=columns_list)
    for index, seed in enumerate(seeds):
        train, test = eva.SplitData(d_file, M, seed)
        item_popularity = eva.ItemsPopularity(d_file, M, seed)
        W_user = UserCF.UserSimilarityVersion3(train)
        for k in K:
            print(k)
            precision, recall, coverage, popularity = eva.PrecisionAndRecallAndCoverageAndPopularity(
                train, test, item_popularity, k, W_user, N, 1)
            d.loc[k,
                  userCF_columns] += [precision, recall, coverage, popularity]
        d.loc[k] /= (index + 1)

    d.to_excel('Result-UserCF-K.xlsx', 'UserCF-K')
    fig, axes = plt.subplots(2, 2)
    axes[0][0].set_title('Precision')
    axes[0][0].plot(d.iloc[:, 0], 'o-', label='precision')
    axes[0][1].set_title('Recall')
    axes[0][1].plot(d.iloc[:, 1], 'o-', label='recall')
    axes[1][0].set_title('Coverage')
    axes[1][0].plot(d.iloc[:, 2], 'o-', label='coverage')
    axes[1][1].set_title('Popularity')
    axes[1][1].plot(d.iloc[:, 3], 'o-', label='popularity')
    plt.legend()
    plt.show()
Esempio n. 9
0
            continue

    # for i in UserBehaviorWeight.keys():
    #     print(f'用户{i}的电影评分')
    #     for j in UserBehaviorWeight[i].keys():
    #         print(f'电影{j},评分{UserBehaviorWeight[i][j]}')

    return UserBehaviorWeight


# 测试小例子
# User = {'1': ['a', 'b', 'd'], '2': ['b', 'c', 'e'], '3': ['c', 'd'], '4': ['b', 'c', 'd'], '5': ['a', 'd']}
# Userlist = ['a', 'b', 'c', 'd', 'e']
# Userbehavoirating = {'1': {'a': '5', 'b': '4', 'd': '2'}, '2': {'b': '2', 'c': '4', 'e': '1'}, '3': {'c': '3', 'd': '4'},
#                 '4': {'b': '1', 'c': '3', 'd': '4'}, '5': {'a': '1', 'd': '5'}}
# testIteMapId = dict()
# for i in range(0, 5):
#     testIteMapId[Userlist[i]] = i
# Matrix = ItemSimilarity(User, testIteMapId)
# Recommend('1', 3, Matrix[0], Matrix[1], User, testIteMapId, Userbehavoirating)

filePath = 'E:\\迅雷下载\\ml-latest-small\\ratings.csv'
movie_path = 'E:\\迅雷下载\\ml-latest-small\\movies.csv'
data = loadItemData(movie_path)
IdMapingName = data[0]
userbehavior_rating = loadUserWeight(filePath)
userbehavior = UserCF.loadData(filePath)
IteMatrix = ItemSimilarity(userbehavior, data[1])
SortList = Recommend('190', 10, IteMatrix[0], IteMatrix[1], userbehavior, data[1], userbehavior_rating)
PrintRecommendList(SortList[0], SortList[1], IdMapingName, 20)
Esempio n. 10
0
        ret[user][item] = rating
    return ret
    
if __name__ == '__main__':
    data = readData()
    numFlod = 5
    precision =0
    recall = 0
    coverage = 0
    popularity =0
    for i in range(0,numFlod):
        [oriTrain,oriTest] = SplitData(data,numFlod,i,0)
        train = transform(oriTrain)
        test = transform(oriTest)
        
        W = UserCF.UserSimilarity(train)
        rank = UserCF.Recommend('1',train,W)
        result = UserCF.Recommendation(test.keys(), train, W)
    
#        W = UserCF_IIF.UserSimilarity(train)
    #    rank = UserCF_IIF.Recommend('1',train,W)
#        result = UserCF_IIF.Recommendation(test.keys(), train, W)
        
    #    W = ItemCF.ItemSimilarity(train)
    #    rank = ItemCF.Recommend('1',train,W)
#        result =  ItemCF_IUF.Recommendation(test.keys(),train, W)
        
#        W = ItemCF_IUF.ItemSimilarity(train)
    #    rank = ItemCF_IUF.Recommend('1',train,W)
#        result =  ItemCF_IUF.Recommendation(test.keys(),train, W)