예제 #1
0
def generate_matrix(with_rating=False):
    """
    :param with_rating: 训练集是否包括rating,True则rating范围为1~5,否则为0或1(只对部分算法有效)
    """
    # UserCF.user_similarity_cosine(train, iif=False)  # with/without rating
    # UserCF.user_similarity_cosine(train, iif=True)  # with/without rating
    # UserCF.user_similarity_jaccard(train, iif=False)  # with/without rating
    # UserCF.user_similarity_jaccard(train, iif=True)  # with/without rating
    # UserCF.user_similarity_pearson(train, iif=False)  # with rating
    # UserCF.user_similarity_pearson(train, iif=True)  # with rating
    # UserCF.user_similarity_adjusted_cosine(train, iif=False)  # with rating
    # UserCF.user_similarity_adjusted_cosine(train, iif=True)  # with rating
    # UserCF.user_similarity_log_likelihood(train)  # without rating
    ItemCF.item_similarity_cosine(train, norm=False, iuf=False, with_rating=with_rating)  # with/without rating
예제 #2
0
def PrecisionAndRecallAndCoverageAndPopularity(train, test, item_popularity, K, W, N, method):
    """ 计算 准确率,召回率,覆盖率,流行度

    Desc:


    Args:
        train
        test
        item_popularity
        K: 取相似度前K个
        W: 相似度矩阵
        N: TopN
        method: recommend 推荐方法

    Returns:
        precision: 准确率
        recall: 召回率
        coverage: 覆盖率
        popularity: 流行度

    """
    hit = 0
    num_rank = 0
    num_tu = 0
    recommend_items = set()
    all_items = set()
    popularity = 0.0
    for user in train:  # test / train
        if user not in test:
            continue
        all_items = all_items | set(train[user][0])
        tu = test[user][0]
        if method == 1:
            rank = ItemCF.Recommend1(user, train, W, K, N)
        elif method == 2:
            rank = ItemCF.Recommend2(user, train, W, K, N)
        for item, _ in rank:
            if item in tu:
                hit += 1
            popularity += math.log(1 + item_popularity[item])
            recommend_items.add(item)
        num_rank += len(rank)
        num_tu += len(tu)
    print(K, ': ', hit / (num_rank * 1.0), hit / (num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0))

    return hit / (num_rank * 1.0), hit / (
        num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0)
예제 #3
0
def PrecisionRecall(train, test, W, N, K):  # N为推荐物品的TopN,K为和用户兴趣相似的K个用户
    hit = 0
    p_all = 0
    r_all = 0

    for user in train.keys():
        tu = test.get(user, {})  # 用户user在测试集上喜欢的物品集合,是一个字典

        # if tu:  # 由于|R(u)|=N是人为设定要推荐的物品数,所以此处就没必要为tu为空的user做推荐计算了
        rank_all = ItemCF.ItemCF_IUFNormRecommend(train, user, W,
                                                  K)  # K为指定的相似物品数
        rank_TopN = sorted(rank_all.items(), key=itemgetter(1),
                           reverse=True)[0:N]
        # 推荐给用户user的TopN物品及对应的预测出来的user对该物品的兴趣度

        # for item, pui in rank_all.items():
        for item, pui in rank_TopN:
            if item in tu:
                hit += 1

        p_all += N
        # p_all += len(rank_all)
        r_all += len(tu)

    return hit / p_all, hit / r_all
예제 #4
0
def evaluate(train_set, test_set):
    print('Evaluating start ...')
    kg_sim = {}
    kg_sim_detail = {}
    for line in lf.load_file_all("./data/kg_sim1.txt"):
        temp = line.split("::")
        details = temp[1].replace("{", "").replace("}", "").split(",")
        kg_sim.setdefault(int(temp[0]), {})
        for items in details:
            kv = items.split(":")
            kg_sim_detail[int(kv[0])] = kv[1]
        kg_sim[temp[0]] = kg_sim_detail

    user_sim_matrix = gg.calc_user_sim(train_set)
    movie_sim_matrix = ic.calc_movie_sim(train_set)
    all_rec_movies = set()

    for i in sim_mix:
        sim_score = i[0]
        sim_kg = i[1]
        for j in range(loop_count):
            # group generate
            group_mem = gg.generateGroup(user_sim_matrix, train_set,
                                         group_count, group_sim)
            group_rank_kg = ic.kg_ic_predict(movie_sim_matrix, train_set,
                                             group_mem, kg_sim, sim_score,
                                             sim_kg)

            g_rating_kg_lm = gsp.least_strategy(group_rank_kg, N)
            precision_kg_lm, recall_kg_lm, sim_score, sim_kg = ev_detail_kg(
                group_mem, test_set, g_rating_kg_lm, all_rec_movies, sim_score,
                sim_kg)

            g_rating_kg_avg = gsp.avg_strategy(group_rank_kg, N)
            precision_kg_avg, recall_kg_avg, sim_score, sim_kg = ev_detail_kg(
                group_mem, test_set, g_rating_kg_avg, all_rec_movies,
                sim_score, sim_kg)

    print("执行完毕!")
예제 #5
0
def PrecisionAndRecallAndCoverageAndPopularity(train, test, item_popularity, K, W, N, method=1):
    hit = 0
    num_rank = 0
    num_tu = 0
    recommend_items = set()
    all_items = set()
    popularity = 0.0
    if method == 1:
        for user in train:  # test / train
            if user not in test:
                continue
            all_items = all_items | set(train[user][0])
            tu = test[user][0]
            rank = UserCF.Recommend(user, train, W, K, N)
            recommend_items = recommend_items | set(rank)
            #  hit += len(np.intersect1d(rank, tu))
            for item in rank:
                if item in tu:
                    hit += 1
                popularity += math.log(1 + item_popularity[item])
            #  for item, value in rank.items():
            #  if item in tu:
            #  hit += 1
            num_rank += len(rank)
            num_tu += len(tu)
        #  print('Hit: ', hit)
        #  print('Rank num: ', num_rank)
        #  print('Test user\'s item num:', num_tu)
        #  print(len(all_items), len(recommend_items))
        return hit / (num_rank * 1.0), hit / (
            num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0)
    elif method == 2:
        for user in train:  # test / train
            if user not in test:
                continue
            all_items = all_items | set(train[user][0])
            tu = test[user][0]
            rank = ItemCF.Recommend(user, train, W, K, N)
            for item, _ in rank:
                if item in tu:
                    hit += 1
                popularity += math.log(1 + item_popularity[item])
                recommend_items.add(item)
            num_rank += len(rank)
            num_tu += len(tu)
        #  print('Hit: ', hit)
        #  print('Rank num: ', num_rank)
        #  print('Test user\'s item num:', num_tu)
        #  print(len(all_items), len(recommend_items))
        return hit / (num_rank * 1.0), hit / (
            num_tu * 1.0), len(recommend_items) / (len(all_items) * 1.0), popularity / (num_rank * 1.0)
예제 #6
0
def main():
    # 读取数据集
    file_path = '~/file/rs/dataset/ml-1m/ratings.dat'
    d_file = eva.readData(file_path, '::')
    M = 8  # 分组数
    N = 10  # 推荐个数
    K = [5, 10, 20, 40, 80, 120, 160]
    #  seeds = np.arange(M)
    seeds = [0]
    columns_list = [
        'PrecisionUserCF', 'PrecisionItemCF', 'RecallUserCF', 'RecallItemCF',
        'CoverageUserCF', 'CoverageItemCF', 'PopularityUserCF',
        'PopularityItemCF'
    ]
    userCF_columns = [
        'PrecisionUserCF', 'RecallUserCF', 'CoverageUserCF', 'PopularityUserCF'
    ]
    itemCF_columns = [
        'PrecisionItemCF', 'RecallItemCF', 'CoverageItemCF', 'PopularityItemCF'
    ]
    d = pd.DataFrame(np.zeros([len(K), len(columns_list)]),
                     index=K,
                     columns=columns_list)
    for index, seed in enumerate(seeds):
        train, test = eva.SplitData(d_file, M, seed)
        item_popularity = eva.ItemsPopularity(d_file, M, seed)
        W_user = UserCF.UserSimilarityVersion3(train)
        W_item = ItemCF.ItemSimilarityVersion2(train)
        for k in K:
            precision, recall, coverage, popularity = eva.PrecisionAndRecallAndCoverageAndPopularity(
                train, test, item_popularity, k, W_user, N, 1)
            d.loc[k,
                  userCF_columns] += [precision, recall, coverage, popularity]
            precision, recall, coverage, popularity = eva.PrecisionAndRecallAndCoverageAndPopularity(
                train, test, item_popularity, k, W_item, N, 2)
            d.loc[k,
                  itemCF_columns] += [precision, recall, coverage, popularity]
        d.loc[k] /= (index + 1)

    d.to_excel('Result-UserCF-ItemCF-K.xlsx', 'UserCF-ItemCF-K')
    fig, axes = plt.subplots(2, 2)
    axes[0][0].set_title('Precision')
    d.iloc[:, 0:2].plot(ax=axes[0][0], style=['o-', 'o-'])
    axes[0][1].set_title('Recall')
    d.iloc[:, 2:4].plot(ax=axes[0][1], style=['o-', 'o-'])
    axes[1][0].set_title('Coverage')
    d.iloc[:, 4:6].plot(ax=axes[1][0], style=['o-', 'o-'])
    axes[1][1].set_title('Popularity')
    d.iloc[:, 6:8].plot(ax=axes[1][1], style=['o-', 'o-'])

    plt.show()
예제 #7
0
def TestItemCF():
    """ 测试 ItemCF 算法

    Desc:


    Args:


    Returns:


    """
    start = time.time()
    path = '~/file/rs/dataset/ml-1m/ratingsTest.dat'
    d_file = pd.read_csv(path, sep='::', usecols=[0, 1, 2])
    M = 8  # 分组数
    N = 10  # 推荐个数
    K = [5, 10, 20, 40, 80, 120, 160]
    train, test, train_items_list, item_popularity, items_user = SplitData(d_file, M, 1)  # 0: seed
    w_item = ItemCF.ItemSimilarityVersion1(train, item_popularity, items_user)
    columns_list = [
        'precision', 'recall', 'coverage', 'popularity'
    ]
    d = pd.DataFrame(
        np.zeros([len(K), len(columns_list)]), index=K, columns=columns_list)
    p = Pool(4)
    result = dict()
    for k in K:
        result[k] = p.apply_async(Evaluation, args=(k, train, test, item_popularity, w_item, N, 2))
    p.close()
    p.join()  # 等待所有子进程执行完毕
    for k, v in result.items():
        d.loc[k, columns_list] += v.get()
    end = time.time()
    print('total time: %.2fs' % (end - start))

    d.to_excel('Result-ItemCF-K.xlsx', 'ItemCF-K')
    fig, axes = plt.subplots(2, 2)
    axes[0][0].set_title('Precision')
    axes[0][0].plot(d.iloc[:, 0], 'o-', label='precision')
    axes[0][1].set_title('Recall')
    axes[0][1].plot(d.iloc[:, 1], 'o-', label='recall')
    axes[1][0].set_title('Coverage')
    axes[1][0].plot(d.iloc[:, 2], 'o-', label='coverage')
    axes[1][1].set_title('Popularity')
    axes[1][1].plot(d.iloc[:, 3], 'o-', label='popularity')
    plt.legend()
    plt.show()
예제 #8
0
def Coverge(train, W, N, K):
    recommend_items = set()
    all_items = set()

    for user in train.keys():
        for item in train[user].keys():
            all_items.add(item)

        rank_all = ItemCF.ItemCF_IUFNormRecommend(train, user, W, K)
        rank_TopN = sorted(rank_all.items(), key=itemgetter(1),
                           reverse=True)[0:N]

        # for item, pui in rank_all.items():
        for item, pui in rank_TopN:
            recommend_items.add(item)

    return len(recommend_items) / len(all_items)
예제 #9
0
def Novelty(train, W, N, K):
    item_popularity = PopularityNums(train)

    ret = 0
    n = 0  # n=用户数*TopN推荐列表中物品数N(推荐的物品总数)

    for user in train.keys():
        rank_all = ItemCF.ItemCF_IUFNormRecommend(train, user, W, K)
        rank_TopN = sorted(rank_all.items(), key=itemgetter(1),
                           reverse=True)[0:N]

        # for item, pui in rank_all.items():
        for item, pui in rank_TopN:
            ret += math.log(1 + item_popularity[item])
            n += 1

    return ret / n
예제 #10
0
def main(arg, f):
    # 数据集按均匀分布划分为M份,M-1份均为训练集,剩下1份为测试集
    M = 8
    k = 0
    seed = 42  # 随机数种子

    data = [
        tuple(line.split('::')[:2]) for line in open(
            'G:/master/python/PycharmProjects/RecommendationSystem/ItemCF/MovieLens/data/ml-1m/ratings.dat'
        ).readlines()
    ]  # win10上的ml-1m数据集
    # data = [tuple(line.split(',')[:2]) for line in open('G:/Recommend/User-CF/MovieLens/ml-latest-small/ratings_test.csv').readlines()]		# ml-latest-small数据集

    train, test = SplitData(data, M, k, seed)

    # 基于训练集计算物品相似度
    W = ItemCF.ItemSimilarity(train)

    # 离线指标计算
    precision, recall = PrecisionRecall(train, test, W, arg[1], arg[0])
    f.write(str(precision))
    f.write(',')
    f.write(str(recall))
    f.write(',')
    coverage = Coverge(train, W, arg[1], arg[0])
    f.write(str(coverage))
    f.write(',')
    novelty = Novelty(train, W, arg[1], arg[0])
    f.write(str(novelty))
    f.write(',')
    F1 = 2 * precision * recall / (precision + recall)
    f.write(str(F1))
    f.write(',')

    # print(f'precision:{precision}\trecall:{recall}\tcoverage:{coverage}\tpopularity:{novelty}\tF1:{F1}')
    '''
예제 #11
0
def get_recommendation_with_rating(user):
    # return UserCF.recommend_with_rating(user, train)
    return ItemCF.recommend_with_rating(user, train)
예제 #12
0
def get_recommendation(user):
    # return UserCF.recommend(user, train, _n, _user_k)
    return ItemCF.recommend(user, train, _n, _item_k)
예제 #13
0
def test_recommend():
    """ 对比原版推荐算法和改进后的推荐算法

    Desc:


    Args:


    Returns:


    """
    file_path = '~/file/rs/dataset/ml-1m/ratings.dat'
    d_file = eva.readData(file_path, '::')
    M = 8  # 分组数
    N = 10  # 推荐个数
    K = [5, 10, 20, 40, 80, 120, 160]
    #  K = [5, 10, 20, 40]
    train, test, train_items_list, item_popularity, items_user = SplitData(d_file, M, 0)  # 0: seed
    #  W = ItemCF.ItemSimilarityVersion1(train, item_popularity, items_user)
    W = ItemCF.ItemSimilarityVersion2(train, item_popularity, items_user)
    columns_list = [
        'Precision-I', 'Precision-II', 'Recall-I', 'Recall-II',
        'Coverage-I', 'Coverage-II', 'Popularity-I',
        'Popularity-II'
    ]
    I_columns = [
        'Precision-I', 'Recall-I', 'Coverage-I', 'Popularity-I'
    ]
    II_columns = [
        'Precision-II', 'Recall-II', 'Coverage-II', 'Popularity-II'
    ]
    d = pd.DataFrame(
        np.zeros([len(K), len(columns_list)]), index=K, columns=columns_list)

    # ItemCF
    p = Pool(4)
    resultI = dict()
    resultII = dict()
    for k in K:
        resultI[k] = p.apply_async(Evaluation, args=(k, train, test, item_popularity, W, N, 1))
        resultII[k] = p.apply_async(Evaluation, args=(k, train, test, item_popularity, W, N, 2))
    p.close()
    p.join()  # 等待所有子进程执行完毕
    for k, v in resultI.items():
        d.loc[k, I_columns] += v.get()
    for k, v in resultII.items():
        d.loc[k, II_columns] += v.get()

    d.to_excel('Result-ItemCF-Recommend-K.xlsx', 'ItemCF-K')
    fig, axes = plt.subplots(2, 2)
    axes[0][0].set_title('Precision')
    d.iloc[:, 0:2].plot(ax=axes[0][0], style=['o-', 'o-'])
    axes[0][1].set_title('Recall')
    d.iloc[:, 2:4].plot(ax=axes[0][1], style=['o-', 'o-'])
    axes[1][0].set_title('Coverage')
    d.iloc[:, 4:6].plot(ax=axes[1][0], style=['o-', 'o-'])
    axes[1][1].set_title('Popularity')
    d.iloc[:, 6:8].plot(ax=axes[1][1], style=['o-', 'o-'])
    plt.legend()
    plt.show()
예제 #14
0
def TestItemCF_Norm():
    """ 对比 ItemCF 和 ItemCF-Norm

    Desc:


    Args:


    Returns:


    """
    file_path = '~/file/rs/dataset/ml-1m/ratings.dat'
    start = time.time()
    d_file = eva.readData(file_path, '::')
    M = 8  # 分组数
    N = 10  # 推荐个数
    K = [5, 10, 20, 40, 80, 120, 160]
    train, test, train_items_list, item_popularity, items_user = SplitData(d_file, M, 0)  # 0: seed
    W_ItemCF = ItemCF.ItemSimilarityVersion1(train, item_popularity, items_user)
    W_Norm = ItemCF.ItemSimilarityNorm(train, item_popularity, items_user)
    columns_list = [
        'Precision-ItemCF', 'Precision-Norm', 'Recall-ItemCF', 'Recall-Norm',
        'Coverage-ItemCF', 'Coverage-Norm', 'Popularity-ItemCF',
        'Popularity-Norm'
    ]
    I_columns = [
        'Precision-ItemCF', 'Recall-ItemCF', 'Coverage-ItemCF', 'Popularity-ItemCF'
    ]
    II_columns = [
        'Precision-Norm', 'Recall-Norm', 'Coverage-Norm', 'Popularity-Norm'
    ]
    d = pd.DataFrame(
        np.zeros([len(K), len(columns_list)]), index=K, columns=columns_list)

    # ItemCF
    p = Pool(4)
    resultItemCF = dict()
    resultNorm = dict()
    for k in K:
        resultItemCF[k] = p.apply_async(Evaluation, args=(k, train, test, item_popularity, W_ItemCF, N))
        resultNorm[k] = p.apply_async(Evaluation, args=(k, train, test, item_popularity, W_Norm, N))
    p.close()
    p.join()  # 等待所有子进程执行完毕
    for k, v in resultItemCF.items():
        d.loc[k, I_columns] += v.get()
    for k, v in resultNorm.items():
        d.loc[k, II_columns] += v.get()

    end = time.time()
    print('total time: %.2fs' % (end - start))

    d.to_excel('Result-ItemCF-Norm-K.xlsx', 'ItemCF-K')
    fig, axes = plt.subplots(2, 2)
    axes[0][0].set_title('Precision')
    d.iloc[:, 0:2].plot(ax=axes[0][0], style=['o-', 'o-'])
    axes[0][1].set_title('Recall')
    d.iloc[:, 2:4].plot(ax=axes[0][1], style=['o-', 'o-'])
    axes[1][0].set_title('Coverage')
    d.iloc[:, 4:6].plot(ax=axes[1][0], style=['o-', 'o-'])
    axes[1][1].set_title('Popularity')
    d.iloc[:, 6:8].plot(ax=axes[1][1], style=['o-', 'o-'])
    plt.legend()
    plt.show()