Esempio n. 1
0
class UserCF:
    DataPath = "../dataset/ml-latest-small/ratings.csv"  # 全部数据路径
    Ratio = 0.8  # 训练集占数据集的比例
    K = 10  # 最邻近的K个user
    N = 10  # 为每个user推荐N个item
    R_train = None  # type:pd.DataFrame # 训练集评分矩阵
    R_test = None  # type:pd.DataFrame # 测试集评分矩阵
    users = None  # type:np.ndarray # 全部user列表
    items = None  # type:np.ndarray # 全部item列表
    W = None  # type:pd.DataFrame # user相似度矩阵
    P = None  # type:pd.DataFrame # 预测评分矩阵

    logger = Logger.fileAndConsoleLogger('../result/UserCF/userCF.log')

    def load_data(self):
        """
        1. 读取原始数据。ratings数据格式为[Rating]
        """
        ratings = FileUtil.readRatings(fileName=self.DataPath)

        # 划分训练集和测试集
        train, test = ratioSplitter(ratings, self.Ratio, fixed=True)

        # 获取user和item列表,这里需要去重,并保持顺序
        users = np.unique([r.uid for r in train])
        items = np.unique([r.iid for r in train])
        self.logger.debug(f"users长度:{len(users)}")
        self.logger.debug(f"items长度:{len(items)}")

        # 构建训练集评分矩阵
        R_train = pd.DataFrame(np.zeros((len(users), len(items))), index=users, columns=items)
        for r in train:
            R_train.at[r.uid, r.iid] = 1  # 隐式反馈,这里就设置为1,忽略具体的评分数据
        self.logger.debug(f"训练集评分矩阵:\n{R_train}")

        # 构建测试集评分矩阵
        test_users = np.unique([r.uid for r in test])
        test_items = np.unique([r.iid for r in test])
        R_test = pd.DataFrame(np.zeros((len(test_users), len(test_items))), index=test_users, columns=test_items)
        for r in test:
            R_test.at[r.uid, r.iid] = 1  # 隐式反馈,这里就设置为1,忽略具体的评分数据
        self.logger.debug(f"测试集评分矩阵:\n{R_test}")

        self.users = users
        self.items = items
        self.R_train = R_train
        self.R_test = R_test

        return users, items, R_train, R_test

    def calc_user_sim(self):
        """
        2. 计算user相似度。直接使用sklearn的pairwise_distances函数来计算,速度大大提高
        """
        W = 1 - pairwise_distances(self.R_train.to_numpy(), metric="cosine")
        W = pd.DataFrame(W, index=self.users, columns=self.users)
        self.logger.debug(f"user相似度矩阵:\n{W}")
        self.W = W
        return W

    def rec(self):
        """
        3. 推荐。计算推荐(兴趣)矩阵P,P[u][i]表示u对i的兴趣值(预测的u对i的评分)
        """
        self.logger.info(f"开始推荐,K={self.K}")
        P = pd.DataFrame(np.zeros((len(self.users), len(self.items))), index=self.users, columns=self.items)
        for u in self.users:
            K_Wu = self.W[u].nlargest(self.K + 1).iloc[1:]
            for v, wuv in K_Wu.items():
                Rv = self.R_train.loc[v]
                vis = Rv[Rv != 0.0].index.values
                for i in vis:
                    # 如果i已经在u的评分列表中,则跳过
                    if self.R_train.at[u, i] != 0.0:
                        continue
                    P.at[u, i] += self.W.at[u, v] * self.R_train.at[v, i]

        self.logger.debug(f"预测矩阵:\n{P}")
        self.P = P
        return P

    def evaluate(self):
        """
        4. 评估效果
        """
        Ru_Dict = {}
        Tu_Dict = {}
        for u in self.users:
            # 为u推荐的N个item
            N_Pu = self.P.loc[u].nlargest(self.N).index.values
            # 测试集中u的items
            try:  # 测试集中可能没有训练集中全部的user
                Tu = self.R_test.loc[u]
            except KeyError:
                continue
            N_Tu = Tu[Tu != 0.0].index.values
            Ru_Dict[u] = set(N_Pu)
            Tu_Dict[u] = set(N_Tu)

        precs = Evaluate.precision(Ru_Dict, Tu_Dict)
        recl = Evaluate.recall(Ru_Dict, Tu_Dict)
        covrg = Evaluate.coverage(Ru_Dict, len(self.items))
        popu = Evaluate.popularity(self.R_train, Ru_Dict)

        self.logger.info(
            f"K={self.K},N={self.N},准确率{precs * 100:.2f}%"
            f",召回率{recl * 100:.2f}%,覆盖率{covrg * 100:.2f}%,流行度{popu}")
        return precs, recl, covrg, popu

    def run(self):
        """
        执行上面4个步骤,也可以在外部调用,分步调试
        """
        self.load_data()
        self.calc_user_sim()
        self.rec()
        self.evaluate()