Ejemplo n.º 1
0
    def build_user_item_sim_CF(self):
        from collections import defaultdict
        self.user_sim = SimMatrix()
        self.item_sim = SimMatrix()
        self.user_k_neibor = defaultdict(dict)
        self.item_k_neibor = defaultdict(dict)

        # compute item-item similarity matrix
        print('constructing user-user similarity matrix...')
        # self.user_sim = util.load_data('../data/sim/ft_08_uu_tricf.pkl')
        for u1 in self.rg.user:
            for u2 in self.rg.user:
                if u1 != u2:
                    if self.user_sim.contains(u1, u2):
                        continue
                    sim = pearson_sp(self.rg.get_row(u1), self.rg.get_row(u2))
                    sim = round(sim, 5)
                    self.user_sim.set(u1, u2, sim)
        util.save_data(self.user_sim, '../data/sim/ft_08_uu_tricf_cv0.pkl')

        # compute the k neighbors of user
        # self.user_k_neibor = util.load_data(
        #     '../data/neibor/ft_08_uu_' + str(self.config.user_near_num) + '_neibor_tricf.pkl')
        for user in self.rg.user:
            matchUsers = sorted(self.user_sim[user].items(),
                                key=lambda x: x[1],
                                reverse=True)[:self.config.user_near_num]
            matchUsers = matchUsers[:self.config.user_near_num]
            self.user_k_neibor[user] = dict(matchUsers)
        util.save_data(
            self.user_k_neibor, '../data/neibor/ft_08_uu_' +
            str(self.config.user_near_num) + '_neibor_tricf_cv0.pkl')

        # compute item-item similarity matrix
        print('constructing item-item similarity matrix...')
        # self.item_sim = util.load_data('../data/sim/ft_08_ii_tricf.pkl')
        for i1 in self.rg.item:
            for i2 in self.rg.item:
                if i1 != i2:
                    if self.item_sim.contains(i1, i2):
                        continue
                    sim = pearson_sp(self.rg.get_col(i1), self.rg.get_col(i2))
                    sim = round(sim, 5)
                    self.item_sim.set(i1, i2, sim)
        util.save_data(self.item_sim, '../data/sim/ft_08_ii_tricf_cv0.pkl')

        # compute the k neighbors of item
        # self.item_k_neibor = util.load_data(
        #     '../data/neibor/ft_08_ii_' + str(self.config.item_near_num) + '_neibor_tricf.pkl')
        for item in self.rg.item:
            matchItems = sorted(self.item_sim[item].items(),
                                key=lambda x: x[1],
                                reverse=True)[:self.config.item_near_num]
            matchItems = matchItems[:self.config.item_near_num]
            self.item_k_neibor[item] = dict(matchItems)
        util.save_data(
            self.item_k_neibor, '../data/neibor/ft_08_ii_' +
            str(self.config.item_near_num) + '_neibor_tricf_cv0.pkl')
        pass
Ejemplo n.º 2
0
    def __init__(self):
        super(SimBase, self).__init__()

        self.config = ConfigX()
        self.user_sim = SimMatrix()
        self.item_sim = SimMatrix()
        self.user_k_neibor = defaultdict(dict)
        self.item_k_neibor = defaultdict(dict)
Ejemplo n.º 3
0
class ItemCF(MF):
    """
    docstring for ItemCF
    implement the ItemCF

    Sarwar B, Karypis G, Konstan J, et al. Item-based collaborative filtering recommendation algorithms[C]//Proceedings of the 10th international conference on World Wide Web. ACM, 2001: 285-295.
    """
    def __init__(self):
        super(ItemCF, self).__init__()
        self.config.n = 50
        # self.init_model()

    def init_model(self, k):
        super(ItemCF, self).init_model(k)
        self.item_sim = SimMatrix()

        for i_test in self.rg.testSet_i:
            for i_train in self.rg.item:
                if i_test != i_train:
                    if self.item_sim.contains(i_test, i_train):
                        continue
                    sim = pearson_sp(self.rg.get_col(i_test),
                                     self.rg.get_col(i_train))
                    self.item_sim.set(i_test, i_train, sim)

    def predict(self, u, i):

        # item_sim=dict()
        # for i_train in self.rg.item:
        #     if i != i_train:
        #         if i_train in item_sim :
        #             continue
        #         sim=cosine_sp(self.rg.get_col(i), self.rg.get_col(i_train))
        #         item_sim[i_train]=sim

        matchItems = sorted(self.item_sim[i].items(),
                            key=lambda x: x[1],
                            reverse=True)
        itemCount = self.config.n
        if itemCount > len(matchItems):
            itemCount = len(matchItems)

        sum, denom = 0, 0
        for n in range(itemCount):
            similarItem = matchItems[n][0]
            if self.rg.containsUserItem(u, similarItem):
                similarity = matchItems[n][1]
                rating = self.rg.trainSet_u[u][similarItem]
                sum += similarity * (rating - self.rg.itemMeans[similarItem])
                denom += similarity
        if sum == 0:
            if not self.rg.containsItem(i):
                return self.rg.globalMean
            return self.rg.itemMeans[i]
        pred = self.rg.itemMeans[i] + sum / float(denom)
        # print('finished user:'+str(u)+" item:"+str(i))
        return pred
        pass
Ejemplo n.º 4
0
    def init_model(self):
        self.item_sim = SimMatrix()

        for i_test in self.rg.testSet_i:
            for i_train in self.rg.item:
                if i_test != i_train:
                    if self.item_sim.contains(i_test, i_train):
                        continue
                    sim = cosine_sp(self.rg.get_col(i_test), self.rg.get_col(i_train))
                    self.item_sim.set(i_test, i_train, sim)
Ejemplo n.º 5
0
    def init_model(self):
        self.user_sim = SimMatrix()

        for u_test in self.rg.testSet_u:
            for u_train in self.rg.user:
                if u_test != u_train:
                    if self.user_sim.contains(u_test, u_train):
                        continue
                    sim = pearson_sp(self.rg.get_row(u_test),
                                     self.rg.get_row(u_train))
                    self.user_sim.set(u_test, u_train, sim)
Ejemplo n.º 6
0
    def init_model(self, k):
        super(ItemCF, self).init_model(k)
        self.item_sim = SimMatrix()

        for i_test in self.rg.testSet_i:
            for i_train in self.rg.item:
                if i_test != i_train:
                    if self.item_sim.contains(i_test, i_train):
                        continue
                    sim = pearson_sp(self.rg.get_col(i_test),
                                     self.rg.get_col(i_train))
                    self.item_sim.set(i_test, i_train, sim)
Ejemplo n.º 7
0
    def build_user_item_sim_CF(self):
        from collections import defaultdict
        self.user_sim = SimMatrix()  # 保存用户相似度矩阵-UI
        self.item_sim = SimMatrix()  # 保存项目相似度矩阵-UI
        self.user_k_neibor = defaultdict(dict)  # 保存用户k近邻
        self.item_k_neibor = defaultdict(dict)  # 保存项目k近邻
        # 用户
        # print('constructing user-user similarity matrix...')
        self.user_sim = util.load_data('../data/sim/ft_08_uu_tricf.pkl')
        # for u1 in self.rg.user:
        # 	for u2 in self.rg.user:
        # 		if u1!=u2:
        # 			if self.user_sim.contains(u1,u2):
        # 				continue
        # 			sim = pearson_sp(self.rg.get_row(u1),self.rg.get_row(u2))
        # 			sim=round(sim,5)
        # 			self.user_sim.set(u1,u2,sim)
        # util.save_data(self.user_sim,'../data/sim/ft_08_uu_tricf_cv1.pkl')

        # 寻找用户的k近邻
        self.user_k_neibor = util.load_data('../data/neibor/ft_08_uu_' +
                                            str(self.config.user_near_num) +
                                            '_neibor_tricf.pkl')
        # for user in self.rg.user:
        #     matchUsers = sorted(self.user_sim[user].items(),key = lambda x:x[1],reverse=True)[:self.config.user_near_num]
        #     matchUsers=matchUsers[:self.config.user_near_num]
        #     self.user_k_neibor[user]=dict(matchUsers)
        # util.save_data(self.user_k_neibor,'../data/neibor/ft_08_uu_'+str(self.config.user_near_num)+'_neibor_tricf.pkl')

        # 项目
        # print('constructing item-item similarity matrix...')
        self.item_sim = util.load_data('../data/sim/ft_08_ii_tricf.pkl')
        # for i1 in self.rg.item:
        # 	for i2 in self.rg.item:
        # 		if i1!=i2:
        # 			if self.item_sim.contains(i1,i2):
        # 				continue
        # 			sim = pearson_sp(self.rg.get_col(i1),self.rg.get_col(i2))
        # 			sim=round(sim,5)
        # 			self.item_sim.set(i1,i2,sim)
        # util.save_data(self.item_sim,'../data/sim/ft_08_ii_tricf_cv1.pkl')

        # 寻找项目的k近邻
        self.item_k_neibor = util.load_data('../data/neibor/ft_08_ii_' +
                                            str(self.config.item_near_num) +
                                            '_neibor_tricf.pkl')
        # for item in self.rg.item:
        #     matchItems = sorted(self.item_sim[item].items(),key = lambda x:x[1],reverse=True)[:self.config.item_near_num]
        #     matchItems=matchItems[:self.config.item_near_num]
        #     self.item_k_neibor[item]=dict(matchItems)
        # util.save_data(self.item_k_neibor,'../data/neibor/ft_08_ii_'+str(self.config.item_near_num)+'_neibor_tricf_cv1.pkl')
        pass
Ejemplo n.º 8
0
    def init_model(self, k):
        super(SocialReg, self).init_model(k)
        from collections import defaultdict
        self.user_sim = SimMatrix()
        print('constructing user-user similarity matrix...')

        # self.user_sim = util.load_data('../data/sim/ft_cf_soreg08_cv1.pkl')

        for u in self.rg.user:
            for f in self.tg.get_followees(u):
                if self.user_sim.contains(u, f):
                    continue
                sim = self.get_sim(u, f)
                self.user_sim.set(u, f, sim)
Ejemplo n.º 9
0
    def __init__(self):
        super(SimGe, self).__init__()
        self.config = ConfigX()

        self.config.walkCount = 30
        self.config.walkLength = 20
        self.config.walkDim = 20
        self.config.winSize = 5
        self.config.topK = 50

        self.user_sim = SimMatrix()
        self.item_sim = SimMatrix()
        self.user_k_neibor = defaultdict(dict)
        self.item_k_neibor = defaultdict(dict)
Ejemplo n.º 10
0
    def init_model(self, k):
        super(CUNE, self).init_model(k)
        self.user_sim = SimMatrix()
        self.generate_cu_net() # 构建uu网络
        self.deep_walk()

        print('Constructing similarity matrix...')
        # self.W = np.zeros((self.rg.get_train_size()[0], self.config.walkDim))
        self.topKSim = defaultdict(dict)
        i = 0
        for user1 in self.CUNet:
            sims = {}
            for user2 in self.CUNet:
                if user1 != user2:
                    wu1 = self.model[str(user1)] # 取出embedding
                    wu2 = self.model[str(user2)]
                    sims[user2]=cosine(wu1,wu2) # 计算uu相似性
            self.topKSim[user1] = sorted(sims.items(), key=lambda d: d[1], reverse=True)[:self.config.topK] # 按照value来排序,{u1:{u2:1.0, ...}, ...}每个user的键值是该user的k个最相似好友
            i += 1
            if i % 200 == 0:
                print('progress:', i, '/', len(self.CUNet)) # 200个user为一组输出进度
        # print(self.topKSim)
        #构建被关注列表
        print('Constructing desimilarity matrix...')
        self.topKSimBy = defaultdict(dict)
        for user in self.topKSim:
            users = self.topKSim[user]
            for user2 in users: # user的相似好友中
                self.topKSimBy[user2[0]][user] = user2[1] # 把“关注字典”的key和value互换,得到“被关注字典”:{u2:{u1:1.0, ...}, ...}
        print('Similarity matrix finished.')
class UserCF(MF):
    """
    docstring for UserCF
    implement the UserCF

    Resnick P, Iacovou N, Suchak M, et al. GroupLens: an open architecture for collaborative filtering of netnews[C]//Proceedings of the 1994 ACM conference on Computer supported cooperative work. ACM, 1994: 175-186.
    """
    def __init__(self):
        super(UserCF, self).__init__()
        self.config.n = 10
        # self.init_model(k)

    def init_model(self, k):
        super(UserCF, self).init_model(k)
        self.user_sim = SimMatrix()

        for u_test in self.rg.testSet_u:
            for u_train in self.rg.user:
                if u_test != u_train:
                    if self.user_sim.contains(u_test, u_train):
                        continue
                    sim = pearson_sp(self.rg.get_row(u_test),
                                     self.rg.get_row(u_train))
                    self.user_sim.set(u_test, u_train, sim)

    def predict(self, u, i):
        matchUsers = sorted(self.user_sim[u].items(),
                            key=lambda x: x[1],
                            reverse=True)
        userCount = self.config.n
        if userCount > len(matchUsers):
            userCount = len(matchUsers)

        sum, denom = 0, 0
        for n in range(userCount):
            similarUser = matchUsers[n][0]
            if self.rg.containsUserItem(similarUser, i):
                similarity = matchUsers[n][1]
                rating = self.rg.trainSet_u[similarUser][i]
                sum += similarity * (rating - self.rg.userMeans[similarUser])
                denom += similarity
        if sum == 0:
            if not self.rg.containsUser(u):
                return self.rg.globalMean
            return self.rg.userMeans[u]
        pred = self.rg.userMeans[u] + sum / float(denom)
        return pred
Ejemplo n.º 12
0
 def getSimMatrix(self, sim_func=pearson_sp):
     self.log.info(
         "gettting sim matrix with '%s()' ... (will take some time) " %
         sim_func.__name__)
     sim_matrix = SimMatrix()
     count = 0
     for i1 in tqdm(self.item):
         for i2 in (self.item):
             if i1 != i2:
                 if sim_matrix.contains(i1, i2):
                     continue
                 a, b = self.get_col(i1), self.get_col(i2)
                 # 皮尔逊相似度? 修改为余弦相似度;
                 # sim = pearson_sp(a, b)
                 # 计算 jaacard
                 sim = sim_func(a.keys(), b.keys())
                 # if sim1 != 0 or sim2 != 0 or sim3 != 0:
                 # print (i1, a, i2, b, sim1, sim2, sim3)
                 # sim = sim1
                 sim = round(sim, 5)
                 if sim != 0:
                     #     self.log.debug("sim: %s -- item %s item %s " % (sim, i1, i2))
                     sim_matrix.set(i1, i2, sim)
                     count += 1
                 # if count > 10:
                 #     break; # 测试早期停止数据
     self.log.info("'%s()' get %s sims " %
                   (sim_func.__name__, sim_matrix.size()))
     return sim_matrix
Ejemplo n.º 13
0
class SocialReg(MF):
    """
    docstring for SocialReg

    Ma H, Zhou D, Liu C, et al. Recommender systems with social regularization[C]//Proceedings of the fourth ACM international conference on Web search and data mining. ACM, 2011: 287-296.
    """
    def __init__(self):
        super(SocialReg, self).__init__()
        # self.config.lambdaP = 0.001
        # self.config.lambdaQ = 0.001
        self.config.alpha = 0.1
        self.tg = TrustGetter()
        # self.init_model()

    def init_model(self, k):
        super(SocialReg, self).init_model(k)
        from collections import defaultdict
        self.user_sim = SimMatrix()
        print('constructing user-user similarity matrix...')

        # self.user_sim = util.load_data('../data/sim/ft_cf_soreg08_cv1.pkl')

        for u in self.rg.user:
            for f in self.tg.get_followees(u):
                if self.user_sim.contains(u, f):
                    continue
                sim = self.get_sim(u, f)
                self.user_sim.set(u, f, sim)

        # util.save_data(self.user_sim,'../data/sim/ft_cf_soreg08.pkl')

    def get_sim(self, u, k):
        sim = (pearson_sp(self.rg.get_row(u), self.rg.get_row(k)) +
               1.0) / 2.0  # fit the value into range [0.0,1.0]
        return sim

    def train_model(self, k):
        super(SocialReg, self).train_model(k)
        iteration = 0
        while iteration < self.config.maxIter:
            self.loss = 0
            for index, line in enumerate(self.rg.trainSet()):
                user, item, rating = line
                u = self.rg.user[user]
                i = self.rg.item[item]
                error = rating - self.predict(user, item)
                self.loss += 0.5 * error**2
                p, q = self.P[u], self.Q[i]

                social_term_p, social_term_loss = np.zeros(
                    (self.config.factor)), 0.0
                followees = self.tg.get_followees(user)
                for followee in followees:
                    if self.rg.containsUser(followee):
                        s = self.user_sim[user][followee]
                        uf = self.P[self.rg.user[followee]]
                        social_term_p += s * (p - uf)
                        social_term_loss += s * ((p - uf).dot(p - uf))

                social_term_m = np.zeros((self.config.factor))
                followers = self.tg.get_followers(user)
                for follower in followers:
                    if self.rg.containsUser(follower):
                        s = self.user_sim[user][follower]
                        ug = self.P[self.rg.user[follower]]
                        social_term_m += s * (p - ug)

                # update latent vectors
                self.P[u] += self.config.lr * (
                    error * q - self.config.alpha *
                    (social_term_p + social_term_m) - self.config.lambdaP * p)
                self.Q[i] += self.config.lr * (error * p -
                                               self.config.lambdaQ * q)

                self.loss += 0.5 * self.config.alpha * social_term_loss

            self.loss += 0.5 * self.config.lambdaP * (self.P * self.P).sum(
            ) + 0.5 * self.config.lambdaQ * (self.Q * self.Q).sum()

            iteration += 1
            if self.isConverged(iteration):
                break
Ejemplo n.º 14
0
class SimGe():
    def __init__(self):
        super(SimGe, self).__init__()
        self.config = ConfigX()

        self.config.walkCount = 30
        self.config.walkLength = 20
        self.config.walkDim = 20
        self.config.winSize = 5
        self.config.topK = 50

        self.user_sim = SimMatrix()
        self.item_sim = SimMatrix()
        self.user_k_neibor = defaultdict(dict)
        self.item_k_neibor = defaultdict(dict)

    def check_dataset(self):
        super(SimGe, self).check_dataset()
        # if config.dataset_name != 'db':
        #     print("WARN: 注意 config.dataset_name 未设置为 'db' - douban movie")
        # # config.dataset_name = 'ml'
        # # sys.exit()

    def build_user_item_sim(self,
                            kfold,
                            user_near_num=50,
                            item_near_num=50,
                            load_save_sim=False):
        """
        获取 user 与 item 的相似性
        load_save_sim: 加载原有保存数据,提高测试速度
        """

        # 目前仅使用一个 SimCF
        # TODO: 下一步要混合多个Sim
        self.build_user_item_sim_CF(kfold,
                                    user_near_num=user_near_num,
                                    item_near_num=item_near_num,
                                    load_save_sim=load_save_sim)

    def build_user_item_sim_CF(self,
                               kfold,
                               user_near_num=50,
                               item_near_num=50,
                               load_save_sim=False):

        self.rg = RatingGetter(kfold)
        self.mg = MetaGetter(kfold)

        from collections import defaultdict

        # compute item-item similarity matrix
        print('构建 item-item 相似度矩阵  ...')
        if load_save_sim:
            self.item_sim = util.load_data(
                '../data/sim/%s_08_ii_gemf_cv0.pkl' % self.config.dataset_name)
        else:
            # 封装 item 相似度计算
            self.item_sim = self.mg.getSimMatrix(jaccard_sim)
            util.save_data(
                self.item_sim,
                '../data/sim/%s_08_ii_gemf_cv0.pkl' % self.config.dataset_name)

        # compute the k neighbors of item
        if load_save_sim:
            self.item_k_neibor = util.load_data(
                '../data/neibor/%s_08_ii_%s_neibor_gemf_cv0.pkl' %
                (self.config.dataset_name, item_near_num))
        for item in self.mg.item:
            matchItems = sorted(self.item_sim[item].items(),
                                key=lambda x: x[1],
                                reverse=True)[:item_near_num]
            matchItems = matchItems[:item_near_num]
            self.item_k_neibor[item] = dict(matchItems)
        util.save_data(
            self.item_k_neibor,
            '../data/neibor/%s_08_ii_%s_neibor_gemf_cv0.pkl' %
            (self.config.dataset_name, item_near_num))

        # compute user-user similarity matrix
        print('构建 user-user 相似度矩阵 ...')
        if load_save_sim:
            # if True:
            self.user_sim = util.load_data(
                '../data/sim/%s_08_uu_gemf_cv0.pkl' % self.config.dataset_name)
        else:
            itemNet = {}
            for item in self.rg.trainSet_i:
                if len(self.rg.trainSet_i[item]) > 1:
                    itemNet[item] = self.rg.trainSet_i[item]

            filteredRatings = defaultdict(list)

            for item in itemNet:
                for user in itemNet[item]:
                    if itemNet[item][user] > 0:
                        filteredRatings[user].append(item)

            self.CUNet = defaultdict(list)

            for user1 in tqdm(filteredRatings):
                s1 = set(filteredRatings[user1])
                for user2 in filteredRatings:
                    if user1 != user2:
                        s2 = set(filteredRatings[user2])
                        weight = len(s1.intersection(s2))
                        if weight > 0:
                            self.CUNet[user1] += [user2]

            print('Generating random deep walks...')
            self.walks = []
            self.visited = defaultdict(dict)
            for user in tqdm(self.CUNet):
                for t in range(self.config.walkCount):
                    path = [str(user)]
                    lastNode = user
                    for i in range(1, self.config.walkLength):
                        nextNode = choice(self.CUNet[lastNode])
                        count = 0
                        while (nextNode in self.visited[lastNode]):
                            nextNode = choice(self.CUNet[lastNode])
                            #break infinite loop
                            count += 1
                            if count == self.config.walkLength:  # 10
                                break
                        path.append(str(nextNode))
                        self.visited[user][nextNode] = 1
                        lastNode = nextNode
                    self.walks.append(path)

            self.model = w2v.Word2Vec(self.walks,
                                      size=self.config.walkDim,
                                      window=5,
                                      min_count=0,
                                      iter=3)

            self.topKSim = defaultdict(dict)
            i = 0
            for u1 in tqdm(self.CUNet):
                sims = {}
                for u2 in self.CUNet:
                    if user1 != user2:
                        if self.user_sim.contains(u1, u2):
                            continue
                        wu1 = self.model[str(u1)]
                        wu2 = self.model[str(u2)]
                        sims[u2] = cosine(wu1, wu2)  #若为空咋整
                        self.user_sim.set(u1, u2, sims[u2])
                i += 1
                if i % 200 == 0:
                    print('progress:', i, '/', len(self.CUNet))
            if not os.path.exists('../data/sim'):
                os.makedirs('../data/sim')
                print('../data/sim folder has been established.')
            util.save_data(
                self.user_sim,
                '../data/sim/%s_08_uu_gemf_cv0.pkl' % self.config.dataset_name)

        # compute the k neighbors of user
        if load_save_sim:
            self.user_k_neibor = util.load_data(
                '../data/neibor/%s_08_uu_%s_neibor_gemf_cv0.pkl' %
                (self.config.dataset_name, user_near_num))
        for user in self.rg.user:
            self.topKSim[u1] = sorted(sims.items(),
                                      key=lambda d: d[1],
                                      reverse=True)[:self.config.topK]
            self.topKSim[u1] = self.topKSim[u1][:user_near_num]
            self.user_k_neibor[user] = dict(self.topKSim[u1])

        if not os.path.exists('../data/neibor'):
            os.makedirs('../data/neibor')
            print('../data/neibor folder has been established.')

        util.save_data(
            self.user_k_neibor,
            '../data/neibor/%s_08_uu_%s_neibor_gemf_cv0.pkl' %
            (self.config.dataset_name, user_near_num))
Ejemplo n.º 15
0
class SimBase():
    def __init__(self):
        super(SimBase, self).__init__()

        self.config = ConfigX()
        self.user_sim = SimMatrix()
        self.item_sim = SimMatrix()
        self.user_k_neibor = defaultdict(dict)
        self.item_k_neibor = defaultdict(dict)

    def check_dataset(self):
        super(SimBase, self).check_dataset()
        # if config.dataset_name != 'db':
        #     print("WARN: 注意 config.dataset_name 未设置为 'db' - douban movie")
        # # config.dataset_name = 'ml'
        # # sys.exit()

    def build_user_item_sim(self,
                            kfold,
                            user_near_num=50,
                            item_near_num=50,
                            load_save_sim=False):
        """
        获取 user 与 item 的相似性
        load_save_sim: 加载原有保存数据,提高测试速度
        """

        # 目前仅使用一个 SimCF
        # TODO: 下一步要混合多个Sim
        self.build_user_item_sim_CF(kfold,
                                    user_near_num=user_near_num,
                                    item_near_num=item_near_num,
                                    load_save_sim=load_save_sim)

    def build_user_item_sim_CF(self,
                               kfold,
                               user_near_num=50,
                               item_near_num=50,
                               load_save_sim=False):

        self.rg = RatingGetter(kfold)
        self.mg = MetaGetter(kfold)

        from collections import defaultdict

        # compute item-item similarity matrix
        print('构建 item-item 相似度矩阵  ...')
        if load_save_sim:
            self.item_sim = util.load_data(
                '../data/sim/%s_08_ii_cucmemf_cv0.pkl' %
                self.config.dataset_name)
        else:
            # 封装 item 相似度计算
            self.item_sim = self.mg.getSimMatrix(jaccard_sim)
            util.save_data(
                self.item_sim, '../data/sim/%s_08_ii_cucmemf_cv0.pkl' %
                self.config.dataset_name)

        # compute the k neighbors of item
        if load_save_sim:
            self.item_k_neibor = util.load_data(
                '../data/neibor/%s_08_ii_%s_neibor_cucmemf_cv0.pkl' %
                (self.config.dataset_name, item_near_num))
        for item in self.mg.item:
            matchItems = sorted(self.item_sim[item].items(),
                                key=lambda x: x[1],
                                reverse=True)[:item_near_num]
            matchItems = matchItems[:item_near_num]
            self.item_k_neibor[item] = dict(matchItems)
        util.save_data(
            self.item_k_neibor,
            '../data/neibor/%s_08_ii_%s_neibor_cucmemf_cv0.pkl' %
            (self.config.dataset_name, item_near_num))

        # compute user-user similarity matrix
        print('构建 user-user 相似度矩阵 ...')
        if load_save_sim:
            # if True:
            self.user_sim = util.load_data(
                '../data/sim/%s_08_uu_cucmemf_cv0.pkl' %
                self.config.dataset_name)
        else:
            for u1 in tqdm(self.rg.user):
                for u2 in self.rg.user:
                    if u1 != u2:
                        if self.user_sim.contains(u1, u2):
                            continue
                        # 皮尔逊相似度? 修改为余弦相似度?;
                        sim = pearson_sp(self.rg.get_row(u1),
                                         self.rg.get_row(u2))
                        sim = round(sim, 5)
                        self.user_sim.set(u1, u2, sim)
            if not os.path.exists('../data/sim'):
                os.makedirs('../data/sim')
                print('../data/sim folder has been established.')
            util.save_data(
                self.user_sim, '../data/sim/%s_08_uu_cucmemf_cv0.pkl' %
                self.config.dataset_name)

        # compute the k neighbors of user
        if load_save_sim:
            self.user_k_neibor = util.load_data(
                '../data/neibor/%s_08_uu_%s_neibor_cucmemf_cv0.pkl' %
                (self.config.dataset_name, user_near_num))
        for user in self.rg.user:
            matchUsers = sorted(self.user_sim[user].items(),
                                key=lambda x: x[1],
                                reverse=True)[kfold:user_near_num]
            matchUsers = matchUsers[:user_near_num]
            self.user_k_neibor[user] = dict(matchUsers)

        if not os.path.exists('../data/neibor'):
            os.makedirs('../data/neibor')
            print('../data/neibor folder has been established.')

        util.save_data(
            self.user_k_neibor,
            '../data/neibor/%s_08_uu_%s_neibor_cucmemf_cv0.pkl' %
            (self.config.dataset_name, user_near_num))
Ejemplo n.º 16
0
class TriCFBias(MF):
    """
    docstring for TriCFBias

    """
    def __init__(self):
        super(TriCFBias, self).__init__()
        # self.config.lr=0.001
        self.config.lambdaU = 0.002
        self.config.lambdaI = 0.001

        self.config.lambdaP = 0.02
        self.config.lambdaQ = 0.03
        self.config.lambdaB = 0.01

        self.config.user_near_num = 50
        self.config.item_near_num = 50
        # self.init_model()

    def init_model(self, k):
        super(TriCFBias, self).init_model(k)
        np.random.seed(seed=self.config.random_state)
        self.Bu = np.random.rand(
            self.rg.get_train_size()[0])  # bias value of user
        np.random.seed(seed=self.config.random_state)  # 固定随机种子
        self.Bi = np.random.rand(
            self.rg.get_train_size()[1])  # bais value of item
        self.build_user_item_sim_CF()

    # construct the u-u,i-i similarity matirx and their's k neighbors
    def build_user_item_sim_CF(self):
        from collections import defaultdict
        self.user_sim = SimMatrix()
        self.item_sim = SimMatrix()
        self.user_k_neibor = defaultdict(dict)
        self.item_k_neibor = defaultdict(dict)

        # compute item-item similarity matrix
        print('constructing user-user similarity matrix...')
        # self.user_sim = util.load_data('../data/sim/ft_08_uu_tricf.pkl')
        for u1 in self.rg.user:
            for u2 in self.rg.user:
                if u1 != u2:
                    if self.user_sim.contains(u1, u2):
                        continue
                    sim = pearson_sp(self.rg.get_row(u1), self.rg.get_row(u2))
                    sim = round(sim, 5)
                    self.user_sim.set(u1, u2, sim)
        if not os.path.exists('../data/sim'):
            os.makedirs('../data/sim')
            print('../data/sim folder has been established.')

        print("save user sims size = %s" % (self.user_sim.size()))
        util.save_data(self.user_sim, '../data/sim/ft_08_uu_tricf_cv0.pkl')

        # compute the k neighbors of user
        # self.user_k_neibor = util.load_data(
        #     '../data/neibor/ft_08_uu_' + str(self.config.user_near_num) + '_neibor_tricf.pkl')
        for user in self.rg.user:
            matchUsers = sorted(self.user_sim[user].items(),
                                key=lambda x: x[1],
                                reverse=True)[:self.config.user_near_num]
            matchUsers = matchUsers[:self.config.user_near_num]
            self.user_k_neibor[user] = dict(matchUsers)

        if not os.path.exists('../data/neibor'):
            os.makedirs('../data/neibor')
            print('../data/neibor folder has been established.')

        util.save_data(
            self.user_k_neibor, '../data/neibor/ft_08_uu_' +
            str(self.config.user_near_num) + '_neibor_tricf_cv0.pkl')

        # compute item-item similarity matrix
        print('constructing item-item similarity matrix...')
        # self.item_sim = util.load_data('../data/sim/ft_08_ii_tricf.pkl')
        for i1 in self.rg.item:
            for i2 in self.rg.item:
                if i1 != i2:
                    if self.item_sim.contains(i1, i2):
                        continue
                    sim = pearson_sp(self.rg.get_col(i1), self.rg.get_col(i2))
                    sim = round(sim, 5)
                    self.item_sim.set(i1, i2, sim)
        print("save item sims size = %s" % (self.item_sim.size()))
        util.save_data(self.item_sim, '../data/sim/ft_08_ii_tricf_cv0.pkl')

        # compute the k neighbors of item
        # self.item_k_neibor = util.load_data(
        #     '../data/neibor/ft_08_ii_' + str(self.config.item_near_num) + '_neibor_tricf.pkl')
        for item in self.rg.item:
            matchItems = sorted(self.item_sim[item].items(),
                                key=lambda x: x[1],
                                reverse=True)[:self.config.item_near_num]
            matchItems = matchItems[:self.config.item_near_num]
            self.item_k_neibor[item] = dict(matchItems)
        util.save_data(
            self.item_k_neibor, '../data/neibor/ft_08_ii_' +
            str(self.config.item_near_num) + '_neibor_tricf_cv0.pkl')
        pass

    def train_model(self, k):
        super(TriCFBias, self).train_model(k)
        print('training model...')
        iteration = 0
        # faflag=True
        while iteration < self.config.maxIter:
            self.loss = 0
            self.u_near_total_dict = defaultdict()
            self.i_near_total_dict = defaultdict()
            for index, line in enumerate(self.rg.trainSet()):
                user, item, rating = line
                u = self.rg.user[user]
                i = self.rg.item[item]

                error = rating - self.predict(user, item)
                self.loss += error**2
                p, q = self.P[u], self.Q[i]

                # get the k neighbors of user and item
                matchUsers = self.user_k_neibor[user]
                matchItems = self.item_k_neibor[item]

                u_near_sum, u_near_total, s = np.zeros(
                    (self.config.factor)), 0.0, 0.0
                for suser in matchUsers.keys():
                    near_user, sim_value = suser, matchUsers[suser]
                    if sim_value != 0.0:
                        s += sim_value
                        pn = self.P[self.rg.user[near_user]]
                        u_near_sum += sim_value * (pn - p)
                        u_near_total += sim_value * ((pn - p).dot(pn - p))
                if s != 0.0:
                    u_near_sum /= s

                i_near_sum, i_near_total, ss = np.zeros(
                    (self.config.factor)), 0.0, 0.0
                for sitem in matchItems:
                    near_item, sim_value = sitem, matchItems[sitem]
                    if sim_value != 0.0:
                        ss += sim_value
                    qn = self.Q[self.rg.item[near_item]]
                    i_near_sum += sim_value * (qn - q)
                    i_near_total += sim_value * ((qn - q).dot(qn - q))
                if ss != 0.0:
                    i_near_sum /= ss

                if u not in self.u_near_total_dict:
                    self.u_near_total_dict[u] = u_near_total
                if i not in self.i_near_total_dict:
                    self.i_near_total_dict[i] = i_near_total

                self.Bu[u] += self.config.lr * (
                    error - self.config.lambdaB * self.Bu[u])
                self.Bi[i] += self.config.lr * (
                    error - self.config.lambdaB * self.Bi[i])

                self.P[u] += self.config.lr * (
                    error * q - self.config.lambdaU * u_near_sum -
                    self.config.lambdaP * p)
                self.Q[i] += self.config.lr * (
                    error * p - self.config.lambdaI * i_near_sum -
                    self.config.lambdaQ * q)

                self.loss += 0.5 * (self.config.lambdaU * u_near_total +
                                    self.config.lambdaI * i_near_total)

            self.loss += self.config.lambdaP * (self.P * self.P).sum() + self.config.lambdaQ * (self.Q * self.Q).sum() \
                         + self.config.lambdaB * ((self.Bu * self.Bu).sum() + (self.Bi * self.Bi).sum())

            iteration += 1
            if self.isConverged(iteration):
                break

    # test cold start users among test set
    def predict_model_cold_users_improved(self):
        res = []
        for user in self.rg.testColdUserSet_u.keys():
            for item in self.rg.testColdUserSet_u[user].keys():
                rating = self.rg.testColdUserSet_u[user][item]
                pred = self.predict_improved(user, item)
                # denormalize
                pred = denormalize(pred, self.config.min_val,
                                   self.config.max_val)
                pred = self.checkRatingBoundary(pred)
                res.append([user, item, rating, pred])
        rmse = Metric.RMSE(res)
        return rmse
Ejemplo n.º 17
0
 def init_model(self, k):
     super(CUNE, self).init_model(k)
     self.user_sim = SimMatrix()
     self.generate_cu_net()
     self.deep_walk()
     self.compute_social_sim()