Exemple #1
0
    def computeMetaSimilarity(self):
        # Training get top-k items
        print 'Generating meta embedding...'
        self.pTopKSim = {}
        self.pSimilarity = defaultdict(dict)
        pos_model = w2v.Word2Vec(self.pWalks,
                                 size=self.walkDim,
                                 window=5,
                                 min_count=0,
                                 iter=10)
        for item in self.meta.item:
            mid = self.meta.item[item]
            try:
                self.W[mid] = pos_model.wv['I' + item]
            except KeyError:
                continue
        print 'meta embedding generated.'

        print 'Constructing item similarity matrix...'
        i = 0

        for item1 in self.meta.item:
            mSim = []
            i += 1
            if i % 200 == 0:
                print i, '/', len(self.meta.item)
            vec1 = self.W[self.meta.item[item1]]
            for item2 in self.meta.item:
                if item1 <> item2:
                    vec2 = self.W[self.meta.item[item2]]
                    sim = cosine(vec1, vec2)
                    mSim.append((item2, sim))
            fList = sorted(mSim, key=lambda d: d[1], reverse=True)[:10]
            self.threshold[item1] = fList[10 / 2][1]
            for pair in fList:
                self.pSimilarity[item1][pair[0]] = pair[1]
            self.pTopKSim[item1] = [item[0] for item in fList]
            self.avg_sim[item1] = sum([item[1]
                                       for item in fList][:10 / 2]) / (10 / 2)
            if item1 in self.pItems:
                for u in self.pItems[item1]:
                    for item in self.pTopKSim[item1]:
                        if item not in self.positive[u]:
                            self.positive[u].append(item)
            if item1 in self.nItems:
                for u in self.nItems[item1]:
                    for item in self.pTopKSim[item1]:
                        if item not in self.negative[u]:
                            self.negative[u].append(item)
Exemple #2
0
    def buildModel(self):
        print('Kind Note: This method will probably take much time.')
        #build C-U-NET
        print('Building collaborative user network...')
        #filter isolated nodes
        self.itemNet = {}
        for item in self.data.trainSet_i:
            if len(self.data.trainSet_i[item]) > 1:
                self.itemNet[item] = self.data.trainSet_i[item]

        self.filteredRatings = defaultdict(list)
        for item in self.itemNet:
            for user in self.itemNet[item]:
                if self.itemNet[item][user] >= 1:
                    self.filteredRatings[user].append(item)

        self.CUNet = defaultdict(list)

        for user1 in self.filteredRatings:
            s1 = set(self.filteredRatings[user1])
            for user2 in self.filteredRatings:
                if user1 != user2:
                    s2 = set(self.filteredRatings[user2])
                    weight = len(s1.intersection(s2))
                    if weight > 0:
                        self.CUNet[user1] += [user2] * weight

        #build Huffman Tree First
        #get weight
        # print 'Building Huffman tree...'
        # #To accelerate the method, the weight is estimated roughly
        # nodes = {}
        # for user in self.CUNet:
        #     nodes[user] = len(self.CUNet[user])
        # nodes = sorted(nodes.iteritems(),key=lambda d:d[1])
        # nodes = [HTreeNode(None,None,user[1],user[0]) for user in nodes]
        # nodeList = OrderedLinkList()
        # for node in nodes:
        #     listNode = Node()
        #     listNode.val = node
        #     try:
        #         nodeList.insert(listNode)
        #     except AttributeError:
        #         pass
        # self.HTree = HuffmanTree(vecLength=self.walkDim)
        # self.HTree.buildTree(nodeList)
        # print 'Coding for all users...'
        # self.HTree.coding(self.HTree.root,'',0)

        print('Generating random deep walks...')
        self.walks = []
        self.visited = defaultdict(dict)
        for user in self.CUNet:
            for t in range(self.walkCount):
                path = [user]
                lastNode = user
                for i in range(1, self.walkLength):
                    nextNode = choice(self.CUNet[lastNode])
                    count = 0
                    while (nextNode in self.visited[lastNode]):
                        nextNode = choice(self.CUNet[lastNode])
                        #break infinite loop
                        count += 1
                        if count == 10:
                            break
                    path.append(nextNode)
                    self.visited[user][nextNode] = 1
                    lastNode = nextNode
                self.walks.append(path)
                #print path
        shuffle(self.walks)

        #Training get top-k friends
        print('Generating user embedding...')
        # iteration = 1
        # while iteration <= self.epoch:
        #     loss = 0
        #     #slide windows randomly
        #
        #     for n in range(self.walkLength/self.winSize):
        #
        #         for walk in self.walks:
        #             center = randint(0, len(walk)-1)
        #             s = max(0,center-self.winSize/2)
        #             e = min(center+self.winSize/2,len(walk)-1)
        #             for user in walk[s:e]:
        #                 centerUser = walk[center]
        #                 if user <> centerUser:
        #                     code = self.HTree.code[user]
        #                     centerCode = self.HTree.code[centerUser]
        #                     x = self.HTree.vector[centerCode]
        #                     for i in range(1,len(code)):
        #                         prefix = code[0:i]
        #                         w = self.HTree.vector[prefix]
        #                         self.HTree.vector[prefix] += self.lRate*(1-sigmoid(w.dot(x)))*x
        #                         self.HTree.vector[centerCode] += self.lRate*(1-sigmoid(w.dot(x)))*w
        #                         loss += -log(sigmoid(w.dot(x)),2)
        #     print 'iteration:', iteration, 'loss:', loss
        #     iteration+=1
        model = w2v.Word2Vec(self.walks,
                             size=self.walkDim,
                             window=5,
                             min_count=0,
                             iter=3)
        print('User embedding generated.')

        print('Constructing similarity matrix...')
        self.W = np.random.rand(self.data.trainingSize()[0], self.walkDim) / 10
        self.topKSim = {}
        i = 0
        for user1 in self.CUNet:
            # prefix1 = self.HTree.code[user1]
            # vec1 = self.HTree.vector[prefix1]
            sims = []
            u1 = self.data.user[user1]
            self.W[u1] = model.wv[user1]
            for user2 in self.CUNet:
                if user1 != user2:
                    u2 = self.data.user[user2]
                    self.W[u2] = model.wv[user2]
                    sims.append((user2, cosine(self.W[u1], self.W[u2])))
            self.topKSim[user1] = sorted(sims,
                                         key=lambda d: d[1],
                                         reverse=True)[:self.topK]
            i += 1
            if i % 200 == 0:
                print(('progress:', i, '/', len(self.CUNet)))
        print('Similarity matrix finished.')

        #print self.topKSim

        #matrix decomposition
        print('Decomposing...')

        iteration = 0
        while iteration < self.maxIter:
            self.loss = 0
            for entry in self.data.trainingData:
                user, item, rating = entry
                u = self.data.user[user]  #get user id
                i = self.data.item[item]  #get item id
                error = rating - self.P[u].dot(self.Q[i])
                self.loss += error**2
                p = self.P[u]
                q = self.Q[i]

                #update latent vectors
                self.P[u] += self.lRate * (error * q - self.regU * p)
                self.Q[i] += self.lRate * (error * p - self.regI * q)

            for user in self.CUNet:

                u = self.data.user[user]
                friends = self.topKSim[user]
                for friend in friends:
                    uf = self.data.user[friend[0]]
                    self.P[u] -= self.lRate * (self.P[u] -
                                               self.P[uf]) * self.alpha
                    self.loss += self.alpha * (
                        self.P[u] - self.P[uf]).dot(self.P[u] - self.P[uf])

            self.loss += self.regU * (self.P * self.P).sum() + self.regI * (
                self.Q * self.Q).sum()
            iteration += 1
            if self.isConverged(iteration):
                break
Exemple #3
0
    def buildModel(self):
        print 'Kind Note: This method will probably take much time.'
        #build C-U-NET
        print 'Building collaborative user network...'
        #filter isolated nodes
        self.itemNet = {}
        for item in self.dao.trainSet_i:
            if len(self.dao.trainSet_i[item]) > 1:
                self.itemNet[item] = self.dao.trainSet_i[item]

        self.filteredRatings = defaultdict(list)
        for item in self.itemNet:
            for user in self.itemNet[item]:
                if self.itemNet[item][user] >= 1:
                    self.filteredRatings[user].append(item)

        self.CUNet = defaultdict(list)

        for user1 in self.filteredRatings:
            s1 = set(self.filteredRatings[user1])
            for user2 in self.filteredRatings:
                if user1 <> user2:
                    s2 = set(self.filteredRatings[user2])
                    weight = len(s1.intersection(s2))
                    if weight > 0:
                        self.CUNet[user1] += [user2] * weight

        #build Huffman Tree First
        #get weight
        # print 'Building Huffman tree...'
        # #To accelerate the method, the weight is estimated roughly
        # nodes = {}
        # for user in self.CUNet:
        #     nodes[user] = len(self.CUNet[user])
        # nodes = sorted(nodes.iteritems(),key=lambda d:d[1])
        # nodes = [HTreeNode(None,None,user[1],user[0]) for user in nodes]
        # nodeList = OrderedLinkList()
        # for node in nodes:
        #     listNode = Node()
        #     listNode.val = node
        #     try:
        #         nodeList.insert(listNode)
        #     except AttributeError:
        #         pass
        # self.HTree = HuffmanTree(vecLength=self.walkDim)
        # self.HTree.buildTree(nodeList)
        # print 'Coding for all users...'
        # self.HTree.coding(self.HTree.root,'',0)

        print 'Generating random deep walks...'
        self.walks = []
        self.visited = defaultdict(dict)
        for user in self.CUNet:
            for t in range(self.walkCount):
                path = [user]
                lastNode = user
                for i in range(1, self.walkLength):
                    nextNode = choice(self.CUNet[lastNode])
                    count = 0
                    while (self.visited[lastNode].has_key(nextNode)):
                        nextNode = choice(self.CUNet[lastNode])
                        #break infinite loop
                        count += 1
                        if count == 10:
                            break
                    path.append(nextNode)
                    self.visited[user][nextNode] = 1
                    lastNode = nextNode
                self.walks.append(path)
                #print path
        shuffle(self.walks)

        #Training get top-k friends
        print 'Generating user embedding...'
        # iteration = 1
        # while iteration <= self.epoch:
        #     loss = 0
        #     #slide windows randomly
        #
        #     for n in range(self.walkLength/self.winSize):
        #
        #         for walk in self.walks:
        #             center = randint(0, len(walk)-1)
        #             s = max(0,center-self.winSize/2)
        #             e = min(center+self.winSize/2,len(walk)-1)
        #             for user in walk[s:e]:
        #                 centerUser = walk[center]
        #                 if user <> centerUser:
        #                     code = self.HTree.code[user]
        #                     centerCode = self.HTree.code[centerUser]
        #                     x = self.HTree.vector[centerCode]
        #                     for i in range(1,len(code)):
        #                         prefix = code[0:i]
        #                         w = self.HTree.vector[prefix]
        #                         self.HTree.vector[prefix] += self.lRate*(1-sigmoid(w.dot(x)))*x
        #                         self.HTree.vector[centerCode] += self.lRate*(1-sigmoid(w.dot(x)))*w
        #                         loss += -log(sigmoid(w.dot(x)),2)
        #     print 'iteration:', iteration, 'loss:', loss
        #     iteration+=1
        model = w2v.Word2Vec(self.walks,
                             size=self.walkDim,
                             window=5,
                             min_count=0,
                             iter=3)
        print 'User embedding generated.'

        print 'Constructing similarity matrix...'
        self.W = np.random.rand(self.dao.trainingSize()[0], self.walkDim) / 10
        self.topKSim = {}
        i = 0
        for user1 in self.CUNet:
            # prefix1 = self.HTree.code[user1]
            # vec1 = self.HTree.vector[prefix1]
            sims = []
            u1 = self.dao.user[user1]
            self.W[u1] = model.wv[user1]
            for user2 in self.CUNet:
                if user1 <> user2:
                    u2 = self.dao.user[user2]
                    self.W[u2] = model.wv[user2]
                    sims.append((user2, cosine(self.W[u1], self.W[u2])))
            self.topKSim[user1] = sorted(sims,
                                         key=lambda d: d[1],
                                         reverse=True)[:self.topK]
            i += 1
            if i % 200 == 0:
                print 'progress:', i, '/', len(self.CUNet)
        print 'Similarity matrix finished.'
        #print self.topKSim

        #prepare Pu set, IPu set, and Nu set
        print 'Preparing item sets...'
        self.PositiveSet = defaultdict(dict)
        self.IPositiveSet = defaultdict(dict)
        #self.NegativeSet = defaultdict(list)

        for user in self.topKSim:
            for item in self.dao.trainSet_u[user]:
                self.PositiveSet[user][item] = 1
            # else:
            #     self.NegativeSet[user].append(item)

            for friend in self.topKSim[user]:
                for item in self.dao.trainSet_u[friend[0]]:
                    if not self.PositiveSet[user].has_key(item):
                        self.IPositiveSet[user][item] = 1

        print 'Training...'
        iteration = 0
        while iteration < self.maxIter:
            self.loss = 0
            itemList = self.dao.item.keys()
            for user in self.PositiveSet:
                u = self.dao.user[user]
                kItems = self.IPositiveSet[user].keys()
                for item in self.PositiveSet[user]:
                    i = self.dao.item[item]
                    for n in range(3):  #negative sampling for 3 times
                        if len(self.IPositiveSet[user]) > 0:
                            item_k = choice(kItems)
                            k = self.dao.item[item_k]
                            self.P[u] += self.lRate * (
                                1 - sigmoid(self.P[u].dot(self.Q[i]) -
                                            self.P[u].dot(self.Q[k]))) * (
                                                self.Q[i] - self.Q[k])
                            self.Q[i] += self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * \
                                         self.P[u]
                            self.Q[k] -= self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * \
                                         self.P[u]

                            item_j = ''
                            # if len(self.NegativeSet[user])>0:
                            #     item_j = choice(self.NegativeSet[user])
                            # else:
                            item_j = choice(itemList)
                            while (self.PositiveSet[user].has_key(item_j)
                                   or self.IPositiveSet.has_key(item_j)):
                                item_j = choice(itemList)
                            j = self.dao.item[item_j]
                            self.P[u] += (1 / self.s) * self.lRate * (
                                1 - sigmoid(
                                    (1 / self.s) *
                                    (self.P[u].dot(self.Q[k]) - self.P[u].dot(
                                        self.Q[j])))) * (self.Q[k] - self.Q[j])
                            self.Q[k] += (1 / self.s) * self.lRate * (
                                1 - sigmoid(
                                    (1 / self.s) *
                                    (self.P[u].dot(self.Q[k]) -
                                     self.P[u].dot(self.Q[j])))) * self.P[u]
                            self.Q[j] -= (1 / self.s) * self.lRate * (
                                1 - sigmoid(
                                    (1 / self.s) *
                                    (self.P[u].dot(self.Q[k]) -
                                     self.P[u].dot(self.Q[j])))) * self.P[u]

                            self.P[u] -= self.lRate * self.regU * self.P[u]
                            self.Q[i] -= self.lRate * self.regI * self.Q[i]
                            self.Q[j] -= self.lRate * self.regI * self.Q[j]
                            self.Q[k] -= self.lRate * self.regI * self.Q[k]

                            self.loss += -log(sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) - \
                                         log(sigmoid((1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j]))))
                        else:
                            item_j = choice(itemList)
                            while (self.PositiveSet[user].has_key(item_j)):
                                item_j = choice(itemList)
                            j = self.dao.item[item_j]
                            self.P[u] += self.lRate * (
                                1 - sigmoid(self.P[u].dot(self.Q[i]) -
                                            self.P[u].dot(self.Q[j]))) * (
                                                self.Q[i] - self.Q[j])
                            self.Q[i] += self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) * \
                                         self.P[u]
                            self.Q[j] -= self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) * \
                                         self.P[u]

                            self.loss += -log(
                                sigmoid(self.P[u].dot(self.Q[i]) -
                                        self.P[u].dot(self.Q[j])))

                self.loss += self.regU * (self.P * self.P).sum(
                ) + self.regI * (self.Q * self.Q).sum()
            iteration += 1
            if self.isConverged(iteration):
                break
Exemple #4
0
    def computeSimilarity(self):
        # Training get top-k friends
        print 'Generating user embedding...'
        self.pTopKSim = {}
        self.nTopKSim = {}
        self.pSimilarity = defaultdict(dict)
        self.nSimilarity = defaultdict(dict)
        pos_model = w2v.Word2Vec(self.pWalks, size=50, window=5, min_count=0, iter=10)
        neg_model = w2v.Word2Vec(self.nWalks, size=50, window=5, min_count=0, iter=10)
        for user in self.positive:
            uid = self.data.user[user]
            try:
                self.W[uid] = pos_model.wv['U' + user]
            except KeyError:
                continue
        for user in self.negative:
            uid = self.data.user[user]
            try:
                self.G[uid] = neg_model.wv['U' + user]
            except KeyError:
                continue
        print 'User embedding generated.'

        print 'Constructing similarity matrix...'
        i = 0
        for user1 in self.positive:
            uSim = []
            i += 1
            if i % 200 == 0:
                print i, '/', len(self.positive)
            vec1 = self.W[self.data.user[user1]]
            for user2 in self.positive:
                if user1 <> user2:
                    vec2 = self.W[self.data.user[user2]]
                    sim = cosine(vec1, vec2)
                    uSim.append((user2, sim))
            fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK]

            self.pTopKSim[user1] = [item[0] for item in fList]


        i = 0
        for user1 in self.negative:
            uSim = []
            i += 1
            if i % 200 == 0:
                print i, '/', len(self.negative)
            vec1 = self.G[self.data.user[user1]]
            for user2 in self.negative:
                if user1 <> user2:
                    vec2 = self.G[self.data.user[user2]]
                    sim = cosine(vec1, vec2)
                    uSim.append((user2, sim))
            fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK]
            for pair in fList:
                self.nSimilarity[user1][pair[0]] = pair[1]
            self.nTopKSim[user1] = [item[0] for item in fList]

        self.seededFriends = defaultdict(list)
        self.firend_item_set = defaultdict(list)
        for user in self.pTopKSim:
            trueFriends = list(set(self.pTopKSim[user]).intersection(set(self.nTopKSim[user])))
            self.seededFriends[user] = trueFriends+self.pTopKSim[user][:50]

        for user in self.pTopKSim:
            for friend in self.seededFriends[user]:
                self.firend_item_set[user]+=self.data.trainSet_u[friend].keys()
Exemple #5
0
    def buildModel(self):
        print 'Kind Note: This method will probably take much time.'
        #build C-U-NET
        print 'Building collaborative user network...'
        #filter isolated nodes
        self.itemNet = {}
        for item in self.dao.trainSet_i:
            if len(self.dao.trainSet_i[item]) > 1:
                self.itemNet[item] = self.dao.trainSet_i[item]

        self.filteredRatings = defaultdict(list)
        for item in self.itemNet:
            for user in self.itemNet[item]:
                if self.itemNet[item][user] >= 1:
                    self.filteredRatings[user].append(item)

        self.CUNet = defaultdict(list)

        for user1 in self.filteredRatings:
            for user2 in self.filteredRatings:
                if user1 <> user2:
                    weight = len(
                        set(self.filteredRatings[user1]).intersection(
                            set(self.filteredRatings[user2])))
                    if weight > 0:
                        self.CUNet[user1] += [user2] * weight

        #build Huffman Tree First
        #get weight
        print 'Building Huffman tree...'
        #To accelerate the method, the weight is estimated roughly
        nodes = {}
        for user in self.CUNet:
            nodes[user] = len(self.CUNet[user])
        nodes = sorted(nodes.iteritems(), key=lambda d: d[1])
        nodes = [HTreeNode(None, None, user[1], user[0]) for user in nodes]
        nodeList = OrderedLinkList()
        for node in nodes:
            listNode = Node()
            listNode.val = node
            try:
                nodeList.insert(listNode)
            except AttributeError:
                pass
        self.HTree = HuffmanTree(vecLength=self.walkDim)
        self.HTree.buildTree(nodeList)
        print 'Coding for all users...'
        self.HTree.coding(self.HTree.root, '', 0)

        print 'Generating random deep walks...'
        self.walks = []
        self.visited = defaultdict(dict)
        for user in self.CUNet:
            for t in range(self.walkCount):
                currentNode = user
                path = [user]
                for i in range(1, self.walkLength):
                    nextNode = self.CUNet[user][
                        randint(0, len(self.CUNet[user])) - 1]
                    count = 0
                    while (self.visited[user].has_key(nextNode)):
                        nextNode = self.CUNet[randint(0, len(self.CUNet[user]))
                                              - 1]
                        #break infinite loop
                        count += 1
                        if count == 10:
                            break
                    path.append(nextNode)
                self.walks.append(path)
                #print path
        shuffle(self.walks)

        #Training get top-k friends
        print 'Generating user embedding...'
        iteration = 1
        while iteration <= self.maxIter:
            loss = 0
            for walk in self.walks:
                for user in walk:
                    centerUser = walk[len(walk) / 2]
                    if user <> centerUser:
                        code = self.HTree.code[user]
                        centerCode = self.HTree.code[centerUser]
                        x = self.HTree.vector[centerCode]
                        for i in range(1, len(code)):
                            prefix = code[0:i]
                            w = self.HTree.vector[prefix]
                            self.HTree.vector[prefix] += self.lRate * (
                                1 - sigmoid(w.dot(x))) * x
                            self.HTree.vector[centerCode] += self.lRate * (
                                1 - sigmoid(w.dot(x))) * w
                            loss += -log(sigmoid(w.dot(x)), 2)
            print 'iteration:', iteration, 'loss:', loss
            iteration += 1
        print 'User embedding generated.'

        print 'Constructing similarity matrix...'
        self.Sim = SymmetricMatrix(len(self.CUNet))
        for user1 in self.CUNet:
            for user2 in self.CUNet:
                if user1 <> user2:
                    prefix1 = self.HTree.code[user1]
                    vec1 = self.HTree.vector[prefix1]
                    prefix2 = self.HTree.code[user2]
                    vec2 = self.HTree.vector[prefix2]
                    if self.Sim.contains(user1, user2):
                        continue
                    sim = cosine(vec1, vec2)
                    self.Sim.set(user1, user2, sim)
        self.topKSim = {}
        for user in self.CUNet:
            self.topKSim[user] = sorted(self.Sim[user].iteritems(),
                                        key=lambda d: d[1],
                                        reverse=True)[:self.topK]
        print 'Similarity matrix finished.'
        #print self.topKSim

        #prepare Pu set, IPu set, and Nu set
        print 'Preparing item sets...'
        self.PositiveSet = defaultdict(dict)
        self.IPositiveSet = defaultdict(list)
        self.NegativeSet = defaultdict(list)

        for user in self.topKSim:
            for item in self.dao.trainSet_u[user]:
                if self.dao.trainSet_u[user][item] >= 1:
                    self.PositiveSet[user][item] = 1
                else:
                    self.NegativeSet[user].append(item)

            for friend in self.topKSim[user]:
                for item in self.dao.trainSet_u[friend[0]]:
                    if not self.PositiveSet[user].has_key(item):
                        self.IPositiveSet[user].append(item)

        print 'Training...'
        iteration = 0
        while iteration < self.maxIter:
            self.loss = 0

            for user in self.PositiveSet:
                u = self.dao.user[user]
                for item in self.PositiveSet[user]:
                    if len(self.IPositiveSet[user]) > 0:
                        item_k = self.IPositiveSet[user][randint(
                            0,
                            len(self.IPositiveSet[user]) - 1)]
                        i = self.dao.item[item]
                        k = self.dao.item[item_k]
                        self.P[u] += self.lRate * (
                            1 - sigmoid(self.P[u].dot(self.Q[i]) -
                                        self.P[u].dot(self.Q[k]))) * (
                                            self.Q[i] - self.Q[k])
                        self.Q[i] += self.lRate * (
                            1 - sigmoid(self.P[u].dot(self.Q[i]) -
                                        self.P[u].dot(self.Q[k]))) * self.P[u]
                        self.Q[k] -= self.lRate * (
                            1 - sigmoid(self.P[u].dot(self.Q[i]) -
                                        self.P[u].dot(self.Q[k]))) * self.P[u]

                        item_j = ''
                        if len(self.NegativeSet[user]) > 0:
                            item_j = self.NegativeSet[user][randint(
                                0,
                                len(self.NegativeSet[user]) - 1)]
                        else:
                            item_j = self.dao.item.keys()[randint(
                                0,
                                len(self.dao.item) - 1)]
                            while (self.PositiveSet[user].has_key(item_j)):
                                item_j = self.dao.item.keys()[randint(
                                    0,
                                    len(self.dao.item) - 1)]
                        j = self.dao.item[item_j]
                        self.P[u] += (1 / self.s) * self.lRate * (1 - sigmoid(
                            (1 / self.s) *
                            (self.P[u].dot(self.Q[k]) - self.P[u].dot(
                                self.Q[j])))) * (self.Q[k] - self.Q[j])
                        self.Q[k] += (1 / self.s) * self.lRate * (1 - sigmoid(
                            (1 / self.s) *
                            (self.P[u].dot(self.Q[k]) -
                             self.P[u].dot(self.Q[j])))) * self.P[u]
                        self.Q[j] -= (1 / self.s) * self.lRate * (1 - sigmoid(
                            (1 / self.s) *
                            (self.P[u].dot(self.Q[k]) -
                             self.P[u].dot(self.Q[j])))) * self.P[u]

                        self.P[u] += self.lRate * self.regU * self.P[u]
                        self.Q[i] += self.lRate * self.regI * self.Q[i]
                        self.Q[j] += self.lRate * self.regI * self.Q[j]
                        self.Q[k] += self.lRate * self.regI * self.Q[k]

                        self.loss += log(sigmoid(self.P[u].dot(self.Q[i])-self.P[u].dot(self.Q[k]))) + \
                                     log(sigmoid((1/self.s)*(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))))

            self.loss += self.regU * (self.P * self.P).sum() + self.regI * (
                self.Q * self.Q).sum()
            iteration += 1
            if self.isConverged(iteration):
                break
Exemple #6
0
    def computeSimilarity(self):
        # Training get top-k friends
        print('Generating user embedding...')
        self.pTopKSim = {}
        self.nTopKSim = {}
        self.pSimilarity = defaultdict(dict)
        self.nSimilarity = defaultdict(dict)
        pos_model = w2v.Word2Vec(self.pWalks,
                                 size=self.walkDim,
                                 window=5,
                                 min_count=0,
                                 iter=10)
        neg_model = w2v.Word2Vec(self.nWalks,
                                 size=self.walkDim,
                                 window=5,
                                 min_count=0,
                                 iter=10)
        for user in self.positive:
            uid = self.data.user[user]
            try:
                self.W[uid] = pos_model.wv['U' + user]
            except KeyError:
                continue
        for user in self.negative:
            uid = self.data.user[user]
            try:
                self.G[uid] = neg_model.wv['U' + user]
            except KeyError:
                continue
        print('User embedding generated.')

        print('Constructing similarity matrix...')
        i = 0
        for user1 in self.positive:
            uSim = []
            i += 1
            if i % 200 == 0:
                print(i, '/', len(self.positive))
            vec1 = self.W[self.data.user[user1]]
            for user2 in self.positive:
                if user1 != user2:
                    vec2 = self.W[self.data.user[user2]]
                    sim = cosine(vec1, vec2)
                    uSim.append((user2, sim))
            fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK]
            self.threshold[user1] = fList[self.topK // 2][1]
            for pair in fList:
                self.pSimilarity[user1][pair[0]] = pair[1]
            self.pTopKSim[user1] = [item[0] for item in fList]
            self.avg_sim[user1] = sum([item[1] for item in fList
                                       ][:self.topK // 2]) / (self.topK / 2)

        i = 0
        for user1 in self.negative:
            uSim = []
            i += 1
            if i % 200 == 0:
                print(i, '/', len(self.negative))
            vec1 = self.G[self.data.user[user1]]
            for user2 in self.negative:
                if user1 != user2:
                    vec2 = self.G[self.data.user[user2]]
                    sim = cosine(vec1, vec2)
                    uSim.append((user2, sim))
            fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK]
            for pair in fList:
                self.nSimilarity[user1][pair[0]] = pair[1]
            self.nTopKSim[user1] = [item[0] for item in fList]

        self.trueTopKFriends = defaultdict(list)
        for user in self.pTopKSim:
            trueFriends = list(
                set(self.pTopKSim[user]).intersection(set(
                    self.nTopKSim[user])))
            self.trueTopKFriends[user] = trueFriends
            self.pTopKSim[user] = list(
                set(self.pTopKSim[user]).difference(set(trueFriends)))
Exemple #7
0
    def buildModel(self):

        #data clean

        # li = self.sao.followees.keys()
        #
        print 'Kind Note: This method will probably take much time.'
        # build U-F-NET
        print 'Building weighted user-friend network...'
        # filter isolated nodes and low ratings
        # Definition of Meta-Path

        self.W = np.random.rand(self.data.getSize('user'), self.walkDim) / 10

        self.user2track = defaultdict(list)
        self.user2artist = defaultdict(list)
        self.user2album = defaultdict(list)
        self.track2user = defaultdict(list)
        self.artist2user = defaultdict(list)
        self.album2user = defaultdict(list)
        self.artist2track = defaultdict(list)
        self.artist2album = defaultdict(list)
        self.album2track = defaultdict(list)
        self.album2artist = {}
        self.track2artst = {}
        self.track2album = {}

        for user in self.data.userRecord:
            for item in self.data.userRecord[user]:
                self.user2track[user].append(item[self.recType])
                self.user2artist[user].append(item['artist'])
                if self.data.columns.has_key('album'):
                    self.user2album[user].append(item['album'])

        for artist in self.data.listened['artist']:
            for user in self.data.listened['artist'][artist]:
                self.artist2user[artist] += [
                    user
                ] * self.data.listened['artist'][artist][user]

        for track in self.data.listened['track']:
            for user in self.data.listened['track'][track]:
                self.track2user[track] += [
                    user
                ] * self.data.listened['track'][track][user]

        if self.data.columns.has_key('album'):
            for album in self.data.listened['album']:
                for user in self.data.listened['album'][album]:
                    self.album2user[album] += [
                        user
                    ] * self.data.listened['album'][album][user]

        for artist in self.data.artist2Track:
            self.artist2track[artist] = self.data.artist2Track[artist].keys()
            for key in self.data.artist2Track[artist]:
                self.track2artst[key] = artist
        if self.data.columns.has_key('album'):
            for album in self.data.album2Track:
                self.album2track[album] = self.data.album2Track[album].keys()
                for key in self.data.album2Track[album]:
                    self.track2album[key] = album

            for artist in self.data.artist2Album:
                self.artist2album[artist] = self.data.artist2Album[
                    artist].keys()
                for key in self.data.artist2Album[artist]:
                    self.album2artist[key] = artist

        print 'Generating random meta-path random walks...'
        self.walks = []
        #self.usercovered = {}
        p1 = 'UTU'
        p2 = 'UAU'
        p3 = 'UZU'
        p4 = 'UTATU'
        p5 = 'UTZTU'
        p6 = 'UTZAZTU'

        mPaths = []
        if self.data.columns.has_key('album'):
            mPaths = [p1, p2, p3, p4, p5, p6]
        else:
            mPaths = [p1, p2, p4]

        for user in self.data.userRecord:

            for mp in mPaths:
                for t in range(self.walkCount):

                    path = [user]
                    lastNode = user
                    nextNode = user
                    lastType = 'U'
                    for i in range(self.walkLength / len(mp[1:])):
                        for tp in mp[1:]:
                            try:
                                if tp == 'T' and lastType == 'U':
                                    nextNode = choice(
                                        self.user2track[lastNode])
                                elif tp == 'T' and lastType == 'A':
                                    nextNode = choice(
                                        self.artist2track[lastNode])
                                elif tp == 'T' and lastType == 'Z':
                                    nextNode = choice(
                                        self.album2track[lastNode])
                                elif tp == 'A' and lastType == 'T':
                                    nextNode = self.track2artst[lastNode]
                                elif tp == 'A' and lastType == 'Z':
                                    nextNode = self.album2artist[lastNode]
                                elif tp == 'A' and lastType == 'U':
                                    nextNode = choice(
                                        self.user2artist[lastNode])

                                elif tp == 'Z' and lastType == 'U':
                                    nextNode = choice(
                                        self.user2album[lastNode])
                                elif tp == 'Z' and lastType == 'A':
                                    nextNode = choice(
                                        self.artist2album[lastNode])
                                elif tp == 'Z' and lastType == 'T':
                                    nextNode = self.track2album[lastNode]

                                elif tp == 'U':
                                    if lastType == 'T':
                                        nextNode = choice(
                                            self.track2user[lastNode])
                                    elif lastType == 'Z':
                                        nextNode = choice(
                                            self.album2user[lastNode])
                                    elif lastType == 'A':
                                        nextNode = choice(
                                            self.artist2user[lastNode])

                                path.append(nextNode)
                                lastNode = nextNode
                                lastType = tp

                            except (KeyError, IndexError):
                                path = []
                                break

                    if path:
                        self.walks.append(path)
                        # for node in path:
                        #     if node[1] == 'U' or node[1] == 'F':
                        #         self.usercovered[node[0]] = 1
                        # print path
                        # if mp == 'UFIU':
                        # pass
        shuffle(self.walks)
        print 'walks:', len(self.walks)
        # Training get top-k friends
        print 'Generating user embedding...'

        self.topKSim = {}

        model = w2v.Word2Vec(self.walks,
                             size=self.walkDim,
                             window=5,
                             min_count=0,
                             iter=self.epoch)

        for user in self.data.userRecord:
            uid = self.data.getId(user, 'user')
            self.W[uid] = model.wv[user]
        print 'User embedding generated.'

        print 'Constructing similarity matrix...'
        i = 0

        for user1 in self.data.userRecord:
            uSim = []
            i += 1
            if i % 200 == 0:
                print i, '/', len(self.data.userRecord)
            vec1 = self.W[self.data.getId(user1, 'user')]
            for user2 in self.data.userRecord:
                if user1 <> user2:
                    vec2 = self.W[self.data.getId(user2, 'user')]
                    sim = cosine(vec1, vec2)
                    uSim.append((user2, sim))

            self.topKSim[user1] = sorted(uSim,
                                         key=lambda d: d[1],
                                         reverse=True)[:self.topK]

        # print 'Similarity matrix finished.'
        # # # #print self.topKSim
        #import pickle
        # # # #
        # # # #recordTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        # similarity = open('SocialMR-lastfm-sim'+self.foldInfo+'.pkl', 'wb')
        # vectors = open('SocialMR-lastfm-vec'+self.foldInfo+'.pkl', 'wb')
        # #Pickle dictionary using protocol 0.
        #
        # pickle.dump(self.topKSim, similarity)
        # pickle.dump((self.W,self.G),vectors)
        # similarity.close()
        # vectors.close()

        # matrix decomposition
        #pkl_file = open('SocialMR-lastfm-sim' + self.foldInfo + '.pkl', 'rb')

        #self.topKSim = pickle.load(pkl_file)

#        self.F = np.random.rand(self.data.trainingSize()[0], self.k) / 10
# prepare Pu set, IPu set, and Nu set

        print 'Preparing item sets...'
        self.PositiveSet = defaultdict(dict)
        self.pSet = defaultdict(list)
        self.IPositiveSet = defaultdict(dict)
        self.ipSet = defaultdict(list)
        # self.NegativeSet = defaultdict(list)

        for user in self.data.userRecord:
            for item in self.data.userRecord[user]:
                self.PositiveSet[user][item['track']] = 1
                self.pSet[user].append(item['track'])

            for friend, sim in self.topKSim[user]:
                for item in self.data.userRecord[friend]:
                    if not self.PositiveSet[user].has_key(item['track']):
                        self.IPositiveSet[user][item['track']] = 1
                        self.ipSet[user].append(item['track'])

        Suk = 0.5
        print 'Training...'
        iteration = 0
        while iteration < self.maxIter:
            self.loss = 0
            itemList = self.data.name2id['track'].keys()
            for user in self.pSet:
                u = self.data.getId(user, 'user')
                kItems = self.ipSet[user]
                for item in self.pSet[user]:
                    i = self.data.getId(item, 'track')
                    if len(self.ipSet[user]) > 0:
                        item_k = choice(kItems)
                        k = self.data.getId(item_k, 'track')
                        s1 = sigmoid((self.P[u].dot(self.Q[i]) -
                                      self.P[u].dot(self.Q[k])) / (Suk + 1))
                        self.P[u] += 1 / (Suk + 1) * self.lRate * (1 - s1) * (
                            self.Q[i] - self.Q[k])
                        self.Q[i] += 1 / (Suk + 1) * self.lRate * (
                            1 - s1) * self.P[u]
                        self.Q[k] -= 1 / (Suk + 1) * self.lRate * (
                            1 - s1) * self.P[u]
                        item_j = ''
                        # if len(self.NegativeSet[user])>0:
                        #     item_j = choice(self.NegativeSet[user])
                        # else:
                        item_j = choice(itemList)
                        while (self.PositiveSet[user].has_key(item_j)
                               or self.IPositiveSet[user].has_key(item_j)):
                            item_j = choice(itemList)
                        j = self.data.getId(item_j, 'track')
                        s2 = sigmoid(self.P[u].dot(self.Q[k]) -
                                     self.P[u].dot(self.Q[j]))
                        self.P[u] += self.lRate * (1 - s2) * (self.Q[k] -
                                                              self.Q[j])
                        self.Q[k] += self.lRate * (1 - s2) * self.P[u]
                        self.Q[j] -= self.lRate * (1 - s2) * self.P[u]

                        self.P[u] -= self.lRate * self.regU * self.P[u]
                        self.Q[i] -= self.lRate * self.regI * self.Q[i]
                        self.Q[j] -= self.lRate * self.regI * self.Q[j]
                        self.Q[k] -= self.lRate * self.regI * self.Q[k]

                        self.loss += -log(s1) - log(s2)

                    else:
                        item_j = choice(itemList)
                        while (self.PositiveSet[user].has_key(item_j)):
                            item_j = choice(itemList)
                        j = self.data.getId(item_j, 'track')
                        s = sigmoid(self.P[u].dot(self.Q[i]) -
                                    self.P[u].dot(self.Q[j]))
                        self.P[u] += self.lRate * (1 - s) * (self.Q[i] -
                                                             self.Q[j])
                        self.Q[i] += self.lRate * (1 - s) * self.P[u]
                        self.Q[j] -= self.lRate * (1 - s) * self.P[u]
                        self.P[u] -= self.lRate * self.regU * self.P[u]
                        self.Q[i] -= self.lRate * self.regI * self.Q[i]
                        self.Q[j] -= self.lRate * self.regI * self.Q[j]

                        self.loss += -log(s)
            for user in self.topKSim:
                for friend in self.topKSim[user]:
                    u = self.data.getId(user, 'user')
                    f = self.data.getId(friend[0], 'user')
                    self.P[u] -= self.alpha * self.lRate * (self.P[u] -
                                                            self.P[f])
            self.loss += self.regU * (self.P * self.P).sum() + self.regI * (
                self.Q * self.Q).sum()
            iteration += 1
            if self.isConverged(iteration):
                break
Exemple #8
0
    def buildModel(self):
        self.T = np.random.rand(self.data.getSize('track'),self.k)
        sentences = []
        self.listenTrack = set()
        self.user = defaultdict(list)
        for user in self.data.userRecord:
            playList = []
            if len(self.data.userRecord[user]) > 10:
                self.user[user] = self.data.userRecord[user]
                for item in self.data.userRecord[user]:
                    playList.append(item['track'])
                    self.listenTrack.add(item['track'])
                sentences.append(playList)
        model = w2v.Word2Vec(sentences, size=self.k, window=5, min_count=0, iter=10)
        for track in self.listenTrack:
            tid = self.data.getId(track, 'track')
            self.T[tid] = model.wv[track]
        print ('song embedding generated.')
       
        print ('Constructing similarity matrix...')
        i = 0
        self.topKSim = {}
        for track1 in self.listenTrack:
            tSim = []
            i += 1
            if i % 200 == 0:
                print (i, '/', len(self.listenTrack))
            vec1 = self.T[self.data.getId(track1, 'track')]
            for track2 in self.listenTrack:
                if track1 != track2:
                    vec2 = self.T[self.data.getId(track2, 'track')]
                    sim = cosine(vec1, vec2)
                    tSim.append((track2, sim))

            self.topKSim[track1] = sorted(tSim, key=lambda d: d[1], reverse=True)[:self.topK]

        userListen = defaultdict(dict) 
        for user in self.user: 
            for item in self.user[user]: 
                if item[self.recType] not in userListen[user]: 
                    userListen[user][item[self.recType]] = 0 
                userListen[user][item[self.recType]] += 1 
        print ('training...')
        '''
        iteration = 0
        itemList = list(self.data.name2id[self.recType].keys())
        while iteration < self.maxIter:
            self.loss = 0
            
            YtY = self.Y.T.dot(self.Y)
            I = np.ones(self.n)
            for user in self.data.name2id['user']:
                #C_u = np.ones(self.data.getSize(self.recType))
                H = np.ones(self.n)
                val = []
                pos = []
                P_u = np.zeros(self.n)
                uid = self.data.getId(user,'user')
                for item in userListen[user]:
                    iid = self.data.getId(item,self.recType)
                    r_ui = userListen[user][item]
                    pos.append(iid)
                    val.append(10*r_ui)
                    H[iid]+=10*r_ui
                    P_u[iid]=1
                    error = (P_u[iid]-self.X[uid].dot(self.Y[iid]))
                    self.loss+=pow(error,2)
                #sparse matrix
                C_u = coo_matrix((val,(pos,pos)),shape=(self.n,self.n))
                A = (YtY+np.dot(self.Y.T,C_u.dot(self.Y))+self.regU*np.eye(self.k))
                self.X[uid] = np.dot(np.linalg.inv(A),(self.Y.T*H).dot(P_u))


            XtX = self.X.T.dot(self.X)
            I = np.ones(self.m)
            for item in self.data.name2id[self.recType]:
                P_i = np.zeros(self.m)
                iid = self.data.getId(item, self.recType)
                H = np.ones(self.m)
                val = []
                pos = []
                for user in self.data.listened[self.recType][item]:
                    uid = self.data.getId(user, 'user')
                    r_ui = self.data.listened[self.recType][item][user]
                    pos.append(uid)
                    val.append(10*r_ui)
                    H[uid] += 10*r_ui
                    P_i[uid] = 1
                # sparse matrix
                C_i = coo_matrix((val, (pos, pos)),shape=(self.m,self.m))
                A = (XtX+np.dot(self.X.T,C_i.dot(self.X))+self.regU*np.eye(self.k))
                self.Y[iid]=np.dot(np.linalg.inv(A), (self.X.T*H).dot(P_i))
           
            for user in self.user:
                u = self.data.getId(user,'user')
                for item in self.user[user]:
                    i = self.data.getId(item[self.recType],self.recType)
                    for ind in range(3):
                        item_j = choice(itemList)
                        while (item_j in userListen[user]):
                            item_j = choice(itemList)
                        j = self.data.getId(item_j,self.recType)
                        s = sigmoid(self.X[u].dot(self.Y[i]) - self.X[u].dot(self.Y[j]))
                        self.X[u] += self.lRate * (1 - s) * (self.Y[i] - self.Y[j])
                        self.Y[i] += self.lRate * (1 - s) * self.X[u]
                        self.Y[j] -= self.lRate * (1 - s) * self.X[u]

                        self.X[u] -= self.lRate * self.regU * self.X[u]
                        self.Y[i] -= self.lRate * self.regI * self.Y[i]
                        self.Y[j] -= self.lRate * self.regI * self.Y[j]
                        self.loss += -log(s)
            
            for t1 in self.topKSim:
                tid1 = self.data.getId(t1,'track')
                for t2 in self.topKSim[t1]:
                    tid2 = self.data.getId(t2[0],'track')
                    sim = t2[1]
                    error = (sim-self.Y[tid1].dot(self.Y[tid2]))
                    self.loss+=error**2
                    self.Y[tid1]+=0.5*self.alpha*self.lRate*(error)*self.Y[tid2]
                    self.Y[tid2]+=0.5*self.alpha*self.lRate*(error)*self.Y[tid1]
         
            #self.loss += (self.X * self.X).sum() + (self.Y * self.Y).sum()
            iteration += 1
            print ('iteration:',iteration,'loss:',self.loss)
            # if self.isConverged(iteration):
            #     break
        '''
        iteration = 0
        while iteration < self.maxIter:
            self.loss = 0
            for user in self.data.name2id['user']:
                u = self.data.getId(user,'user')
                bu = self.Bu[u]
                for item in userListen[user]:
                    i = self.data.getId(item, self.recType)
                
                    bi = self.Bi[i]
                    rating = self.Y[i].dot(self.X[u]) + self.data.globalMean + self.Bu[u] + self.Bi[i]
                    error = userListen[user][item] - rating
                    self.loss += error**2
                    self.X[u] += self.lRate * (error*self.Y[i] - self.regU*self.X[u])
                    self.Y[i] += self.lRate * (error*self.X[u] - self.regI*self.Y[i])

                    self.Bu[u] += self.lRate * (error - self.regB * bu)
                    self.Bi[i] += self.lRate * (error - self.regB * bi)

            for t1 in self.topKSim:
                tid1 = self.data.getId(t1,'track')
                for t2 in self.topKSim[t1]:
                    tid2 = self.data.getId(t2[0],'track')
                    sim = t2[1]
                    error2 = (sim-self.Y[tid1].dot(self.Y[tid2]))
                    self.loss+=error2**2
                    self.Y[tid1]+=0.5*self.alpha*self.lRate*(error2)*self.Y[tid2] 
                    self.Y[tid2]+=0.5*self.alpha*self.lRate*(error2)*self.Y[tid1]
            self.loss += self.regB*(self.Bu * self.Bu).sum() + self.regB*(self.Bi*self.Bi).sum() + (self.X * self.X).sum() + (self.Y * self.Y).sum()
            iteration += 1
            print ('iteration:',iteration,'loss:',self.loss)
Exemple #9
0
    def buildModel(self):
        print 'Kind Note: This method will probably take much time.'
        #build C-U-NET
        print 'Building collaborative user network...'
        #filter isolated nodes and low ratings

        self.itemNet = {}
        for item in self.dao.trainSet_i:
            if len(self.dao.trainSet_i[item]) > 1:
                self.itemNet[item] = self.dao.trainSet_i[item]

        self.filteredRatings = defaultdict(list)
        for item in self.itemNet:
            for user in self.itemNet[item]:
                if self.itemNet[item][user] > 0.75:
                    self.filteredRatings[user].append(item)

        self.CUNet = defaultdict(list)

        for user1 in self.filteredRatings:
            s1 = set(self.filteredRatings[user1])
            for user2 in self.filteredRatings:
                if user1 <> user2:
                    s2 = set(self.filteredRatings[user2])
                    weight = len(s1.intersection(s2))
                    if weight > 0:
                        self.CUNet[user1] += [user2] * weight

        #build Huffman Tree First
        #get weight
        print 'Building Huffman tree...'
        #To accelerate the method, the weight is estimated roughly
        nodes = {}
        for user in self.CUNet:
            nodes[user] = len(self.CUNet[user])
        nodes = sorted(nodes.iteritems(), key=lambda d: d[1])
        nodes = [HTreeNode(None, None, user[1], user[0]) for user in nodes]
        nodeList = OrderedLinkList()
        for node in nodes:
            listNode = Node()
            listNode.val = node
            try:
                nodeList.insert(listNode)
            except AttributeError:
                pass
        self.HTree = HuffmanTree(vecLength=self.walkDim)
        self.HTree.buildTree(nodeList)
        print 'Coding for all users...'
        self.HTree.coding(self.HTree.root, '', 0)

        print 'Generating random deep walks...'
        self.walks = []
        self.visited = defaultdict(dict)
        for user in self.CUNet:
            for t in range(self.walkCount):
                path = [user]
                for i in range(1, self.walkLength):
                    nextNode = choice(self.CUNet[user])
                    count = 0
                    while (self.visited[user].has_key(nextNode)):
                        nextNode = choice(self.CUNet[user])
                        #break infinite loop
                        count += 1
                        if count == 10:
                            break
                    path.append(nextNode)
                    self.visited[user][nextNode] = 1
                self.walks.append(path)
                #print path
        shuffle(self.walks)

        #Training get top-k friends
        print 'Generating user embedding...'
        iteration = 1
        while iteration <= self.epoch:
            loss = 0
            for walk in self.walks:
                for user in walk:
                    centerUser = walk[len(walk) / 2]
                    if user <> centerUser:
                        code = self.HTree.code[user]
                        centerCode = self.HTree.code[centerUser]
                        x = self.HTree.vector[centerCode]
                        for i in range(1, len(code)):
                            prefix = code[0:i]
                            w = self.HTree.vector[prefix]
                            self.HTree.vector[prefix] += self.lRate * (
                                1 - sigmoid(w.dot(x))) * x
                            self.HTree.vector[centerCode] += self.lRate * (
                                1 - sigmoid(w.dot(x))) * w
                            loss += -log(sigmoid(w.dot(x)))
            print 'iteration:', iteration, 'loss:', loss
            iteration += 1
        print 'User embedding generated.'

        print 'Constructing similarity matrix...'
        self.Sim = SymmetricMatrix(len(self.CUNet))
        for user1 in self.CUNet:
            for user2 in self.CUNet:
                if user1 <> user2:
                    prefix1 = self.HTree.code[user1]
                    vec1 = self.HTree.vector[prefix1]
                    prefix2 = self.HTree.code[user2]
                    vec2 = self.HTree.vector[prefix2]
                    if self.Sim.contains(user1, user2):
                        continue
                    sim = cosine(vec1, vec2)
                    self.Sim.set(user1, user2, sim)
        self.topKSim = {}
        for user in self.CUNet:
            self.topKSim[user] = sorted(self.Sim[user].iteritems(),
                                        key=lambda d: d[1],
                                        reverse=True)[:self.topK]
        print 'Similarity matrix finished.'
        #print self.topKSim

        #matrix decomposition
        print 'Decomposing...'

        iteration = 0
        while iteration < self.maxIter:
            self.loss = 0
            for entry in self.dao.trainingData:
                user, item, rating = entry
                u = self.dao.user[user]  #get user id
                i = self.dao.item[item]  #get item id
                error = rating - self.P[u].dot(self.Q[i])
                self.loss += error**2
                p = self.P[u]
                q = self.Q[i]

                #update latent vectors
                self.P[u] += self.lRate * (error * q - self.regU * p)
                self.Q[i] += self.lRate * (error * p - self.regI * q)

            for user in self.CUNet:

                u = self.dao.user[user]
                friends = self.topKSim[user]
                for friend in friends:
                    uf = self.dao.user[friend[0]]
                    self.P[u] -= self.lRate * (self.P[u] -
                                               self.P[uf]) * self.alpha
                    self.loss += self.alpha * (
                        self.P[u] - self.P[uf]).dot(self.P[u] - self.P[uf])

            self.loss += self.regU * (self.P * self.P).sum() + self.regI * (
                self.Q * self.Q).sum()
            iteration += 1
            if self.isConverged(iteration):
                break
Exemple #10
0
    def buildModel(self):
        #build a list for weighted negative sampling
        negCandidate = []
        # for track in self.data.trackListened:
        #     count = sum(self.data.trackListened[track].values())
        #     id = self.data.getId(track,'track')
        #     negCandidate+=[id]*count
        # print 'learning music embedding...'
        # iteration = 0
        # while iteration < self.epoch:
        #     loss = 0
        #     for user in self.data.userRecord:
        #         u = self.data.getId(user, 'user')
        #         #global user preference
        #         global_uv = np.zeros(self.k)
        #         for event in self.data.userRecord[user]:
        #             id = self.data.getId(event['track'], 'track')
        #             global_uv += self.Q[id]
        #         global_uv /= len(self.data.userRecord[user])
        #
        #         #song embedding
        #         for i in range(len(self.data.userRecord[user])):
        #             start = max(0,i-self.winSize/2)
        #             end = min(i+self.winSize/2,len(self.data.userRecord[user])-1)
        #             local = self.data.userRecord[user][start:i]+self.data.userRecord[user][i+1:end+1]
        #             local_v = np.zeros(self.k)
        #             for event in local:
        #                 id = self.data.getId(event['track'],'track')
        #                 local_v+=self.Q[id]
        #             v_hat = (global_uv+local_v)/(end-start+1)
        #             center_id = self.data.getId(self.data.userRecord[user][i]['track'],'track')
        #             center_v = self.Q[center_id]
        #             gradient = self.lRate*(1-sigmoid(v_hat.dot(center_v)))*v_hat
        #             gradient2 = self.lRate*(1-sigmoid(v_hat.dot(center_v)))*center_v
        #             self.Q[center_id]+=gradient
        #             global_uv+=gradient/len(self.data.userRecord[user])
        #             global_uv+=gradient2/len(self.data.userRecord[user])*(end-start)
        #             for event in local:
        #                 id = self.data.getId(event['track'],'track')
        #                 self.Q[id]+=gradient2/(end-start+1)
        #             loss+= -log(sigmoid(v_hat.dot(center_v)))
        #             #negative sampling
        #             for j in range(self.negCount):
        #                 neg_id = choice(negCandidate)
        #                 while neg_id==center_id:
        #                     neg_id = choice(negCandidate)
        #                 neg_v = self.Q[neg_id]
        #                 gradient = -self.lRate * (1 - sigmoid(v_hat.dot(neg_v))) * v_hat
        #                 gradient2 = -self.lRate * (1 - sigmoid(v_hat.dot(neg_v))) * neg_v
        #                 self.Q[center_id]+=gradient
        #                 for event in local:
        #                     id = self.data.getId(event['track'], 'track')
        #                     self.Q[id] += gradient2 / (end - start + 1)
        #                 loss+=-(log(1-sigmoid(neg_v.dot(v_hat))))

        sentences = []
        for user in self.data.userRecord:
            playList = []
            for item in self.data.userRecord[user]:
                playList.append(item['track'])
            sentences.append(playList)
        model = w2v.Word2Vec(sentences,
                             size=self.k,
                             window=5,
                             min_count=0,
                             iter=10,
                             sg=1)
        for track in self.data.trackListened:
            tid = self.data.getId(track, 'track')
            self.Q[tid] = model.wv[track]

        # #regularization
        # for album in self.data.album2Track:
        #     for track1 in self.data.album2Track[album]:
        #         for track2 in self.data.album2Track[album]:
        #             t1 = self.data.getId(track1,'track')
        #             t2 = self.data.getId(track2,'track')
        #             v1 = self.Q[t1]
        #             v2 = self.Q[t2]
        #             self.Q[t1]+=self.lRate*(exp(v1.dot(v2))*v2)
        #             self.Q[t2] += self.lRate*(exp(v1.dot(v2))*v1)

        #
        #         #print 'window %d finished' %(i)
        #     #print 'user %s finished.' %(user)
        # iteration+=1
        # print 'iteration %d, loss %.4f' %(iteration,loss)

        #preference embedding
        self.R = np.zeros((self.data.getSize('user'), self.k))
        for user in self.data.userRecord['user']:
            uid = self.data.getId(user, 'user')
            global_uv = np.zeros(self.k)
            local_uv = np.zeros(self.k)
            for event in self.data.userRecord[user]:
                tid = self.data.getId(event['track'], 'track')
                global_uv += self.Q[tid]
            self.P[uid] = global_uv / len(self.data.userRecord['user'])
            recent = max(0, len(self.data.userRecord[user]) - 20)
            for event in self.data.userRecord[user][recent:]:
                tid = self.data.getId(event['track'], 'track')
                local_uv += self.Q[tid]
            self.R[uid] = local_uv / recent

        for t1 in self.data.trackListened:
            if len(self.data.trackListened[t1]) < 200:
                continue
            xiangsi = ''
            m = 0
            s = ''
            n = ''
            mi = 10000
            s1 = set(self.data.trackListened[t1].keys())
            for t2 in self.data.trackListened:
                if t1 != t2:
                    s2 = set(self.data.trackListened[t2].keys())
                    l = len(s1.intersection(s2))
                    if l > m and l > 50:
                        m = l
                        s = t2
                    if l < mi:
                        mi = l
                        n = t2

            print t1, s, cosine(self.Q[self.data.getId(t1, 'track')],
                                self.Q[self.data.getId(s, 'track')]), m
            print t1, n, cosine(self.Q[self.data.getId(t1, 'track')],
                                self.Q[self.data.getId(n, 'track')]), mi
            break
Exemple #11
0
    def buildModel(self):
        # self.P = np.ones((self.dao.trainingSize()[0], self.k))/10  # latent user matrix
        # self.Q = np.ones((self.dao.trainingSize()[1], self.k))/10  # latent item matrix
        #data clean
        cleanList = []
        cleanPair = []
        for user in self.sao.followees:
            if not self.dao.user.has_key(user):
                cleanList.append(user)
            for u2 in self.sao.followees[user]:
                if not self.dao.user.has_key(u2):
                    cleanPair.append((user, u2))
        for u in cleanList:
            del self.sao.followees[u]

        for pair in cleanPair:
            if self.sao.followees.has_key(pair[0]):
                del self.sao.followees[pair[0]][pair[1]]

        cleanList = []
        cleanPair = []
        for user in self.sao.followers:
            if not self.dao.user.has_key(user):
                cleanList.append(user)
            for u2 in self.sao.followers[user]:
                if not self.dao.user.has_key(u2):
                    cleanPair.append((user, u2))
        for u in cleanList:
            del self.sao.followers[u]

        for pair in cleanPair:
            if self.sao.followers.has_key(pair[0]):
                del self.sao.followers[pair[0]][pair[1]]

        # li = self.sao.followees.keys()
        #
        # import pickle
        #
        # self.trueTopKFriends = defaultdict(list)
        # pkl_file = open(self.config['ratings'] + self.foldInfo + 'p.pkl', 'rb')
        # self.pTopKSim = pickle.load(pkl_file)
        # pkl_file = open(self.config['ratings'] + self.foldInfo + 'n.pkl', 'rb')
        # self.nTopKSim = pickle.load(pkl_file)
        # self.trueTopKFriends = defaultdict(list)
        # for user in self.pTopKSim:
        #     trueFriends = list(
        #         set(self.pTopKSim[user][:self.topK]).intersection(set(self.nTopKSim[user][:self.topK])))
        #     self.trueTopKFriends[user] = trueFriends
        #
        # ps = open(self.config['ratings'] + self.foldInfo + 'psim.pkl', 'rb')
        # self.pSimilarity=pickle.load(ps)
        # ns = open(self.config['ratings'] + self.foldInfo + 'nsim.pkl', 'rb')
        # self.nSimilarity=pickle.load(ns)
        # av = open(self.config['ratings'] + self.foldInfo + 'av.pkl', 'rb')
        # self.avg_sim=pickle.load(av)
        # th = open(self.config['ratings'] + self.foldInfo + 'th.pkl', 'rb')
        # self.threshold=pickle.load(th)

        print 'Kind Note: This method will probably take much time.'
        # build U-F-NET
        print 'Building weighted user-friend network...'
        # filter isolated nodes and low ratings
        # Definition of Meta-Path
        p1 = 'UIU'
        p2 = 'UFU'
        p3 = 'UTU'
        p4 = 'UFIU'
        p5 = 'UFUIU'
        mPaths = [p1, p2, p3, p4, p5]

        self.G = np.random.rand(self.dao.trainingSize()[0], self.walkDim) / 10
        self.W = np.random.rand(self.dao.trainingSize()[0], self.walkDim) / 10

        self.UFNet = defaultdict(list)

        for user1 in self.sao.followees:
            s1 = set(self.sao.followees[user1])
            for user2 in self.sao.followees[user1]:
                if self.sao.followees.has_key(user2):
                    if user1 <> user2:
                        s2 = set(self.sao.followees[user2])
                        weight = len(s1.intersection(s2))
                        self.UFNet[user1] += [user2] * (weight + 1)

        self.UTNet = defaultdict(list)

        for user1 in self.sao.followers:
            s1 = set(self.sao.followers[user1])
            for user2 in self.sao.followers[user1]:
                if self.sao.followers.has_key(user2):
                    if user1 <> user2:
                        s2 = set(self.sao.followers[user2])
                        weight = len(s1.intersection(s2))
                        self.UTNet[user1] += [user2] * (weight + 1)
        #
        #
        #
        #
        print 'Generating random meta-path random walks... (Positive)'
        self.pWalks = []
        #self.usercovered = {}

        # positive
        for user in self.dao.user:

            for mp in mPaths:
                if mp == p1:
                    self.walkCount = 10
                if mp == p2:
                    self.walkCount = 8
                if mp == p3:
                    self.walkCount = 8
                if mp == p4:
                    self.walkCount = 5
                if mp == p5:
                    self.walkCount = 5
                for t in range(self.walkCount):

                    path = ['U' + user]
                    lastNode = user
                    nextNode = user
                    lastType = 'U'
                    for i in range(self.walkLength / len(mp[1:])):

                        for tp in mp[1:]:
                            try:
                                if tp == 'I':

                                    nextNode = choice(self.positive[lastNode])

                                if tp == 'U':

                                    if lastType == 'I':
                                        nextNode = choice(
                                            self.pItems[lastNode])
                                    elif lastType == 'F':
                                        nextNode = choice(self.UFNet[lastNode])
                                        while not self.dao.user.has_key(
                                                nextNode):
                                            nextNode = choice(
                                                self.UFNet[lastNode])
                                    elif lastType == 'T':
                                        nextNode = choice(self.UTNet[lastNode])
                                        while not self.dao.user.has_key(
                                                nextNode):
                                            nextNode = choice(
                                                self.UTNet[lastNode])

                                if tp == 'F':

                                    nextNode = choice(self.UFNet[lastNode])
                                    while not self.dao.user.has_key(nextNode):
                                        nextNode = choice(self.UFNet[lastNode])

                                if tp == 'T':

                                    nextNode = choice(self.UFNet[lastNode])
                                    while not self.dao.user.has_key(nextNode):
                                        nextNode = choice(self.UFNet[lastNode])

                                path.append(tp + nextNode)
                                lastNode = nextNode
                                lastType = tp

                            except (KeyError, IndexError):
                                path = []
                                break

                    if path:
                        self.pWalks.append(path)

        self.nWalks = []
        # self.usercovered = {}

        #negative
        for user in self.dao.user:

            for mp in mPaths:
                if mp == p1:
                    self.walkCount = 10
                if mp == p2:
                    self.walkCount = 8
                if mp == p3:
                    self.walkCount = 8
                if mp == p4:
                    self.walkCount = 5
                if mp == p5:
                    self.walkCount = 5
                for t in range(self.walkCount):

                    path = ['U' + user]
                    lastNode = user
                    nextNode = user
                    lastType = 'U'
                    for i in range(self.walkLength / len(mp[1:])):

                        for tp in mp[1:]:
                            try:
                                if tp == 'I':
                                    nextNode = choice(self.negative[lastNode])

                                if tp == 'U':

                                    if lastType == 'I':
                                        nextNode = choice(
                                            self.nItems[lastNode])
                                    elif lastType == 'F':
                                        nextNode = choice(self.UFNet[lastNode])
                                        while not self.dao.user.has_key(
                                                nextNode):
                                            nextNode = choice(
                                                self.UFNet[lastNode])
                                    elif lastType == 'T':
                                        nextNode = choice(self.UTNet[lastNode])
                                        while not self.dao.user.has_key(
                                                nextNode):
                                            nextNode = choice(
                                                self.UTNet[lastNode])

                                if tp == 'F':

                                    nextNode = choice(self.UFNet[lastNode])
                                    while not self.dao.user.has_key(nextNode):
                                        nextNode = choice(self.UFNet[lastNode])

                                if tp == 'T':

                                    nextNode = choice(self.UFNet[lastNode])
                                    while not self.dao.user.has_key(nextNode):
                                        nextNode = choice(self.UFNet[lastNode])

                                path.append(tp + nextNode)
                                lastNode = nextNode
                                lastType = tp

                            except (KeyError, IndexError):
                                path = []
                                break

                    if path:
                        self.nWalks.append(path)

        shuffle(self.pWalks)
        print 'pwalks:', len(self.pWalks)
        print 'nwalks:', len(self.nWalks)
        # Training get top-k friends
        print 'Generating user embedding...'

        self.pTopKSim = {}
        self.nTopKSim = {}
        self.pSimilarity = defaultdict(dict)
        self.nSimilarity = defaultdict(dict)
        model = w2v.Word2Vec(self.pWalks,
                             size=self.walkDim,
                             window=5,
                             min_count=0,
                             iter=10)
        model2 = w2v.Word2Vec(self.nWalks,
                              size=self.walkDim,
                              window=5,
                              min_count=0,
                              iter=10)

        for user in self.positive:
            uid = self.dao.user[user]
            try:
                self.W[uid] = model.wv['U' + user]
            except KeyError:
                continue

        for user in self.negative:
            uid = self.dao.user[user]
            try:
                self.G[uid] = model2.wv['U' + user]
            except KeyError:
                continue
        print 'User embedding generated.'

        print 'Constructing similarity matrix...'
        i = 0

        for user1 in self.positive:
            uSim = []
            i += 1
            if i % 200 == 0:
                print i, '/', len(self.positive)
            vec1 = self.W[self.dao.user[user1]]
            for user2 in self.positive:
                if user1 <> user2:
                    vec2 = self.W[self.dao.user[user2]]
                    sim = cosine(vec1, vec2)
                    uSim.append((user2, sim))
            fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK]
            self.threshold[user1] = fList[self.topK / 2][1]
            for pair in fList:
                self.pSimilarity[user1][pair[0]] = pair[1]
            self.pTopKSim[user1] = [item[0] for item in fList]
            self.avg_sim[user1] = sum([item[1] for item in fList
                                       ][:self.topK / 2]) / (self.topK / 2)
        # import pickle
        # ps = open(self.config['ratings'] + self.foldInfo + 'ps.pkl', 'wb')
        #
        # pickle.dump(self.pSimilarity, ps)
        # av = open(self.config['ratings'] + self.foldInfo + 'av.pkl', 'wb')
        #
        # pickle.dump(self.avg_sim, av)
        #
        # th = open(self.config['ratings'] + self.foldInfo + 'th.pkl', 'wb')
        # pickle.dump(self.threshold, th)
        i = 0
        for user1 in self.negative:
            uSim = []
            i += 1
            if i % 200 == 0:
                print i, '/', len(self.negative)
            vec1 = self.G[self.dao.user[user1]]
            for user2 in self.negative:
                if user1 <> user2:
                    vec2 = self.G[self.dao.user[user2]]
                    sim = cosine(vec1, vec2)
                    uSim.append((user2, sim))
            fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK]
            for pair in fList:
                self.nSimilarity[user1][pair[0]] = pair[1]
            self.nTopKSim[user1] = [item[0] for item in fList]

        self.trueTopKFriends = defaultdict(list)
        for user in self.pTopKSim:
            trueFriends = list(
                set(self.pTopKSim[user]).intersection(set(
                    self.nTopKSim[user])))
            self.trueTopKFriends[user] = trueFriends
            # if len(trueFriends)>0:
            #     print trueFriends
            self.pTopKSim[user] = list(
                set(self.pTopKSim[user]).difference(set(trueFriends)))

        # print 'Similarity matrix finished.'
        # # # #print self.topKSim

        # # # #
        # # # #recordTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time()))
        # psimilarity = open(self.config['ratings']+self.foldInfo+'p.pkl', 'wb')
        # nsimilarity = open(self.config['ratings'] + self.foldInfo + 'n.pkl', 'wb')
        # vectors = open('HERP-lastfm-vec'+self.foldInfo+'.pkl', 'wb')
        # #Pickle dictionary using protocol 0.
        #
        # pickle.dump(self.pTopKSim, psimilarity)
        # pickle.dump(self.nTopKSim, nsimilarity)
        #
        # psimilarity = open(self.config['ratings'] + self.foldInfo + 'psim.pkl', 'wb')
        # nsimilarity = open(self.config['ratings'] + self.foldInfo + 'nsim.pkl', 'wb')
        # vectors = open('HERP-lastfm-vec'+self.foldInfo+'.pkl', 'wb')
        # #Pickle dictionary using protocol 0.
        #

        # pickle.dump(self.pSimilarity, psimilarity)
        # pickle.dump(self.nSimilarity, nsimilarity)

        #pickle.dump((self.W,self.G),vectors)
        # similarity.close()
        # vectors.close()

        # matrix decomposition
        #pkl_file = open('IF_BPR-lastfm-sim' + self.foldInfo + '.pkl', 'rb')

        #self.topKSim = pickle.load(pkl_file)

        print 'Decomposing...'
        self.F = np.random.rand(self.dao.trainingSize()[0], self.k) / 10
        # prepare Pu set, IPu set, and Nu set
        print 'Preparing item sets...'
        self.PositiveSet = defaultdict(dict)

        self.NegSets = defaultdict(dict)

        for user in self.dao.user:
            for item in self.dao.trainSet_u[user]:
                self.PositiveSet[user][item] = 1

        for user in self.dao.user:
            for item in self.negative[user]:
                if self.dao.item.has_key(item):
                    self.NegSets[user][item] = 1

        iteration = 0
        while iteration < self.maxIter:
            self.loss = 0

            self.IPositiveSet = defaultdict(dict)
            self.OKSet = defaultdict(dict)
            for user in self.dao.user:
                if self.trueTopKFriends.has_key(user):
                    for friend in self.trueTopKFriends[user][:self.topK]:
                        if self.dao.user.has_key(friend) and self.pSimilarity[
                                user][friend] >= self.threshold[user]:
                            for item in self.positive[friend]:
                                if not self.PositiveSet[user].has_key(
                                        item
                                ) and not self.NegSets[user].has_key(item):
                                    self.IPositiveSet[user][item] = friend

                if self.pTopKSim.has_key(user):
                    for friend in self.pTopKSim[user][:self.topK]:
                        if self.dao.user.has_key(friend) and self.pSimilarity[
                                user][friend] >= self.threshold[user]:
                            for item in self.positive[friend]:
                                if not self.PositiveSet[user].has_key(
                                        item
                                ) and not self.IPositiveSet[user].has_key(
                                        item
                                ) and not self.NegSets[user].has_key(item):
                                    self.OKSet[user][item] = friend

                if self.nTopKSim.has_key(user):
                    for friend in self.nTopKSim[user][:self.topK]:
                        if self.dao.user.has_key(
                                friend
                        ):  #and self.nSimilarity[user][friend]>=self.threshold[user]:
                            for item in self.negative[friend]:
                                if self.dao.item.has_key(item):
                                    if not self.PositiveSet[user].has_key(item) and not self.IPositiveSet[user].has_key(
                                            item) \
                                            and not self.OKSet.has_key(item):
                                        if not self.NegSets[user].has_key(
                                                item):
                                            self.NegSets[user][item] = 1
                                        else:
                                            self.NegSets[user][item] += 1
            itemList = self.dao.item.keys()
            for user in self.PositiveSet:
                #itemList = self.NegSets[user].keys()
                kItems = self.IPositiveSet[user].keys()
                okItems = self.OKSet[user].keys()
                nItems = self.NegSets[user].keys()

                u = self.dao.user[user]

                for item in self.PositiveSet[user]:
                    i = self.dao.item[item]

                    for ind in range(1):
                        if len(kItems) > 0 and len(okItems) > 0:

                            item_k = choice(kItems)
                            uf = self.IPositiveSet[user][item_k]
                            k = self.dao.item[item_k]
                            self.optimization_thres(u, i, k, user, uf)

                            item_ok = choice(okItems)
                            ok = self.dao.item[item_ok]

                            self.optimization(u, k, ok)

                            item_j = choice(itemList)
                            while (self.PositiveSet[user].has_key(item_j)
                                   or self.IPositiveSet[user].has_key(item_j)
                                   or self.OKSet[user].has_key(item_j)):
                                item_j = choice(itemList)
                            j = self.dao.item[item_j]
                            self.optimization(u, ok, j)

                        elif len(kItems) == 0 and len(okItems) > 0:
                            item_ok = choice(okItems)
                            ok = self.dao.item[item_ok]

                            uf = self.OKSet[user][item_ok]
                            self.optimization_thres(u, i, ok, user, uf)

                            item_j = choice(itemList)
                            while (self.PositiveSet[user].has_key(item_j)
                                   or self.IPositiveSet[user].has_key(item_j)
                                   or self.OKSet[user].has_key(item_j)):
                                item_j = choice(itemList)
                            j = self.dao.item[item_j]
                            self.optimization(u, ok, j)

                        elif len(kItems) > 0 and len(okItems) == 0:
                            item_k = choice(kItems)
                            uf = self.IPositiveSet[user][item_k]
                            k = self.dao.item[item_k]
                            self.optimization_thres(u, i, k, user, uf)

                            item_j = choice(itemList)
                            while (self.PositiveSet[user].has_key(item_j)
                                   or self.IPositiveSet[user].has_key(item_j)
                                   or self.OKSet[user].has_key(item_j)):
                                item_j = choice(itemList)
                            j = self.dao.item[item_j]
                            self.optimization(u, k, j)

                        else:
                            item_j = choice(itemList)
                            while (self.PositiveSet[user].has_key(item_j)
                                   or self.IPositiveSet[user].has_key(item_j)
                                   or self.OKSet[user].has_key(item_j)):
                                item_j = choice(itemList)
                            j = self.dao.item[item_j]
                            self.optimization(u, i, j)
                        if len(nItems) > 0:
                            item_n = choice(nItems)
                            n = self.dao.item[item_n]
                            self.optimization(u, j, n)
                if self.thres_count[user] > 0:
                    self.threshold[user] -= self.lRate * self.thres_d[
                        user] / self.thres_count[user]
                    self.thres_d[user] = 0
                    self.thres_count[user] = 0
                    li = [
                        sim for sim in self.pSimilarity[user].values()
                        if sim >= self.threshold[user]
                    ]
                    if len(li) == 0:
                        self.avg_sim[user] = self.threshold[user]
                    else:
                        self.avg_sim[user] = sum(li) / (len(li) + 0.0)

            for abc in range(2):
                for friend in self.trueTopKFriends[user]:
                    if self.pSimilarity[user][friend] > self.threshold[user]:
                        u = self.dao.user[user]
                        f = self.dao.user[friend]
                        self.P[u] -= self.alpha * self.lRate * (self.P[u] -
                                                                self.P[f])

            self.loss += self.regU * (self.P * self.P).sum() + self.regI * (
                self.Q * self.Q).sum()
            iteration += 1
            if self.isConverged(iteration):
                break
Exemple #12
0
    def buildModel(self):
        print('Kind Note: This method will probably take much time.')
        #build C-U-NET
        print('Building collaborative user network...')

        userListen = defaultdict(dict)
        for user in self.data.userRecord:
            for item in self.data.userRecord[user]:
                userListen[user][item[self.recType]] = 1
        self.CUNet = defaultdict(list)

        for user1 in userListen:
            s1 = set(userListen[user1].keys())
            for user2 in userListen:
                if user1 != user2:
                    s2 = set(userListen[user2].keys())
                    weight = len(s1.intersection(s2))
                    if weight > 0:
                        self.CUNet[user1] += [user2] * weight

        print('Generating random deep walks...')
        self.walks = []
        self.visited = defaultdict(dict)
        for user in self.CUNet:
            for t in range(self.walkCount):
                path = [user]
                lastNode = user
                for i in range(1, self.walkLength):
                    nextNode = choice(self.CUNet[lastNode])
                    count = 0
                    while (nextNode in self.visited[lastNode]):
                        nextNode = choice(self.CUNet[lastNode])
                        #break infinite loop
                        count += 1
                        if count == 10:
                            break
                    path.append(nextNode)
                    self.visited[user][nextNode] = 1
                    lastNode = nextNode
                self.walks.append(path)
        shuffle(self.walks)

        #Training get top-k friends
        print('Generating user embedding...')
        model = w2v.Word2Vec(self.walks,
                             size=self.walkDim,
                             window=self.winSize,
                             min_count=0,
                             iter=self.epoch)
        print('User embedding generated.')

        print('Constructing similarity matrix...')
        self.W = np.random.rand(self.data.getSize('user'),
                                self.k) / 10  # global user preference
        self.topKSim = {}
        i = 0
        for user in self.CUNet:
            u = self.data.getId(user, 'user')
            self.W[u] = model.wv[user]
        for user1 in self.CUNet:
            sims = []
            u1 = self.data.getId(user1, 'user')
            for user2 in self.CUNet:
                if user1 != user2:
                    u2 = self.data.getId(user2, 'user')
                    sims.append((user2, cosine(self.W[u1], self.W[u2])))
            self.topKSim[user1] = sorted(sims,
                                         key=lambda d: d[1],
                                         reverse=True)[:self.topK]
            i += 1
            if i % 200 == 0:
                print('progress:', i, '/', len(self.CUNet))
        print('Similarity matrix finished.')
        #print self.topKSim

        #prepare Pu set, IPu set, and Nu set
        print('Preparing item sets...')
        self.PositiveSet = defaultdict(list)
        self.IPositiveSet = defaultdict(list)
        #self.NegativeSet = defaultdict(list)
        for user in self.data.userRecord:
            for event in self.data.userRecord[user]:
                self.PositiveSet[user].append(event[self.recType])

        for user in self.CUNet:
            for friend in self.topKSim[user]:
                self.IPositiveSet[user] += list(
                    set(self.PositiveSet[friend[0]]).difference(
                        self.PositiveSet[user]))

        print('Training...')
        iteration = 0
        while iteration < self.maxIter:
            self.loss = 0
            itemList = list(self.data.name2id[self.recType].keys())
            for user in self.PositiveSet:
                u = self.data.getId(user, 'user')

                for item in self.PositiveSet[user]:
                    i = self.data.getId(item, self.recType)
                    for n in range(3):
                        if len(self.IPositiveSet[user]) > 0:
                            item_k = choice(self.IPositiveSet[user])

                            k = self.data.getId(item_k, self.recType)
                            self.P[u] += self.lRate * (
                                1 - sigmoid(self.P[u].dot(self.Q[i]) -
                                            self.P[u].dot(self.Q[k]))) * (
                                                self.Q[i] - self.Q[k])
                            self.Q[i] += self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * \
                                        self.P[u]
                            self.Q[k] -= self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * \
                                        self.P[u]

                            item_j = ''
                            # if len(self.NegativeSet[user])>0:
                            #     item_j = choice(self.NegativeSet[user])
                            # else:
                            item_j = choice(itemList)
                            while (user
                                   in self.data.listened[self.recType][item_j]
                                   ):
                                item_j = choice(itemList)
                            j = self.data.getId(item_j, self.recType)
                            self.P[u] += (1 / self.s) * self.lRate * (
                                1 - sigmoid(
                                    (1 / self.s) *
                                    (self.P[u].dot(self.Q[k]) - self.P[u].dot(
                                        self.Q[j])))) * (self.Q[k] - self.Q[j])
                            self.Q[k] += (1 / self.s) * self.lRate * (
                                1 - sigmoid(
                                    (1 / self.s) *
                                    (self.P[u].dot(self.Q[k]) -
                                     self.P[u].dot(self.Q[j])))) * self.P[u]
                            self.Q[j] -= (1 / self.s) * self.lRate * (
                                1 - sigmoid(
                                    (1 / self.s) *
                                    (self.P[u].dot(self.Q[k]) -
                                     self.P[u].dot(self.Q[j])))) * self.P[u]

                            self.P[u] -= self.lRate * self.regU * self.P[u]
                            self.Q[i] -= self.lRate * self.regI * self.Q[i]
                            self.Q[j] -= self.lRate * self.regI * self.Q[j]
                            self.Q[k] -= self.lRate * self.regI * self.Q[k]

                            self.loss += -log(sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) - \
                                        log(sigmoid((1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j]))))
                        else:
                            item_j = choice(itemList)
                            while (user
                                   in self.data.listened[self.recType][item_j]
                                   ):
                                item_j = choice(itemList)
                            j = self.data.getId(item_j, self.recType)
                            self.P[u] += self.lRate * (
                                1 - sigmoid(self.P[u].dot(self.Q[i]) -
                                            self.P[u].dot(self.Q[j]))) * (
                                                self.Q[i] - self.Q[j])
                            self.Q[i] += self.lRate * (
                                1 -
                                sigmoid(self.P[u].dot(self.Q[i]) -
                                        self.P[u].dot(self.Q[j]))) * self.P[u]
                            self.Q[j] -= self.lRate * (
                                1 -
                                sigmoid(self.P[u].dot(self.Q[i]) -
                                        self.P[u].dot(self.Q[j]))) * self.P[u]

                            self.loss += -log(
                                sigmoid(self.P[u].dot(self.Q[i]) -
                                        self.P[u].dot(self.Q[j])))

                self.loss += self.regU * (self.P * self.P).sum(
                ) + self.regI * (self.Q * self.Q).sum()
            iteration += 1
            if self.isConverged(iteration):
                break
Exemple #13
0
    def buildNetwork(self):
        self.trainingData = []
        print('Kind Note: This method will take much time')
        # build C-T-NET
        print('Building collaborative track network')
        self.trackNet = {}
        self.filteredListen = defaultdict(list)

        for track in self.data.trackRecord:
            if len(self.data.trackRecord[track]) > 0:
                self.trackNet[track] = self.data.trackRecord[track]
        for track in self.trackNet:
            tid = self.data.getId(track, 'track')
            for item in self.trackNet[track]:
                uid = self.data.getId(item['user'], 'user')
                if self.userListen[uid][tid] >= 0:
                    self.filteredListen[track].append(item['user'])
                    self.trainingData.append(item)

        self.CTNet = defaultdict(list)
        i = 0
        for track1 in self.filteredListen:
            i += 1
            if i % 200 == 0:
                print(i, '/', len(self.filteredListen))
            s1 = set(self.filteredListen[track1])
            for track2 in self.filteredListen:
                if track1 != track2:
                    s2 = set(self.filteredListen[track2])
                    weight = len(s1.intersection(s2))
                    if weight > 0:
                        self.CTNet[track1] += [track2] * weight
        ########################    歌曲 C-T-N-E-T 构建结束    ############################

        print('Genrerating random deep walks...')
        self.T_walks = []
        self.T_visited = defaultdict(dict)
        for track in self.CTNet:
            for t in range(10):
                path = [track]
                lastNode = track
                for i in range(1, 10):
                    nextNode = choice(self.CTNet[lastNode])
                    count = 0
                    #while(nextNode in self.T_visited[lastNode] or nextNode not in self.aSim[lastNode]):
                    while (nextNode in self.T_visited[lastNode]):
                        nextNode = choice(self.CTNet[lastNode])
                        count += 1
                        if count == 10:
                            break
                    path.append(nextNode)
                    self.T_visited[track][lastNode] = 1
                    lastNode = nextNode
                self.T_walks.append(path)
        shuffle(self.T_walks)
        ##del self.aSim

        print('Generating track embedding')
        model = w2v.Word2Vec(self.T_walks,
                             size=self.k,
                             window=5,
                             min_count=0,
                             iter=3)
        print('Track embedding generated')

        self.T = np.random.rand(self.data.getSize('track'), self.k)

        print('Constructing similarity matrix...')
        i = 0
        self.nSim = {}
        for track1 in self.CTNet:
            tSim = []
            i += 1
            if i % 1000 == 0:
                print(i, '/', len(self.CTNet))
            vec1 = model.wv[track1]
            tid1 = self.data.getId(track1, 'track')
            for track2 in self.CTNet:
                if track1 != track2:
                    tid2 = self.data.getId(track2, 'track')
                    vec2 = model.wv[track2]
                    sim = max(1e-6, cosine(vec1, vec2))
                    tSim.append((tid2, sim))
                    #self.nSim[t1][t2] = sim
            self.nSim[tid1] = sorted(tSim, key=lambda d: d[1],
                                     reverse=True)[:20]

        file1 = 'nsim.txt'
        df1 = open(file1, 'wb')
        #df1 = open(file1, 'rb')
        pickle.dump(self.nSim, df1)