def computeMetaSimilarity(self): # Training get top-k items print 'Generating meta embedding...' self.pTopKSim = {} self.pSimilarity = defaultdict(dict) pos_model = w2v.Word2Vec(self.pWalks, size=self.walkDim, window=5, min_count=0, iter=10) for item in self.meta.item: mid = self.meta.item[item] try: self.W[mid] = pos_model.wv['I' + item] except KeyError: continue print 'meta embedding generated.' print 'Constructing item similarity matrix...' i = 0 for item1 in self.meta.item: mSim = [] i += 1 if i % 200 == 0: print i, '/', len(self.meta.item) vec1 = self.W[self.meta.item[item1]] for item2 in self.meta.item: if item1 <> item2: vec2 = self.W[self.meta.item[item2]] sim = cosine(vec1, vec2) mSim.append((item2, sim)) fList = sorted(mSim, key=lambda d: d[1], reverse=True)[:10] self.threshold[item1] = fList[10 / 2][1] for pair in fList: self.pSimilarity[item1][pair[0]] = pair[1] self.pTopKSim[item1] = [item[0] for item in fList] self.avg_sim[item1] = sum([item[1] for item in fList][:10 / 2]) / (10 / 2) if item1 in self.pItems: for u in self.pItems[item1]: for item in self.pTopKSim[item1]: if item not in self.positive[u]: self.positive[u].append(item) if item1 in self.nItems: for u in self.nItems[item1]: for item in self.pTopKSim[item1]: if item not in self.negative[u]: self.negative[u].append(item)
def buildModel(self): print('Kind Note: This method will probably take much time.') #build C-U-NET print('Building collaborative user network...') #filter isolated nodes self.itemNet = {} for item in self.data.trainSet_i: if len(self.data.trainSet_i[item]) > 1: self.itemNet[item] = self.data.trainSet_i[item] self.filteredRatings = defaultdict(list) for item in self.itemNet: for user in self.itemNet[item]: if self.itemNet[item][user] >= 1: self.filteredRatings[user].append(item) self.CUNet = defaultdict(list) for user1 in self.filteredRatings: s1 = set(self.filteredRatings[user1]) for user2 in self.filteredRatings: if user1 != user2: s2 = set(self.filteredRatings[user2]) weight = len(s1.intersection(s2)) if weight > 0: self.CUNet[user1] += [user2] * weight #build Huffman Tree First #get weight # print 'Building Huffman tree...' # #To accelerate the method, the weight is estimated roughly # nodes = {} # for user in self.CUNet: # nodes[user] = len(self.CUNet[user]) # nodes = sorted(nodes.iteritems(),key=lambda d:d[1]) # nodes = [HTreeNode(None,None,user[1],user[0]) for user in nodes] # nodeList = OrderedLinkList() # for node in nodes: # listNode = Node() # listNode.val = node # try: # nodeList.insert(listNode) # except AttributeError: # pass # self.HTree = HuffmanTree(vecLength=self.walkDim) # self.HTree.buildTree(nodeList) # print 'Coding for all users...' # self.HTree.coding(self.HTree.root,'',0) print('Generating random deep walks...') self.walks = [] self.visited = defaultdict(dict) for user in self.CUNet: for t in range(self.walkCount): path = [user] lastNode = user for i in range(1, self.walkLength): nextNode = choice(self.CUNet[lastNode]) count = 0 while (nextNode in self.visited[lastNode]): nextNode = choice(self.CUNet[lastNode]) #break infinite loop count += 1 if count == 10: break path.append(nextNode) self.visited[user][nextNode] = 1 lastNode = nextNode self.walks.append(path) #print path shuffle(self.walks) #Training get top-k friends print('Generating user embedding...') # iteration = 1 # while iteration <= self.epoch: # loss = 0 # #slide windows randomly # # for n in range(self.walkLength/self.winSize): # # for walk in self.walks: # center = randint(0, len(walk)-1) # s = max(0,center-self.winSize/2) # e = min(center+self.winSize/2,len(walk)-1) # for user in walk[s:e]: # centerUser = walk[center] # if user <> centerUser: # code = self.HTree.code[user] # centerCode = self.HTree.code[centerUser] # x = self.HTree.vector[centerCode] # for i in range(1,len(code)): # prefix = code[0:i] # w = self.HTree.vector[prefix] # self.HTree.vector[prefix] += self.lRate*(1-sigmoid(w.dot(x)))*x # self.HTree.vector[centerCode] += self.lRate*(1-sigmoid(w.dot(x)))*w # loss += -log(sigmoid(w.dot(x)),2) # print 'iteration:', iteration, 'loss:', loss # iteration+=1 model = w2v.Word2Vec(self.walks, size=self.walkDim, window=5, min_count=0, iter=3) print('User embedding generated.') print('Constructing similarity matrix...') self.W = np.random.rand(self.data.trainingSize()[0], self.walkDim) / 10 self.topKSim = {} i = 0 for user1 in self.CUNet: # prefix1 = self.HTree.code[user1] # vec1 = self.HTree.vector[prefix1] sims = [] u1 = self.data.user[user1] self.W[u1] = model.wv[user1] for user2 in self.CUNet: if user1 != user2: u2 = self.data.user[user2] self.W[u2] = model.wv[user2] sims.append((user2, cosine(self.W[u1], self.W[u2]))) self.topKSim[user1] = sorted(sims, key=lambda d: d[1], reverse=True)[:self.topK] i += 1 if i % 200 == 0: print(('progress:', i, '/', len(self.CUNet))) print('Similarity matrix finished.') #print self.topKSim #matrix decomposition print('Decomposing...') iteration = 0 while iteration < self.maxIter: self.loss = 0 for entry in self.data.trainingData: user, item, rating = entry u = self.data.user[user] #get user id i = self.data.item[item] #get item id error = rating - self.P[u].dot(self.Q[i]) self.loss += error**2 p = self.P[u] q = self.Q[i] #update latent vectors self.P[u] += self.lRate * (error * q - self.regU * p) self.Q[i] += self.lRate * (error * p - self.regI * q) for user in self.CUNet: u = self.data.user[user] friends = self.topKSim[user] for friend in friends: uf = self.data.user[friend[0]] self.P[u] -= self.lRate * (self.P[u] - self.P[uf]) * self.alpha self.loss += self.alpha * ( self.P[u] - self.P[uf]).dot(self.P[u] - self.P[uf]) self.loss += self.regU * (self.P * self.P).sum() + self.regI * ( self.Q * self.Q).sum() iteration += 1 if self.isConverged(iteration): break
def buildModel(self): print 'Kind Note: This method will probably take much time.' #build C-U-NET print 'Building collaborative user network...' #filter isolated nodes self.itemNet = {} for item in self.dao.trainSet_i: if len(self.dao.trainSet_i[item]) > 1: self.itemNet[item] = self.dao.trainSet_i[item] self.filteredRatings = defaultdict(list) for item in self.itemNet: for user in self.itemNet[item]: if self.itemNet[item][user] >= 1: self.filteredRatings[user].append(item) self.CUNet = defaultdict(list) for user1 in self.filteredRatings: s1 = set(self.filteredRatings[user1]) for user2 in self.filteredRatings: if user1 <> user2: s2 = set(self.filteredRatings[user2]) weight = len(s1.intersection(s2)) if weight > 0: self.CUNet[user1] += [user2] * weight #build Huffman Tree First #get weight # print 'Building Huffman tree...' # #To accelerate the method, the weight is estimated roughly # nodes = {} # for user in self.CUNet: # nodes[user] = len(self.CUNet[user]) # nodes = sorted(nodes.iteritems(),key=lambda d:d[1]) # nodes = [HTreeNode(None,None,user[1],user[0]) for user in nodes] # nodeList = OrderedLinkList() # for node in nodes: # listNode = Node() # listNode.val = node # try: # nodeList.insert(listNode) # except AttributeError: # pass # self.HTree = HuffmanTree(vecLength=self.walkDim) # self.HTree.buildTree(nodeList) # print 'Coding for all users...' # self.HTree.coding(self.HTree.root,'',0) print 'Generating random deep walks...' self.walks = [] self.visited = defaultdict(dict) for user in self.CUNet: for t in range(self.walkCount): path = [user] lastNode = user for i in range(1, self.walkLength): nextNode = choice(self.CUNet[lastNode]) count = 0 while (self.visited[lastNode].has_key(nextNode)): nextNode = choice(self.CUNet[lastNode]) #break infinite loop count += 1 if count == 10: break path.append(nextNode) self.visited[user][nextNode] = 1 lastNode = nextNode self.walks.append(path) #print path shuffle(self.walks) #Training get top-k friends print 'Generating user embedding...' # iteration = 1 # while iteration <= self.epoch: # loss = 0 # #slide windows randomly # # for n in range(self.walkLength/self.winSize): # # for walk in self.walks: # center = randint(0, len(walk)-1) # s = max(0,center-self.winSize/2) # e = min(center+self.winSize/2,len(walk)-1) # for user in walk[s:e]: # centerUser = walk[center] # if user <> centerUser: # code = self.HTree.code[user] # centerCode = self.HTree.code[centerUser] # x = self.HTree.vector[centerCode] # for i in range(1,len(code)): # prefix = code[0:i] # w = self.HTree.vector[prefix] # self.HTree.vector[prefix] += self.lRate*(1-sigmoid(w.dot(x)))*x # self.HTree.vector[centerCode] += self.lRate*(1-sigmoid(w.dot(x)))*w # loss += -log(sigmoid(w.dot(x)),2) # print 'iteration:', iteration, 'loss:', loss # iteration+=1 model = w2v.Word2Vec(self.walks, size=self.walkDim, window=5, min_count=0, iter=3) print 'User embedding generated.' print 'Constructing similarity matrix...' self.W = np.random.rand(self.dao.trainingSize()[0], self.walkDim) / 10 self.topKSim = {} i = 0 for user1 in self.CUNet: # prefix1 = self.HTree.code[user1] # vec1 = self.HTree.vector[prefix1] sims = [] u1 = self.dao.user[user1] self.W[u1] = model.wv[user1] for user2 in self.CUNet: if user1 <> user2: u2 = self.dao.user[user2] self.W[u2] = model.wv[user2] sims.append((user2, cosine(self.W[u1], self.W[u2]))) self.topKSim[user1] = sorted(sims, key=lambda d: d[1], reverse=True)[:self.topK] i += 1 if i % 200 == 0: print 'progress:', i, '/', len(self.CUNet) print 'Similarity matrix finished.' #print self.topKSim #prepare Pu set, IPu set, and Nu set print 'Preparing item sets...' self.PositiveSet = defaultdict(dict) self.IPositiveSet = defaultdict(dict) #self.NegativeSet = defaultdict(list) for user in self.topKSim: for item in self.dao.trainSet_u[user]: self.PositiveSet[user][item] = 1 # else: # self.NegativeSet[user].append(item) for friend in self.topKSim[user]: for item in self.dao.trainSet_u[friend[0]]: if not self.PositiveSet[user].has_key(item): self.IPositiveSet[user][item] = 1 print 'Training...' iteration = 0 while iteration < self.maxIter: self.loss = 0 itemList = self.dao.item.keys() for user in self.PositiveSet: u = self.dao.user[user] kItems = self.IPositiveSet[user].keys() for item in self.PositiveSet[user]: i = self.dao.item[item] for n in range(3): #negative sampling for 3 times if len(self.IPositiveSet[user]) > 0: item_k = choice(kItems) k = self.dao.item[item_k] self.P[u] += self.lRate * ( 1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * ( self.Q[i] - self.Q[k]) self.Q[i] += self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * \ self.P[u] self.Q[k] -= self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * \ self.P[u] item_j = '' # if len(self.NegativeSet[user])>0: # item_j = choice(self.NegativeSet[user]) # else: item_j = choice(itemList) while (self.PositiveSet[user].has_key(item_j) or self.IPositiveSet.has_key(item_j)): item_j = choice(itemList) j = self.dao.item[item_j] self.P[u] += (1 / self.s) * self.lRate * ( 1 - sigmoid( (1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot( self.Q[j])))) * (self.Q[k] - self.Q[j]) self.Q[k] += (1 / self.s) * self.lRate * ( 1 - sigmoid( (1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) * self.P[u] self.Q[j] -= (1 / self.s) * self.lRate * ( 1 - sigmoid( (1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) * self.P[u] self.P[u] -= self.lRate * self.regU * self.P[u] self.Q[i] -= self.lRate * self.regI * self.Q[i] self.Q[j] -= self.lRate * self.regI * self.Q[j] self.Q[k] -= self.lRate * self.regI * self.Q[k] self.loss += -log(sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) - \ log(sigmoid((1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) else: item_j = choice(itemList) while (self.PositiveSet[user].has_key(item_j)): item_j = choice(itemList) j = self.dao.item[item_j] self.P[u] += self.lRate * ( 1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) * ( self.Q[i] - self.Q[j]) self.Q[i] += self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) * \ self.P[u] self.Q[j] -= self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) * \ self.P[u] self.loss += -log( sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) self.loss += self.regU * (self.P * self.P).sum( ) + self.regI * (self.Q * self.Q).sum() iteration += 1 if self.isConverged(iteration): break
def computeSimilarity(self): # Training get top-k friends print 'Generating user embedding...' self.pTopKSim = {} self.nTopKSim = {} self.pSimilarity = defaultdict(dict) self.nSimilarity = defaultdict(dict) pos_model = w2v.Word2Vec(self.pWalks, size=50, window=5, min_count=0, iter=10) neg_model = w2v.Word2Vec(self.nWalks, size=50, window=5, min_count=0, iter=10) for user in self.positive: uid = self.data.user[user] try: self.W[uid] = pos_model.wv['U' + user] except KeyError: continue for user in self.negative: uid = self.data.user[user] try: self.G[uid] = neg_model.wv['U' + user] except KeyError: continue print 'User embedding generated.' print 'Constructing similarity matrix...' i = 0 for user1 in self.positive: uSim = [] i += 1 if i % 200 == 0: print i, '/', len(self.positive) vec1 = self.W[self.data.user[user1]] for user2 in self.positive: if user1 <> user2: vec2 = self.W[self.data.user[user2]] sim = cosine(vec1, vec2) uSim.append((user2, sim)) fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK] self.pTopKSim[user1] = [item[0] for item in fList] i = 0 for user1 in self.negative: uSim = [] i += 1 if i % 200 == 0: print i, '/', len(self.negative) vec1 = self.G[self.data.user[user1]] for user2 in self.negative: if user1 <> user2: vec2 = self.G[self.data.user[user2]] sim = cosine(vec1, vec2) uSim.append((user2, sim)) fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK] for pair in fList: self.nSimilarity[user1][pair[0]] = pair[1] self.nTopKSim[user1] = [item[0] for item in fList] self.seededFriends = defaultdict(list) self.firend_item_set = defaultdict(list) for user in self.pTopKSim: trueFriends = list(set(self.pTopKSim[user]).intersection(set(self.nTopKSim[user]))) self.seededFriends[user] = trueFriends+self.pTopKSim[user][:50] for user in self.pTopKSim: for friend in self.seededFriends[user]: self.firend_item_set[user]+=self.data.trainSet_u[friend].keys()
def buildModel(self): print 'Kind Note: This method will probably take much time.' #build C-U-NET print 'Building collaborative user network...' #filter isolated nodes self.itemNet = {} for item in self.dao.trainSet_i: if len(self.dao.trainSet_i[item]) > 1: self.itemNet[item] = self.dao.trainSet_i[item] self.filteredRatings = defaultdict(list) for item in self.itemNet: for user in self.itemNet[item]: if self.itemNet[item][user] >= 1: self.filteredRatings[user].append(item) self.CUNet = defaultdict(list) for user1 in self.filteredRatings: for user2 in self.filteredRatings: if user1 <> user2: weight = len( set(self.filteredRatings[user1]).intersection( set(self.filteredRatings[user2]))) if weight > 0: self.CUNet[user1] += [user2] * weight #build Huffman Tree First #get weight print 'Building Huffman tree...' #To accelerate the method, the weight is estimated roughly nodes = {} for user in self.CUNet: nodes[user] = len(self.CUNet[user]) nodes = sorted(nodes.iteritems(), key=lambda d: d[1]) nodes = [HTreeNode(None, None, user[1], user[0]) for user in nodes] nodeList = OrderedLinkList() for node in nodes: listNode = Node() listNode.val = node try: nodeList.insert(listNode) except AttributeError: pass self.HTree = HuffmanTree(vecLength=self.walkDim) self.HTree.buildTree(nodeList) print 'Coding for all users...' self.HTree.coding(self.HTree.root, '', 0) print 'Generating random deep walks...' self.walks = [] self.visited = defaultdict(dict) for user in self.CUNet: for t in range(self.walkCount): currentNode = user path = [user] for i in range(1, self.walkLength): nextNode = self.CUNet[user][ randint(0, len(self.CUNet[user])) - 1] count = 0 while (self.visited[user].has_key(nextNode)): nextNode = self.CUNet[randint(0, len(self.CUNet[user])) - 1] #break infinite loop count += 1 if count == 10: break path.append(nextNode) self.walks.append(path) #print path shuffle(self.walks) #Training get top-k friends print 'Generating user embedding...' iteration = 1 while iteration <= self.maxIter: loss = 0 for walk in self.walks: for user in walk: centerUser = walk[len(walk) / 2] if user <> centerUser: code = self.HTree.code[user] centerCode = self.HTree.code[centerUser] x = self.HTree.vector[centerCode] for i in range(1, len(code)): prefix = code[0:i] w = self.HTree.vector[prefix] self.HTree.vector[prefix] += self.lRate * ( 1 - sigmoid(w.dot(x))) * x self.HTree.vector[centerCode] += self.lRate * ( 1 - sigmoid(w.dot(x))) * w loss += -log(sigmoid(w.dot(x)), 2) print 'iteration:', iteration, 'loss:', loss iteration += 1 print 'User embedding generated.' print 'Constructing similarity matrix...' self.Sim = SymmetricMatrix(len(self.CUNet)) for user1 in self.CUNet: for user2 in self.CUNet: if user1 <> user2: prefix1 = self.HTree.code[user1] vec1 = self.HTree.vector[prefix1] prefix2 = self.HTree.code[user2] vec2 = self.HTree.vector[prefix2] if self.Sim.contains(user1, user2): continue sim = cosine(vec1, vec2) self.Sim.set(user1, user2, sim) self.topKSim = {} for user in self.CUNet: self.topKSim[user] = sorted(self.Sim[user].iteritems(), key=lambda d: d[1], reverse=True)[:self.topK] print 'Similarity matrix finished.' #print self.topKSim #prepare Pu set, IPu set, and Nu set print 'Preparing item sets...' self.PositiveSet = defaultdict(dict) self.IPositiveSet = defaultdict(list) self.NegativeSet = defaultdict(list) for user in self.topKSim: for item in self.dao.trainSet_u[user]: if self.dao.trainSet_u[user][item] >= 1: self.PositiveSet[user][item] = 1 else: self.NegativeSet[user].append(item) for friend in self.topKSim[user]: for item in self.dao.trainSet_u[friend[0]]: if not self.PositiveSet[user].has_key(item): self.IPositiveSet[user].append(item) print 'Training...' iteration = 0 while iteration < self.maxIter: self.loss = 0 for user in self.PositiveSet: u = self.dao.user[user] for item in self.PositiveSet[user]: if len(self.IPositiveSet[user]) > 0: item_k = self.IPositiveSet[user][randint( 0, len(self.IPositiveSet[user]) - 1)] i = self.dao.item[item] k = self.dao.item[item_k] self.P[u] += self.lRate * ( 1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * ( self.Q[i] - self.Q[k]) self.Q[i] += self.lRate * ( 1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * self.P[u] self.Q[k] -= self.lRate * ( 1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * self.P[u] item_j = '' if len(self.NegativeSet[user]) > 0: item_j = self.NegativeSet[user][randint( 0, len(self.NegativeSet[user]) - 1)] else: item_j = self.dao.item.keys()[randint( 0, len(self.dao.item) - 1)] while (self.PositiveSet[user].has_key(item_j)): item_j = self.dao.item.keys()[randint( 0, len(self.dao.item) - 1)] j = self.dao.item[item_j] self.P[u] += (1 / self.s) * self.lRate * (1 - sigmoid( (1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot( self.Q[j])))) * (self.Q[k] - self.Q[j]) self.Q[k] += (1 / self.s) * self.lRate * (1 - sigmoid( (1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) * self.P[u] self.Q[j] -= (1 / self.s) * self.lRate * (1 - sigmoid( (1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) * self.P[u] self.P[u] += self.lRate * self.regU * self.P[u] self.Q[i] += self.lRate * self.regI * self.Q[i] self.Q[j] += self.lRate * self.regI * self.Q[j] self.Q[k] += self.lRate * self.regI * self.Q[k] self.loss += log(sigmoid(self.P[u].dot(self.Q[i])-self.P[u].dot(self.Q[k]))) + \ log(sigmoid((1/self.s)*(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k])))) self.loss += self.regU * (self.P * self.P).sum() + self.regI * ( self.Q * self.Q).sum() iteration += 1 if self.isConverged(iteration): break
def computeSimilarity(self): # Training get top-k friends print('Generating user embedding...') self.pTopKSim = {} self.nTopKSim = {} self.pSimilarity = defaultdict(dict) self.nSimilarity = defaultdict(dict) pos_model = w2v.Word2Vec(self.pWalks, size=self.walkDim, window=5, min_count=0, iter=10) neg_model = w2v.Word2Vec(self.nWalks, size=self.walkDim, window=5, min_count=0, iter=10) for user in self.positive: uid = self.data.user[user] try: self.W[uid] = pos_model.wv['U' + user] except KeyError: continue for user in self.negative: uid = self.data.user[user] try: self.G[uid] = neg_model.wv['U' + user] except KeyError: continue print('User embedding generated.') print('Constructing similarity matrix...') i = 0 for user1 in self.positive: uSim = [] i += 1 if i % 200 == 0: print(i, '/', len(self.positive)) vec1 = self.W[self.data.user[user1]] for user2 in self.positive: if user1 != user2: vec2 = self.W[self.data.user[user2]] sim = cosine(vec1, vec2) uSim.append((user2, sim)) fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK] self.threshold[user1] = fList[self.topK // 2][1] for pair in fList: self.pSimilarity[user1][pair[0]] = pair[1] self.pTopKSim[user1] = [item[0] for item in fList] self.avg_sim[user1] = sum([item[1] for item in fList ][:self.topK // 2]) / (self.topK / 2) i = 0 for user1 in self.negative: uSim = [] i += 1 if i % 200 == 0: print(i, '/', len(self.negative)) vec1 = self.G[self.data.user[user1]] for user2 in self.negative: if user1 != user2: vec2 = self.G[self.data.user[user2]] sim = cosine(vec1, vec2) uSim.append((user2, sim)) fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK] for pair in fList: self.nSimilarity[user1][pair[0]] = pair[1] self.nTopKSim[user1] = [item[0] for item in fList] self.trueTopKFriends = defaultdict(list) for user in self.pTopKSim: trueFriends = list( set(self.pTopKSim[user]).intersection(set( self.nTopKSim[user]))) self.trueTopKFriends[user] = trueFriends self.pTopKSim[user] = list( set(self.pTopKSim[user]).difference(set(trueFriends)))
def buildModel(self): #data clean # li = self.sao.followees.keys() # print 'Kind Note: This method will probably take much time.' # build U-F-NET print 'Building weighted user-friend network...' # filter isolated nodes and low ratings # Definition of Meta-Path self.W = np.random.rand(self.data.getSize('user'), self.walkDim) / 10 self.user2track = defaultdict(list) self.user2artist = defaultdict(list) self.user2album = defaultdict(list) self.track2user = defaultdict(list) self.artist2user = defaultdict(list) self.album2user = defaultdict(list) self.artist2track = defaultdict(list) self.artist2album = defaultdict(list) self.album2track = defaultdict(list) self.album2artist = {} self.track2artst = {} self.track2album = {} for user in self.data.userRecord: for item in self.data.userRecord[user]: self.user2track[user].append(item[self.recType]) self.user2artist[user].append(item['artist']) if self.data.columns.has_key('album'): self.user2album[user].append(item['album']) for artist in self.data.listened['artist']: for user in self.data.listened['artist'][artist]: self.artist2user[artist] += [ user ] * self.data.listened['artist'][artist][user] for track in self.data.listened['track']: for user in self.data.listened['track'][track]: self.track2user[track] += [ user ] * self.data.listened['track'][track][user] if self.data.columns.has_key('album'): for album in self.data.listened['album']: for user in self.data.listened['album'][album]: self.album2user[album] += [ user ] * self.data.listened['album'][album][user] for artist in self.data.artist2Track: self.artist2track[artist] = self.data.artist2Track[artist].keys() for key in self.data.artist2Track[artist]: self.track2artst[key] = artist if self.data.columns.has_key('album'): for album in self.data.album2Track: self.album2track[album] = self.data.album2Track[album].keys() for key in self.data.album2Track[album]: self.track2album[key] = album for artist in self.data.artist2Album: self.artist2album[artist] = self.data.artist2Album[ artist].keys() for key in self.data.artist2Album[artist]: self.album2artist[key] = artist print 'Generating random meta-path random walks...' self.walks = [] #self.usercovered = {} p1 = 'UTU' p2 = 'UAU' p3 = 'UZU' p4 = 'UTATU' p5 = 'UTZTU' p6 = 'UTZAZTU' mPaths = [] if self.data.columns.has_key('album'): mPaths = [p1, p2, p3, p4, p5, p6] else: mPaths = [p1, p2, p4] for user in self.data.userRecord: for mp in mPaths: for t in range(self.walkCount): path = [user] lastNode = user nextNode = user lastType = 'U' for i in range(self.walkLength / len(mp[1:])): for tp in mp[1:]: try: if tp == 'T' and lastType == 'U': nextNode = choice( self.user2track[lastNode]) elif tp == 'T' and lastType == 'A': nextNode = choice( self.artist2track[lastNode]) elif tp == 'T' and lastType == 'Z': nextNode = choice( self.album2track[lastNode]) elif tp == 'A' and lastType == 'T': nextNode = self.track2artst[lastNode] elif tp == 'A' and lastType == 'Z': nextNode = self.album2artist[lastNode] elif tp == 'A' and lastType == 'U': nextNode = choice( self.user2artist[lastNode]) elif tp == 'Z' and lastType == 'U': nextNode = choice( self.user2album[lastNode]) elif tp == 'Z' and lastType == 'A': nextNode = choice( self.artist2album[lastNode]) elif tp == 'Z' and lastType == 'T': nextNode = self.track2album[lastNode] elif tp == 'U': if lastType == 'T': nextNode = choice( self.track2user[lastNode]) elif lastType == 'Z': nextNode = choice( self.album2user[lastNode]) elif lastType == 'A': nextNode = choice( self.artist2user[lastNode]) path.append(nextNode) lastNode = nextNode lastType = tp except (KeyError, IndexError): path = [] break if path: self.walks.append(path) # for node in path: # if node[1] == 'U' or node[1] == 'F': # self.usercovered[node[0]] = 1 # print path # if mp == 'UFIU': # pass shuffle(self.walks) print 'walks:', len(self.walks) # Training get top-k friends print 'Generating user embedding...' self.topKSim = {} model = w2v.Word2Vec(self.walks, size=self.walkDim, window=5, min_count=0, iter=self.epoch) for user in self.data.userRecord: uid = self.data.getId(user, 'user') self.W[uid] = model.wv[user] print 'User embedding generated.' print 'Constructing similarity matrix...' i = 0 for user1 in self.data.userRecord: uSim = [] i += 1 if i % 200 == 0: print i, '/', len(self.data.userRecord) vec1 = self.W[self.data.getId(user1, 'user')] for user2 in self.data.userRecord: if user1 <> user2: vec2 = self.W[self.data.getId(user2, 'user')] sim = cosine(vec1, vec2) uSim.append((user2, sim)) self.topKSim[user1] = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK] # print 'Similarity matrix finished.' # # # #print self.topKSim #import pickle # # # # # # # #recordTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) # similarity = open('SocialMR-lastfm-sim'+self.foldInfo+'.pkl', 'wb') # vectors = open('SocialMR-lastfm-vec'+self.foldInfo+'.pkl', 'wb') # #Pickle dictionary using protocol 0. # # pickle.dump(self.topKSim, similarity) # pickle.dump((self.W,self.G),vectors) # similarity.close() # vectors.close() # matrix decomposition #pkl_file = open('SocialMR-lastfm-sim' + self.foldInfo + '.pkl', 'rb') #self.topKSim = pickle.load(pkl_file) # self.F = np.random.rand(self.data.trainingSize()[0], self.k) / 10 # prepare Pu set, IPu set, and Nu set print 'Preparing item sets...' self.PositiveSet = defaultdict(dict) self.pSet = defaultdict(list) self.IPositiveSet = defaultdict(dict) self.ipSet = defaultdict(list) # self.NegativeSet = defaultdict(list) for user in self.data.userRecord: for item in self.data.userRecord[user]: self.PositiveSet[user][item['track']] = 1 self.pSet[user].append(item['track']) for friend, sim in self.topKSim[user]: for item in self.data.userRecord[friend]: if not self.PositiveSet[user].has_key(item['track']): self.IPositiveSet[user][item['track']] = 1 self.ipSet[user].append(item['track']) Suk = 0.5 print 'Training...' iteration = 0 while iteration < self.maxIter: self.loss = 0 itemList = self.data.name2id['track'].keys() for user in self.pSet: u = self.data.getId(user, 'user') kItems = self.ipSet[user] for item in self.pSet[user]: i = self.data.getId(item, 'track') if len(self.ipSet[user]) > 0: item_k = choice(kItems) k = self.data.getId(item_k, 'track') s1 = sigmoid((self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k])) / (Suk + 1)) self.P[u] += 1 / (Suk + 1) * self.lRate * (1 - s1) * ( self.Q[i] - self.Q[k]) self.Q[i] += 1 / (Suk + 1) * self.lRate * ( 1 - s1) * self.P[u] self.Q[k] -= 1 / (Suk + 1) * self.lRate * ( 1 - s1) * self.P[u] item_j = '' # if len(self.NegativeSet[user])>0: # item_j = choice(self.NegativeSet[user]) # else: item_j = choice(itemList) while (self.PositiveSet[user].has_key(item_j) or self.IPositiveSet[user].has_key(item_j)): item_j = choice(itemList) j = self.data.getId(item_j, 'track') s2 = sigmoid(self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])) self.P[u] += self.lRate * (1 - s2) * (self.Q[k] - self.Q[j]) self.Q[k] += self.lRate * (1 - s2) * self.P[u] self.Q[j] -= self.lRate * (1 - s2) * self.P[u] self.P[u] -= self.lRate * self.regU * self.P[u] self.Q[i] -= self.lRate * self.regI * self.Q[i] self.Q[j] -= self.lRate * self.regI * self.Q[j] self.Q[k] -= self.lRate * self.regI * self.Q[k] self.loss += -log(s1) - log(s2) else: item_j = choice(itemList) while (self.PositiveSet[user].has_key(item_j)): item_j = choice(itemList) j = self.data.getId(item_j, 'track') s = sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j])) self.P[u] += self.lRate * (1 - s) * (self.Q[i] - self.Q[j]) self.Q[i] += self.lRate * (1 - s) * self.P[u] self.Q[j] -= self.lRate * (1 - s) * self.P[u] self.P[u] -= self.lRate * self.regU * self.P[u] self.Q[i] -= self.lRate * self.regI * self.Q[i] self.Q[j] -= self.lRate * self.regI * self.Q[j] self.loss += -log(s) for user in self.topKSim: for friend in self.topKSim[user]: u = self.data.getId(user, 'user') f = self.data.getId(friend[0], 'user') self.P[u] -= self.alpha * self.lRate * (self.P[u] - self.P[f]) self.loss += self.regU * (self.P * self.P).sum() + self.regI * ( self.Q * self.Q).sum() iteration += 1 if self.isConverged(iteration): break
def buildModel(self): self.T = np.random.rand(self.data.getSize('track'),self.k) sentences = [] self.listenTrack = set() self.user = defaultdict(list) for user in self.data.userRecord: playList = [] if len(self.data.userRecord[user]) > 10: self.user[user] = self.data.userRecord[user] for item in self.data.userRecord[user]: playList.append(item['track']) self.listenTrack.add(item['track']) sentences.append(playList) model = w2v.Word2Vec(sentences, size=self.k, window=5, min_count=0, iter=10) for track in self.listenTrack: tid = self.data.getId(track, 'track') self.T[tid] = model.wv[track] print ('song embedding generated.') print ('Constructing similarity matrix...') i = 0 self.topKSim = {} for track1 in self.listenTrack: tSim = [] i += 1 if i % 200 == 0: print (i, '/', len(self.listenTrack)) vec1 = self.T[self.data.getId(track1, 'track')] for track2 in self.listenTrack: if track1 != track2: vec2 = self.T[self.data.getId(track2, 'track')] sim = cosine(vec1, vec2) tSim.append((track2, sim)) self.topKSim[track1] = sorted(tSim, key=lambda d: d[1], reverse=True)[:self.topK] userListen = defaultdict(dict) for user in self.user: for item in self.user[user]: if item[self.recType] not in userListen[user]: userListen[user][item[self.recType]] = 0 userListen[user][item[self.recType]] += 1 print ('training...') ''' iteration = 0 itemList = list(self.data.name2id[self.recType].keys()) while iteration < self.maxIter: self.loss = 0 YtY = self.Y.T.dot(self.Y) I = np.ones(self.n) for user in self.data.name2id['user']: #C_u = np.ones(self.data.getSize(self.recType)) H = np.ones(self.n) val = [] pos = [] P_u = np.zeros(self.n) uid = self.data.getId(user,'user') for item in userListen[user]: iid = self.data.getId(item,self.recType) r_ui = userListen[user][item] pos.append(iid) val.append(10*r_ui) H[iid]+=10*r_ui P_u[iid]=1 error = (P_u[iid]-self.X[uid].dot(self.Y[iid])) self.loss+=pow(error,2) #sparse matrix C_u = coo_matrix((val,(pos,pos)),shape=(self.n,self.n)) A = (YtY+np.dot(self.Y.T,C_u.dot(self.Y))+self.regU*np.eye(self.k)) self.X[uid] = np.dot(np.linalg.inv(A),(self.Y.T*H).dot(P_u)) XtX = self.X.T.dot(self.X) I = np.ones(self.m) for item in self.data.name2id[self.recType]: P_i = np.zeros(self.m) iid = self.data.getId(item, self.recType) H = np.ones(self.m) val = [] pos = [] for user in self.data.listened[self.recType][item]: uid = self.data.getId(user, 'user') r_ui = self.data.listened[self.recType][item][user] pos.append(uid) val.append(10*r_ui) H[uid] += 10*r_ui P_i[uid] = 1 # sparse matrix C_i = coo_matrix((val, (pos, pos)),shape=(self.m,self.m)) A = (XtX+np.dot(self.X.T,C_i.dot(self.X))+self.regU*np.eye(self.k)) self.Y[iid]=np.dot(np.linalg.inv(A), (self.X.T*H).dot(P_i)) for user in self.user: u = self.data.getId(user,'user') for item in self.user[user]: i = self.data.getId(item[self.recType],self.recType) for ind in range(3): item_j = choice(itemList) while (item_j in userListen[user]): item_j = choice(itemList) j = self.data.getId(item_j,self.recType) s = sigmoid(self.X[u].dot(self.Y[i]) - self.X[u].dot(self.Y[j])) self.X[u] += self.lRate * (1 - s) * (self.Y[i] - self.Y[j]) self.Y[i] += self.lRate * (1 - s) * self.X[u] self.Y[j] -= self.lRate * (1 - s) * self.X[u] self.X[u] -= self.lRate * self.regU * self.X[u] self.Y[i] -= self.lRate * self.regI * self.Y[i] self.Y[j] -= self.lRate * self.regI * self.Y[j] self.loss += -log(s) for t1 in self.topKSim: tid1 = self.data.getId(t1,'track') for t2 in self.topKSim[t1]: tid2 = self.data.getId(t2[0],'track') sim = t2[1] error = (sim-self.Y[tid1].dot(self.Y[tid2])) self.loss+=error**2 self.Y[tid1]+=0.5*self.alpha*self.lRate*(error)*self.Y[tid2] self.Y[tid2]+=0.5*self.alpha*self.lRate*(error)*self.Y[tid1] #self.loss += (self.X * self.X).sum() + (self.Y * self.Y).sum() iteration += 1 print ('iteration:',iteration,'loss:',self.loss) # if self.isConverged(iteration): # break ''' iteration = 0 while iteration < self.maxIter: self.loss = 0 for user in self.data.name2id['user']: u = self.data.getId(user,'user') bu = self.Bu[u] for item in userListen[user]: i = self.data.getId(item, self.recType) bi = self.Bi[i] rating = self.Y[i].dot(self.X[u]) + self.data.globalMean + self.Bu[u] + self.Bi[i] error = userListen[user][item] - rating self.loss += error**2 self.X[u] += self.lRate * (error*self.Y[i] - self.regU*self.X[u]) self.Y[i] += self.lRate * (error*self.X[u] - self.regI*self.Y[i]) self.Bu[u] += self.lRate * (error - self.regB * bu) self.Bi[i] += self.lRate * (error - self.regB * bi) for t1 in self.topKSim: tid1 = self.data.getId(t1,'track') for t2 in self.topKSim[t1]: tid2 = self.data.getId(t2[0],'track') sim = t2[1] error2 = (sim-self.Y[tid1].dot(self.Y[tid2])) self.loss+=error2**2 self.Y[tid1]+=0.5*self.alpha*self.lRate*(error2)*self.Y[tid2] self.Y[tid2]+=0.5*self.alpha*self.lRate*(error2)*self.Y[tid1] self.loss += self.regB*(self.Bu * self.Bu).sum() + self.regB*(self.Bi*self.Bi).sum() + (self.X * self.X).sum() + (self.Y * self.Y).sum() iteration += 1 print ('iteration:',iteration,'loss:',self.loss)
def buildModel(self): print 'Kind Note: This method will probably take much time.' #build C-U-NET print 'Building collaborative user network...' #filter isolated nodes and low ratings self.itemNet = {} for item in self.dao.trainSet_i: if len(self.dao.trainSet_i[item]) > 1: self.itemNet[item] = self.dao.trainSet_i[item] self.filteredRatings = defaultdict(list) for item in self.itemNet: for user in self.itemNet[item]: if self.itemNet[item][user] > 0.75: self.filteredRatings[user].append(item) self.CUNet = defaultdict(list) for user1 in self.filteredRatings: s1 = set(self.filteredRatings[user1]) for user2 in self.filteredRatings: if user1 <> user2: s2 = set(self.filteredRatings[user2]) weight = len(s1.intersection(s2)) if weight > 0: self.CUNet[user1] += [user2] * weight #build Huffman Tree First #get weight print 'Building Huffman tree...' #To accelerate the method, the weight is estimated roughly nodes = {} for user in self.CUNet: nodes[user] = len(self.CUNet[user]) nodes = sorted(nodes.iteritems(), key=lambda d: d[1]) nodes = [HTreeNode(None, None, user[1], user[0]) for user in nodes] nodeList = OrderedLinkList() for node in nodes: listNode = Node() listNode.val = node try: nodeList.insert(listNode) except AttributeError: pass self.HTree = HuffmanTree(vecLength=self.walkDim) self.HTree.buildTree(nodeList) print 'Coding for all users...' self.HTree.coding(self.HTree.root, '', 0) print 'Generating random deep walks...' self.walks = [] self.visited = defaultdict(dict) for user in self.CUNet: for t in range(self.walkCount): path = [user] for i in range(1, self.walkLength): nextNode = choice(self.CUNet[user]) count = 0 while (self.visited[user].has_key(nextNode)): nextNode = choice(self.CUNet[user]) #break infinite loop count += 1 if count == 10: break path.append(nextNode) self.visited[user][nextNode] = 1 self.walks.append(path) #print path shuffle(self.walks) #Training get top-k friends print 'Generating user embedding...' iteration = 1 while iteration <= self.epoch: loss = 0 for walk in self.walks: for user in walk: centerUser = walk[len(walk) / 2] if user <> centerUser: code = self.HTree.code[user] centerCode = self.HTree.code[centerUser] x = self.HTree.vector[centerCode] for i in range(1, len(code)): prefix = code[0:i] w = self.HTree.vector[prefix] self.HTree.vector[prefix] += self.lRate * ( 1 - sigmoid(w.dot(x))) * x self.HTree.vector[centerCode] += self.lRate * ( 1 - sigmoid(w.dot(x))) * w loss += -log(sigmoid(w.dot(x))) print 'iteration:', iteration, 'loss:', loss iteration += 1 print 'User embedding generated.' print 'Constructing similarity matrix...' self.Sim = SymmetricMatrix(len(self.CUNet)) for user1 in self.CUNet: for user2 in self.CUNet: if user1 <> user2: prefix1 = self.HTree.code[user1] vec1 = self.HTree.vector[prefix1] prefix2 = self.HTree.code[user2] vec2 = self.HTree.vector[prefix2] if self.Sim.contains(user1, user2): continue sim = cosine(vec1, vec2) self.Sim.set(user1, user2, sim) self.topKSim = {} for user in self.CUNet: self.topKSim[user] = sorted(self.Sim[user].iteritems(), key=lambda d: d[1], reverse=True)[:self.topK] print 'Similarity matrix finished.' #print self.topKSim #matrix decomposition print 'Decomposing...' iteration = 0 while iteration < self.maxIter: self.loss = 0 for entry in self.dao.trainingData: user, item, rating = entry u = self.dao.user[user] #get user id i = self.dao.item[item] #get item id error = rating - self.P[u].dot(self.Q[i]) self.loss += error**2 p = self.P[u] q = self.Q[i] #update latent vectors self.P[u] += self.lRate * (error * q - self.regU * p) self.Q[i] += self.lRate * (error * p - self.regI * q) for user in self.CUNet: u = self.dao.user[user] friends = self.topKSim[user] for friend in friends: uf = self.dao.user[friend[0]] self.P[u] -= self.lRate * (self.P[u] - self.P[uf]) * self.alpha self.loss += self.alpha * ( self.P[u] - self.P[uf]).dot(self.P[u] - self.P[uf]) self.loss += self.regU * (self.P * self.P).sum() + self.regI * ( self.Q * self.Q).sum() iteration += 1 if self.isConverged(iteration): break
def buildModel(self): #build a list for weighted negative sampling negCandidate = [] # for track in self.data.trackListened: # count = sum(self.data.trackListened[track].values()) # id = self.data.getId(track,'track') # negCandidate+=[id]*count # print 'learning music embedding...' # iteration = 0 # while iteration < self.epoch: # loss = 0 # for user in self.data.userRecord: # u = self.data.getId(user, 'user') # #global user preference # global_uv = np.zeros(self.k) # for event in self.data.userRecord[user]: # id = self.data.getId(event['track'], 'track') # global_uv += self.Q[id] # global_uv /= len(self.data.userRecord[user]) # # #song embedding # for i in range(len(self.data.userRecord[user])): # start = max(0,i-self.winSize/2) # end = min(i+self.winSize/2,len(self.data.userRecord[user])-1) # local = self.data.userRecord[user][start:i]+self.data.userRecord[user][i+1:end+1] # local_v = np.zeros(self.k) # for event in local: # id = self.data.getId(event['track'],'track') # local_v+=self.Q[id] # v_hat = (global_uv+local_v)/(end-start+1) # center_id = self.data.getId(self.data.userRecord[user][i]['track'],'track') # center_v = self.Q[center_id] # gradient = self.lRate*(1-sigmoid(v_hat.dot(center_v)))*v_hat # gradient2 = self.lRate*(1-sigmoid(v_hat.dot(center_v)))*center_v # self.Q[center_id]+=gradient # global_uv+=gradient/len(self.data.userRecord[user]) # global_uv+=gradient2/len(self.data.userRecord[user])*(end-start) # for event in local: # id = self.data.getId(event['track'],'track') # self.Q[id]+=gradient2/(end-start+1) # loss+= -log(sigmoid(v_hat.dot(center_v))) # #negative sampling # for j in range(self.negCount): # neg_id = choice(negCandidate) # while neg_id==center_id: # neg_id = choice(negCandidate) # neg_v = self.Q[neg_id] # gradient = -self.lRate * (1 - sigmoid(v_hat.dot(neg_v))) * v_hat # gradient2 = -self.lRate * (1 - sigmoid(v_hat.dot(neg_v))) * neg_v # self.Q[center_id]+=gradient # for event in local: # id = self.data.getId(event['track'], 'track') # self.Q[id] += gradient2 / (end - start + 1) # loss+=-(log(1-sigmoid(neg_v.dot(v_hat)))) sentences = [] for user in self.data.userRecord: playList = [] for item in self.data.userRecord[user]: playList.append(item['track']) sentences.append(playList) model = w2v.Word2Vec(sentences, size=self.k, window=5, min_count=0, iter=10, sg=1) for track in self.data.trackListened: tid = self.data.getId(track, 'track') self.Q[tid] = model.wv[track] # #regularization # for album in self.data.album2Track: # for track1 in self.data.album2Track[album]: # for track2 in self.data.album2Track[album]: # t1 = self.data.getId(track1,'track') # t2 = self.data.getId(track2,'track') # v1 = self.Q[t1] # v2 = self.Q[t2] # self.Q[t1]+=self.lRate*(exp(v1.dot(v2))*v2) # self.Q[t2] += self.lRate*(exp(v1.dot(v2))*v1) # # #print 'window %d finished' %(i) # #print 'user %s finished.' %(user) # iteration+=1 # print 'iteration %d, loss %.4f' %(iteration,loss) #preference embedding self.R = np.zeros((self.data.getSize('user'), self.k)) for user in self.data.userRecord['user']: uid = self.data.getId(user, 'user') global_uv = np.zeros(self.k) local_uv = np.zeros(self.k) for event in self.data.userRecord[user]: tid = self.data.getId(event['track'], 'track') global_uv += self.Q[tid] self.P[uid] = global_uv / len(self.data.userRecord['user']) recent = max(0, len(self.data.userRecord[user]) - 20) for event in self.data.userRecord[user][recent:]: tid = self.data.getId(event['track'], 'track') local_uv += self.Q[tid] self.R[uid] = local_uv / recent for t1 in self.data.trackListened: if len(self.data.trackListened[t1]) < 200: continue xiangsi = '' m = 0 s = '' n = '' mi = 10000 s1 = set(self.data.trackListened[t1].keys()) for t2 in self.data.trackListened: if t1 != t2: s2 = set(self.data.trackListened[t2].keys()) l = len(s1.intersection(s2)) if l > m and l > 50: m = l s = t2 if l < mi: mi = l n = t2 print t1, s, cosine(self.Q[self.data.getId(t1, 'track')], self.Q[self.data.getId(s, 'track')]), m print t1, n, cosine(self.Q[self.data.getId(t1, 'track')], self.Q[self.data.getId(n, 'track')]), mi break
def buildModel(self): # self.P = np.ones((self.dao.trainingSize()[0], self.k))/10 # latent user matrix # self.Q = np.ones((self.dao.trainingSize()[1], self.k))/10 # latent item matrix #data clean cleanList = [] cleanPair = [] for user in self.sao.followees: if not self.dao.user.has_key(user): cleanList.append(user) for u2 in self.sao.followees[user]: if not self.dao.user.has_key(u2): cleanPair.append((user, u2)) for u in cleanList: del self.sao.followees[u] for pair in cleanPair: if self.sao.followees.has_key(pair[0]): del self.sao.followees[pair[0]][pair[1]] cleanList = [] cleanPair = [] for user in self.sao.followers: if not self.dao.user.has_key(user): cleanList.append(user) for u2 in self.sao.followers[user]: if not self.dao.user.has_key(u2): cleanPair.append((user, u2)) for u in cleanList: del self.sao.followers[u] for pair in cleanPair: if self.sao.followers.has_key(pair[0]): del self.sao.followers[pair[0]][pair[1]] # li = self.sao.followees.keys() # # import pickle # # self.trueTopKFriends = defaultdict(list) # pkl_file = open(self.config['ratings'] + self.foldInfo + 'p.pkl', 'rb') # self.pTopKSim = pickle.load(pkl_file) # pkl_file = open(self.config['ratings'] + self.foldInfo + 'n.pkl', 'rb') # self.nTopKSim = pickle.load(pkl_file) # self.trueTopKFriends = defaultdict(list) # for user in self.pTopKSim: # trueFriends = list( # set(self.pTopKSim[user][:self.topK]).intersection(set(self.nTopKSim[user][:self.topK]))) # self.trueTopKFriends[user] = trueFriends # # ps = open(self.config['ratings'] + self.foldInfo + 'psim.pkl', 'rb') # self.pSimilarity=pickle.load(ps) # ns = open(self.config['ratings'] + self.foldInfo + 'nsim.pkl', 'rb') # self.nSimilarity=pickle.load(ns) # av = open(self.config['ratings'] + self.foldInfo + 'av.pkl', 'rb') # self.avg_sim=pickle.load(av) # th = open(self.config['ratings'] + self.foldInfo + 'th.pkl', 'rb') # self.threshold=pickle.load(th) print 'Kind Note: This method will probably take much time.' # build U-F-NET print 'Building weighted user-friend network...' # filter isolated nodes and low ratings # Definition of Meta-Path p1 = 'UIU' p2 = 'UFU' p3 = 'UTU' p4 = 'UFIU' p5 = 'UFUIU' mPaths = [p1, p2, p3, p4, p5] self.G = np.random.rand(self.dao.trainingSize()[0], self.walkDim) / 10 self.W = np.random.rand(self.dao.trainingSize()[0], self.walkDim) / 10 self.UFNet = defaultdict(list) for user1 in self.sao.followees: s1 = set(self.sao.followees[user1]) for user2 in self.sao.followees[user1]: if self.sao.followees.has_key(user2): if user1 <> user2: s2 = set(self.sao.followees[user2]) weight = len(s1.intersection(s2)) self.UFNet[user1] += [user2] * (weight + 1) self.UTNet = defaultdict(list) for user1 in self.sao.followers: s1 = set(self.sao.followers[user1]) for user2 in self.sao.followers[user1]: if self.sao.followers.has_key(user2): if user1 <> user2: s2 = set(self.sao.followers[user2]) weight = len(s1.intersection(s2)) self.UTNet[user1] += [user2] * (weight + 1) # # # # print 'Generating random meta-path random walks... (Positive)' self.pWalks = [] #self.usercovered = {} # positive for user in self.dao.user: for mp in mPaths: if mp == p1: self.walkCount = 10 if mp == p2: self.walkCount = 8 if mp == p3: self.walkCount = 8 if mp == p4: self.walkCount = 5 if mp == p5: self.walkCount = 5 for t in range(self.walkCount): path = ['U' + user] lastNode = user nextNode = user lastType = 'U' for i in range(self.walkLength / len(mp[1:])): for tp in mp[1:]: try: if tp == 'I': nextNode = choice(self.positive[lastNode]) if tp == 'U': if lastType == 'I': nextNode = choice( self.pItems[lastNode]) elif lastType == 'F': nextNode = choice(self.UFNet[lastNode]) while not self.dao.user.has_key( nextNode): nextNode = choice( self.UFNet[lastNode]) elif lastType == 'T': nextNode = choice(self.UTNet[lastNode]) while not self.dao.user.has_key( nextNode): nextNode = choice( self.UTNet[lastNode]) if tp == 'F': nextNode = choice(self.UFNet[lastNode]) while not self.dao.user.has_key(nextNode): nextNode = choice(self.UFNet[lastNode]) if tp == 'T': nextNode = choice(self.UFNet[lastNode]) while not self.dao.user.has_key(nextNode): nextNode = choice(self.UFNet[lastNode]) path.append(tp + nextNode) lastNode = nextNode lastType = tp except (KeyError, IndexError): path = [] break if path: self.pWalks.append(path) self.nWalks = [] # self.usercovered = {} #negative for user in self.dao.user: for mp in mPaths: if mp == p1: self.walkCount = 10 if mp == p2: self.walkCount = 8 if mp == p3: self.walkCount = 8 if mp == p4: self.walkCount = 5 if mp == p5: self.walkCount = 5 for t in range(self.walkCount): path = ['U' + user] lastNode = user nextNode = user lastType = 'U' for i in range(self.walkLength / len(mp[1:])): for tp in mp[1:]: try: if tp == 'I': nextNode = choice(self.negative[lastNode]) if tp == 'U': if lastType == 'I': nextNode = choice( self.nItems[lastNode]) elif lastType == 'F': nextNode = choice(self.UFNet[lastNode]) while not self.dao.user.has_key( nextNode): nextNode = choice( self.UFNet[lastNode]) elif lastType == 'T': nextNode = choice(self.UTNet[lastNode]) while not self.dao.user.has_key( nextNode): nextNode = choice( self.UTNet[lastNode]) if tp == 'F': nextNode = choice(self.UFNet[lastNode]) while not self.dao.user.has_key(nextNode): nextNode = choice(self.UFNet[lastNode]) if tp == 'T': nextNode = choice(self.UFNet[lastNode]) while not self.dao.user.has_key(nextNode): nextNode = choice(self.UFNet[lastNode]) path.append(tp + nextNode) lastNode = nextNode lastType = tp except (KeyError, IndexError): path = [] break if path: self.nWalks.append(path) shuffle(self.pWalks) print 'pwalks:', len(self.pWalks) print 'nwalks:', len(self.nWalks) # Training get top-k friends print 'Generating user embedding...' self.pTopKSim = {} self.nTopKSim = {} self.pSimilarity = defaultdict(dict) self.nSimilarity = defaultdict(dict) model = w2v.Word2Vec(self.pWalks, size=self.walkDim, window=5, min_count=0, iter=10) model2 = w2v.Word2Vec(self.nWalks, size=self.walkDim, window=5, min_count=0, iter=10) for user in self.positive: uid = self.dao.user[user] try: self.W[uid] = model.wv['U' + user] except KeyError: continue for user in self.negative: uid = self.dao.user[user] try: self.G[uid] = model2.wv['U' + user] except KeyError: continue print 'User embedding generated.' print 'Constructing similarity matrix...' i = 0 for user1 in self.positive: uSim = [] i += 1 if i % 200 == 0: print i, '/', len(self.positive) vec1 = self.W[self.dao.user[user1]] for user2 in self.positive: if user1 <> user2: vec2 = self.W[self.dao.user[user2]] sim = cosine(vec1, vec2) uSim.append((user2, sim)) fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK] self.threshold[user1] = fList[self.topK / 2][1] for pair in fList: self.pSimilarity[user1][pair[0]] = pair[1] self.pTopKSim[user1] = [item[0] for item in fList] self.avg_sim[user1] = sum([item[1] for item in fList ][:self.topK / 2]) / (self.topK / 2) # import pickle # ps = open(self.config['ratings'] + self.foldInfo + 'ps.pkl', 'wb') # # pickle.dump(self.pSimilarity, ps) # av = open(self.config['ratings'] + self.foldInfo + 'av.pkl', 'wb') # # pickle.dump(self.avg_sim, av) # # th = open(self.config['ratings'] + self.foldInfo + 'th.pkl', 'wb') # pickle.dump(self.threshold, th) i = 0 for user1 in self.negative: uSim = [] i += 1 if i % 200 == 0: print i, '/', len(self.negative) vec1 = self.G[self.dao.user[user1]] for user2 in self.negative: if user1 <> user2: vec2 = self.G[self.dao.user[user2]] sim = cosine(vec1, vec2) uSim.append((user2, sim)) fList = sorted(uSim, key=lambda d: d[1], reverse=True)[:self.topK] for pair in fList: self.nSimilarity[user1][pair[0]] = pair[1] self.nTopKSim[user1] = [item[0] for item in fList] self.trueTopKFriends = defaultdict(list) for user in self.pTopKSim: trueFriends = list( set(self.pTopKSim[user]).intersection(set( self.nTopKSim[user]))) self.trueTopKFriends[user] = trueFriends # if len(trueFriends)>0: # print trueFriends self.pTopKSim[user] = list( set(self.pTopKSim[user]).difference(set(trueFriends))) # print 'Similarity matrix finished.' # # # #print self.topKSim # # # # # # # #recordTime = strftime("%Y-%m-%d %H-%M-%S", localtime(time())) # psimilarity = open(self.config['ratings']+self.foldInfo+'p.pkl', 'wb') # nsimilarity = open(self.config['ratings'] + self.foldInfo + 'n.pkl', 'wb') # vectors = open('HERP-lastfm-vec'+self.foldInfo+'.pkl', 'wb') # #Pickle dictionary using protocol 0. # # pickle.dump(self.pTopKSim, psimilarity) # pickle.dump(self.nTopKSim, nsimilarity) # # psimilarity = open(self.config['ratings'] + self.foldInfo + 'psim.pkl', 'wb') # nsimilarity = open(self.config['ratings'] + self.foldInfo + 'nsim.pkl', 'wb') # vectors = open('HERP-lastfm-vec'+self.foldInfo+'.pkl', 'wb') # #Pickle dictionary using protocol 0. # # pickle.dump(self.pSimilarity, psimilarity) # pickle.dump(self.nSimilarity, nsimilarity) #pickle.dump((self.W,self.G),vectors) # similarity.close() # vectors.close() # matrix decomposition #pkl_file = open('IF_BPR-lastfm-sim' + self.foldInfo + '.pkl', 'rb') #self.topKSim = pickle.load(pkl_file) print 'Decomposing...' self.F = np.random.rand(self.dao.trainingSize()[0], self.k) / 10 # prepare Pu set, IPu set, and Nu set print 'Preparing item sets...' self.PositiveSet = defaultdict(dict) self.NegSets = defaultdict(dict) for user in self.dao.user: for item in self.dao.trainSet_u[user]: self.PositiveSet[user][item] = 1 for user in self.dao.user: for item in self.negative[user]: if self.dao.item.has_key(item): self.NegSets[user][item] = 1 iteration = 0 while iteration < self.maxIter: self.loss = 0 self.IPositiveSet = defaultdict(dict) self.OKSet = defaultdict(dict) for user in self.dao.user: if self.trueTopKFriends.has_key(user): for friend in self.trueTopKFriends[user][:self.topK]: if self.dao.user.has_key(friend) and self.pSimilarity[ user][friend] >= self.threshold[user]: for item in self.positive[friend]: if not self.PositiveSet[user].has_key( item ) and not self.NegSets[user].has_key(item): self.IPositiveSet[user][item] = friend if self.pTopKSim.has_key(user): for friend in self.pTopKSim[user][:self.topK]: if self.dao.user.has_key(friend) and self.pSimilarity[ user][friend] >= self.threshold[user]: for item in self.positive[friend]: if not self.PositiveSet[user].has_key( item ) and not self.IPositiveSet[user].has_key( item ) and not self.NegSets[user].has_key(item): self.OKSet[user][item] = friend if self.nTopKSim.has_key(user): for friend in self.nTopKSim[user][:self.topK]: if self.dao.user.has_key( friend ): #and self.nSimilarity[user][friend]>=self.threshold[user]: for item in self.negative[friend]: if self.dao.item.has_key(item): if not self.PositiveSet[user].has_key(item) and not self.IPositiveSet[user].has_key( item) \ and not self.OKSet.has_key(item): if not self.NegSets[user].has_key( item): self.NegSets[user][item] = 1 else: self.NegSets[user][item] += 1 itemList = self.dao.item.keys() for user in self.PositiveSet: #itemList = self.NegSets[user].keys() kItems = self.IPositiveSet[user].keys() okItems = self.OKSet[user].keys() nItems = self.NegSets[user].keys() u = self.dao.user[user] for item in self.PositiveSet[user]: i = self.dao.item[item] for ind in range(1): if len(kItems) > 0 and len(okItems) > 0: item_k = choice(kItems) uf = self.IPositiveSet[user][item_k] k = self.dao.item[item_k] self.optimization_thres(u, i, k, user, uf) item_ok = choice(okItems) ok = self.dao.item[item_ok] self.optimization(u, k, ok) item_j = choice(itemList) while (self.PositiveSet[user].has_key(item_j) or self.IPositiveSet[user].has_key(item_j) or self.OKSet[user].has_key(item_j)): item_j = choice(itemList) j = self.dao.item[item_j] self.optimization(u, ok, j) elif len(kItems) == 0 and len(okItems) > 0: item_ok = choice(okItems) ok = self.dao.item[item_ok] uf = self.OKSet[user][item_ok] self.optimization_thres(u, i, ok, user, uf) item_j = choice(itemList) while (self.PositiveSet[user].has_key(item_j) or self.IPositiveSet[user].has_key(item_j) or self.OKSet[user].has_key(item_j)): item_j = choice(itemList) j = self.dao.item[item_j] self.optimization(u, ok, j) elif len(kItems) > 0 and len(okItems) == 0: item_k = choice(kItems) uf = self.IPositiveSet[user][item_k] k = self.dao.item[item_k] self.optimization_thres(u, i, k, user, uf) item_j = choice(itemList) while (self.PositiveSet[user].has_key(item_j) or self.IPositiveSet[user].has_key(item_j) or self.OKSet[user].has_key(item_j)): item_j = choice(itemList) j = self.dao.item[item_j] self.optimization(u, k, j) else: item_j = choice(itemList) while (self.PositiveSet[user].has_key(item_j) or self.IPositiveSet[user].has_key(item_j) or self.OKSet[user].has_key(item_j)): item_j = choice(itemList) j = self.dao.item[item_j] self.optimization(u, i, j) if len(nItems) > 0: item_n = choice(nItems) n = self.dao.item[item_n] self.optimization(u, j, n) if self.thres_count[user] > 0: self.threshold[user] -= self.lRate * self.thres_d[ user] / self.thres_count[user] self.thres_d[user] = 0 self.thres_count[user] = 0 li = [ sim for sim in self.pSimilarity[user].values() if sim >= self.threshold[user] ] if len(li) == 0: self.avg_sim[user] = self.threshold[user] else: self.avg_sim[user] = sum(li) / (len(li) + 0.0) for abc in range(2): for friend in self.trueTopKFriends[user]: if self.pSimilarity[user][friend] > self.threshold[user]: u = self.dao.user[user] f = self.dao.user[friend] self.P[u] -= self.alpha * self.lRate * (self.P[u] - self.P[f]) self.loss += self.regU * (self.P * self.P).sum() + self.regI * ( self.Q * self.Q).sum() iteration += 1 if self.isConverged(iteration): break
def buildModel(self): print('Kind Note: This method will probably take much time.') #build C-U-NET print('Building collaborative user network...') userListen = defaultdict(dict) for user in self.data.userRecord: for item in self.data.userRecord[user]: userListen[user][item[self.recType]] = 1 self.CUNet = defaultdict(list) for user1 in userListen: s1 = set(userListen[user1].keys()) for user2 in userListen: if user1 != user2: s2 = set(userListen[user2].keys()) weight = len(s1.intersection(s2)) if weight > 0: self.CUNet[user1] += [user2] * weight print('Generating random deep walks...') self.walks = [] self.visited = defaultdict(dict) for user in self.CUNet: for t in range(self.walkCount): path = [user] lastNode = user for i in range(1, self.walkLength): nextNode = choice(self.CUNet[lastNode]) count = 0 while (nextNode in self.visited[lastNode]): nextNode = choice(self.CUNet[lastNode]) #break infinite loop count += 1 if count == 10: break path.append(nextNode) self.visited[user][nextNode] = 1 lastNode = nextNode self.walks.append(path) shuffle(self.walks) #Training get top-k friends print('Generating user embedding...') model = w2v.Word2Vec(self.walks, size=self.walkDim, window=self.winSize, min_count=0, iter=self.epoch) print('User embedding generated.') print('Constructing similarity matrix...') self.W = np.random.rand(self.data.getSize('user'), self.k) / 10 # global user preference self.topKSim = {} i = 0 for user in self.CUNet: u = self.data.getId(user, 'user') self.W[u] = model.wv[user] for user1 in self.CUNet: sims = [] u1 = self.data.getId(user1, 'user') for user2 in self.CUNet: if user1 != user2: u2 = self.data.getId(user2, 'user') sims.append((user2, cosine(self.W[u1], self.W[u2]))) self.topKSim[user1] = sorted(sims, key=lambda d: d[1], reverse=True)[:self.topK] i += 1 if i % 200 == 0: print('progress:', i, '/', len(self.CUNet)) print('Similarity matrix finished.') #print self.topKSim #prepare Pu set, IPu set, and Nu set print('Preparing item sets...') self.PositiveSet = defaultdict(list) self.IPositiveSet = defaultdict(list) #self.NegativeSet = defaultdict(list) for user in self.data.userRecord: for event in self.data.userRecord[user]: self.PositiveSet[user].append(event[self.recType]) for user in self.CUNet: for friend in self.topKSim[user]: self.IPositiveSet[user] += list( set(self.PositiveSet[friend[0]]).difference( self.PositiveSet[user])) print('Training...') iteration = 0 while iteration < self.maxIter: self.loss = 0 itemList = list(self.data.name2id[self.recType].keys()) for user in self.PositiveSet: u = self.data.getId(user, 'user') for item in self.PositiveSet[user]: i = self.data.getId(item, self.recType) for n in range(3): if len(self.IPositiveSet[user]) > 0: item_k = choice(self.IPositiveSet[user]) k = self.data.getId(item_k, self.recType) self.P[u] += self.lRate * ( 1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * ( self.Q[i] - self.Q[k]) self.Q[i] += self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * \ self.P[u] self.Q[k] -= self.lRate * (1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) * \ self.P[u] item_j = '' # if len(self.NegativeSet[user])>0: # item_j = choice(self.NegativeSet[user]) # else: item_j = choice(itemList) while (user in self.data.listened[self.recType][item_j] ): item_j = choice(itemList) j = self.data.getId(item_j, self.recType) self.P[u] += (1 / self.s) * self.lRate * ( 1 - sigmoid( (1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot( self.Q[j])))) * (self.Q[k] - self.Q[j]) self.Q[k] += (1 / self.s) * self.lRate * ( 1 - sigmoid( (1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) * self.P[u] self.Q[j] -= (1 / self.s) * self.lRate * ( 1 - sigmoid( (1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) * self.P[u] self.P[u] -= self.lRate * self.regU * self.P[u] self.Q[i] -= self.lRate * self.regI * self.Q[i] self.Q[j] -= self.lRate * self.regI * self.Q[j] self.Q[k] -= self.lRate * self.regI * self.Q[k] self.loss += -log(sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[k]))) - \ log(sigmoid((1 / self.s) * (self.P[u].dot(self.Q[k]) - self.P[u].dot(self.Q[j])))) else: item_j = choice(itemList) while (user in self.data.listened[self.recType][item_j] ): item_j = choice(itemList) j = self.data.getId(item_j, self.recType) self.P[u] += self.lRate * ( 1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) * ( self.Q[i] - self.Q[j]) self.Q[i] += self.lRate * ( 1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) * self.P[u] self.Q[j] -= self.lRate * ( 1 - sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) * self.P[u] self.loss += -log( sigmoid(self.P[u].dot(self.Q[i]) - self.P[u].dot(self.Q[j]))) self.loss += self.regU * (self.P * self.P).sum( ) + self.regI * (self.Q * self.Q).sum() iteration += 1 if self.isConverged(iteration): break
def buildNetwork(self): self.trainingData = [] print('Kind Note: This method will take much time') # build C-T-NET print('Building collaborative track network') self.trackNet = {} self.filteredListen = defaultdict(list) for track in self.data.trackRecord: if len(self.data.trackRecord[track]) > 0: self.trackNet[track] = self.data.trackRecord[track] for track in self.trackNet: tid = self.data.getId(track, 'track') for item in self.trackNet[track]: uid = self.data.getId(item['user'], 'user') if self.userListen[uid][tid] >= 0: self.filteredListen[track].append(item['user']) self.trainingData.append(item) self.CTNet = defaultdict(list) i = 0 for track1 in self.filteredListen: i += 1 if i % 200 == 0: print(i, '/', len(self.filteredListen)) s1 = set(self.filteredListen[track1]) for track2 in self.filteredListen: if track1 != track2: s2 = set(self.filteredListen[track2]) weight = len(s1.intersection(s2)) if weight > 0: self.CTNet[track1] += [track2] * weight ######################## 歌曲 C-T-N-E-T 构建结束 ############################ print('Genrerating random deep walks...') self.T_walks = [] self.T_visited = defaultdict(dict) for track in self.CTNet: for t in range(10): path = [track] lastNode = track for i in range(1, 10): nextNode = choice(self.CTNet[lastNode]) count = 0 #while(nextNode in self.T_visited[lastNode] or nextNode not in self.aSim[lastNode]): while (nextNode in self.T_visited[lastNode]): nextNode = choice(self.CTNet[lastNode]) count += 1 if count == 10: break path.append(nextNode) self.T_visited[track][lastNode] = 1 lastNode = nextNode self.T_walks.append(path) shuffle(self.T_walks) ##del self.aSim print('Generating track embedding') model = w2v.Word2Vec(self.T_walks, size=self.k, window=5, min_count=0, iter=3) print('Track embedding generated') self.T = np.random.rand(self.data.getSize('track'), self.k) print('Constructing similarity matrix...') i = 0 self.nSim = {} for track1 in self.CTNet: tSim = [] i += 1 if i % 1000 == 0: print(i, '/', len(self.CTNet)) vec1 = model.wv[track1] tid1 = self.data.getId(track1, 'track') for track2 in self.CTNet: if track1 != track2: tid2 = self.data.getId(track2, 'track') vec2 = model.wv[track2] sim = max(1e-6, cosine(vec1, vec2)) tSim.append((tid2, sim)) #self.nSim[t1][t2] = sim self.nSim[tid1] = sorted(tSim, key=lambda d: d[1], reverse=True)[:20] file1 = 'nsim.txt' df1 = open(file1, 'wb') #df1 = open(file1, 'rb') pickle.dump(self.nSim, df1)