class ImplicitMF(): def __init__(self): # Initialisation of self.num_factors = 40 self.num_iterations = 30 self.reg_param = 5.0 self.alpha = 10.0 self.client = MongoConnector() # Maps user ids to matrix indexes self.user_index = {} # Maps item ids to matrix indexes self.item_index = {} # Maps matrix indexes to item ids self.index_item = {} # Maps matrix indexes to user ids self.index_user = {} self.errors = [] def process(self): self.users = self.client.current_db.users.find() # Matrix factorization only applies to media items in the system, not FB or Twitter self.items = self.client.current_db.user_media.find({'$or' : [{'link': {'$regex' : ".*spotify.*"}}, {'link': {'$regex' : ".*soundcloud.*"}}, {'link': {'$regex' : ".*youtube.*"}}, {'link': {'$regex' : ".*vimeo.*"}}] }) self.num_users = self.users.count() self.num_items = self.items.count() self.counts = np.zeros((self.num_users, self.num_items)) self.construct_matrix(self.counts) # Essentially defines the confidence values -1, aka the r_{ui} values # allows Cu to be notated as Cu + I later on in the code self.counts *= self.alpha self.countsCopy = self.counts.copy() self.counts= sparse.csr_matrix(self.counts) self.train_model() predictions = self.predict(self.user_vectors,self.item_vectors) self.print_prediction(predictions) self.store_predictions(predictions) def predict(self,user_vectors,item_vectors): # For each index in reconstructed matrix, calculate using dot product predictions = np.zeros((self.num_users,self.num_items)) for i in range(self.num_users): for j in range(self.num_items): predictions[i][j] = self.user_vectors[i].T.dot(mf.item_vectors[j]) return predictions def construct_matrix(self,counts): # Give all users an index for matrix for i,user in enumerate(self.users): self.user_index[user['_id']] = i self.index_user[i] = user['_id'] # Give all items an index for matrix for i,item in enumerate(self.items): self.item_index[item['_id']] = i self.index_item[i] = item['_id'] n = self.item_index[item['_id']] # For each user rating for this item, find the users index and put a 1 in matrix for user in (item['user_ratings']): m = self.user_index[user['user']] counts[m][n] = 1 def getItemName(self, objectid): item = self.client.current_db.media.find_one({"_id" : objectid}) return item['name'] + ' - ' + item['link'] def getUserName(self, userid): user = self.client.current_db.users.find_one({"_id" : userid}) return user['name'] # Recalculates user factor and item factor vectors for a fixed number of iterations using ALS def train_model(self): # Initialise to random noise self.user_vectors = np.random.normal(size=(self.num_users, self.num_factors)) self.item_vectors = np.random.normal(size=(self.num_items, self.num_factors)) # For each iteration for i in xrange(self.num_iterations): t0 = time.time() # Fix item vectors and solve for user vector print 'Solving for user vectors...' self.user_vectors = self.iteration(True, sparse.csr_matrix(self.item_vectors)) # Fix user vectors and solve for item vectors print 'Solving for item vectors...' self.item_vectors = self.iteration(False, sparse.csr_matrix(self.user_vectors)) t1 = time.time() print 'iteration %i finished in %f seconds' % (i + 1, t1 - t0) def iteration(self, user, fixed_vecs): # Number of user / item vectors you are solving for num_solve = self.num_users if user else self.num_items # Size of fixed matrix num_fixed = fixed_vecs.shape[0] # Precalculate matrices they dont depend on u # Y^t Y calculated YTY = fixed_vecs.T.dot(fixed_vecs) # Identity matrix eye = sparse.eye(num_fixed) # LambaI lambda_eye = self.reg_param * sparse.eye(self.num_factors) # Initialise a vector to store results of recomputed factor vector solve_vecs = np.zeros((num_solve, self.num_factors)) t = time.time() # For each item / user you need to recalcualte for for i in xrange(num_solve): if user: # if recomputing user vectors, retrieve their ratings counts_i = self.counts[i].toarray() else: # else, if recomputing item vectors, get all ratings for item i counts_i = self.counts[:, i].T.toarray() CuI = sparse.diags(counts_i, [0]) pu = counts_i.copy() # setting preferences from c values. pu[np.where(pu != 0)] = 1.0 # Calculate Ttrans YTCuIY = fixed_vecs.T.dot(CuI).dot(fixed_vecs) YTCupu = fixed_vecs.T.dot(CuI + eye).dot(sparse.csr_matrix(pu).T) xu = spsolve(YTY + YTCuIY + lambda_eye, YTCupu) solve_vecs[i] = xu return solve_vecs def predict(self,user_vectors,item_vectors): # For each index in reconstructed matrix, calculate using dot product predictions = np.zeros((self.num_users,self.num_items)) for i in range(self.num_users): for j in range(self.num_items): predictions[i][j] = self.user_vectors[i].T.dot(self.item_vectors[j]) return predictions def print_prediction(self,predictions): for i in range (0,len(predictions)): for j, x in enumerate(predictions[i]): # If we know they already like it, set prediction to 0 so it's not suggested(?) if (self.countsCopy[i][j] != 0): predictions[i][j] = 0 # Finds top 10 items for user i, finds item names instead of id's n = (len(np.where(predictions[i] > 0)[0])/10) topn = np.argpartition(predictions[i], -n)[-n:] topn[:] = topn[::-1] items = [] for k in topn: items.append(self.getItemName(self.index_item[k])) print self.getUserName(self.index_user[i]) + '\n' + str(items) print def store_predictions(self,predictions): for i in range(0,len(predictions)): for j, x in enumerate(predictions[i]): # If we know they already like it, set prediction to 0 so it's not suggested(?) if (self.countsCopy[i][j] == 1): predictions[i][j] = -1 user = self.client.current_db.users.find_one({'_id' : self.index_user[i]}) media = [] n = (len(np.where(predictions[i] > 0)[0])/10) topn = np.argpartition(predictions[i], -n)[-n:] topn[:] = topn[::-1] for index in topn: item_id = self.index_item[index] media.append({'user' : user['_id'], 'media' : item_id}) self.client.store_prioritised_media(user, media, 'implicit')
class Stager: def __init__(self): self.client = MongoConnector() def process(self): relation_collection = self.client.current_db.relations user_collection = self.client.current_db.users user_graphs_collection = self.client.current_db.user_graphs for relation in relation_collection.find(): self.stage_user(relation) self.prioritise_media() def stage_user(self, relation): ranks = self.rank_order(relation) self.client.store_rankings(relation, ranks) def rank_order(self, relation): #Create a list of all users which are at all related to this user ranks = [{'user' : x} for x in set([y['user'] for y in (relation['similar'] + relation['direct'])])] for user_links in relation['similar']: for rank in ranks: if rank['user'] == user_links['user']: rank['similar'] = len(user_links['links']) continue for user_links in relation['direct']: for rank in ranks: if rank['user'] == user_links['user']: rank['direct'] = len(user_links['links']) continue for rank in ranks: if 'similar' in rank.keys(): if 'direct' in rank.keys(): rank['total'] = int(rank['similar']) + (int(rank['direct'])*10) else: rank['total'] = int(rank['similar']) elif 'direct' in rank.keys(): rank['total'] = int(rank['direct'])*10 else: rank['total'] = 0 return sorted(ranks, key=lambda rank: rank['total'], reverse=True) def prioritise_media(self): user_media_collection = self.client.current_db.user_media user_graphs_collection = self.client.current_db.user_graphs for user in user_graphs_collection.find(): media = [] #For each user here, put a load of media in staged_media table if len(user[u'ranks']) > 0: #For each media item, store the user and then their stuff with a priority attached for user_media_item in user_media_collection.find(): for rating in user_media_item['user_ratings']: for u in user[u'ranks']: if rating[u'user'] == u[u'user']: #Here we have a media item with a rating by a user with a priority media.append({'user' : user[u'_id'], 'media' : user_media_item[u'_id'], 'similarity' : u['total'], 'similar_user' : u['user']}) if len(media) > 0: self.client.store_prioritised_media(user, media)