Esempio n. 1
0
class ImplicitMF():

    def __init__(self):

        # Initialisation of 
        self.num_factors = 40
        self.num_iterations = 30
        self.reg_param = 5.0
        self.alpha = 10.0
        self.client = MongoConnector()

        # Maps user ids to matrix indexes
        self.user_index = {}
        # Maps item ids to matrix indexes
        self.item_index = {}
        # Maps matrix indexes to item ids
        self.index_item = {}
        # Maps matrix indexes to user ids
        self.index_user = {}
        self.errors = []

    def process(self):

        self.users = self.client.current_db.users.find()
        # Matrix factorization only applies to media items in the system, not FB or Twitter
        self.items = self.client.current_db.user_media.find({'$or' : [{'link': {'$regex' : ".*spotify.*"}},
                                      {'link': {'$regex' : ".*soundcloud.*"}},
                                      {'link': {'$regex' : ".*youtube.*"}},
                                      {'link': {'$regex' : ".*vimeo.*"}}] })
        self.num_users = self.users.count()
        self.num_items = self.items.count()
        self.counts = np.zeros((self.num_users, self.num_items))
        self.construct_matrix(self.counts)
        # Essentially defines the confidence values -1, aka the r_{ui} values 
        # allows Cu to be notated as Cu + I later on in the code
        self.counts *= self.alpha
        self.countsCopy = self.counts.copy()
        self.counts= sparse.csr_matrix(self.counts)
        self.train_model()
        predictions = self.predict(self.user_vectors,self.item_vectors)
        self.print_prediction(predictions)
        self.store_predictions(predictions)

    def predict(self,user_vectors,item_vectors):
        # For each index in reconstructed matrix, calculate using dot product
        predictions = np.zeros((self.num_users,self.num_items))
        for i in range(self.num_users):
            for j in range(self.num_items):
                predictions[i][j] = self.user_vectors[i].T.dot(mf.item_vectors[j])
    
        return predictions

    def construct_matrix(self,counts):

        # Give all users an index for matrix
        for i,user in enumerate(self.users):
    
            self.user_index[user['_id']] = i
            self.index_user[i] = user['_id']

        # Give all items an index for matrix
        for i,item in enumerate(self.items):

            self.item_index[item['_id']] = i
            self.index_item[i] = item['_id']

            n = self.item_index[item['_id']]

            # For each user rating for this item, find the users index and put a 1 in matrix
            for user in (item['user_ratings']):

                m = self.user_index[user['user']]
                counts[m][n] = 1

    def getItemName(self, objectid):
        item = self.client.current_db.media.find_one({"_id" : objectid})
        return item['name'] + ' - ' + item['link']

    def getUserName(self, userid):
        user = self.client.current_db.users.find_one({"_id" : userid})
        return user['name']

    # Recalculates user factor and item factor vectors for a fixed number of iterations using ALS
    def train_model(self):

        # Initialise to random noise 
        self.user_vectors = np.random.normal(size=(self.num_users,
                                                   self.num_factors))
        self.item_vectors = np.random.normal(size=(self.num_items,
                                                   self.num_factors))


        # For each iteration
        for i in xrange(self.num_iterations):
            t0 = time.time()
            # Fix item vectors and solve for user vector
            print 'Solving for user vectors...'
            self.user_vectors = self.iteration(True, sparse.csr_matrix(self.item_vectors))
            # Fix user vectors and solve for item vectors
            print 'Solving for item vectors...'
            self.item_vectors = self.iteration(False, sparse.csr_matrix(self.user_vectors))
            t1 = time.time()
                        
            print 'iteration %i finished in %f seconds' % (i + 1, t1 - t0)
        
    def iteration(self, user, fixed_vecs):

        # Number of user / item vectors you are solving for 
        num_solve = self.num_users if user else self.num_items
        # Size of fixed matrix
        num_fixed = fixed_vecs.shape[0]
        
        # Precalculate matrices they dont depend on u

        # Y^t Y calculated
        YTY = fixed_vecs.T.dot(fixed_vecs)

        # Identity matrix
        eye = sparse.eye(num_fixed)
        
        # LambaI
        lambda_eye = self.reg_param * sparse.eye(self.num_factors)
        # Initialise a vector to store results of recomputed factor vector
        solve_vecs = np.zeros((num_solve, self.num_factors))

        t = time.time()

        # For each item / user you need to recalcualte for
        for i in xrange(num_solve):
            if user:
                # if recomputing user vectors, retrieve their ratings 
                counts_i = self.counts[i].toarray()
            else:
                # else, if recomputing item vectors, get all ratings for item i
                counts_i = self.counts[:, i].T.toarray()
            CuI = sparse.diags(counts_i, [0])
            pu = counts_i.copy()
            # setting preferences from c values.
            pu[np.where(pu != 0)] = 1.0
            # Calculate Ttrans 
            YTCuIY = fixed_vecs.T.dot(CuI).dot(fixed_vecs)
            YTCupu = fixed_vecs.T.dot(CuI + eye).dot(sparse.csr_matrix(pu).T)
            xu = spsolve(YTY + YTCuIY + lambda_eye, YTCupu)
            solve_vecs[i] = xu

        return solve_vecs

    def predict(self,user_vectors,item_vectors):
        # For each index in reconstructed matrix, calculate using dot product
        predictions = np.zeros((self.num_users,self.num_items))
        for i in range(self.num_users):
            for j in range(self.num_items):
                predictions[i][j] = self.user_vectors[i].T.dot(self.item_vectors[j])
        
        return predictions

    def print_prediction(self,predictions):

        for i in range (0,len(predictions)):
    
            for j, x in enumerate(predictions[i]):
        
                # If we know they already like it, set prediction to 0 so it's not suggested(?)
                if (self.countsCopy[i][j] != 0):
                    predictions[i][j] = 0 
            
            # Finds top 10 items for user i, finds item names instead of id's
	    n = (len(np.where(predictions[i] > 0)[0])/10)
            topn = np.argpartition(predictions[i], -n)[-n:]
            topn[:] = topn[::-1]
            items = []
            for k in topn:
                items.append(self.getItemName(self.index_item[k]))
            
            print self.getUserName(self.index_user[i]) + '\n' + str(items)
            print
            
    def store_predictions(self,predictions):

        for i in range(0,len(predictions)):

            for j, x in enumerate(predictions[i]):
        
                # If we know they already like it, set prediction to 0 so it's not suggested(?)
                if (self.countsCopy[i][j] == 1):
                    predictions[i][j] = -1 

            user = self.client.current_db.users.find_one({'_id' : self.index_user[i]})

            media = []
            n = (len(np.where(predictions[i] > 0)[0])/10)
            topn = np.argpartition(predictions[i], -n)[-n:]
            topn[:] = topn[::-1]

            for index in topn:
                item_id = self.index_item[index]
                media.append({'user' : user['_id'], 'media' : item_id})

            self.client.store_prioritised_media(user, media, 'implicit')
Esempio n. 2
0
class Stager:

	def __init__(self):
		self.client = MongoConnector()

	def process(self):
		relation_collection = self.client.current_db.relations
		user_collection = self.client.current_db.users
		user_graphs_collection = self.client.current_db.user_graphs

		for relation in relation_collection.find():
			self.stage_user(relation)

		self.prioritise_media()

	def stage_user(self, relation):
		ranks = self.rank_order(relation)
		self.client.store_rankings(relation, ranks)

	def rank_order(self, relation):
		#Create a list of all users which are at all related to this user
		ranks = [{'user' : x} for x in set([y['user'] for y in (relation['similar'] + relation['direct'])])]

		for user_links in relation['similar']:
			for rank in ranks:
				if rank['user'] == user_links['user']:
					rank['similar'] = len(user_links['links'])
					continue

		for user_links in relation['direct']:
			for rank in ranks:
				if rank['user'] == user_links['user']:
					rank['direct'] = len(user_links['links'])
					continue

		for rank in ranks:
			if 'similar' in rank.keys():
				if 'direct' in rank.keys():
					rank['total'] = int(rank['similar']) + (int(rank['direct'])*10)
				else:
					rank['total'] = int(rank['similar'])
			elif 'direct' in rank.keys():
				rank['total'] = int(rank['direct'])*10
			else:
				rank['total'] = 0

		return sorted(ranks, key=lambda rank: rank['total'], reverse=True)

	def prioritise_media(self):
		user_media_collection = self.client.current_db.user_media
		user_graphs_collection = self.client.current_db.user_graphs

		for user in user_graphs_collection.find():
			
			media = []
			#For each user here, put a load of media in staged_media table
			if len(user[u'ranks']) > 0:
				#For each media item, store the user and then their stuff with a priority attached
				for user_media_item in user_media_collection.find():
					for rating in user_media_item['user_ratings']:
						for u in user[u'ranks']:
							if rating[u'user'] == u[u'user']:
								#Here we have a media item with a rating by a user with a priority
								media.append({'user' : user[u'_id'], 'media' : user_media_item[u'_id'], 'similarity' : u['total'], 'similar_user' : u['user']})

			if len(media) > 0:
				self.client.store_prioritised_media(user, media)