def top_recommendations_poisson(): params = get_best_params("poisson") info = getMeta() beta = params["beta"] theta = params["theta"] reviews = get_test_reviews() precision = 0.0 num_users = 0 for user in xrange(info["users"]): movie_ratings = [] for movie in xrange(info["movies"]): rating = np.dot(theta[user, :], beta[movie, :]) movie_ratings.append((movie, rating)) movie_ratings = sorted(movie_ratings, key=lambda x: x[1]) top_movies_for_user = set(movie for movie, rating in movie_ratings[-1000:]) user_precision = 0.0 movies = reviews[user, :].nonzero()[0] for movie in movies: if movie in top_movies_for_user: user_precision += 1 if len(movies) > 0: num_users += 1 precision += (user_precision / len(movies)) return precision / num_users
def top_recommendations_lda(): params = get_best_params("lda") info = getMeta() phi = params["phi"] kappa = params["kappa"] reviews = get_test_reviews() rating_values = np.asarray([0,1.0,2.0,3.0,4.0,5.0]) precision = 0.0 num_users = 0 for user in xrange(info["users"]): movie_ratings = [] for movie in xrange(info["movies"]): topic = np.argmax(phi[movie,:]) rating = np.dot(kappa[:,user,topic]/np.sum(kappa[:,user,topic]), rating_values) movie_ratings.append((movie, rating)) movie_ratings = sorted(movie_ratings, key=lambda x: x[1]) top_movies_for_user = set(movie for movie, rating in movie_ratings[-1000:]) user_precision = 0.0 movies = reviews[user, :].nonzero()[0] for movie in movies: if movie in top_movies_for_user: user_precision += 1 if len(movies) > 0: num_users += 1 precision += (user_precision / len(movies)) return precision / num_users
def top_recommendations_lda(): params = get_best_params("lda") info = getMeta() phi = params["phi"] kappa = params["kappa"] reviews = get_test_reviews() rating_values = np.asarray([0, 1.0, 2.0, 3.0, 4.0, 5.0]) precision = 0.0 num_users = 0 for user in xrange(info["users"]): movie_ratings = [] for movie in xrange(info["movies"]): topic = np.argmax(phi[movie, :]) rating = np.dot( kappa[:, user, topic] / np.sum(kappa[:, user, topic]), rating_values) movie_ratings.append((movie, rating)) movie_ratings = sorted(movie_ratings, key=lambda x: x[1]) top_movies_for_user = set(movie for movie, rating in movie_ratings[-1000:]) user_precision = 0.0 movies = reviews[user, :].nonzero()[0] for movie in movies: if movie in top_movies_for_user: user_precision += 1 if len(movies) > 0: num_users += 1 precision += (user_precision / len(movies)) return precision / num_users
def __init__(self, numTopics, alpha, beta, gamma): # Setup logger self.log = logging.getLogger("Gibbs") self.log.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p") fh = logging.handlers.TimedRotatingFileHandler("logs/gibbs.log", when="D", interval=1, backupCount=10) ch = logging.StreamHandler() fh.setFormatter(formatter) ch.setFormatter(formatter) self.log.addHandler(fh) self.log.addHandler(ch) self.numTopics = numTopics self.alpha = alpha self.beta = beta self.gamma = gamma self.info = getMeta() self.user_movies, _ = get_split_review_mats() user_indices, movie_indices = self.user_movies.nonzero() self.user_movie_indices = zip(user_indices, movie_indices) self.CountMT = np.zeros((self.info["movies"], numTopics), dtype=np.int) self.CountRUT = np.zeros((6, self.info["users"], numTopics), dtype=np.int) # ratings 1-5 and 0 self.CountUT = np.zeros((self.info["users"], numTopics), dtype=np.int) self.topic_assignments = np.zeros( (self.info["users"], self.info["movies"]), dtype=np.int) # Normalization factors self.CountT = np.zeros(numTopics, dtype=np.int) self.CountU = np.zeros(self.info["users"], dtype=np.int) self.CountRU = np.zeros((6, self.info["users"]), dtype=np.int) for userid, movieid in self.user_movie_indices: topic = randint(0, numTopics - 1) self.CountMT[movieid, topic] += 1 rating = self.user_movies[userid, movieid] self.CountRUT[rating, userid, topic] += 1 self.CountUT[userid, topic] += 1 self.topic_assignments[userid, movieid] = topic self.CountT[topic] += 1 self.CountU[userid] += 1 self.CountRU[rating, userid] += 1
def __init__(self, numTopics, alpha, beta, gamma): # Setup logger self.log = logging.getLogger("Gibbs") self.log.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p") fh = logging.handlers.TimedRotatingFileHandler("logs/gibbs.log", when="D", interval=1, backupCount=10) ch = logging.StreamHandler() fh.setFormatter(formatter) ch.setFormatter(formatter) self.log.addHandler(fh) self.log.addHandler(ch) self.numTopics = numTopics self.alpha = alpha self.beta = beta self.gamma = gamma self.info = getMeta() self.user_movies, _ = get_split_review_mats() user_indices, movie_indices = self.user_movies.nonzero() self.user_movie_indices = zip(user_indices, movie_indices) self.CountMT = np.zeros((self.info["movies"], numTopics), dtype=np.int) self.CountRUT = np.zeros((6, self.info["users"], numTopics), dtype=np.int) # ratings 1-5 and 0 self.CountUT = np.zeros((self.info["users"], numTopics), dtype=np.int) self.topic_assignments = np.zeros((self.info["users"], self.info["movies"]), dtype=np.int) # Normalization factors self.CountT = np.zeros(numTopics, dtype=np.int) self.CountU = np.zeros(self.info["users"], dtype=np.int) self.CountRU = np.zeros((6, self.info["users"]), dtype=np.int) for userid, movieid in self.user_movie_indices: topic = randint(0, numTopics - 1) self.CountMT[movieid, topic] += 1 rating = self.user_movies[userid, movieid] self.CountRUT[rating, userid, topic] += 1 self.CountUT[userid, topic] += 1 self.topic_assignments[userid, movieid] = topic self.CountT[topic] += 1 self.CountU[userid] += 1 self.CountRU[rating, userid] += 1
def test_lda(): params = get_best_params("lda") info = getMeta() phi = params["phi"] kappa = params["kappa"] reviews = get_test_reviews() rmse = 0.0 count = 0 rating_values = np.asarray([0,1.0,2.0,3.0,4.0,5.0]) for user, movie in izip(*reviews.nonzero()): topic = np.argmax(phi[movie,:]) estimated_rating = np.dot(kappa[:,user,topic]/np.sum(kappa[:,user,topic]), rating_values) true_rating = reviews[user, movie] rmse += (true_rating - estimated_rating) ** 2 count += 1 return math.sqrt(rmse / count)
def test_lda(): params = get_best_params("lda") info = getMeta() phi = params["phi"] kappa = params["kappa"] reviews = get_test_reviews() rmse = 0.0 count = 0 rating_values = np.asarray([0, 1.0, 2.0, 3.0, 4.0, 5.0]) for user, movie in izip(*reviews.nonzero()): topic = np.argmax(phi[movie, :]) estimated_rating = np.dot( kappa[:, user, topic] / np.sum(kappa[:, user, topic]), rating_values) true_rating = reviews[user, movie] rmse += (true_rating - estimated_rating)**2 count += 1 return math.sqrt(rmse / count)
def top_recommendations_iid(): info = getMeta() train, reviews = get_split_review_mats() avg_ratings = train.sum(axis=0) / (train != 0).sum(axis=0).astype(np.float) top_movies = sorted(((movie, rating) for movie, rating in enumerate(avg_ratings)), key=lambda x: x[1]) top_movies = set(movie for movie, rating in top_movies[-1000:]) precision = 0.0 num_users = 0 for user in xrange(info["users"]): movies = reviews[user, :].nonzero()[0] user_precision = 0.0 for movie in movies: if movie in top_movies: user_precision += 1 if len(movies) > 0: num_users += 1 precision += (user_precision / len(movies)) return precision / num_users
def top_recommendations_iid(): info = getMeta() train, reviews = get_split_review_mats() avg_ratings = train.sum(axis=0) / (train != 0).sum(axis=0).astype(np.float) top_movies = sorted( ((movie, rating) for movie, rating in enumerate(avg_ratings)), key=lambda x: x[1]) top_movies = set(movie for movie, rating in top_movies[-1000:]) precision = 0.0 num_users = 0 for user in xrange(info["users"]): movies = reviews[user, :].nonzero()[0] user_precision = 0.0 for movie in movies: if movie in top_movies: user_precision += 1 if len(movies) > 0: num_users += 1 precision += (user_precision / len(movies)) return precision / num_users