def context_users_similarity(self): """Compute the similarity between users using context features""" filename = os.path.join(OUTPUT_PATH, "pickle", "context_users_features.pickle") if os.path.isfile(filename): with open(filename) as f: features = pickle.load(f) else: self._processor = ContextProcessor() features = [] # get all the features for each user for user, docs in self._processor.iterate(): features.append(self._processor.get_features(docs, user)) with open(filename, "w+") as f: pickle.dump(features, f) reduced_features = [] for doc in features: reduced_features.append(np.mean(doc, axis=1)) from ipdb import set_trace set_trace() # it is possible to cluster each user's documents # # for alexis, let's print the similarity matrix of his documents draw_matrix(euclidean_distances(features[0], features[0]), "context_alexis", OUTPUT_PATH)
def context_users_similarity(self): """Compute the similarity between users using context features""" filename = os.path.join(OUTPUT_PATH, "pickle", "context_users_features.pickle") if os.path.isfile(filename): with open(filename) as f: features = pickle.load(f) else: self._processor = ContextProcessor() features = [] # get all the features for each user for user, docs in self._processor.iterate(): features.append(self._processor.get_features(docs, user)) with open(filename, "w+") as f: pickle.dump(features, f) reduced_features = [] for doc in features: reduced_features.append(np.mean(doc, axis=1)) from ipdb import set_trace; set_trace() # it is possible to cluster each user's documents # # for alexis, let's print the similarity matrix of his documents draw_matrix(euclidean_distances(features[0], features[0]), "context_alexis", OUTPUT_PATH)
def text_users_similarity(self): """Compute the similarity between users using text features""" processor = self._processor = TextProcessor() features = [] for user, docs in processor.iterate(): features.append(processor.get_features(docs, user)) # draw the matrix for alexis draw_matrix(euclidean_distances(features[0], features[0]), "text_alexis", OUTPUT_PATH)
def compute_similarities(self): print "generated %s" % draw_matrix( self.text_users_similarity(), "users", self.output_path) print "generated %s" % draw_matrix( self.text_profiles_similarity(), "profiles", self.output_path)
def collaborative_filtering(usernames, similarity, rankings, urls, N): """Do the collaborative filtering for the given usernames, rankings and similarity between profiles. :usernames: the list of usernames in the system :similarity: The similarity matrix for all the profiles :rankings: An array of [username][url] = score :N: the number of profiles used per user """ # XXX Eventually split here. # draw the matrix for later analysis draw_matrix(similarity, "final_kmeans", OUTPUT_PATH) # For each profile, get the best matches. user_id = 0 weighted_ranks = defaultdict(dict) # p_sim is for "profile similarity" for idx, p_sim in enumerate(similarity): if idx % N == 0: # we iterated over all the profiles for this user user_id = user_id + 1 # ignore the profiles from the same user matching_profiles = [i for i in p_sim.argsort()[::-1] if i < N * (user_id - 1) or i > N * (user_id)][:10] # for all the matching profiles, get the related urls, and construct a # list of weighted ranks, in the form url, rank for profile_id in matching_profiles: username = usernames[profile_id / N] # get the urls for this profile profile_urls = urls[profile_id] # use the collaborative filtering technique to weigth all the urls # with the similarity scores between profiles for url in profile_urls: weighted_ranks[idx][url] = \ rankings[username][url] * p_sim[profile_id] recommendations = defaultdict(dict) # All the urls have been ranked, now get the M best ones in total for starting_profile in range(len(weighted_ranks))[::N]: user_rankings = {} for profile_id in range(starting_profile, starting_profile + N): profile_rankings = weighted_ranks[profile_id] for url in (user_rankings.viewkeys() & profile_rankings.viewkeys()): profile_rankings[url] = profile_rankings[url] + user_rankings[url] user_rankings.update(profile_rankings) user_rankings = user_rankings.items() sorted(user_rankings, key=itemgetter(1), reverse=True) recommendations[usernames[starting_profile / N]] = user_rankings return recommendations
def compute_similarities(self): print "generated %s" % draw_matrix(self.text_users_similarity(), "users", self.output_path) print "generated %s" % draw_matrix(self.text_profiles_similarity(), "profiles", self.output_path)