Beispiel #1
0
    def context_users_similarity(self):
        """Compute the similarity between users using context features"""

        filename = os.path.join(OUTPUT_PATH, "pickle",
                                "context_users_features.pickle")

        if os.path.isfile(filename):
            with open(filename) as f:
                features = pickle.load(f)
        else:
            self._processor = ContextProcessor()
            features = []
            # get all the features for each user
            for user, docs in self._processor.iterate():
                features.append(self._processor.get_features(docs, user))
            with open(filename, "w+") as f:
                pickle.dump(features, f)

        reduced_features = []
        for doc in features:
            reduced_features.append(np.mean(doc, axis=1))

        from ipdb import set_trace
        set_trace()
        # it is possible to cluster each user's documents
        #
        # for alexis, let's print the similarity matrix of his documents
        draw_matrix(euclidean_distances(features[0], features[0]),
                    "context_alexis", OUTPUT_PATH)
Beispiel #2
0
    def context_users_similarity(self):
        """Compute the similarity between users using context features"""

        filename = os.path.join(OUTPUT_PATH, 
                "pickle", "context_users_features.pickle")

        if os.path.isfile(filename):
            with open(filename) as f:
                features = pickle.load(f)
        else:
            self._processor = ContextProcessor()
            features = []
            # get all the features for each user
            for user, docs in self._processor.iterate():
                features.append(self._processor.get_features(docs, user))
            with open(filename, "w+") as f:
                pickle.dump(features, f)

        reduced_features = []
        for doc in features:
            reduced_features.append(np.mean(doc, axis=1))

        from ipdb import set_trace; set_trace()
        # it is possible to cluster each user's documents
        #
        # for alexis, let's print the similarity matrix of his documents
        draw_matrix(euclidean_distances(features[0], features[0]),
                "context_alexis", OUTPUT_PATH)
Beispiel #3
0
    def text_users_similarity(self):
        """Compute the similarity between users using text features"""

        processor = self._processor = TextProcessor()
        features = []
        for user, docs in processor.iterate():
            features.append(processor.get_features(docs, user))
        
        # draw the matrix for alexis
        draw_matrix(euclidean_distances(features[0], features[0]), 
            "text_alexis", OUTPUT_PATH)
Beispiel #4
0
    def text_users_similarity(self):
        """Compute the similarity between users using text features"""

        processor = self._processor = TextProcessor()
        features = []
        for user, docs in processor.iterate():
            features.append(processor.get_features(docs, user))

        # draw the matrix for alexis
        draw_matrix(euclidean_distances(features[0], features[0]),
                    "text_alexis", OUTPUT_PATH)
Beispiel #5
0
 def compute_similarities(self):
     print "generated %s" % draw_matrix(
             self.text_users_similarity(), "users", self.output_path)
     print "generated %s" % draw_matrix(
             self.text_profiles_similarity(), "profiles", self.output_path)
Beispiel #6
0
def collaborative_filtering(usernames, similarity, rankings, urls, N):
    """Do the collaborative filtering for the given usernames, rankings and 
    similarity between profiles.

    :usernames:
        the list of usernames in the system

    :similarity:
        The similarity matrix for all the profiles

    :rankings:
        An array of [username][url] = score

    :N:
        the number of profiles used per user
    """

    # XXX Eventually split here.

    # draw the matrix for later analysis
    draw_matrix(similarity, "final_kmeans", OUTPUT_PATH)

    # For each profile, get the best matches. 
    user_id = 0
    weighted_ranks = defaultdict(dict)
    # p_sim is for "profile similarity"
    for idx, p_sim in enumerate(similarity):
        if idx % N == 0:
            # we iterated over all the profiles for this user
            user_id = user_id + 1

        # ignore the profiles from the same user
        matching_profiles = [i for i in p_sim.argsort()[::-1] 
                if i < N * (user_id - 1) or i > N * (user_id)][:10]

        # for all the matching profiles, get the related urls, and construct a 
        # list of weighted ranks, in the form url, rank
        for profile_id in matching_profiles:
            username = usernames[profile_id / N]
            # get the urls for this profile
            profile_urls = urls[profile_id]
            
            # use the collaborative filtering technique to weigth all the urls
            # with the similarity scores between profiles
            for url in profile_urls:
                weighted_ranks[idx][url] = \
                    rankings[username][url] * p_sim[profile_id]
    
    recommendations = defaultdict(dict)
    # All the urls have been ranked, now get the M best ones in total
    for starting_profile in range(len(weighted_ranks))[::N]:
        user_rankings = {}
        for profile_id in range(starting_profile, starting_profile + N):
            profile_rankings = weighted_ranks[profile_id]
            for url in (user_rankings.viewkeys() & profile_rankings.viewkeys()):
                profile_rankings[url] = profile_rankings[url] + user_rankings[url]
            user_rankings.update(profile_rankings)
        
        user_rankings = user_rankings.items()
        sorted(user_rankings, key=itemgetter(1), reverse=True)
        recommendations[usernames[starting_profile / N]] = user_rankings

    return recommendations
Beispiel #7
0
 def compute_similarities(self):
     print "generated %s" % draw_matrix(self.text_users_similarity(),
                                        "users", self.output_path)
     print "generated %s" % draw_matrix(self.text_profiles_similarity(),
                                        "profiles", self.output_path)