예제 #1
0
class SlopeOnePredictor:
    def __init__(self):
        self.uim = None
        self.md = MovieData('../data/movies.dat')
        self.movies = self.md.get_movies()
        self.p_movies = None
        self.unique_rated_movies = None

    def fit(self, X):
        self.uim = X.get_df()
        self.unique_rated_movies = pd.DataFrame(self.uim["movieID"].unique(),
                                                columns=["movieID"])
        self.unique_rated_movies = self.unique_rated_movies.set_index(
            "movieID", drop=False)
        self.unique_rated_movies["key"] = 1
        cartesian_p = pd.merge(self.unique_rated_movies,
                               self.unique_rated_movies,
                               on="key")[["movieID_x", "movieID_y"]]
        cartesian_p = cartesian_p[
            cartesian_p["movieID_x"] != cartesian_p["movieID_y"]][[
                "movieID_x", "movieID_y"
            ]]
        cartesian_p["similarity_d"] = cartesian_p.apply(
            lambda x: self.similar_d(x["movieID_x"], x["movieID_y"]), axis=1)
        self.p_movies = cartesian_p

    def predict(self, user_id):
        pred = {}

        user_movies = dict(
            self.uim.groupby(self.uim.index)['movieID'].apply(list))

        if user_id not in user_movies.keys():
            return pred

        rating_movies_from_user = pd.DataFrame(
            {'rated_movies': user_movies[user_id]})

        non_rating_movies = self.uim[
            ~self.uim["movieID"].isin(user_movies[user_id])].drop_duplicates(
                subset=["movieID"])

        for key_non_rating, value_non_rating in non_rating_movies.iterrows():
            s = []
            p = []
            for key_rating, value_rating in rating_movies_from_user.iterrows():
                p1 = self.uim.loc[self.uim["movieID"] ==
                                  value_rating["rated_movies"], "rating"]
                calculated_d = self.p_movies.loc[
                    (self.p_movies["movieID_x"] == value_non_rating["movieID"])
                    &
                    (self.p_movies["movieID_y"] == value_rating["rated_movies"]
                     ), "similarity_d"].iat[0]
                s += [(p1[user_id] - calculated_d[0])]
                p += [calculated_d[1]]

            if np.sum(np.array(p)) == 0:
                pred[int(value_non_rating["movieID"])] = 0.0
            else:
                pred[int(value_non_rating["movieID"])] = (
                    (np.sum(np.array(s) * np.array(p))) / np.sum(np.array(p)))

        return pred

    def similar_d(self, p1, p2):
        p_intersection = set(
            self.uim.loc[self.uim["movieID"] == p1]["userID"].unique()
        ).intersection(
            set(self.uim.loc[self.uim["movieID"] == p2]["userID"].unique()))

        all_p = self.uim[self.uim["userID"].isin(p_intersection)]
        x_p1 = (all_p.loc[all_p["movieID"] == p1])
        x_p2 = (all_p.loc[all_p["movieID"] == p2])
        p1 = x_p1[x_p1["userID"].isin(p_intersection)][["movieID", "rating"]]
        p2 = x_p2[x_p2["userID"].isin(p_intersection)][["movieID", "rating"]]

        concated_movies_rating = pd.concat([p1, p2], axis=1)
        concated_movies_rating.columns = [
            "movieID_x", "rating_x", "movieID_y", "rating_y"
        ]
        concated_movies_rating["calculated_d"] = concated_movies_rating.apply(
            lambda x: x["rating_x"] - x["rating_y"], axis=1)

        return tuple((-(concated_movies_rating["calculated_d"].sum() /
                        concated_movies_rating["calculated_d"].count()),
                      concated_movies_rating["calculated_d"].count()))
class ItemBasedPredictor:
    def __init__(self, min_values=0, threshold=0):
        self.min_values = min_values
        self.threshold = threshold
        self.uim = None
        self.md = MovieData('../data/movies.dat')
        self.movies = self.md.get_movies()
        self.p_movies = None
        self.unique_rated_movies = None

    def fit(self, X):
        self.uim = X.get_df()
        self.unique_rated_movies = pd.DataFrame(self.uim["movieID"].unique(), columns=["movieID"])
        self.unique_rated_movies = self.unique_rated_movies.set_index("movieID", drop=False)
        self.unique_rated_movies["key"] = 1
        cartesian_p = pd.merge(self.unique_rated_movies, self.unique_rated_movies, on="key")[["movieID_x", "movieID_y"]]
        cartesian_p = cartesian_p[cartesian_p["movieID_x"] != cartesian_p["movieID_y"]][["movieID_x", "movieID_y"]]
        cartesian_p["similarity"] = cartesian_p.apply(lambda x: self.similarity(x["movieID_x"], x["movieID_y"]), axis=1)
        self.p_movies = cartesian_p

    def predict(self, user_id):
        pred = {}
        user_movies = dict(self.uim.groupby(self.uim.index)['movieID'].apply(list))

        rating_movies_from_user = pd.DataFrame({'rated_movies': user_movies[user_id]})

        non_rating_movies = self.uim[~self.uim["movieID"].isin(user_movies[user_id])].drop_duplicates(subset=["movieID"])

        for key_non_rating, value_non_rating in non_rating_movies.iterrows():
            s = []
            p = []
            for key_rating, value_rating in rating_movies_from_user.iterrows():
                p1 = self.uim.loc[self.uim["movieID"] == value_rating["rated_movies"], "rating"]
                s += [self.p_movies.loc[(self.p_movies["movieID_x"] == value_non_rating["movieID"]) & (self.p_movies["movieID_y"] == value_rating["rated_movies"]), "similarity"].iat[0]]
                p += [p1[user_id]]

            if np.sum(np.array(s)) == 0:
                pred[int(value_non_rating["movieID"])] = 0.0
            else:
                pred[int(value_non_rating["movieID"])] = ((np.sum(np.array(s) * np.array(p))) / np.sum(np.array(s)))

        return pred

    def similarity(self, p1, p2):
        if p1 == p2:
            return 0.0

        # start = time.time()
        p_intersection = set(self.uim.loc[self.uim["movieID"] == p1]["userID"].unique()).intersection(set(self.uim.loc[self.uim["movieID"] == p2]["userID"].unique()))

        all_p = self.uim[self.uim["userID"].isin(p_intersection)]
        x_p1 = (all_p.loc[all_p["movieID"] == p1])
        x_p2 = (all_p.loc[all_p["movieID"] == p2])
        p1 = x_p1[x_p1["userID"].isin(p_intersection)]["rating"]
        p2 = x_p2[x_p2["userID"].isin(p_intersection)]["rating"]
        avg = (all_p.groupby(all_p.index).mean()["rating"])

        # end = time.time()
        # print(end - start)

        if max(len(p1.keys()), len(p2.keys())) < self.min_values:
            return 0.0

        c = 0
        iml = 0
        imr = 0
        for ocena_1, ocena_2, user_avg in zip(p1, p2, avg):
            c += (ocena_1 - user_avg) * (ocena_2 - user_avg)
            iml += (ocena_1 - user_avg)**2
            imr += (ocena_2 - user_avg)**2

        # end = time.time()
        # print(end - start)

        if c <= 0:
            return 0.0
        result = c / (math.sqrt(iml) * math.sqrt(imr))

        if result < self.threshold:
            return 0.0

        return result

    def mostSimilarFilms(self):
        most_similar = self.p_movies.sort_values(by='similarity', ascending=False).head(n=20)
        for key, value in most_similar.iterrows():
            print("Film1: {}, Film2: {}, podobnost: {}".format(self.md.get_title(value["movieID_x"]),
                                                               self.md.get_title(value["movieID_y"]),
                                                               value["similarity"]))

    def similarItems(self, item, n):
        films = self.uim.groupby("movieID")[["movieID"]].apply(lambda x: self.similarity(item, x.name))
        return sorted(list(dict(films).items()), key=lambda x: x[1], reverse=True)[0:n]