def __init__(self, min_values=0, threshold=0): self.min_values = min_values self.threshold = threshold self.uim = None self.md = MovieData('../data/movies.dat') self.movies = self.md.get_movies() self.p_movies = None self.unique_rated_movies = None
class RandomPredictor: def __init__(self, min_ocena, max_ocena): self.min_ocena = min_ocena self.max_ocena = max_ocena self.uim = None self.md = MovieData('../data/movies.dat').get_movies() def fit(self, X): self.uim = X def predict(self, user_id): user = self.uim.get_user(user_id) p = {} # for key, movie in user.iterrows(): # if movie["movieID"] not in p: # p[movie["movieID"]] = movie["rating"] for key, movie in self.md.iterrows(): if movie["id"] not in p: p[movie["id"]] = random.randint(self.min_ocena, self.max_ocena) return p
def __init__(self): self.uim = None self.md = MovieData('../data/movies.dat') self.movies = self.md.get_movies() self.p_movies = None self.unique_rated_movies = None
class SlopeOnePredictor: def __init__(self): self.uim = None self.md = MovieData('../data/movies.dat') self.movies = self.md.get_movies() self.p_movies = None self.unique_rated_movies = None def fit(self, X): self.uim = X.get_df() self.unique_rated_movies = pd.DataFrame(self.uim["movieID"].unique(), columns=["movieID"]) self.unique_rated_movies = self.unique_rated_movies.set_index( "movieID", drop=False) self.unique_rated_movies["key"] = 1 cartesian_p = pd.merge(self.unique_rated_movies, self.unique_rated_movies, on="key")[["movieID_x", "movieID_y"]] cartesian_p = cartesian_p[ cartesian_p["movieID_x"] != cartesian_p["movieID_y"]][[ "movieID_x", "movieID_y" ]] cartesian_p["similarity_d"] = cartesian_p.apply( lambda x: self.similar_d(x["movieID_x"], x["movieID_y"]), axis=1) self.p_movies = cartesian_p def predict(self, user_id): pred = {} user_movies = dict( self.uim.groupby(self.uim.index)['movieID'].apply(list)) if user_id not in user_movies.keys(): return pred rating_movies_from_user = pd.DataFrame( {'rated_movies': user_movies[user_id]}) non_rating_movies = self.uim[ ~self.uim["movieID"].isin(user_movies[user_id])].drop_duplicates( subset=["movieID"]) for key_non_rating, value_non_rating in non_rating_movies.iterrows(): s = [] p = [] for key_rating, value_rating in rating_movies_from_user.iterrows(): p1 = self.uim.loc[self.uim["movieID"] == value_rating["rated_movies"], "rating"] calculated_d = self.p_movies.loc[ (self.p_movies["movieID_x"] == value_non_rating["movieID"]) & (self.p_movies["movieID_y"] == value_rating["rated_movies"] ), "similarity_d"].iat[0] s += [(p1[user_id] - calculated_d[0])] p += [calculated_d[1]] if np.sum(np.array(p)) == 0: pred[int(value_non_rating["movieID"])] = 0.0 else: pred[int(value_non_rating["movieID"])] = ( (np.sum(np.array(s) * np.array(p))) / np.sum(np.array(p))) return pred def similar_d(self, p1, p2): p_intersection = set( self.uim.loc[self.uim["movieID"] == p1]["userID"].unique() ).intersection( set(self.uim.loc[self.uim["movieID"] == p2]["userID"].unique())) all_p = self.uim[self.uim["userID"].isin(p_intersection)] x_p1 = (all_p.loc[all_p["movieID"] == p1]) x_p2 = (all_p.loc[all_p["movieID"] == p2]) p1 = x_p1[x_p1["userID"].isin(p_intersection)][["movieID", "rating"]] p2 = x_p2[x_p2["userID"].isin(p_intersection)][["movieID", "rating"]] concated_movies_rating = pd.concat([p1, p2], axis=1) concated_movies_rating.columns = [ "movieID_x", "rating_x", "movieID_y", "rating_y" ] concated_movies_rating["calculated_d"] = concated_movies_rating.apply( lambda x: x["rating_x"] - x["rating_y"], axis=1) return tuple((-(concated_movies_rating["calculated_d"].sum() / concated_movies_rating["calculated_d"].count()), concated_movies_rating["calculated_d"].count()))
def __init__(self, min_ocena, max_ocena): self.min_ocena = min_ocena self.max_ocena = max_ocena self.uim = None self.md = MovieData('../data/movies.dat').get_movies()
from code.UserItemData import UserItemData from code.MovieData import MovieData from code.RandomPredictor import RandomPredictor import pandas as pd md = MovieData('../data/movies.dat') uim = UserItemData('../data/user_ratedmovies.dat') rp = RandomPredictor(1, 5) rp.fit(uim) pred = rp.predict(78) print(type(pred)) items = [1, 3, 20, 50, 100] for item in items: print("Film: {}, ocena: {}".format(md.get_title(item), pred[item]))
class ItemBasedPredictor: def __init__(self, min_values=0, threshold=0): self.min_values = min_values self.threshold = threshold self.uim = None self.md = MovieData('../data/movies.dat') self.movies = self.md.get_movies() self.p_movies = None self.unique_rated_movies = None def fit(self, X): self.uim = X.get_df() self.unique_rated_movies = pd.DataFrame(self.uim["movieID"].unique(), columns=["movieID"]) self.unique_rated_movies = self.unique_rated_movies.set_index("movieID", drop=False) self.unique_rated_movies["key"] = 1 cartesian_p = pd.merge(self.unique_rated_movies, self.unique_rated_movies, on="key")[["movieID_x", "movieID_y"]] cartesian_p = cartesian_p[cartesian_p["movieID_x"] != cartesian_p["movieID_y"]][["movieID_x", "movieID_y"]] cartesian_p["similarity"] = cartesian_p.apply(lambda x: self.similarity(x["movieID_x"], x["movieID_y"]), axis=1) self.p_movies = cartesian_p def predict(self, user_id): pred = {} user_movies = dict(self.uim.groupby(self.uim.index)['movieID'].apply(list)) rating_movies_from_user = pd.DataFrame({'rated_movies': user_movies[user_id]}) non_rating_movies = self.uim[~self.uim["movieID"].isin(user_movies[user_id])].drop_duplicates(subset=["movieID"]) for key_non_rating, value_non_rating in non_rating_movies.iterrows(): s = [] p = [] for key_rating, value_rating in rating_movies_from_user.iterrows(): p1 = self.uim.loc[self.uim["movieID"] == value_rating["rated_movies"], "rating"] s += [self.p_movies.loc[(self.p_movies["movieID_x"] == value_non_rating["movieID"]) & (self.p_movies["movieID_y"] == value_rating["rated_movies"]), "similarity"].iat[0]] p += [p1[user_id]] if np.sum(np.array(s)) == 0: pred[int(value_non_rating["movieID"])] = 0.0 else: pred[int(value_non_rating["movieID"])] = ((np.sum(np.array(s) * np.array(p))) / np.sum(np.array(s))) return pred def similarity(self, p1, p2): if p1 == p2: return 0.0 # start = time.time() p_intersection = set(self.uim.loc[self.uim["movieID"] == p1]["userID"].unique()).intersection(set(self.uim.loc[self.uim["movieID"] == p2]["userID"].unique())) all_p = self.uim[self.uim["userID"].isin(p_intersection)] x_p1 = (all_p.loc[all_p["movieID"] == p1]) x_p2 = (all_p.loc[all_p["movieID"] == p2]) p1 = x_p1[x_p1["userID"].isin(p_intersection)]["rating"] p2 = x_p2[x_p2["userID"].isin(p_intersection)]["rating"] avg = (all_p.groupby(all_p.index).mean()["rating"]) # end = time.time() # print(end - start) if max(len(p1.keys()), len(p2.keys())) < self.min_values: return 0.0 c = 0 iml = 0 imr = 0 for ocena_1, ocena_2, user_avg in zip(p1, p2, avg): c += (ocena_1 - user_avg) * (ocena_2 - user_avg) iml += (ocena_1 - user_avg)**2 imr += (ocena_2 - user_avg)**2 # end = time.time() # print(end - start) if c <= 0: return 0.0 result = c / (math.sqrt(iml) * math.sqrt(imr)) if result < self.threshold: return 0.0 return result def mostSimilarFilms(self): most_similar = self.p_movies.sort_values(by='similarity', ascending=False).head(n=20) for key, value in most_similar.iterrows(): print("Film1: {}, Film2: {}, podobnost: {}".format(self.md.get_title(value["movieID_x"]), self.md.get_title(value["movieID_y"]), value["similarity"])) def similarItems(self, item, n): films = self.uim.groupby("movieID")[["movieID"]].apply(lambda x: self.similarity(item, x.name)) return sorted(list(dict(films).items()), key=lambda x: x[1], reverse=True)[0:n]
from code.MovieData import MovieData from code.Recommender import Recommender from code.SlopeOnePredictor import SlopeOnePredictor from code.UserItemData import UserItemData # evalvacija priporočilnega sistema print("------------------------------") print("Evalvacija priporočilnega sistema") print("------------------------------") md = MovieData('../data/movies.dat') uim = UserItemData('../data/user_ratedmovies.dat', min_ratings=1000, end_date='1.1.2008') rp = SlopeOnePredictor() rec = Recommender(rp) rec.fit(uim) uim_test = UserItemData('../data/user_ratedmovies.dat', min_ratings=200, start_date='2.1.2008') mse, mae, precision, recall, f = rec.evaluate(uim_test, 20) print(mse, mae, precision, recall, f)
from code.MovieData import MovieData import pandas as pd md = MovieData('../data/movies.dat') print(md.get_title(1))
from code.AveragePredictor import AveragePredictor from code.ItemBasedPredictor import ItemBasedPredictor from code.MovieData import MovieData from code.RandomPredictor import RandomPredictor from code.Recommender import Recommender from code.SlopeOnePredictor import SlopeOnePredictor from code.UserItemData import UserItemData from code.ViewsPredictor import ViewsPredictor import pandas as pd import time start = time.time() md = MovieData('../data/movies.dat') uim = UserItemData('../data/user_ratedmovies.dat') # Priporočanje z naključnim prediktorjem print("------------------------------") print("Priporočanje z naključnim prediktorjem") print("------------------------------") rp = RandomPredictor(1, 5) rec = Recommender(rp) rec.fit(uim) rec_items = rec.recommend(78, n=5, rec_seen=False) for idmovie, val in rec_items: print("Film: {}, ocena: {}".format(md.get_title(idmovie), val)) # Priporočanje s povprečnim prediktorjem print("------------------------------") print("Priporočanje s povprečnim prediktorjem") print("------------------------------")
def __init__(self, b): self.b = b self.uim = None self.md = MovieData('../data/movies.dat').get_movies() self.fitted = {}
def __init__(self, predictor): self.predictor = predictor self.uim = None self.md = MovieData('../data/movies.dat').get_movies()