Esempio n. 1
0
def ComputeCollaborativeFiltering_Item_Item(recipe_df,
                                            train_rating_df,
                                            pd,
                                            benchmark,
                                            knnmeans=False):
    print("\n###### Compute CollaborativeFiltering_Item_Item ######")
    df = pd.merge(recipe_df, train_rating_df, on='recipe_id', how='inner')
    reader = Reader(rating_scale=(1, 5))
    data = Dataset.load_from_df(df[['user_id', 'recipe_id', 'rating']], reader)
    trainSet, testSet = train_test_split(data, test_size=.2, random_state=0)

    # compute  similarities between items
    sim_options = {'name': 'cosine', 'user_based': False}

    if knnmeans:
        algo = KNNWithMeans(sim_options=sim_options, verbose=False)
    else:
        algo = KNNBasic(sim_options=sim_options, verbose=False)
    algo.fit(trainSet)
    predictions = algo.test(testSet)

    Evaluators.RunAllEvals(predictions, benchmark)
Esempio n. 2
0
def user_collaborative_filtering(trainset, testset):

    # Use user_based true/false to switch between user-based or item-based collaborative filtering
    algo = KNNWithMeans(k=50,
                        sim_options={
                            'name': 'pearson_baseline',
                            'user_based': True
                        })
    algo.fit(trainset)

    # we can now query for specific predicions
    uid = str(196)  # raw user id
    iid = str(302)  # raw item id

    # get a prediction for specific users and items.
    pred = algo.predict(uid, iid, r_ui=4, verbose=True)

    # run the trained model against the testset
    test_pred = algo.test(testset)

    # get RMSE
    print("User-based Model : Test Set")
    accuracy.rmse(test_pred, verbose=True)
Esempio n. 3
0
class KNNMean:
    def __init__(self, data, rating_scale, k=50, min_k=1, sim_options=None):
        self.data = data
        self.rating_scale = rating_scale
        self.k = k
        self.min_k = min_k
        self.reader = Reader(rating_scale=self.rating_scale)
        if not sim_options:
            sim_options = {
                "name": "cosine",
                'min_support': 3,
                "user_based": False
            }  # Compute  similarities between items
        self.model_data = Dataset.load_from_df(
            data.loc[:, ["userId", "movieId", "rating"]], self.reader)
        self.trainset = self.model_data.build_full_trainset()
        self.model = KNNWithMeans(self.k, self.min_k, sim_options=sim_options)
        print('fitting KNNWithMeans model...')
        self.model.fit(self.trainset)
        self.grid_search_ = None

    def set_model_params(self, model_params):
        print('updating model parameters...')
        self.model = KNNWithMeans(model_params)
        print('fitting KNNWithMeans model...')
        self.model.fit(self.trainset)

    def update_grid_search(self, gs):
        self.grid_search_ = gs

    def fit(self, data):
        self.data = data
        self.model_data = Dataset.load_from_df(
            data.loc[:, ["userId", "movieId", "rating"]], self.reader)
        self.trainset = self.model_data.build_full_trainset()
        self.model.fit(self.trainset)

    def grid_search(self):
        print('grid search...')
        sim_options = {
            "name": ["msd", "cosine"],
            "min_support": [3, 4],
            "user_based": [False]
        }
        param_grid = {
            "sim_options": sim_options,
            "k": [50, 100, 200],
            "min_k": [1]
        }
        gs = GridSearchCV(KNNWithMeans,
                          param_grid,
                          measures=["rmse", "mae"],
                          cv=3)
        gs.fit(self.model_data)
        best_params, best_score = gs.best_params["rmse"], gs.best_score["rmse"]
        print(f'Best score (RMSE): {best_score}')
        print(f'Best params (RMSE): {best_params}')

        print(f'Best score (MAE): {gs.best_score["mae"]}')
        print(f'Best params (RMSE): {gs.best_params["mae"]}')

        self.set_model_params(best_params)

        return best_params

    def predict(self, test_data):
        ratings = test_data.apply(
            lambda x: self.model.predict(x['userId'], x['movieId']).est,
            axis=1)
        return ratings
evaluator.AddAlgorithm(UserKNN1, "User KNNBasic")
# Item-based KNN
ItemKNN1 = KNNBasic(sim_options={'name': 'cosine', 'user_based': False})
evaluator.AddAlgorithm(ItemKNN1, "Item KNNBasic")
###############
###### KNNWithZScore
# User-based KNN
UserKNN2 = KNNWithZScore(sim_options={'name': 'cosine', 'user_based': True})
evaluator.AddAlgorithm(UserKNN2, "User KNNWithZScore")
# Item-based KNN
ItemKNN2 = KNNWithZScore(sim_options={'name': 'cosine', 'user_based': False})
evaluator.AddAlgorithm(ItemKNN2, "Item KNNWithZScore")
###############
###### KNNWithMeans
# User-based KNN
UserKNN3 = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True})
evaluator.AddAlgorithm(UserKNN3, "User KNNWithMeans")
# Item-based KNN
ItemKNN3 = KNNWithMeans(sim_options={'name': 'cosine', 'user_based': False})
evaluator.AddAlgorithm(ItemKNN3, "Item KNNWithMeans")
###############
###### KNNBaseline
# User-based KNN
UserKNN4 = KNNBaseline(sim_options={'name': 'cosine', 'user_based': True})
evaluator.AddAlgorithm(UserKNN4, "User KNNBaseline")
# Item-based KNN
ItemKNN4 = KNNBaseline(sim_options={'name': 'cosine', 'user_based': False})
evaluator.AddAlgorithm(ItemKNN4, "Item KNNBaseline")
###############

# Just make random recommendations
Esempio n. 5
0
from surprise import accuracy
from collections import defaultdict
import pprint
# 数据读取
path = './movielens_sample.txt'
df = pd.read_csv(path, usecols=[0, 1, 2], skiprows=1)
df.columns = ['user', 'item', 'rating']
reader = Reader(line_format='user item rating', sep=',')
data = Dataset.load_from_df(df, reader=reader)
trainset = data.build_full_trainset()

# ItemCF 计算得分
# 取最相似的用户计算时,只取最相似的k个

kf = KFold(n_splits=5)
algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'})

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    rmse = accuracy.rmse(predictions, verbose=True)
    print(rmse, rmse * rmse)

predictions = []
for row in df.itertuples():
    user, item = getattr(row, 'user'), getattr(row, 'item')
    predictions.append([user, item, algo.predict(user, item).est])

print("*" * 100)
print("user\titem\tpredict\n")
pprint.pprint(predictions)
Esempio n. 6
0
class KNNWithMeansRecommender(SurpriseRecommender):
    """Generates recommendations via KNNWithMeans, see
    https://surprise.readthedocs.io/en/stable/knn_inspired.html
    """
    algo = KNNWithMeans()
Esempio n. 7
0
cross_validate = False

list_reviews = read_datafile(data_file)

df = pd.DataFrame(list_reviews, columns=['UserId', 'ItemId', 'Playtime'])
#filter_dataset(df)
#normalize_playtime(df)

reader = Reader(rating_scale=(0, max(df.Playtime)))

sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)

if cross_validate:
    data = Dataset.load_from_df(df, reader)

    cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
else:
    train_df, test_df = train_test_split(df, test_size=0.2)
    train_data = Dataset.load_from_df(train_df, reader)
    training_set = train_data.build_full_trainset()
    algo.fit(training_set)

    for index, row in test_df.iterrows():
        user = row['UserId']
        item = row['ItemId']
        playtime = row['Playtime']
Esempio n. 8
0
algo_svd.fit(trainset)

predictions = algo_svd.test(trainset.build_anti_testset())

predictions_svd = algo_svd.test(testset)
pred_svd = pd.DataFrame(predictions_svd)

r.loc[(r['user_id'] == 27523) & (r['book_id'] == 2203)]

SVD().fit

SVD().fit(trainset)
SVDpp().fit(trainset)
KNNBasic(sim_options={'name': 'cosine', 'user_based': True}).fit(trainset)
KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True}).fit(trainset)
KNNWithZScore(sim_options={'name': 'cosine', 'user_based': True}).fit(trainset)
KNNBasic(sim_options={'name': 'cosine', 'user_based': False}).fit(trainset)
KNNWithMeans(sim_options={'name': 'cosine', 'user_based': False}).fit(trainset)
KNNWithZScore(sim_options={
    'name': 'cosine',
    'user_based': False
}).fit(trainset)
SlopeOne().fit(trainset)
BaselineOnly().fit(trainset)
NormalPredictor().fit(trainset)

SVD().fit(trainset)
SVDpp().fit(trainset)
KNNBasic(sim_options={'name': 'cosine', 'user_based': True}).fit(trainset)
KNNWithMeans(sim_options={'name': 'cosine', 'user_based': True}).fit(trainset)
def selfmade_approach():
    # import reduced dataset:
    df = import_reduced_reviews(
        'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv')
    df = df[['user_key', 'game_key', 'rating']]

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])

    ### Modelling part with Surprise:
    # get data in a format surprise can work with:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # Build trainset from the whole dataset:
    trainsetfull = data.build_full_trainset()
    print('Number of users: ', trainsetfull.n_users, '\n')
    print('Number of items: ', trainsetfull.n_items, '\n')

    # Parameters:
    sim_option = {'name': 'cosine', 'user_based': False}
    k = 10
    min_k = 5

    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)

    # Run fit:
    start_time = time.time()
    algo.fit(trainsetfull)
    print("--- %s seconds ---" % (time.time() - start_time))

    # 1st approach: Calculate for a single user contained in dataset:
    target_user_key = 286189
    target_user_info = df[df['user_key'] == target_user_key]

    # Estimate single game:
    target_game_key = 100098

    # data structures:
    # sim_matrix = ndarray(312,312)
    # xr = defaultdict: 312
    # yr = defaultdict 8787

    # later on replace these by self-written structures
    xr = algo.xr
    yr = algo.yr
    sim_matrix = algo.sim
    item_means = algo.means

    inner_target_uid = algo.trainset.to_inner_uid(target_user_key)
    inner_target_iid = algo.trainset.to_inner_iid(target_game_key)

    # switch: uid and idd:
    x = inner_target_uid
    y = inner_target_iid

    # pred2:
    inner_2_raw_item_ids = algo.trainset._raw2inner_id_items
    # swap keys and values:
    inner_2_raw_item_ids = dict(
        (v, k) for k, v in inner_2_raw_item_ids.items())

    # similarity matrix with raw ids instead of inner surprise ids:
    sim_matrix_df = pd.DataFrame(sim_matrix)
    sim_matrix_df = sim_matrix_df.rename(
        columns=lambda x: inner_2_raw_item_ids[x])
    sim_matrix_df = sim_matrix_df.rename(
        index=lambda x: inner_2_raw_item_ids[x])

    target_user_ratings = yr[x]

    # convert from inner to raw:
    target_user_ratings2 = []
    for (inner_iid, rating) in target_user_ratings:
        target_user_ratings2.append((inner_2_raw_item_ids[inner_iid], rating))

    # convert item means from inner to raw:
    item_means2 = {}
    for i, mean in enumerate(item_means):
        item_means2[inner_2_raw_item_ids[i]] = mean

    myKNN = MyKnnWithMeans(sim_matrix=sim_matrix_df,
                           target_user_ratings=target_user_ratings2,
                           item_means=item_means2,
                           k=k,
                           min_k=min_k)
    pred = myKNN.predict_single_game(user_key=target_user_key,
                                     game_key=target_game_key)
    pred_surprise = algo.predict(uid=inner_target_uid, iid=inner_target_iid)

    estimate = pred
    print("Estimate for user %s for game %s is %s" %
          (target_user_key, target_game_key, estimate))

    # Estimate for user not contained in dataset:
    target_user_key = 123456789
    target_game_key = 100098

    user_ratings = [
        (100284, 7),
        (100311, 8),
        (105154, 2),
        (100020, 4),
        (100001, 9),
        (100277, 7),
    ]

    myKNN2 = MyKnnWithMeans(sim_matrix_df, user_ratings, item_means2, k, min_k)
    prediction = myKNN2.predict_single_game(target_user_key, target_game_key)

    # export similarity matrix:
    sim_matrix_df.to_csv(
        '../Data/Recommender/item-item-sim-matrix-surprise.csv')

    # export item means:
    export_path = '../Data/Recommender/item-means.json'
    with open(export_path, 'w') as fp:
        json.dump(item_means2, fp, sort_keys=False, indent=4)

    test = sim_matrix_df.loc[100516, 100284]

    pass
Esempio n. 10
0
        predictions = algo.test(testset)
        Prec, Reca = metrics(predictions, t)
        pr = pr + Prec
        re = re + Reca

    return pr / 10.0, re / 10.0


if __name__ == '__main__':
    data = retrieve_data()
    G_max = ret_mod_user_dict(data)

    algo_NMF = NMF(NMF_no_of_LF, verbose=False)
    algo_SVD = SVD(n_factors=MF_no_of_LF)
    algo_KNN = KNNWithMeans(k=KNN_no_of_LF,
                            sim_options=sim_options,
                            verbose=False)

    # Q36
    Pr1 = []
    Re1 = []
    t = list(range(1, 26))
    for l in t:
        Precision, Recall = cross_val_(data, G_max, l, algo_KNN)
        Pr1.append(Precision)
        Re1.append(Recall)

    plotgraphs(t, Pr1, "Number of Suggestions", "Precision",
               "Precision Curve for KNN")
    plotgraphs(t, Re1, "Number of Suggestions", "Recall",
               "Recall Curve for KNN")
Esempio n. 11
0
def browse(uid):

    #Step 1: data import and prep

    #establish connection
    cnx = sql.connect(user='******',
                      password='******',
                      host='localhost',
                      database='moviesurprise')
    cursor = cnx.cursor()

    #execute query
    query = ("SELECT User_ID, Movie_ID, rating FROM ratings")
    query_sur = ("SELECT User_ID, Movie_ID, surpriseRating FROM ratings")
    query_baseline = ("SELECT Movie_ID from movies where Movie_ID < 6")
    query_baseline_2 = (
        "SELECT * from movies where Movie_ID > 6 AND Movie_ID < 12")
    query_baseline_3 = (
        "SELECT * from movies where Movie_ID > 12 AND Movie_ID < 18")

    cursor.execute(query)
    #convert cursor data to list
    l = list()
    for x in cursor:
        l.insert(len(l), x)

    cursor.execute(query_sur)
    l_sur = list()
    for x in cursor:
        l_sur.insert(len(l_sur), x)

    cursor.execute(query_baseline)
    l_baseline = dict()
    l_baseline = cursor.fetchall()

    cursor.execute(query_baseline_2)
    l_baseline_2 = dict()
    l_baseline_2 = cursor.fetchall()

    cursor.execute(query_baseline_3)
    l_baseline_3 = dict()
    l_baseline_3 = cursor.fetchall()

    with open('baseline_recs_pickle.pkl', 'wb') as pickle_file:
        pickle.dump(l_baseline, pickle_file)
    with open('baseline_recs2_pickle.pkl', 'wb') as pickle_file:
        pickle.dump(l_baseline_2, pickle_file)
    with open('baseline_recs3_pickle.pkl', 'wb') as pickle_file:
        pickle.dump(l_baseline_3, pickle_file)

    #close connection
    cursor.close()
    cnx.close()

    df = pd.DataFrame(l, columns=["User_ID", "Movie_ID", "rating"])
    df_sur = pd.DataFrame(l_sur,
                          columns=["User_ID", "Movie_ID", "surpriseRating"])

    reader = Reader(rating_scale=(1.0, 5.0))
    reader_sur = Reader(rating_scale=(-2.0, 2.0))

    data = Dataset.load_from_df(df, reader=reader)
    data_sur = Dataset.load_from_df(df_sur, reader_sur)

    trainsetfull = data.build_full_trainset()
    trainsetfull_sur = data_sur.build_full_trainset()

    #print("Number of users: ", trainsetfull.n_users,'\n')
    #print("Number of items: ", trainsetfull.n_items, '\n')

    #Step 2: cross-validate

    my_k = 300
    my_min_k = 5
    my_sim_options = {'name': 'pearson', 'user_based': False}

    algo = KNNWithMeans(k=my_k,
                        min_k=my_min_k,
                        sim_options=my_sim_options,
                        verbose=False)
    algo_sur = KNNWithMeans(k=my_k,
                            min_k=my_min_k,
                            sim_options=my_sim_options,
                            verbose=False)
    #results = cross_validate(algo = algo, data = data, measures = ['RMSE'], cv=5, return_train_measures = True)

    #print(results['test_rmse'].mean())

    #Step 3: fit the model

    algo.fit(trainsetfull)
    algo_sur.fit(trainsetfull_sur)

    #Step 4: prediction

    m_ids = get_movie_ids()

    #dict where key is the movie id and the value is the predicted rating
    d_med = dict()
    d_best = dict()
    d_sur = dict()

    for m_id in m_ids:
        get_med_recs(algo, uid, m_id, d_med)
        get_best_recs(algo, uid, m_id, d_best)
        get_sur_recs(algo, uid, m_id, d_sur)

    #convert d to a sorted 2d list (sorted in asc. order of rating estimate) where sort[x][0] is the predicted rating and sort[x][1] is the movie id
    sort = sorted(d_best.items(), key=lambda x: x[1])
    top_recs = sort

    with open('top_recs_pickle' + str(uid) + '.pkl', 'wb') as pickle_file:
        pickle.dump(top_recs, pickle_file)
    #returns top 5 movies
    #print("Movies you will definitely like: ")
    #for item in top_recs:
    #print(item)

    #print("")

    sort = sorted(d_med.items(), key=lambda x: x[1])
    med_recs = sort

    with open('med_recs_pickle' + str(uid) + '.pkl', 'wb') as pickle_file:
        pickle.dump(med_recs, pickle_file)
    #print("Movies you might like: ")
    #for item in med_recs:
    #print(item)

    #print("")

    #step 5: repeat 3 and 4 for surprise data

    sort = sorted(d_sur.items(), key=lambda x: x[1])
    sur_recs = sort

    with open('sur_recs_pickle' + str(uid) + '.pkl', 'wb') as pickle_file:
        pickle.dump(sur_recs, pickle_file)
    #print("Movies you might be surprised to like: ")
    #for item in sur_recs:
    #print(item)

    #step 6: grab movieId info
    definintely_like = [[0 for x in range(5)]
                        for y in range(5)]  #change range when DB has links
    somewhat_like = [[0 for x in range(5)]
                     for y in range(5)]  #change range when DB has links
    surprisingly_like = [[0 for x in range(5)]
                         for y in range(5)]  #change range when DB has links

    with open('definintely_like_pickle' + str(uid) + '.pkl',
              'wb') as pickle_file:
        pickle.dump(definintely_like, pickle_file)
    with open('somewhat_like_pickle' + str(uid) + '.pkl', 'wb') as pickle_file:
        pickle.dump(somewhat_like, pickle_file)
    with open('surprisingly_like_pickle' + str(uid) + '.pkl',
              'wb') as pickle_file:
        pickle.dump(surprisingly_like, pickle_file)

    for i in range(len(definintely_like)):
        try:
            movie = random.randint(0, top_recs.__len__() - 1)
            definintely_like[i][0] = top_recs[movie][0]  #id
            definintely_like[i][1] = top_recs[movie][1]  #est
            definintely_like[i][2] = id_to_title(top_recs[movie][0])  #title
            definintely_like[i][3] = id_to_avg_rating(
                top_recs[movie][0])  #avg rating
            definintely_like[i][4] = id_to_posterlink(
                top_recs[movie][0])  #poster link
        except ValueError:
            print("Unable to pull recs for this user. Using baseline recs.")
            definintely_like[i][0] = l_baseline[i][0]  #id
            definintely_like[i][1] = l_baseline[i][0]  #est
            definintely_like[i][2] = id_to_title(l_baseline[i][0])  #title
            definintely_like[i][3] = id_to_avg_rating(
                l_baseline[i][0])  #avg rating
            definintely_like[i][4] = id_to_posterlink(
                l_baseline[i][0])  #poster link

    for i in range(len(somewhat_like)):
        try:
            movie = random.randint(0, med_recs.__len__() - 1)
            somewhat_like[i][0] = med_recs[movie][0]  #id
            somewhat_like[i][1] = med_recs[movie][1]  #est
            somewhat_like[i][2] = id_to_title(med_recs[movie][0])  #title
            somewhat_like[i][3] = id_to_avg_rating(
                med_recs[movie][0])  #avg rating
            somewhat_like[i][4] = id_to_posterlink(
                med_recs[movie][0])  #poster link
        except ValueError:
            #print("Unable to pull recs for this user. Using baseline recs.")
            somewhat_like[i][0] = l_baseline_2[i][0]  #id
            somewhat_like[i][1] = l_baseline_2[i][0]  #est
            somewhat_like[i][2] = id_to_title(l_baseline_2[i][0])  #title
            somewhat_like[i][3] = id_to_avg_rating(
                l_baseline_2[i][0])  #avg rating
            somewhat_like[i][4] = id_to_posterlink(
                l_baseline_2[i][0])  #poster link

    for i in range(len(surprisingly_like)):
        try:
            movie = random.randint(0, sur_recs.__len__() - 1)
            surprisingly_like[i][0] = sur_recs[movie][0]  #id
            surprisingly_like[i][1] = sur_recs[movie][1]  #est
            surprisingly_like[i][2] = id_to_title(sur_recs[movie][0])  #title
            surprisingly_like[i][3] = id_to_avg_rating(
                sur_recs[movie][0])  #avg rating
            surprisingly_like[i][4] = id_to_posterlink(
                sur_recs[movie][0])  #poster link
        except ValueError:
            #print("Unable to pull recs for this user. Using baseline recs.")
            surprisingly_like[i][0] = l_baseline_3[i][0]  #id
            surprisingly_like[i][1] = l_baseline_3[i][0]  #est
            surprisingly_like[i][2] = id_to_title(l_baseline_3[i][0])  #title
            surprisingly_like[i][3] = id_to_avg_rating(
                l_baseline_3[i][0])  #avg rating
            surprisingly_like[i][4] = id_to_posterlink(
                l_baseline_3[i][0])  #poster link

    #print("Movies you will definitely like:\n")

    #for x in definintely_like:
    #print(x)

    #print("Movies you might like like:\n")

    #for x in somewhat_like:
    #print(x)

    #print("Movies you may be surprised by:\n")

    #for x in surprisingly_like:
    #print(x)

    #each list has the following format: id, estimated rating, title, average rating, link
    print("Recommendation generation for user " + str(uid) + "  complete\n")
    return definintely_like, somewhat_like, surprisingly_like
Esempio n. 12
0
        movieID = info[1]
        if (userID in movies_watched):
            movies_watched[int(userID)].append(int(movieID))
        else:
            movies_watched[int(userID)] = [int(movieID)]

indications = dict()
moviesIds = set(movies.keys())
for i in movies_watched:
    moviesUserWatched = set(movies_watched[i])
    indications[i] = moviesIds.difference(moviesUserWatched)

data = Dataset.load_builtin('ml-100k')
trainset, testset = train_test_split(data, test_size=.2)

algo = KNNWithMeans(k=4, sim_options={'name': 'cosine', 'user_based': True})

algo.fit(trainset)
predictions = algo.test(testset)
rmse_knn = accuracy.rmse(predictions, verbose=False)


def top5Movies(userId):
    indicationsByRating = dict()
    print(userId)
    indicationsByUser = list(indications[userId])

    for i in indicationsByUser:
        indicationsByRating[i] = algo.predict(uid=str(userId), iid=str(i)).est
    indicationsByRating = sorted(indicationsByRating.items(),
                                 key=lambda x: x[1],
Esempio n. 13
0
anime_info = pd.read_csv(anime_info_path, sep="\t")
# print(anime_info.head())
anime_ratings = pd.read_csv(anime_ratings_path, sep='\t')

reader = Reader(rating_scale=(1, 10))

data = Dataset.load_from_df(anime_ratings[['User_ID', "Anime_ID", "Feedback"]], reader)

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}

model = KNNWithMeans(sim_options=sim_options)

trainingSet = data.build_full_trainset()

model.fit(trainingSet)

# print ("ok")

def top_5():

    top_movies = anime_info.sort_values(
        by='rating', ascending=False)

    return top_movies[:5]

def top_5_recommendations(uid):
Esempio n. 14
0

reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(dataset, reader)


# In[ ]:


trainset, testset = train_test_split(data, test_size=.15)


# In[ ]:


algo = KNNWithMeans(k=50, sim_options={'name': 'pearson_baseline', 'user_based': True})
algo.fit(trainset)


# In[ ]:


test_pred = algo.test(testset)


# In[ ]:


accuracy.rmse(test_pred, verbose=True)

Esempio n. 15
0
from surprise import KNNWithMeans
from surprise import Dataset, print_perf, Reader
from surprise.model_selection import cross_validate
import os

# 指定文件所在路径
file_path = os.path.expanduser('mydata.csv')
# 告诉文本阅读器,文本的格式是怎么样的
reader = Reader(line_format='user item rating', sep=',')
# 加载数据
data = Dataset.load_from_file(file_path, reader=reader)
trainset = data.build_full_trainset()

# Use user_based true/false to switch between user-based or item-based collaborative filtering
algo = KNNWithMeans(k=50, sim_options={'user_based': False})#取最相似的用户进行计算时,只取最相似的k个
algo.fit(trainset)

# we can now query for specific predicions
uid = str(5)  # raw user id
iid = str(1)  # raw item id

# get a prediction for specific users and items.
pred = algo.predict(uid, iid)
print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)# rating of user-5 to item-1

#----------------------------
uid = str(5)  # raw user id
iid = str(5)  # raw item id
# get a prediction for specific users and items.
pred = algo.predict(uid, iid)
print('rating of user-{0} to item-{1} is '.format(uid, iid), pred.est)
def benchmark_different_algorithms():
    # import reduced dataset:
    df = import_reduced_reviews(
        'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv')

    # check for duplicates:
    duplicates = len(df) - len(
        df.drop_duplicates(subset=['game_key', 'user_key']))

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])
    print('duplicates removed: ' + str(duplicates))

    ## Surprise:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    results = []
    algorithms = [
        'SVD\t\t\t\t\t\t', 'SlopeOne\t\t\t\t', 'CoClustering\t\t\t',
        'NMF\t\t\t\t\t\t', 'KNN_Basic Item-Item\t\t',
        'KNN_WithMeans Item-Item\t', 'KNN_WithZScore Item-Item',
        'KNN_Basic User-User\t\t', 'KNN_WithMeans User-User\t',
        'KNN_WithZScore User-User'
    ]

    # 1) SVD
    algo = SVD()
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 2) Slope One
    algo = SlopeOne()
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 3) CoClustering
    algo = CoClustering()
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 4) NMF
    algo = NMF()
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    ## K-Nearest Neighbors - Item-Item
    sim_option = {'name': 'cosine', 'user_based': False}
    k = 40
    min_k = 5

    # 5) KNNBasic
    algo = KNNBasic(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 6) KNNWithMeans
    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 7) KNNWithZScore
    algo = KNNWithZScore(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    ## K-Nearest Neighbors - User - User
    sim_option = {'name': 'cosine', 'user_based': True}
    k = 100
    min_k = 2

    # 8) KNNBasic
    algo = KNNBasic(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 9) KNNWithMeans
    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    # 10) KNNWithZScore
    algo = KNNWithZScore(k=k, min_k=min_k, sim_options=sim_option)
    results.append(
        cross_validate(algo,
                       data,
                       measures=['RMSE'],
                       cv=3,
                       return_train_measures=True,
                       n_jobs=-3,
                       verbose=True))

    for algorithm, result in zip(algorithms, results):
        print(algorithm + '\t \t RMSE Score: \t' +
              str(result['test_rmse'].mean()) + '\t\t Fit-Time: ' +
              str(result['fit_time']) + '\t\t Train-Time: ' +
              str(result['test_time']))
Esempio n. 17
0
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_algo = algo
        best_rmse = rmse
        best_pred = predictions
    pass
pass

print("ok")
print(f"best RMSE {best_rmse}")

print("KNNWithMeans")

kf = KFold(n_splits=5)
sim_options = {'name': 'cosine'}
algo = KNNWithMeans(sim_options=sim_options)
best_algo = None
best_rmse = 1000.0
best_pred = None
for trainset, testset in kf.split(data):
    # train and test algorithm.
    algo.fit(trainset)
    predictions = algo.test(testset)
    # Compute and print Root Mean Squared Error
    rmse = accuracy.rmse(predictions, verbose=True)
    if rmse < best_rmse:
        best_algo = algo
        best_rmse = rmse
        best_pred = predictions
    pass
pass
def create_similarity_matrix():
    start_time = time.time()

    # import reviews:
    import_path = '../Data/Joined/Results/Reviews_Reduced.csv'
    df = pd.read_csv(import_path)

    # keep only important columns:
    df = df[['game_key', 'user_key', 'rating']]

    # create surprise algorithm object
    sim_option = {'name': 'pearson', 'user_based': False}
    algo = KNNWithMeans(sim_options=sim_option)

    # get data in a format surprise can work with:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # Build trainset from the whole dataset:
    trainset_full = data.build_full_trainset()
    print('Number of users: ', trainset_full.n_users, '\n')
    print('Number of items: ', trainset_full.n_items, '\n')

    # fit similarity matrix and calculate item means:
    algo.fit(trainset_full)
    print("--- %s seconds ---" % (time.time() - start_time))

    # save similarity matrix and means from algo object to variable
    sim_matrix = algo.sim
    item_means = algo.means

    # convert numpy array to pd df:
    sim_matrix = pd.DataFrame(sim_matrix)

    # replace inner ids with raw ids:
    raw_2_inner_ids = trainset_full._raw2inner_id_items
    # swap keys and values:
    inner_2_raw_item_ids = dict((v, k) for k, v in raw_2_inner_ids.items())

    # replace inner ids in sim_matrix index and columns by game_keys:
    sim_matrix = sim_matrix.rename(index=inner_2_raw_item_ids)
    sim_matrix = sim_matrix.rename(columns=inner_2_raw_item_ids)

    # export sim_matrix:
    sim_matrix.to_csv(
        '../Data/Recommender/item-item-sim-matrix-surprise-Reduced_dataset.csv'
    )

    # convert item means from inner to raw:
    item_means_raw_ids = {}
    for i, mean in enumerate(item_means):
        item_means_raw_ids[inner_2_raw_item_ids[i]] = mean

    # export item means:
    export_path = '../Data/Recommender/item-means-Reduced_dataset.json'
    with open(export_path, 'w') as fp:
        json.dump(item_means_raw_ids, fp, sort_keys=False, indent=4)

    ## create sim matrix in long format:
    # get index as column:
    column_names = list(sim_matrix.columns.values)
    sim_matrix.reset_index(level=0, inplace=True)

    # convert df from wide to long:
    sim_matrix_long = pd.melt(sim_matrix,
                              id_vars='index',
                              value_vars=column_names,
                              var_name='game_key_2')
    sim_matrix_long.rename(columns={'index': 'game_key'})

    # export long sim matrix:
    sim_matrix_long.to_csv(
        '../Data/Recommender/item-item-sim-matrix-surprise-Reduced_dataset-LONG_FORMAT.csv'
    )

    print("--- %s seconds ---" % (time.time() - start_time))
    print('function end reached')
Esempio n. 19
0
from surprise import KNNWithMeans
from surprise import Dataset, Reader
from surprise import accuracy
from surprise.model_selection import KFold
import time
startTime = time.time()
# 数据读取
reader = Reader(line_format='user item rating timestamp',
                sep=',',
                skip_lines=1)
data = Dataset.load_from_file('./ratings.csv', reader=reader)
trainset = data.build_full_trainset()

# ItemCF 计算得分
# 取最相似的用户计算时,只取最相似的k个
algo = KNNWithMeans(k=50, sim_options={'user_based': False, 'verbose': 'True'})

# 定义K折交叉验证迭代器,K=3
kf = KFold(n_splits=3)
for trainset, testset in kf.split(data):
    # 训练并预测
    algo.fit(trainset)
    predictions = algo.test(testset)
    # 计算RMSE
    accuracy.rmse(predictions, verbose=True)
    # 计算MAE
    accuracy.mae(predictions, verbose=True)
# algo.fit(trainset)

uid = str(196)
iid = str(302)
Esempio n. 20
0
    # Gridsearch KNNBasic
    param_grid = {'k': [22, 24, 26, 28, 30]}
    print(surprise_gridsearch(param_grid, KNNBasic, data))

    # Cross-Validate KNNBasic
    sim_options = {'name': 'MSD', 'user_based': False}
    algo = KNNBasic(k=26, sim_options=sim_options)
    surprise_cross_validate(algo, data, sim_options)

    # Gridsearch KNNWithMeans
    param_grid = {'k': [37, 38, 39, 40, 41, 42, 43]}
    print(surprise_gridsearch(param_grid, KNNWithMeans, data))

    # Cross-Validate KNNWithMeans
    sim_options = {'name': 'MSD', 'user_based': False}
    algo = KNNWithMeans(k=42, sim_options=sim_options)
    surprise_cross_validate(algo, data, sim_options)

    # Gridsearch KNNBaseline
    param_grid = {'k': [18, 19, 20, 21, 22]}
    print(surprise_gridsearch(param_grid, KNNBasic, data))

    # Cross-Validate KNNBaseline
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    algo = KNNBaseline(k=19, sim_options=sim_options)
    surprise_cross_validate(algo, data, sim_options)

    # Predictions
    trainset = data.build_full_trainset()
    sim_options = {'name': 'pearson_baseline', 'user_based': False}
    algo = KNNBaseline(k=19, sim_options=sim_options)
Esempio n. 21
0
def make_alg_and_test(trainset, testset):
    """
    This function for: create the algorithm and run the algorithm on test dataset.
    Args: 
        trainset, testset        
    Return:     

    Try other config in sim_options:
        name : contains the similarity metric to use. Options are cosine, msd, pearson, or pearson_baseline. The default is msd.
        user_based : a boolean that tells whether the approach will be user-based or item-based. The default is True, which means the user-based approach will be used.
        min_support: the minimum number of common items needed between users to consider them for similarity. 
                        For the item-based approach, this corresponds to the minimum number of common users for two items.
    """

    cfg = []
    sim_options0 = {'name': 'pearson_baseline', 'user_based': False}
    cfg.append(sim_options0)

    # To use item-based cosine similarity
    sim_options1 = {
        "name": "cosine",
        "user_based": False,  # Compute  similarities between items
        "min_support": 3,
    }
    cfg.append(sim_options1)

    sim_options2 = {
        "name": "msd",
        "user_based": False,
    }
    cfg.append(sim_options2)

    sim_options3 = {
        "name": "cosine",
        "user_based": False,
        "min_support": 4,
    }
    cfg.append(sim_options3)

    sim_options4 = {
        "name": "msd",
        "user_based": False,
        "min_support": 5,
    }
    cfg.append(sim_options4)

    sim_options5 = {
        "name": "cosine",
        "user_based": False,
        "min_support": 5,
    }
    cfg.append(sim_options5)

    for index in range(len(cfg)):
        algo = KNNWithMeans(k=5, sim_options=cfg[index])
        algo.fit(trainset)

        # run the trained model against the testset
        test_pred = algo.test(testset)

        logging.info(test_pred[20])
        # get RMSE
        logging.info(
            f"With index config : {index} , rmse on Test Set = {accuracy.rmse(test_pred, verbose=True)}"
        )
Esempio n. 22
0
'''
table = []
for klass in classes:
    start = time.time()
    if klass == 'SVD':
        algo = SVD()
    elif klass == 'SVDpp':
        algo = SVDpp()
    elif klass == 'NMF':
        algo = NMF()
    elif klass == 'SlopeOne':
        algo = SlopeOne()
    elif klass == 'KNNBasic':
        algo = KNNBasic()
    elif klass == 'KNNWithMeans':
        algo = KNNWithMeans()
    elif klass == 'KNNBaseline':
        algo = KNNBaseline()
    elif klass == 'CoClustering':
        algo = CoClustering()
    elif klass == 'BaselineOnly':
        algo = BaselineOnly()
    else:
        algo = NormalPredictor()
    #cv_time = str(datetime.timedelta(seconds=int(time.time() - start)))
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

    # Precision and recall can then be averaged over all users
    prec = sum(p for p in precisions.values()) / len(precisions)
Esempio n. 23
0
# Ratings
rcols = ['userId', 'movieId', 'rating']
ml_ratings_training = pd.read_csv('../data/final_py_data_training.csv',
                                  usecols=rcols)

# Convert to Surprise Ratings
reader = Reader(rating_scale=(0.5, 5))
surprise_training = Dataset.load_from_df(ml_ratings_training,
                                         reader=reader).build_full_trainset()

# Train algorithm
i_min_k = 5
i_max_k = 100
sim_options_item = {'name': 'pearson', 'user_based': False}
algo_item = KNNWithMeans(k=i_max_k,
                         min_k=i_min_k,
                         sim_options=sim_options_item)
algo_item.fit(surprise_training)


class item_CF_model(ccobra.CCobraModel):
    def __init__(self, name='Item_CF'):
        super(item_CF_model, self).__init__(name, ["recommendation"],
                                            ["single-choice"])

    def predict(self, item, **kwargs):

        user_id = item.identifier
        movie_id = int(eval(item.task[0][0]))
        # Prediction form
        predict_form = [[user_id, movie_id, 1]]
Esempio n. 24
0
    unrated_df = unrated_df.sort_values('Rating', ascending=False)
    unrated_df = unrated_df.head()
    rated_df = rated_df.sort_values('Rating', ascending=False)
    rated_df = rated_df.head()

    # stores top 5 movies predicted
    finalu = []

    # stores top 5 movies watched already
    finalr = []
    for i in range(0, 5):
        finalu.append(movies.iloc[int(unrated_df.iloc[i][0])][1])
        finalr.append(movies.iloc[int(rated_df.iloc[i][0])][1])
    table = {
        'Test User Id': userInput,
        ('Predicted movies', 'Movies'): finalu,
        ('Predicted movies', 'Ratings'): unrated_df['Rating'].tolist(),
        ('Movies seen in past', 'Movies'): finalr,
        ('Movies seen in past', 'Ratings'): rated_df['Rating'].tolist()
    }
    return table


# loading dataset
data = Dataset.load_builtin('ml-1m')

algo = KNNWithMeans(k=10)

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['MAE'], cv=5, verbose=True)
Esempio n. 25
0
file_path = os.path.expanduser('./data/163_music_suprise_format.txt')
# 指定文件格式
reader = Reader(line_format='user item rating timestamp', sep=',')
# 从文件读取数据
music_data = Dataset.load_from_file(file_path, reader=reader)
# 计算歌曲和歌曲之间的相似度
print "构建数据集..."
trainset = music_data.build_full_trainset()
#sim_options = {'name': 'pearson_baseline', 'user_based': False}

#查找最近的user
print "开始训练模型..."
#sim_options = {'user_based': False}
#algo = KNNBaseline(sim_options=sim_options)
algo = KNNWithMeans()
algo.train(trainset)

current_playlist = list(name_id_dic.keys())[39]
print "歌单名称", current_playlist

# 取出近邻
# 映射名字到id
playlist_id = name_id_dic[current_playlist]
print "歌单id", playlist_id
# 取出来对应的内部user id => to_inner_uid
playlist_inner_id = algo.trainset.to_inner_uid(playlist_id)
print "内部id", playlist_inner_id

playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10)
def compare_model_algorithms(data, Nrep=2, Nfolds=5):
    """
	Prints out model performances and run times for standard algorithms in 
	Surprise.
	Input:
		data = surprise data object
		Nrep = number of iterations with different folds
		Nfolds = number of cross validation folds
	Output: 
		performance_list = list of performance matrices with 
			rows: RMSE, MAE, time(min); and
			cols: Algorithm (SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNWithMeans,
					KNNBaseline, CoClustering, BaselineOnly, NormalPredictor)
		performance = average over lists in performance_list
	"""

    # set RNG
    np.random.seed(0)
    random.seed(0)

    # set KNN algorithm options
    user_opt_cos = {"name": "cosine", "user_based": True}
    item_opt_cos = {"name": "cosine", "user_based": False}

    # The algorithms to cross-validate
    s_SVD = SVD()
    s_SVDpp = SVDpp()
    s_NMF = NMF()
    s_SlopeOne = SlopeOne()
    u_KNNBasic = KNNBasic(sim_options=user_opt_cos)
    u_KNNWithMeans = KNNWithMeans(sim_options=user_opt_cos)
    u_KNNBaseline = KNNBaseline(sim_options=user_opt_cos)
    i_KNNBasic = KNNBasic(sim_options=item_opt_cos)
    i_KNNWithMeans = KNNWithMeans(sim_options=item_opt_cos)
    i_KNNBaseline = KNNBaseline(sim_options=item_opt_cos)
    s_CoClustering = CoClustering()
    s_BaselineOnly = BaselineOnly()
    s_NormalPredictor = NormalPredictor()

    classes = [
        s_SVD, s_SVDpp, s_NMF, s_SlopeOne, u_KNNBasic, u_KNNWithMeans,
        u_KNNBaseline, i_KNNBasic, i_KNNWithMeans, i_KNNBaseline,
        s_CoClustering, s_BaselineOnly, s_NormalPredictor
    ]

    class_names = [
        "SVD", "SVDpp", "NMF", "SlopeOne", "user-KNNBasic",
        "user-KNNWithMeans", "user-KNNBaseline", "item-KNNBasic",
        "item-KNNWithMeans", "item-KNNBaseline", "CoClustering",
        "BaselineOnly", "NormalPredictor"
    ]

    # repeat cross validation for different kfold splits for higher reliability
    performance_list = []
    headers = ['RMSE', 'MAE', 'Time (min)']
    for irep in range(0, Nrep):

        # cross validation folds will be the same for all algorithms.
        kf = KFold(n_splits=Nfolds, random_state=0)

        # cross validate for each algorithm
        table = np.zeros((len(classes), len(headers)))
        for ik, klass in enumerate(classes):
            start = time.time()
            out = cross_validate(klass, data, ['rmse', 'mae'], kf)
            cv_time = (time.time() - start) / 60
            mean_rmse = np.mean(out['test_rmse'])
            mean_mae = np.mean(out['test_mae'])
            table[ik, :] = np.array([mean_rmse, mean_mae, cv_time])

        # Accumulate results for each cross-validation
        performance_list.append(table)

    # Show averaged results over cross validation iterations
    performance = sum(performance_list) / len(performance_list)
    print(
        tabulate(performance.tolist(), headers=headers, showindex=class_names))

    return performance_list, performance
Esempio n. 27
0
reader = Reader(rating_scale=(1, 5))

from surprise import KNNWithMeans
from surprise import KNNBasic

import heapq
from collections import defaultdict
from operator import itemgetter

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between items
}
model = KNNWithMeans(sim_options=sim_options)

#profiledata = pd.read_csv("C:\\Users\\juyee\\Envs\\sih2020\\candidate_recommender\\Test Profiles\\profile_data.csv")
df = pd.read_csv(r"./JVR_CandidatesInfo2.csv")
df = pd.read_csv(r"JVR_CandidatesInfo2.csv")

df = df.replace(np.nan, '', regex=True)
df = df.rename(columns={'Unnamed: 0': 'ind'})

#pdf_files = glob.glob("C:\\Users\\juyee\\Desktop\\Web scraping\\Test Profiles\\*.csv")

app = Flask(__name__)


def getCandidateName(candidateid):
    name = df[df["ind"] == candidateid]["Name"]
Esempio n. 28
0
 def set_model_params(self, model_params):
     print('updating model parameters...')
     self.model = KNNWithMeans(model_params)
     print('fitting KNNWithMeans model...')
     self.model.fit(self.trainset)
Esempio n. 29
0
def make_prediction(test_data_imdb):
    train_data = pd.read_csv('../data/modeling/train/ratings_clean_std_0.csv',
                             sep=',').drop(columns={'Unnamed: 0'})
    omdb = pd.read_csv('../data/modeling/train/omdb_cleaned.csv')

    # build a reader, define the rating scale (minimum and maximum value)
    reader = Reader(rating_scale=(0.5, 5))
    # convert data to surprise format
    train_surprise = Dataset.load_from_df(train_data,
                                          reader).build_full_trainset()

    # Collaborative Filtering Models
    knn_collaborative = KNNWithMeans(k=115,
                                     min_k=5,
                                     sim_options={
                                         'name': 'msd',
                                         'user_based': False
                                     })
    knn_collaborative.fit(train_surprise)
    svd = SVD(lr_all=0.01, reg_all=0.05, n_epochs=23)
    svd.fit(train_surprise)
    preds = [[
        knn_collaborative.predict(test[1], test[3]).est
        for test in test_data_imdb.itertuples()
    ],
             [
                 svd.predict(test[1], test[3]).est
                 for test in test_data_imdb.itertuples()
             ]]

    # Content-Based Models
    # define features for content-based models
    params_features = {
        'threshold_actors': 0,
        'ts_languages': 0,
        'year': True,
        'runtime': True,
        'imdbvotes': True,
        'series': False,
        'awards': False,
        'genres': True,
        'imdb_rating': True,
        'roto_rating': True,
        'pg_rating': True,
        'threshold_newkeywords': 0,
        'threshold_plots': 0,
        'threshold_directors': 0
    }
    # load features
    features, names = preprocessing.features(**params_features)

    # add imdbID and set as index
    features = omdb[['imdbID'
                     ]].join(pd.DataFrame(features)).set_index('imdbID')

    # predict ratings
    pred_content = []
    no_of_ratings = []
    train_data = train_data[train_data['imdbID'] != 'tt0720339']
    for row in test_data_imdb.itertuples():
        # select user and movie

        imdbID = row.imdbID
        userID = row.user_id

        # compute predictions
        if imdbID == 'tt0720339':
            # exclude outlier movie without information
            pred_content.append(svd.predict(userID, imdbID).est)
        else:
            # select ratings of the user
            ratings_user = train_data.loc[train_data['user_id'] == userID]
            ratings_user.reset_index(inplace=True, drop=True)
            # select features of corresponding movies and convert to array
            features_user = np.array(features.loc[ratings_user['imdbID']])
            features_movie = np.array(features.loc[imdbID])

            pred_content.append(
                predict_movie_rating(ratings_user, features_user,
                                     features_movie))
        # store the number of predictions of a user:
        no_of_ratings.append(ratings_user.shape[0])

    # predictions of the models
    predictions = weighted_prediction(preds[0], preds[1], pred_content,
                                      no_of_ratings)
    test_data_with_rating = test_data_imdb.join(predictions)

    return test_data_with_rating[['user_id', 'movieID', 'rating']]
Esempio n. 30
0
reader = Reader(rating_scale=(0, 5))

data1 = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)

data2 = Dataset.load_from_df(ratingsByUser[['timestamp', 'movieId', 'rating']],
                             reader)

data3 = Dataset.load_from_df(ratingsByMovie[['timestamp', 'userId', 'rating']],
                             reader)

PMF = SVD()

kval = 5

knn = KNNWithMeans(k=kval, min_k=kval, verbose=False)

cross_validate(PMF, data1, measures=['MAE', 'RMSE'], cv=5, verbose=True)

cross_validate(knn, data2, measures=['MAE', 'RMSE'], cv=3, verbose=True)

cross_validate(knn, data3, measures=['MAE', 'RMSE'], cv=5, verbose=True)

# knn1 = KNNWithMeans(k=2, min_k=1, verbose=False)
#
# knn2 = KNNWithMeans(k=5, min_k=5, verbose=False)
#
# knn3 = KNNWithMeans(k=9, min_k=9, verbose=False)
#
# knn4 = KNNWithMeans(k=15, min_k=15, verbose=False)
#
def collaborative_filtering_using_surprise():
    """
    https://towardsdatascience.com/how-to-build-a-memory-based-recommendation-system-using-python-surprise-55f3257b2cf4
    Predict games for user with user_key = 93681
    """
    target_user_key = 93681

    # import reduced dataset:
    df = import_reduced_reviews()

    # check for duplicates:
    duplicates = len(df) - len(
        df.drop_duplicates(subset=['game_key', 'user_key']))

    # drop duplicates:
    df = df.drop_duplicates(subset=['game_key', 'user_key'])
    print('duplicates removed: ' + str(duplicates))

    # check out our user:
    df_target_user = df[df['user_key'] == target_user_key]

    # build utility matrix:
    # data_pivot = df.pivot(index='user_key', columns='game_key', values='rating')

    # calculate sparsity
    # sparsity = data_pivot.isnull().sum().sum() / data_pivot.size
    # print('Sparcity of utility matrix: ' + str(sparsity))

    ### Modelling part with Surprise:
    # get data in a format surprise can work with:
    reader = Reader(rating_scale=(1, 10))
    data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader)

    # Split in trainset and testset
    trainset, testset = train_test_split(data, test_size=0.2)

    print('Number of users: ', trainset.n_users, '\n')
    print('Number of items: ', trainset.n_items, '\n')

    # When surprise creates a Trainset or Testset object, it takes the raw_id’s (the ones that you used in the file
    # you imported), and converts them to so-called inner_id’s (basically a series of integers, starting from 0). You
    # might need to trace back to the original names. Using the items as an example (you can do the same approach
    # with users, just swap iid's with uid's in the code), to get the list of inner_iids, you can use the all_items
    # method. To convert from raw to inner id you can use the to_inner_iid method, and the to_raw_iid to convert back.

    # An example on how to save a list of inner and raw item id’s:
    trainset_iids = list(trainset.all_items())
    iid_converter = lambda x: trainset.to_raw_iid(x)
    trainset_raw_iids = list(map(iid_converter, trainset_iids))

    ## Model parameters: of kNN:
    # Two hyperparameters we can tune:
    # 1. k parameter
    # 2. similarity option
    #   a) user-user vs item-item
    #   b) similarity function (cosine, pearson, msd)

    sim_option = {'name': 'pearson', 'user_based': False}

    # 3 different KNN Models: KNNBasic, KNNWithMeans, KNNWithZScore
    k = 40
    min_k = 5

    algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option)

    algo.fit(trainset)

    ## Testing:
    predictions = algo.test(testset)

    accuracy.rmse(predictions)

    # Own similarity matrix:
    sim_matrix_imported = pd.read_csv(
        '../Data/Recommender/selfmade_item-item-similarity-matrix.csv',
        index_col=0)
    sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int)
    sim_matrix_imported = sim_matrix_imported.to_numpy()

    algo.sim = sim_matrix_imported

    predictions = algo.test(testset)

    accuracy.rmse(predictions)

    # Cross validation:
    skip = True
    if not skip:
        results = cross_validate(algo=algo,
                                 data=data,
                                 measures=['RMSE'],
                                 cv=5,
                                 return_train_measures=True)
        results_mean = results['test_rmse'].mean()

    ## Predictions
    # Lets assume we are happy with the method and now want to apply it to the entire data set.

    # Estimate for a specific user a specific item:
    single_item_single_user_prediction = algo.predict(uid=target_user_key,
                                                      iid=100010,
                                                      verbose=True)

    # Estimate all items for a specific user:
    list_of_all_items = trainset_raw_iids
    target_predictions = []

    for item in list_of_all_items:
        single_prediction = algo.predict(uid=target_user_key, iid=item)
        target_predictions.append(
            (single_prediction.uid, single_prediction.iid,
             single_prediction.est))

    # Then sort the predictions for each user and retrieve the k highest ones:
    target_predictions.sort(key=lambda x: x[2], reverse=True)
    n = 20
    top_n = target_predictions[:n]
    top_n = [row[1] for row in top_n]

    print('end')