-
Notifications
You must be signed in to change notification settings - Fork 0
/
recommender.py
275 lines (214 loc) · 13.3 KB
/
recommender.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
import pandas as pd
import numpy as np
import warnings
import random
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from scipy.optimize import minimize
warnings.filterwarnings('ignore')
# userID --> ID of the user
# placeID --> ID of the restaurant
# rating --> average total rating given by the user to restaurant
# food_rating --> rating given by user to restaurant's food
# service_rating --> rating given by user based on restaurant's service
input_ratings = pd.read_csv('rating.csv')
#----------- Data Preprocessing ----------#
# Caclulate the number of unique userID and unique placeID
total_restaurants = input_ratings.placeID.unique()
total_users = input_ratings.userID.unique()
#---- Create a user-restaurant matrix to find the similarity between users annd restaurants for three types of ratings --- #
restuarant_id_sort = np.sort(total_restaurants)
user_id_sort = np.sort(total_users)
overall_rating = pd.DataFrame(np.zeros((len(total_restaurants), len(total_users))) - 1, columns=user_id_sort, index =restuarant_id_sort)
food_rating = pd.DataFrame(np.zeros((len(total_restaurants), len(total_users))) - 1, columns=user_id_sort, index =restuarant_id_sort)
service_rating = pd.DataFrame(np.zeros((len(total_restaurants), len(total_users))) - 1, columns=user_id_sort, index =restuarant_id_sort)
for pid, uid, o_rat, f_rat, s_rat in zip(input_ratings.placeID, input_ratings.userID, input_ratings.rating, input_ratings.food_rating, input_ratings.service_rating):
overall_rating.loc[pid, uid] = o_rat
food_rating.loc[pid, uid] = f_rat
service_rating.loc[pid, uid] = s_rat
overall_rating = overall_rating.values
food_rating = food_rating.values
service_rating = service_rating.values
#create a matrix to check if the restaurant-user pair has a rating of zero or not
checkMatrix= np.zeros(overall_rating.shape)
checkMatrix[overall_rating >= 0] = 1
# ---- Train/Test Split ---- #
""" Split the existing ratings in a 70/30 train/test matrix. Also i make sure every restaruant and user receives a rating and gives a rating
respectively in the training set"""
random.seed(0)
check = True
while check:
training_set = checkMatrix.copy()
for i in range(training_set.shape[1]):
# use only rated restaurants
index = list(np.where(training_set[:,i] == 1)[0])
# randomly select 30% of whole dataset
flag = int(round(len(index)*0.3))
index_flag = random.sample(index,flag)
training_set[index_flag,i] = 0
if np.sum(training_set,axis=1).min() > 1:
check = False
testing_set = checkMatrix - training_set
print(f"Training set: {training_set.sum()}\nTesting set: {testing_set.sum()}\n")
# ---- Use evaluation metrics to get the quantified values of model performances --- #
# Use root-mean-squared-error method for model performance employing linear regression
def root_mean_squared(y_true, y_predicte, R):
rsme = np.sqrt(mean_squared_error(y_true[R == 1], y_predicte[R == 1]))
return rsme
# --- Create a benchmark model which returns average predicted rating and use it to compare ratings ----- #
# Function for benchmark model mean
def benchmark_mean (y_true, checkMatrix):
y_true = y_true * checkMatrix
return (np.sum(y_true, axis=1) / np.sum((checkMatrix == 1.0), axis=1)).reshape(y_true.shape[0], 1) * np.ones(y_true.shape)
# Function to get the benchmark root-mean-squared error to compare with the optimized results later
def get_benchmark_RSME(rating, setMatrix, setMatrix2):
ratings_mean = benchmark_mean(rating, setMatrix)
ratings_pred = np.zeros(rating.shape) + ratings_mean
train = root_mean_squared(rating, ratings_pred, setMatrix)
test = root_mean_squared(rating, ratings_pred, setMatrix2)
print(f"RSME of training set is: {train}")
print(f"RSME of testing set is: {test}\n\n")
return ratings_pred
# Plot a boxplot to see the result of optimization when compared with benchmark model
def plot(y_predicte, y_true, setMatrix, title, filename):
data1 = y_predicte[setMatrix == 1][y_true[setMatrix == 1] == 0]
data2 = y_predicte[setMatrix == 1][y_true[setMatrix == 1] == 1]
data3 = y_predicte[setMatrix == 1][y_true[setMatrix == 1] == 2]
data = [data1,data2,data3]
plt.boxplot(data)
plt.xticks([1, 2, 3],[0,1,2])
plt.xlabel('True Rating')
plt.ylabel('Predicted Rating')
plt.title(title)
plt.savefig(filename, dpi=500)
plt.clf()
plt.cla()
plt.close()
# ----- Benchmark Model performances for overall ratings ------ #
# Average ratings on training set and the prediction
print("-------BENCHMARK MODEL----------")
print("-----RSME rating------")
ratings_pred = get_benchmark_RSME(overall_rating, training_set, testing_set)
plot(ratings_pred, overall_rating, training_set, "Benchmark model for 'rating' with training set", "benchmark_images/bench_overall_training.png")
plot(ratings_pred, overall_rating, testing_set, "Benchmark model for 'rating' with testing set", "benchmark_images/bench_overall_testing.png")
# ------ Benchmark Model Performances for food rating ----- #
print("-----RSME food_rating------")
ratings_pred = get_benchmark_RSME(food_rating, training_set, testing_set)
plot(ratings_pred, food_rating, training_set, "Benchmark model for 'food_rating' with training set", "benchmark_images/bench_food_training.png")
plot(ratings_pred, food_rating, testing_set, "Benchmark model for 'food_rating' with testing set", "benchmark_images/bench_food_testing.png")
# ------ Benchmark Model Performances for service rating ----- #
print("-----RSME service_rating------")
ratings_pred = get_benchmark_RSME(service_rating, training_set, testing_set)
plot(ratings_pred, service_rating, training_set, "Benchmark model for 'service_rating' with training set", "benchmark_images/bench_service_training.png")
plot(ratings_pred, service_rating, testing_set, "Benchmark model for 'service_rating' with testing set", "benchmark_images/bench_service_testing.png")
# ----- Benchmark Model Performances for total average rating ---- #
print("-----RSME total average ratings-----")
total_ratings = (overall_rating + food_rating + service_rating) / 3
avg_ratings_pred = get_benchmark_RSME(total_ratings, training_set, testing_set)
plot(avg_ratings_pred, total_ratings, training_set, "Benchmark model for 'total_avg_ratings' with training set", "benchmark_images/bench_totalavg_training.png")
plot(avg_ratings_pred, total_ratings, testing_set, "Benchmark model for 'total_avg_ratings' with testing set", "benchmark_images/bench_totalavg_testing.png")
#print(total_ratings.shape)
# ------- We employ matrix factorization technqiue to calculate the optimized predicted ratings ------ #
# Function to get the cost function which is later minimized for optimization
def cost(parameters, true_values, setMatrix):
num_restaurant = setMatrix.shape[0]
num_user = setMatrix.shape[1]
num_features = int((len(parameters) - num_restaurant) / (num_user + num_restaurant))
# make a 2d matrix of parameters
param_features = parameters[:num_restaurant * num_features].reshape(num_restaurant, num_features)
# add bias term
param_features = np.append(np.ones((num_restaurant, 1)), param_features, axis=1)
#make a 2d matrix of user weights
user_weights = parameters[num_restaurant * num_features:].reshape(num_user, num_features + 1)
cost = 0.5 * np.sum((np.dot(param_features, user_weights.T) * setMatrix - true_values)**2)
cost += (np.sum(user_weights[:,1:]**2) + np.sum(param_features[:,1:]**2))
return cost
# Compute the gradient of the parameters
def Gradient (parameters, true_values, setMatrix):
num_restaurant = int(setMatrix.shape[0])
num_user = int(setMatrix.shape[1])
num_features = int((len(parameters) - num_restaurant) / (num_user + num_restaurant))
# make a 2d matrix of parameters and add a bias term
param_features = parameters[:num_restaurant * num_features].reshape(num_restaurant, num_features)
param_features = np.append(np.ones((num_restaurant, 1)), param_features, axis=1)
#make a 2d matrix of user weights
user_weights = parameters[num_restaurant * num_features:].reshape(num_user, num_features + 1)
param_gradient = np.dot((np.dot(param_features, user_weights.T) * setMatrix - true_values), user_weights)
weight_gradient = np.dot((np.dot(user_weights, param_features.T) * setMatrix.T - true_values.T), param_features)
param_gradient += param_features
param_gradient = param_gradient[:,1:]
final_gradient = np.append(param_gradient.reshape(-1), weight_gradient.reshape(-1))
return final_gradient
np.random.seed(99)
num_features = 3
#intialize the parameteres
param_features = np.random.normal(0,1, (len(total_restaurants), num_features))
user_weights = np.random.normal(0, 1, (len(total_users), num_features + 1))
# Make a 1 dimensional vector out of parameters
initial = np.append(param_features.reshape(-1), user_weights.reshape(-1))
# Function that results in the optimize RSME to see how well our algorithm for optimization works when compared to benchmark
def getRSME(rating, setMatrix, total_restaurants, num_features, total_users, initial, setMatrix2, cost, Gradient):
mean = benchmark_mean(rating, setMatrix)
optimize = minimize(cost, initial, jac=Gradient, args=(((rating * setMatrix) - mean) * setMatrix, setMatrix))
param_features_opt = optimize.x[:len(total_restaurants) * num_features].reshape(len(total_restaurants), num_features)
param_features_opt = np.append(np.ones((len(total_restaurants),1)),param_features_opt,axis=1)
user_weights_opt = optimize.x[len(total_restaurants) * num_features:].reshape(len(total_users), num_features + 1)
ratings_pred = np.dot(param_features_opt, user_weights_opt.T) + mean
#print(ratings_pred.shape)
#print(ratings_pred)
train = root_mean_squared(rating, ratings_pred, setMatrix)
test = root_mean_squared(rating, ratings_pred, setMatrix2)
print(f"RSME of training set is: {train}")
print(f"RSME of testing set is: {test}\n")
return ratings_pred
# Function that generates a dictionary of top 10 recommended restaurants with key being the restaurant id and value being the predicted rating
def top_recommend(ratings_pred, restaurant_id):
ratings_pred = ratings_pred.tolist()
for i in range(len(ratings_pred)):
ratings_pred[i] = np.mean(ratings_pred[i])
# Create a dictionary of restaurant ids and predicted ratings
dict_predicte = dict(zip(restaurant_id, ratings_pred))
#Create a dictionary of top 10 recommended restuarants
recommended = {}
for i in range(10):
key_max = max(dict_predicte.keys(), key=(lambda k: dict_predicte[k]))
recommended[key_max] = dict_predicte[key_max]
del dict_predicte[key_max]
return recommended
# Working for RSME of overall ratings
print("---------OPTIMIZED MODEL AFTER MATRIX FACTORIZATION------------")
print("----RMSE optimized rating------")
ratings_pred = getRSME(overall_rating, training_set, total_restaurants, num_features, total_users, initial, testing_set, cost, Gradient)
plot(ratings_pred, overall_rating, training_set, "Optimized model for 'rating' with training set", "optimized_images/opt_overall_training.png")
plot(ratings_pred, overall_rating, testing_set, "Optimized model for 'rating' with testing set", "optimized_images/opt_overall_testing.png")
recommended = top_recommend(ratings_pred, total_restaurants)
print("-----Top 10 recommended restaurant based on overall rating-----")
print(recommended)
print("\n\n")
# Working for RSME of food rating
print("----RMSE optimized food rating------")
ratings_pred = getRSME(food_rating, training_set, total_restaurants, num_features, total_users, initial, testing_set, cost, Gradient)
plot(ratings_pred, overall_rating, training_set, "Optimized model for 'food_rating' with training set", "optimized_images/opt_food_training.png")
plot(ratings_pred, overall_rating, testing_set, "Optimized model for 'food_rating' with testing set", "optimized_images/opt_food_testing.png")
recommended = top_recommend(ratings_pred, total_restaurants)
print("-----Top 10 recommended restaurant based on food rating-----")
print(recommended)
print("\n\n")
# Working for RSME of service rating
print("----RMSE optimized service rating------")
ratings_pred = getRSME(service_rating, training_set, total_restaurants, num_features, total_users, initial, testing_set, cost, Gradient)
plot(ratings_pred, overall_rating, training_set, "Optimized model for 'service_rating' with training set", "optimized_images/opt_service_training.png")
plot(ratings_pred, overall_rating, testing_set, "Optimized model for 'service_rating' with testing set", "optimized_images/opt_service_testing.png")
recommended = top_recommend(ratings_pred, total_restaurants)
print("-----Top 10 recommended restaurant based on service rating-----")
print(recommended)
print("\n\n")
# Working for RSME of total average rating
print("----RMSE optimized total average rating------")
ratings_pred = getRSME(total_ratings, training_set, total_restaurants, num_features, total_users, initial, testing_set, cost, Gradient)
plot(ratings_pred, overall_rating, training_set, "Optimized model for 'total_avg_rating' with training set", "optimized_images/opt_totalavg_training.png")
plot(ratings_pred, overall_rating, testing_set, "Optimized model for 'total_avg_rating' with testing set", "optimized_images/opt_totalavg_testing.png")
recommended = top_recommend(ratings_pred, total_restaurants)
print("-----Top 10 recommended restaurant based on total average rating-----")
print(recommended)
print("\n\n")