def nmf(data, training, testing): ''' Tune NMF parameters then calculates RMSE, coverage and running time of NMF Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of NMF with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters nmf_param_grid = {'n_factors': [45, 50, 55, 60], 'n_epochs': [45, 50, 55]} # optimize parameters grid_search = GridSearch(NMF, nmf_param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data) param = grid_search.best_params['RMSE'] print('NMF:', param) # fit model using the optimized parameters nmf = NMF(n_factors=param['n_factors'], n_epochs=param['n_epochs']) nmf.train(training) # evaluate the model using test data predictions = nmf.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def nmf_running_time(data): ''' Calculates the running times for training and predictions for NMF Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_NMFtrain: running time for training elapsedtime_NMFtest: running time for predictions on testset ''' elapsedtime_NMFtrain = [] elapsedtime_NMFtest = [] # tune the parameters on the entire data param_grid = {'n_factors': [45, 50, 55, 60], 'n_epochs': [45, 50, 55]} grid_search = GridSearch(NMF, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data[3]) param = grid_search.best_params['RMSE'] n_factors = param['n_factors'] n_epochs = param['n_epochs'] # using the tuned parameters calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() nmf = NMF(n_factors=n_factors, n_epochs=n_epochs) nmf.train(training) elapsedtime_NMFtrain.append(time.time() - training_start) # prediction running time test_start = time.time() nmf.test(testing) elapsedtime_NMFtest.append(time.time() - test_start) return elapsedtime_NMFtrain, elapsedtime_NMFtest
def compute_recommendations(user_id, prediction_table, numeric_prediction_table): algo = 'NMF' algorithm = NMF() # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview engine = create_engine(config.DB_URI, echo=True) session = scoped_session(sessionmaker(bind=engine, autocommit = False, autoflush = False)) #reading in the database df_ratings = pd.read_sql('SELECT * FROM ratings;', con = engine) df_ratings=df_ratings[['user_id','item_id','rating']] df_ratings = df_ratings.dropna() df_ratings = df_ratings.drop_duplicates() df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False) df_ratings2 = df_ratings2.rename(columns = {'movie_id': 'item_id'}) df_ratings2 = df_ratings2[['user_id','item_id','rating']] df_ratings2 = df_ratings2.dropna() df_ratings2 = df_ratings2.drop_duplicates() df_ratings = pd.concat([df_ratings, df_ratings2], axis=0) reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10)) data = Dataset.load_from_df(df_ratings, reader=reader) trainset = data.build_full_trainset() # algorithm = eval(algo + "()")# set the algorithm............................................... algorithm.train(trainset) items = pd.read_sql('SELECT distinct id FROM items;', con = engine) df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id] total_items = items.id.unique() user_items = df_user_items.item_id.unique() # user_id = str(user_id) prediction_items = [x for x in total_items if x not in user_items] predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction']) predicted_ratings = [] for i in prediction_items: a = user_id b = i est = algorithm.predict(a, b) predicted_ratings.append(est[3]) predictions['item_id'] = prediction_items predictions['user_id'] = pd.Series([user_id for x in range(len(predictions.index))], index=predictions.index) predictions['prediction'] = predicted_ratings predictions = predictions.sort_values('prediction', ascending=False) test_prediction = predictions predictions = predictions.head(n=10) cols =['pred_1', 'pred_2','pred_3','pred_4', 'pred_5','pred_6','pred_7','pred_8', 'pred_9','pred_10'] df_pred = predictions[['item_id']].T df_pred.columns = cols df_pred['id'] = user_id df_pred = df_pred[['id','pred_1', 'pred_2','pred_3','pred_4', 'pred_5','pred_6','pred_7','pred_8', 'pred_9','pred_10']] df_pred['id'] = df_pred['id'].astype(int) df_pred.to_sql(prediction_table, engine,if_exists='append', index=False)#if_exists='append' session.commit() df_num_ratings = test_prediction df_num_ratings = df_num_ratings.head(n=20) df_num_ratings['algorithm'] = algo df_num_ratings.rename(columns={'prediction':'predicted_rating'}, inplace=True) df_num_ratings.to_sql('numeric_predictions',engine,if_exists='append', index=False)#if_exists='append' session.commit() predcols =['num_1', 'num_2','num_3','num_4', 'num_5','num_6','num_7','num_8', 'num_9','num_10'] df_num_ratings_transpose = predictions[['prediction']].T df_num_ratings_transpose.columns = predcols df_num_ratings_transpose['id'] = user_id df_num_ratings_transpose = df_num_ratings_transpose[['id','num_1', 'num_2','num_3','num_4', 'num_5','num_6','num_7','num_8', 'num_9','num_10']] df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int) df_num_ratings_transpose.to_sql(numeric_prediction_table,engine,if_exists='append', index=False)#if_exists='append' session.commit()
best_item_est_oma = np.mean(bu) + bi + global_mean algo_baseline = BaselineOnly(reg_u=0, reg_i=0) algo_baseline.train(data_full) best_item_est = algo_baseline.trainset._global_mean + np.mean( algo_baseline.bu) + algo_baseline.bi algo_SVD = NMF(verbose=True, n_factors=5, n_epochs=50, reg_bu=0, reg_bi=0, reg_pu=0.1, reg_qi=0.1, biased=True) algo_SVD.train(data_full) best_item_est_svd = algo_SVD.trainset._global_mean + np.mean( algo_SVD.bu) + algo_SVD.bi f, axarr = plt.subplots(nrows=2) im1 = axarr[0].imshow(datamat_missing) plt.colorbar(im1, ax=axarr[0]) mean_rating = np.nanmean(datamat_missing, axis=1) axarr[1].plot(mean_rating, marker='s') axarr[1].plot(best_item_est_oma, marker='o') axarr[1].plot(best_item_est, marker='<') axarr[1].plot(best_item_est_svd, marker='>') f.show()
trainset = rating_train2.build_full_trainset() testset = rating_test2.build_full_trainset().build_testset() #NMF Model n_factors=[15] # where default = 15 n_epochs=[50] # where default = 50 reg_pu=[0.06] # where default = 0.06 reg_qi=[0.06] # where default = 0.06 count=1 for i in n_factors: for j in n_epochs: for k in reg_pu: for m in reg_qi: start = dt.datetime.today() print("================================================") algo = NMF(n_factors=i, n_epochs=j, reg_pu=k, reg_qi=m, biased=True) algo.train(trainset) print("This is the #" + str(count) + " parameter combination") predictions=algo.test(testset) print("n_factors="+str(i)+", n_epochs="+str(j)+", reg_pu="+str(k)+", reg_qi="+str(m)) accuracy.rmse(predictions, verbose=True) accuracy.fcp(predictions, verbose=True) accuracy.mae(predictions, verbose=True) count=count+1 end = dt.datetime.today() print("Runtime: "+str(end - start))
del dfTest['date'] del dfTest['test_id'] # Set the rating scale and create the data for Surprise to use reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( dfRatings[['user_id', 'business_id', 'rating']], reader) # Cross validation for tuning # Split in 5 folds data.split(5) # This part is to use all the data to train and get the output train_set = data.build_full_trainset() # Use NMF with surprise algo = NMF() algo.train(train_set) f = open('PMFOutput.csv', 'w') f.write("test_id,rating\n") for i in range(len(dfTest)): prediction = algo.predict(dfTest.at[i, 'user_id'], dfTest.at[i, 'business_id'], r_ui=4, verbose=True) predRating = prediction.est f.write(str(i) + "," + str(predRating) + '\n') f.close()
def compute_recommendations(): #connecting to the database # engine = create_engine("mysql://*****:*****@localhost/ratingsx?charset=utf8", echo=True) engine = create_engine(config.DB_URI, echo=True) session = scoped_session( sessionmaker(bind=engine, autocommit=False, autoflush=False)) blockPrint() #reading in the database df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine) df_ratings = df_ratings[['user_id', 'item_id', 'rating']] df_ratings = df_ratings.dropna() df_ratings = df_ratings.drop_duplicates() #formatting the dataset using the surprise library reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 5)) data = Dataset.load_from_df(df_ratings, reader=reader) training_set = data.build_full_trainset() algorithm = NMF() # use the singular value decomposition algorithm.train(training_set) # fit the data to the model testing_set = training_set.build_anti_testset() predictions = algorithm.test(testing_set) # make prediction #writing the function for top predictions def get_top_n(predictions, n=10): # Return the top-N recommendation for each user from a set of predictions. # Args: # predictions(list of Prediction objects): The list of predictions, as # returned by the test method of an algorithm. # n(int): The number of recommendation to output for each user. Default # is 10. # Returns: # A dict where keys are user (raw) ids and values are lists of tuples: # [(raw item id, rating estimation), ...] of size n. # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # getting the top 10 predictions top_n = get_top_n(predictions, n=10) # Print the recommended items for each user a = [] for uid, user_ratings in top_n.items(): a.append([uid, [iid for (iid, _) in user_ratings]]) df_list_pred = pd.DataFrame.from_records(a, columns=['A', 'B']) df_user = pd.DataFrame(df_list_pred.A.values.tolist()) df_pred = pd.DataFrame(df_list_pred.B.values.tolist()) df_pred.columns = [ 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ] df_items = pd.read_sql('SELECT * FROM items;', con=engine) # df_pred = df_pred.applymap(lambda x: df_items.loc[x, 'title']) df_pred[['id']] = df_user df_pred = df_pred[[ 'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ]] df_pred['id'] = df_pred['id'].astype(int) # Append recomemndations df_pred.to_sql('recommendations', engine, if_exists='append', index=False) #if_exists='append' session.commit() #logging the predictions df_log = df_pred df_log['algorithm'] = 'NMF' df_log = df_log.rename(columns={'id': 'user_id'}) df_log = df_log[[ 'user_id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10', 'algorithm' ]] df_log.to_sql('predictionlogs', engine, if_exists='append', index=False) #if_exists='append' session.commit() global mae2 global rmse2 mae2 = accuracy.mae(predictions) rmse2 = accuracy.rmse(predictions) mae2 = float(mae2) rmse2 = float(rmse2)
class MovieRecommender: def __init__(self): self._knn = None self._nmf = None self._trainset = None self._predictions = None self.initialized = False def initialize(self, data_filepath): self._data = Dataset.load_from_file(data_filepath, reader=Reader('ml-100k')) self._trainset = self._data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': False} self._knn = KNNBaseline(sim_options=sim_options) self._nmf = NMF() start_new_thread(self._train) def get_similar_movies(self, movie_id, k=10): if not self.initialized: return [] model = self._knn movie_inner_id = model.trainset.to_inner_iid(movie_id) similar_movie_inner_ids = model.get_neighbors(movie_inner_id, k=k) to_raw_iid = model.trainset.to_raw_iid similar_movie_ids = (to_raw_iid(inner_id) for inner_id in similar_movie_inner_ids) movie_ids = [ similar_movie_id.encode('ascii') for similar_movie_id in similar_movie_ids ] return movie_dataset.get_movies(movie_ids) def get_similar_movies_for_user(self, user_id, num_movies=10): if not self.initialized: return [] user_id = str(user_id) user_predictions = [ prediction for prediction in self._predictions if prediction[0] == user_id ] sorted_predictions = sorted(user_predictions, key=lambda x: x.est, reverse=True) top_n_predictions = sorted_predictions[:num_movies] similar_movie_ids = (prediction.iid for prediction in top_n_predictions) movie_ids = [ similar_movie_id.encode('ascii') for similar_movie_id in similar_movie_ids ] return movie_dataset.get_movies(movie_ids) def update_user_ratings(self, user_id, movie_id, rating): if not self.initialized: return rating = float(rating) has_previous_rating = False if self._trainset.knows_user(user_id): trainset_dict = dict(self._trainset.ur[user_id]) has_previous_rating = movie_id in trainset_dict user_id = str(user_id) movie_id = str(movie_id) new_rating = (user_id, movie_id, rating, time()) if has_previous_rating: for i, rating in enumerate(self._data.raw_ratings): if rating[0] == user_id and rating[1] == movie_id: self._data.raw_ratings[i] = new_rating break else: self._data.raw_ratings.append(new_rating) self._trainset = self._data.build_full_trainset() self._train() def _train(self): self._nmf.train(self._trainset) self._knn.train(self._trainset) self._predictions = self._nmf.test(self._trainset.build_anti_testset()) self.initialized = True
from surprise import Reader, Dataset from surprise import NMF, evaluate # creating the format for the dataset when given the user, item, rating and timestamp data_reader = Reader(line_format="user item rating timestamp", sep="\t") # store the data in the specific format created above # u. data is the data we want data = Dataset.load_from_file("./ml-100k/u.data", reader=data_reader) # will be splitting the data into 5 folds for cross validation data.split(n_folds=5) # for this project I will be using the NMF algorithm algorithm = NMF() evaluate(algorithm, data, measures=["RMSE", "MAE"]) # train the whole data set now training_set = data.build_full_trainset() algorithm.train(training_set) # set the specific user and movie I want to predict user_id = str(200) item_id = str(222) actual_rating = 5 # see how it works! print(algorithm.predict(user_id, item_id, actual_rating))
grid_search = GridSearch(NMF, param_grid_NMF, measures=['RMSE']) # Evaluate performances of our algorithm on the dataset. grid_search.evaluate(data) print('best: ' + str(grid_search.best_score['RMSE'])) # combination of parameters that gave the best FCP score print('best params: ' + str(grid_search.best_params['RMSE'])) params = grid_search.best_params['RMSE'] algo_NMF = NMF(verbose=True, n_factors=params['n_factors'], n_epochs=params['n_epochs'], biased=params['biased']) algo_NMF.train(data_full) #%% SVD param_grid_SVD = { 'n_factors': [5, 10, 20], 'n_epochs': [70], 'lr_all': [0.005, 0.003, 0.001], 'reg_all': [0.005, 0.01, 0.02] } #grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE', 'MAE']) grid_search = GridSearch(SVD, param_grid_SVD, measures=['RMSE']) # Evaluate performances of our algorithm on the dataset. grid_search.evaluate(data)