def svd_model(df): """ Creates svd model for predcitions and cross validation Returns: data """ from surprise.model_selection.split import train_test_split data = df[['user_id', 'business_id', 'average_stars']].loc[df.city == 'Scottsdale'] reader = Reader() data = Dataset.load_from_df(data, reader) trainset, testset = train_test_split(data, test_size=0.25) algo = SVD() algo.fit(trainset) predictions = algo.test(testset) acc = accuracy.rmse(predictions) svd_cv = cross_validate(SVD(), data, cv=5) return data, acc, svd_cv['test_rmse']
def surprise_bench(df): """ Creates benchmark dataframe of SVD, NMF, NormalPredictor, and Baseline with 5 Fold cross validation and returns rmse metrics """ from surprise import (SVD, SVDpp, NMF, NormalPredictor, BaselineOnly) from surprise import Dataset from surprise import Reader from surprise.model_selection.validation import cross_validate from surprise import accuracy data = df[['user_id', 'business_id', 'average_stars']].loc[df.city == 'Scottsdale'] reader = Reader() data = Dataset.load_from_df(data, reader) benchmark = [] # Iterate over all algorithms for algorithm in [ SVD(n_factors=10), NMF(n_factors=10), NormalPredictor(), BaselineOnly() ]: # Perform cross validation results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=7, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) return pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse')
def NMF_filter(ratings, dims): reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) RMSE = np.empty([len(dims)]) MAE = np.empty([len(dims)]) min_RMSE = False min_MAE = False fac_num_RMSE = 0 fac_num_MAE = 0 for k in range(len(dims)): nmf = NMF(n_factors=dims[k], biased=False) cv = cross_validate(algo=nmf, data=data, measures=['RMSE', 'MAE'], cv=10, verbose=True) RMSE[k] = np.mean(cv['test_rmse']) if ((not min_RMSE) or RMSE[k] < min_RMSE): min_RMSE = RMSE[k] fac_num_RMSE = dims[k] MAE[k] = np.mean(cv['test_mae']) if ((not min_MAE) or MAE[k] < min_MAE): min_MAE = MAE[k] fac_num_MAE = dims[k] plt.plot(dims, RMSE) plt.plot(dims, MAE) plt.legend(['RMSE', 'MAE']) plt.show() print('Finishing Plotting...') print('For RMSE:') print('\t---Optimal number of latent factors is ', fac_num_RMSE) print('\t---Minumun Average RMSE is ', min_RMSE) print('\nFor MAE:') print('\t---Optimal number of latent factors is ', fac_num_MAE) print('\t---Minumun Average MAE is ', min_MAE)
def nmf(dataName, data, biased=True): print('Start building NMF with ' + dataName + '!') for i, k in enumerate(ks): nmf = NMF(n_factors=k, biased=biased) scores = cross_validate(nmf, data, cv=10) mae[i] = scores['test_mae'].mean() rmse[i] = scores['test_rmse'].mean() print('k = ' + str(k) + ' finished!') plt.figure() plt.subplot(211) plt.plot(ks, mae) plt.xlabel('k') plt.ylabel('mean absolute error') plt.title('Mean absolute error vs. k of ' + dataName) plt.subplot(212) plt.plot(ks, rmse) plt.xlabel('k') plt.ylabel('root mean squared error') plt.title('Root mean squared error vs. k of ' + dataName) print('mae:') print(mae) print('rmse:') print(rmse) print('Finish building NMF with ' + dataName + '!')
def estimate(self, u, i): """ return the mean of means estimate """ if u not in self.user_means: return (np.mean([self.global_mean, self.item_means[i]])) if i not in self.item_means: return (np.mean([self.global_mean, self.user_means[u]])) return (np.mean( [self.global_mean, self.user_means[u], self.item_means[i]])) # if __name__ == "__main__": df = pd.read_csv('ratings.csv') df.drop('timestamp', axis=1, inplace=True) reader = Reader() data = Dataset.load_from_df(df, reader) print("\nGlobal Mean...") algo = GlobalMean() cross_validate(algo, data) print("\nMeanOfMeans...") algo = MeanofMeans() cross_validate(algo, data) with open('model.sav', 'wb') as file: pickle.dump(algo, file)
if __name__ == "__main__": # Read data df = pd.read_csv(os.path.join(DATA_DIRECTORY, 'ratings.csv')) # Drop unneeded column 'timestamp' df.drop('timestamp', axis=1, inplace=True) # Load the data into the surprise format reader = Reader() data = Dataset.load_from_df(df, reader=reader) # Train ALS model print('Using ALS') bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} trainset, testset = train_test_split(data, test_size=0.25) algo = BaselineOnly(bsl_options=bsl_options) predictions = algo.fit(trainset).test(testset) # Get the RMSE of our predictions rmse = accuracy.rmse(predictions) # Get the cross-validated RMSE of our predictions cv_results = cross_validate(algo, data) cv_rmse = cv_results['test_rmse'].mean() print(f'CV RMSE: {cv_rmse}') # Get true values and predicted values for our test set y_true = [x.r_ui for x in predictions] y_pred = [x.est for x in predictions]
# In order to fit surprise file_path = os.path.expanduser('ratings.csv') reader = Reader(line_format='user item rating', sep=',', skip_lines=1, rating_scale=(0.5, 5)) data = Dataset.load_from_file(file_path, reader=reader) acc_cv = np.zeros((2, 50)) sim_options = {'name': 'pearson'} i = 0 for k in range(2, 101, 2): algo = KNNWithMeans(k=k, sim_options=sim_options) cv1 = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=False) acc_cv[0, i] = np.mean(cv1['test_rmse']) acc_cv[1, i] = np.mean(cv1['test_mae']) print('test_rmse = %f, test_mae = %f' % (acc_cv[0, i], acc_cv[1, i])) i = i + 1 pass ks = np.arange(2, 101, 2) plt.xlabel('k') plt.ylabel('Error value') plt.title('Test RMSE and MAE vs k in KNN with 10 Validation') plt.plot(ks, acc_cv[0, :]) plt.plot(ks, acc_cv[1, :]) plt.legend(['RMSE', 'MAE'], loc='upper right')
restaurants_and_food = restoran[mask_restaurants & mask_food] # number of businesses that have food and restaurant in their category restaurants_and_food.drop_duplicates(subset='name', keep=False, inplace=True) review = reviews[['review_id', 'business_id', 'user_id']] combined_business_data = pd.merge(restaurants_and_food, review, on='business_id') print(combined_business_data.shape) print(combined_business_data[['name', 'categories', 'user_id']].head(50)) from surprise import Reader, Dataset, SVD from surprise.model_selection.validation import cross_validate reader = Reader() data = Dataset.load_from_df( combined_business_data[['user_id', 'business_id', 'stars']], reader) svd = SVD() # Run 5-fold cross-validation and print results cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) trainset = data.build_full_trainset() svd.fit(trainset) r = restaurants_and_food.copy() r['Estimate_Score'] = r['business_id'].apply( lambda x: svd.predict('xIm6CP6pAqS3XQ7QF3Z89g', x).est) r = r.sort_values(by=['Estimate_Score'], ascending=False) print(r[['name', 'categories', 'stars', 'Estimate_Score']].head(10))
svd.fit(x_train) prediction = svd.test(x_test) accuracy.rmse( predictions=prediction ) top_n = get_top_n(prediction, n=10) # Print the recommended items for each user for uid, user_ratings in top_n.items(): print(uid, [iid for (iid, _) in user_ratings]) # 交叉验证 from surprise.model_selection.validation import cross_validate algo =SVD(biased= False) res = cross_validate(algo, data, measures=['rmse', 'mae'], cv= 3, return_train_measures=False , verbose=False) print(res ) # 网格搜索 # GrideSearchCV from surprise.model_selection.search import GridSearchCV param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6] , 'biased' :[False] } gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) # best RMSE score
if u not in self.user_means: return (np.mean([self.global_mean, self.item_means[i]])) if i not in self.item_means: return (np.mean([self.global_mean, self.user_means[u]])) return (np.mean( [self.global_mean, self.user_means[u], self.item_means[i]])) if __name__ == "__main__": data = Dataset.load_builtin('ml-100k') print(data) print("\nGlobal Mean...") algo = GlobalMean() print(np.mean(cross_validate(algo, data)['test_rmse'])) print("\nMeanOfMeans...") algo = MeanofMeans() print(np.mean(cross_validate(algo, data)['test_rmse'])) # print("\nGlobal Mean...") # algo = GlobalMean() # print('RMSE', {np.mean(cross_validate(algo, data)['test_rmse'])}) # print("\nMeanOfMeans...") # algo = MeanofMeans() # print('RMSE'.format(), {np.mean(cross_validate(algo, data)['test_rmse'])}) print(df.head())
for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) user_cut = [i for i in user_ratings[:n] if i[1] > threshold] top_n[uid] = user_cut return top_n from surprise.model_selection.validation import cross_validate score = cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=3, verbose=False) scoredf = pd.DataFrame(score) mean_score = scoredf.mean() trainset = data.build_full_trainset() algo = SVD(n_factors=5, n_epochs=5, lr_all=0.005, reg_all=0.002) algo.fit(trainset) algo.pu # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n_svd = get_top_n(predictions, n=3, threshold=2.5)
# In[15]: recommend('Iron Man') # In[16]: reader = Reader() df = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) kf = KFold(n_splits=5) kf.split(df) # In[ ]: svd = SVD() cross_validate(svd, df, measures=['RMSE', 'MAE']) trainset = df.build_full_trainset() svd.fit(trainset) # In[ ]: ratings[ratings['userId'] == 10] # In[ ]: # smaller link file reload links_df = pd.read_csv('Dataset/links_small.csv') col = np.array(links_df['tmdbId'], np.int64) links_df['tmdbId'] = col
user_means, item_means = {}, {} for user in np.unique(users): user_means[user] = ratings[users == user].mean() for item in np.unique(items): item_means[item] = ratings[items == item].mean() self.global_mean = ratings.mean() self.user_means = user_means self.item_means = item_means def estimate(self, u, i): """ return the mean of means estimate """ if u not in self.user_means: return (np.mean([self.global_mean, self.item_means[i]])) if i not in self.item_means: return (np.mean([self.global_mean, self.user_means[u]])) return (np.mean( [self.global_mean, self.user_means[u], self.item_means[i]])) if __name__ == "__main__": data = Dataset.load_builtin('ml-100k') print("\nGlobal Mean...") algo = GlobalMean() glob = cross_validate(algo, data) print('RMSE: ', np.mean(glob['test_rmse'])) print("\nMeanOfMeans...") algo = MeanofMeans() mom = cross_validate(algo, data) print('RMSE: ', np.mean(mom['test_rmse']))