def coClustering(trainset, testset): # CoClustering print("\n" + "-" * 5 + " CoClustering algorithm using surprise package " + "-" * 5) algo = CoClustering() algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return rmse, mae, predictions
def Coclustering(recipe_df, train_rating_df, pd, benchmark): print("\n###### Compute CoClustering ######") df = pd.merge(recipe_df, train_rating_df, on='recipe_id', how='inner') reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'recipe_id', 'rating']], reader) trainSet, testSet = train_test_split(data, test_size=.2, random_state=0) algo = CoClustering() algo.fit(trainSet) predictions = algo.test(testSet) Evaluators.RunAllEvals(predictions, benchmark)
def co_clustering(train, test, ids, Xtest, Xids): """ Co-clustering algorithm, users and items assigned to clusters and co_clusters Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('Co-clustering') algo = CoClustering(n_cltr_u=1, n_cltr_i=1, n_epochs=50, random_state=15) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
rating_test[['userID', 'itemID', 'rating']], reader) trainset = rating_train2.build_full_trainset() testset = rating_test2.build_full_trainset().build_testset() #Co Clustering Model n_cltr_u = [10] # where default = 3 n_cltr_i = [10] # where default = 3 n_epochs = [20] # where default = 20 count = 1 for i in n_cltr_u: for j in n_cltr_i: for k in n_epochs: start = dt.datetime.today() print("================================================") algo = CoClustering(n_cltr_u=i, n_cltr_i=j, n_epochs=k) algo.train(trainset) print("This is the #" + str(count) + " parameter combination") predictions = algo.test(testset) print("n_cltr_u=" + str(i) + ", n_cltr_i=" + str(j) + ", n_epochs=" + str(k)) accuracy.rmse(predictions, verbose=True) accuracy.fcp(predictions, verbose=True) accuracy.mae(predictions, verbose=True) count = count + 1 end = dt.datetime.today() print("Runtime: " + str(end - start))
movieRecc2 = topMovies2[i] movieRawID2 = movieRecc2[0] movieName2 = movie[movieRawID2] print(str(i+1) + '. ' + movieName2 ) #############predictions using Co-Clustering print('') print('Making more recommendations...') algo3 = CoClustering() algo3.fit(trainset) predictions3 = algo3.test(testset) dictMovies3 = get_top_n(predictions3) topMovies3 = dictMovies3.get(672) print('') print('Here are the top 5 recommendations based on Co-Clustering! ') for i in range(5): movieRecc3 = topMovies3[i] movieRawID3 = movieRecc3[0] movieName3 = movie[movieRawID3] print(str(i+1) + '. ' + movieName3 ) ##################Evaluations of Algorithms: Precision and Recall
class Surprise(): def train(self, algo='SVD', like=True, test='cv', local=False): if local: csv_path = os.path.join(os.path.dirname(__file__), "data/preprocessed") self.recipes = pd.read_csv(f"{csv_path}/recipe_pp.csv") self.reviews = pd.read_csv(f"{csv_path}/review_pp.csv") else: self.recipes = storage.import_file('data/preprocessed', 'recipe_pp.csv') self.reviews = storage.import_file('data/preprocessed', 'review_pp.csv') if like: self.target = 'liked' self.s_min = 0 self.s_max = 1 else: self.target = 'rating' self.s_min = 1 self.s_max = 5 reader = Reader(rating_scale=(self.s_min, self.s_max)) self.relevant_data = self.reviews[[ 'user_id', 'recipe_id', self.target ]] model_data = Dataset.load_from_df(self.relevant_data, reader) # Algos if 'NormalPredictor': self.algorithm = NormalPredictor() elif 'BaselineOnly': self.algorithm = BaselineOnly() elif 'KNNBasic': self.algorithm = KNNBasic() elif 'KNNWithMeans': self.algorithm = KNNWithMeans() elif 'KNNWithZScore': self.algorithm = KNNWithZScore() elif 'KNNBaseline': self.algorithm = KNNBaseline() elif 'SVD': params = { 'n_epochs': 20, 'n_factors': 100, 'lr_all': 0.002, 'reg_all': 0.02 } self.algorithm = SVD(params) # Tuned with svd_grid elif 'SVDpp': self.algorithm = SVDpp() elif 'NMF': self.algorithm = NMF() elif 'SlopeOne': self.algorithm = SlopeOne() elif 'CoClustering': self.algorithm = CoClustering() if test == 'cv': cv_results = cross_validate(self.algorithm, model_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) rmse = np.round(cv_results['test_rmse'].mean(), 3) mae = np.round(cv_results['test_mae'].mean(), 3) train_data = model_data.build_full_trainset() self.algorithm.fit(train_data) elif test == 'svd_grid': param_grid = { 'n_epochs': [10, 20], 'n_factors': [100, 200], 'lr_all': [0.001, 0.002], 'reg_all': [0.01, 0.02] } train_data = model_data.build_full_trainset() gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(model_data) rmse = gs.best_score['rmse'] mae = gs.best_score['mae'] print(gs.best_params['rmse'], gs.best_params['mae']) self.algorithm = gs.best_estimator['rmse'] train_data = model_data.build_full_trainset() self.algorithm.fit(train_data) else: train, test = train_test_split(model_data, test_size=0.3, random_state=42) self.algorithm.fit(train) predictions = self.algorithm.test(test) rmse = np.round(accuracy.rmse(predictions), 3) mae = np.round(accuracy.mae(predictions), 3) return rmse, mae def predict(self, user_id): inputs = self.relevant_data[self.relevant_data['user_id'] == user_id] \ .merge(self.recipes, on="recipe_id", how="left")[['recipe_id', 'name', self.target]] display(inputs) user_recipes = self.relevant_data[self.relevant_data['user_id'] == user_id].recipe_id.unique() recipe_list = self.relevant_data[ self.relevant_data['user_id'] != user_id].recipe_id.unique() predictions = [ self.algorithm.predict(user_id, rec) for rec in recipe_list if rec not in list(user_recipes) ] pdf = pd.DataFrame(predictions, columns=[ 'user_id', 'recipe_id', self.target, f'rec_{self.target}', 'details' ]) pdf = pdf.drop(columns=[self.target, 'details']) pdf = pdf.sort_values(f'rec_{self.target}', ascending=False) rec_target = pdf[f'rec_{self.target}'] pdf['rec_score'] = (rec_target - self.s_min) / (self.s_max - self.s_min) outputs = pdf.merge(self.recipes, on="recipe_id", how="left")[[ 'recipe_id', 'name', f'rec_{self.target}', 'rec_score' ]] display(outputs.head(10)) return outputs
# We'll use the famous SVD algorithm. from surprise import CoClustering df_CoClustering = df_final_user_repo_star_v3.copy(deep=True); dataCoClustering = Dataset.load_from_df(df_CoClustering, reader) coClustering = CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20) # Train the algorithm on the trainset, and predict ratings for the testset trainsetcoClustering = dataCoClustering.build_full_trainset() coClustering.fit(trainsetcoClustering) testcoClustering = trainsetcoClustering.build_anti_testset() predictionscoClustering = coClustering.test(testcoClustering) accuracy.rmse(predictionscoClustering) listOfRMSE.append(accuracy.rmse(predictionscoClustering)) models.append('CoClustering') # ## SlopeOne Implementation # In[89]: from surprise import SlopeOne slopeOne = SlopeOne()