def use_als(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using ALS') bsl_options = {'method': 'als', 'n_epochs': 20, 'reg_u': 12, 'reg_i': 5} algo_ALS = BaselineOnly(bsl_options=bsl_options) algo_ALS.fit(trainset) testset = trainset.build_anti_testset() predictions_ALS = algo_ALS.test(testset) accuracy_rmse \ = accuracy.rmse(predictions_ALS) accuracy_mae = accuracy.mae(predictions_ALS) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def use_sgd(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using SGD') bsl_options = { 'method': 'sgd', 'learning_rate': .005, } algo_SGD = BaselineOnly(bsl_options=bsl_options) algo_SGD.fit(trainset) testset = trainset.build_anti_testset() predictions_SGD = algo_SGD.test(testset) accuracy_rmse = accuracy.rmse(predictions_SGD) accuracy_mae = accuracy.mae(predictions_SGD) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def test_dump(): """Train an algorithm, compute its predictions then dump them. Ensure that the predictions that are loaded back are the correct ones, and that the predictions of the dumped algorithm are also equal to the other ones.""" random.seed(0) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped
def surprise_baseline(train_file, test_file): """ Baseline with Surprise library. Compute the predictions on a test_set after training on a train_set using the method Baseline from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters: - Returns: numpy array: predictions """ print("baseline") algo = BaselineOnly() fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def normalize_affinity_scores_by_user_item_bs(user_item_affinities: List[Tuple[str, str, float]], rating_scale=(1, 5)) \ -> Tuple[float, Dict[str, float], Dict[str, float], float, List[Tuple[str, str, float]]]: train = pd.DataFrame(user_item_affinities) reader = Reader(rating_scale=rating_scale) trainset = Dataset.load_from_df(train, reader).build_full_trainset() trainset_for_testing = trainset.build_testset() algo = BaselineOnly(bsl_options={'method': 'sgd'}) algo.fit(trainset) predictions = algo.test(trainset_for_testing) mean = algo.trainset.global_mean bu = { u: algo.bu[algo.trainset.to_inner_uid(u)] for u in set([u for u, i, r in user_item_affinities]) } bi = { i: algo.bi[algo.trainset.to_inner_iid(i)] for i in set([i for u, i, r in user_item_affinities]) } uid = [[p.uid, p.iid, p.r_ui - p.est] for p in predictions] estimatates = [p.est for p in predictions] estimates_2 = [ p.r_ui - (mean + bu[p.uid] + bi[p.iid]) for p in predictions ] uid = pd.DataFrame(uid, columns=["user", "item", "rating"]) spread = max(uid["rating"].max(), np.abs(uid["rating"].min())) uid = list(zip(uid['user'], uid['item'], uid['rating'])) bu = defaultdict(float, bu) bi = defaultdict(float, bi) # assert estimatates == estimates_2 return mean, bu, bi, spread, uid
def baseline_bias_model(df): """ Shows the performance of model based on just bias """ ratings_pandas_df = df.drop(columns=['date', 'text']) # ratings_pandas_df.columns = ['user_id', 'business_id', 'rating'] reader = Reader(rating_scale=(1, 5)) #TODO figure out data = surprise.dataset.Dataset.load_from_df(df=ratings_pandas_df, reader=reader) ts = data.build_full_trainset() dusers = ts._raw2inner_id_users ditems = ts._raw2inner_id_items trainset, testset = train_test_split(data) algo = BaselineOnly() algo.fit(trainset) # testset = trainset.build_anti_testset() predictions = algo.test(testset) print('\n') return (trainset, testset, predictions, dusers, ditems)
def baseline(trainset, testset): algo = BaselineOnly() algo.fit(trainset) print("Predictions") predictions = algo.test(testset) accuracy.rmse(predictions) accuracy.mae(predictions) return(predictions)
class ALSModelSurprise(ALSModel): def __init__(self, params): super().__init__(params) self.algo = BaselineOnly(bsl_options=self.params) def parse_data(self, ratings): reader = Reader(rating_scale=(1, 5)) self.data = Dataset.load_from_df(ratings, reader) def update_parameters(self): self.algo.bsl_options = self.params def fit(self): self.train = self.data.build_full_trainset() self.algo.fit(self.train) def predict(self, uid, iid): ''' uid, iid should be consistent with ratings['UID','IID'] ''' return self.algo.predict(uid, iid).est def top_n_recommendations(self, uid, n=5): ''' Obtain the top n recommendation for any user. Method for the surprise library ''' scores = [] for i in range(self.train.n_items): iid = self.train.to_raw_iid(i) scores.append((iid, self.predict(uid, iid))) scores.sort(key=lambda x: x[1], reverse=True) top_n_iid = [l[0] for l in scores[:n]] pred = [l[1] for l in scores[:n]] return top_n_iid, pred def cross_validate(self, cv=5, verbose=False): cv_result = cross_validate(self.algo, self.data, \ cv=cv, verbose=verbose) rmse = cv_result['test_rmse'].mean() return rmse def grid_search(self): self._best_params = self.params self._best_rmse = self.cross_validate(cv=5) for n_epochs in [5, 10, 15, 20, 25]: for reg_u in [5, 10, 15, 20]: for reg_i in [5, 10, 15]: self.set_params(n_epochs=n_epochs, reg_u=reg_u, reg_i=reg_i) rmse = self.cross_validate(cv=5) print(n_epochs, reg_u, reg_i, rmse) if (rmse < self._best_rmse): self._best_rmse = rmse self._best_params = self.params
def baseline(trainset, testset): print("\n" + "-" * 5 + " Baseline algorithm using surprise package " + "-" * 5) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return rmse, mae, predictions
def fit(self, train): """ Fit the model """ baselineOnly = BaselineOnly(bsl_options={ 'method': 'als', 'n_epochs': 25, 'reg_u': 5, 'reg_i': 3 }) baselineOnly.fit(train) self.model = baselineOnly
def predict(trainset): print("Training the model for prediction .....") # predict ratings for all pairs (u, i) that are NOT in the training set. algo = BaselineOnly(bsl_options=bsl_options) testset = trainset.build_anti_testset() predictions = algo.fit(trainset).test(testset) return predictions
def predict(path): ##read data and transform it to reader = Reader(line_format='user item rating', sep=',', skip_lines=1) data = Dataset.load_from_file("{}".format(path), reader=reader) all_train = data.build_full_trainset() bsl = BaselineOnly() svd = SVD() bsl.fit(all_train) svd.fit(all_train) all_test = all_train.build_anti_testset() bsl_predictions = bsl.test(all_test) bsl_pred = get_top_n(bsl_predictions, 100) svd_predictions = bsl.test(all_test) svd_pred = get_top_n(svd_predictions, 100) with open("baseline_predictions.pickle", "wb") as f: pickle.dump([bsl_pred, svd_pred], f, protocol=2) f.close() print("Done recommending using baseline model and SVD model.")
def baseline_only(train, test, ids, Xtest, Xids): """ Combines user and item mean with user and item biases Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('Baseline Only') bsl_options = { 'method': 'als', 'n_epochs': 100, 'reg_u': 15, 'reg_i': 0.01 } algo = BaselineOnly(bsl_options=bsl_options, verbose=False) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def test_dump(u1_ml100k): """Train an algorithm, compute its predictions then dump them. Ensure that the predictions that are loaded back are the correct ones, and that the predictions of the dumped algorithm are also equal to the other ones.""" random.seed(0) trainset, testset = next(PredefinedKFold().split(u1_ml100k)) algo = BaselineOnly() algo.fit(trainset) predictions = algo.test(testset) with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped
class BaseLineRecommender(object): """ Use surprise's baselineonly algorithm as the baseline of prediction """ def __init__(self): self.model = None def fit(self, train): """ Fit the model """ self.model = BaselineOnly(bsl_options={ 'method': 'sgd', 'n_epochs': 30, 'reg': 0.01, 'learning_rate': 0.01 }) self.model.fit(train) def predict(self, user_id, item_id): """ Predict ratings """ return self.model.predict(user_id, item_id) def rmse(self, test): """ Calculate RMSE for the predicted ratings """ pred = self.model.test(test) return accuracy.rmse(pred) def mae(self, test): """ Calculate MAE for the predicted ratings """ pred = self.model.test(test) return accuracy.mae(pred)
def train(trainset, testset): """ Train the recommender model that uses the baseline algorithm which is based on similarities between users and their shared ratings of recipes :param trainset: the train set from which the model learns the pattern of ratings and similarity between different users :param testset: the testset to which the model validate its knowledge of data and ratings distribution :return: a variable containing predictions of ratings of all items given by all users """ print("Training the model for prediction ....") # BaselineOnly algorithm gave us the best rmse, # therefore, we will train and predict with BaselineOnly and use Alternating Least Squares (ALS). algo = BaselineOnly(bsl_options=bsl_options) predictions = algo.fit(trainset).test(testset) return predictions
def baseline_only(self): """ Basic baseline prediction using global mean and user-item biases. Returns: predictions_df: The predictions of the model on the test data in Pandas Data Frame format """ algorithm = BaselineOnly() predictions = algorithm.fit(self.train_data).test(self.test_data) predictions_df = self.data.test_df.copy() predictions_df['Rating'] = [x.est for x in predictions] if self.test_purpose: self.evalueate_model(predictions_df['Rating'], 'Surprise baseline_only') return predictions_df
def als_predictions(trainset, dataset_test): algo = BaselineOnly(bsl_options={ 'method': 'als', 'n_epochs': 30, 'reg_u': 6, 'reg_i': 4 }) predictions = algo.fit(trainset) list_1 = [] for x in dataset_test: i = predictions.predict( x[0], x[1]) if mode == 'test' else predictions.predict( x[0], x[1], x[2]) list_1.append((i[0], i[1], i[2], i[3])) return list_1
Train_CV.cv_results['mean_test_rmse'], '.k') plt.xscale('log') plt.xlabel('Regularization Parameter ($\lambda$) - bu') plt.ylabel('RMSE') plt.grid() plt.title('3-Fold CV - Regularization Parameter ($\lambda$) - bu') plt.savefig('3_fold_CV_Reg_Param_Baseline_bu.png') # %% Best Hyper-parameters Training alg = BaselineOnly(bsl_options= Grid_Search_Result.best_params['rmse']['bsl_options']) start = time.time() alg.fit(data_train.build_full_trainset()) end = time.time() print("***********************************************") print("Exe time:") print(end - start) # %% Loading Test Data file_path = "Data/sample_submission.csv" data_test = utils.load_data_desired(file_path) # %% Prediction Predict_Test = [] for line in data_test: Predict_Test.append(alg.predict(str(line[1]), str(line[0])).est)
if __name__ == "__main__": # Read data df = pd.read_csv(os.path.join(DATA_DIRECTORY, 'ratings.csv')) # Drop unneeded column 'timestamp' df.drop('timestamp', axis=1, inplace=True) # Load the data into the surprise format reader = Reader() data = Dataset.load_from_df(df, reader=reader) # Train ALS model print('Using ALS') bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} trainset, testset = train_test_split(data, test_size=0.25) algo = BaselineOnly(bsl_options=bsl_options) predictions = algo.fit(trainset).test(testset) # Get the RMSE of our predictions rmse = accuracy.rmse(predictions) # Get the cross-validated RMSE of our predictions cv_results = cross_validate(algo, data) cv_rmse = cv_results['test_rmse'].mean() print(f'CV RMSE: {cv_rmse}') # Get true values and predicted values for our test set y_true = [x.r_ui for x in predictions] y_pred = [x.est for x in predictions]
avg_precision = sum(prec for prec in precisions.values()) / len(precisions) avg_recall= sum(rec for rec in recalls.values()) / len(recalls) metrics = {'rmse': rmse, 'avg_precision': avg_precision, 'avg_recall': avg_recall} results['NormalPredictor'] = metrics top_n['NormalPredictor'] = get_top_n(norm_pred, n=10) param_grid = {'bsl_options':{'method': ['als', 'sgd']}} gs = GridSearchCV(BaselineOnly, param_grid, measures = ['rmse'], cv = 5) gs.fit(data) params = gs.best_params['rmse'] algo = BaselineOnly(bsl_options = params['bsl_options']) algo.fit(trainset) base_pred = algo.test(testset) rmse = accuracy.rmse(base_pred) precisions, recalls = precision_recall_at_k(base_pred, k = 10, threshold = 4) avg_precision = sum(prec for prec in precisions.values()) / len(precisions) avg_recall= sum(rec for rec in recalls.values()) / len(recalls) metrics = {'rmse': rmse, 'avg_precision': avg_precision, 'avg_recall': avg_recall, 'best_parameters': params} results['BaselineOnly'] = metrics top_n['BaselineOnly'] = get_top_n(base_pred, n=10)
dataset_test = dataset_test.apply(tuple, axis=1).tolist() trainset = trainset.build_full_trainset() """ 'method': 'als', 'n_epochs': 92, 'reg_u': 6, 'reg_i': 4 n_epochs=75, lr_all=0.005, reg_all=0.3 72,0.005,0.23,145 (55,0.005,0.21,105) (35,0.004,0.18,73) """ bsl_options = {'method': 'als', 'n_epochs': 94, 'reg_u': 6, 'reg_i': 4} algo1 = BaselineOnly(bsl_options=bsl_options) predictions1 = algo1.fit(trainset) predictions1 = predictions1.test(dataset_test) predList1 = [] for i in predictions1: predList1.append((i[0], i[1], i[2], i[3])) resdf1 = pd.DataFrame( list(predList1), columns=['user_id', 'business_id', 'stars', 'est_stars1']) algo2 = SVD(n_factors=105, n_epochs=55, lr_all=0.005, reg_all=0.22) predictions2 = algo2.fit(trainset) end = time.time() predictions2 = predictions2.test(dataset_test) predList2 = [] for i in predictions2: predList2.append((i[0], i[1], i[2], i[3])) resdf2 = pd.DataFrame(
for trainset, testset in kf.split(data): # 训练并预测 algo.fit(trainset) predictions = algo.test(testset) # 计算RMSE accuracy.rmse(predictions, verbose=True) #RMSE: 0.8653 # 对指定用户和商品进行评分预测 uid = str(196) iid = str(302) pred = algo.predict(uid, iid, r_ui=4, verbose=True) """方法2:使用baseline算法""" # Baseline算法,使用ALS进行优化,迭代次数5,reg_u为user正则化系数为12,reg_i为item正则化系数为5 bsl_options = {'method': 'als','n_epochs': 5,'reg_u': 12,'reg_i': 5} algo2 = BaselineOnly(bsl_options = bsl_options) kf = KFold(n_splits=5) for trainset, testset in kf.split(data): # 训练并预测 algo2.fit(trainset) predictions = algo2.test(testset) # 计算RMSE accuracy.rmse(predictions, verbose=True) uid = str(196) iid = str(302) # 输出uid对iid的预测结果测结果 pred = algo2.predict(uid, iid, r_ui=4, verbose=True)
knnmean_results = [] nmf_results = [] reader = Reader(rating_scale=(0, np.inf)) data = Dataset.load_from_df( usergroups_df[["user_id", "item_id", "rating"]], reader) folds_it = KFold(n_splits=5).split(data) i = 1 pl_fit = [] for trainset, testset in folds_it: print("Fold: %d" % i) i += 1 print("Baseline") baseline = BaselineOnly() baseline.fit(trainset) baseline_predictions = baseline.test(testset) results = get_group_measures(preds_all=baseline_predictions, U1=U1_users, U2=U2_users, U3=U3_users, U4=U4_users) baseline_results.append(results) print("KNN") knn = KNNBasic(sim_options={"name": "pearson"}) #knn = KNNBasic(sim_options={"name": "cosine"}) knn.fit(trainset) knn_predictions = knn.test(testset) results = get_group_measures(preds_all=knn_predictions, U1=U1_users,
from surprise import Dataset from surprise import Reader from surprise import BaselineOnly from surprise import accuracy from surprise.model_selection import KFold #数据读取 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('./ratings.csv', reader=reader) train_set = data.build_full_trainset() #ALS优化,优化方式可以选其他的('SGD') #设置user、item的正则化项 bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} model = BaselineOnly(bsl_options=bsl_options) #k折交叉验证 kf = KFold(n_splits=5) for trainset, testset in kf.split(data): model.fit(trainset) pred = model.test(testset) #计算RMSE accuracy.rmse(pred) uid = str(300) iid = str(180) #输出uid对iid 的预测结果 pred = model.predict(uid, iid, r_ui=4, verbose=True)
""" from surprise import Dataset, Reader, BaselineOnly, KNNBasic, NormalPredictor, accuracy from surprise.model_selection import KFold # 1. 读取数据 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('data/ratings.csv', reader=reader) # 2. 选择模型 # ALS优化 # bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} # SGD优化 bsl_options = {'method': 'sgd', 'n_epochs': 5} algo = BaselineOnly(bsl_options=bsl_options) # 3. 定义K折交叉验证迭代器,K=3 kf = KFold(n_splits=3) for train_set, test_set in kf.split(data): # 4. 训练并预测 algo.fit(train_set) predictions = algo.test(test_set) # 5. 计算RMSE accuracy.rmse(predictions, verbose=True) uid = str(196) iid = str(302) # 输出uid对iid的预测结果 pred = algo.predict(uid, iid, r_ui=4, verbose=True)
def collaborative_filtering(raw_uid): # To read the data from a txt file # =============== 数据预处理 =========================== # 将数据库中的所有数据读取转换到文件 # dir_data = '/www/wwwroot/music_recommender/page/cf_recommendation/cf_data' dir_data = './collaborative_filtering/cf_data' file_path = '{}/dataset_user_5.txt'.format(dir_data) if not os.path.exists(dir_data): os.makedirs(dir_data) # 数据库操作 # 打开数据库连接 db = pymysql.connect("localhost", "root", "password", "music_recommender", charset='utf8') # 使用 cursor() 方法创建一个游标对象 cursor cursor = db.cursor() songData = defaultdict(list) sql = """SELECT uid, song_id, rating FROM user_rating WHERE 1""" cursor.execute(sql) results = cursor.fetchall() with open(file_path, "w+") as data_f: a = 0 for result in results: uid, song_id, rating = result if song_id in songData: songData[song_id].append(rating) else: songData[song_id] = [rating] data_f.writelines("{}\t{}\t{}\n".format(uid, song_id, rating)) a += 1 if not os.path.exists(file_path): raise IOError("Dataset file is not exists!") reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) # Build the training set trainset = data.build_full_trainset() bsl_options = {'method': 'sgd', 'learning_rate': 0.0005, } algo_BaselineOnly = BaselineOnly(bsl_options=bsl_options) algo_BaselineOnly.fit(trainset) #训练模型 rset = user_build_anti_testset(trainset, raw_uid) predictions = algo_BaselineOnly.test(rset) top_n_baselineonly = get_top_n(predictions, n=10) # print(predictions) # uid 原生用户id # iid 原生项目id # r_ui 浮点型的真实评分 # est 浮点型的预测评分 # details 预测相关的其他详细信息 # print(top_n_baselineonly, 'top_n_baselineonly') # KNNBasic sim_options = {'name': 'pearson', 'user_based': True} algo_KNNBasic = KNNBasic(sim_options=sim_options) algo_KNNBasic.fit(trainset) predictor = PredictionSet(algo_KNNBasic, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBasic.test(knn_anti_set) top_n_knnbasic = get_top_n(predictions, n=1000) # print(predictions, 'top_n_knnbasic') # KNNBaseline sim_options = {'name': 'pearson_baseline', 'user_based': True} algo_KNNBaseline = KNNBaseline(sim_options=sim_options) algo_KNNBaseline.fit(trainset) predictor = PredictionSet(algo_KNNBaseline, trainset, raw_uid) knn_anti_set = predictor.user_build_anti_testset() predictions = algo_KNNBaseline.test(knn_anti_set) top_n_knnbaseline = get_top_n(predictions, n=1000) evaluationMSEResult = evaluationMSE([top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline], raw_uid) recommendset = set() for results in [top_n_baselineonly, top_n_knnbasic, top_n_knnbaseline]: for key in results.keys(): for recommendations in results[key]: iid, rating, true_score = recommendations recommendset.add(iid) items_baselineonly = set() for key in top_n_baselineonly.keys(): for recommendations in top_n_baselineonly[key]: iid, rating, true_score = recommendations items_baselineonly.add(iid) items_knnbasic = set() for key in top_n_knnbasic.keys(): for recommendations in top_n_knnbasic[key]: iid, rating, true_score = recommendations items_knnbasic.add(iid) items_knnbaseline = set() for key in top_n_knnbaseline.keys(): for recommendations in top_n_knnbaseline[key]: iid, rating, true_score = recommendations items_knnbaseline.add(iid) rank = dict() for recommendation in recommendset: if recommendation not in rank: rank[recommendation] = 0 if recommendation in items_baselineonly: rank[recommendation] += 1 if recommendation in items_knnbasic: rank[recommendation] += 1 if recommendation in items_knnbaseline: rank[recommendation] += 1 max_rank = max(rank, key=lambda s: rank[s]) evaluationMSEResult1 = {} if max_rank == 1: return items_baselineonly else: resultAll = dict() result = nlargest(10, rank, key=lambda s: rank[s]) for k in result: resultAll[k] = rank[k] # print("排名结果: {}".format(resultAll)) evaluation(songData, resultAll) for key in evaluationMSEResult: if key in resultAll: evaluationMSEResult1[key] = evaluationMSEResult[key] print(evaluationMSEResult1,'evaluationMSEResult1==') #最后的评估 return resultAll
def make_predictions(user_id): performance = [] algorithms = ['SVD', 'KNN', 'ALS'] # First train an SVD algorithm on the movielens dataset. data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo_SVD = SVD() algo_SVD.fit(trainset) # Then predict ratings for all pairs (u, i) that are NOT in the training set. # SVD algorithm testset = trainset.build_anti_testset() predictions_SVD = algo_SVD.test(testset) accurancy_SVD = accuracy.rmse(predictions_SVD) performance.append(accurancy_SVD) algo_KNN = KNNBasic() algo_KNN.fit(trainset) predictions_KNN = algo_SVD.test(testset) accurancy_KNN = accuracy.rmse(predictions_KNN) performance.append(accurancy_KNN) bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} algo_ALS = BaselineOnly(bsl_options=bsl_options) algo_ALS.fit(trainset) predictions_ALS = algo_ALS.test(testset) accurancy_ALS = accuracy.rmse(predictions_ALS) performance.append(accurancy_ALS) # comparing algorithms by performance best_performance_index = performance.index(min(performance)) best_algorithm = algorithms[best_performance_index] if best_algorithm == 'SVD': top_n = get_top_n(predictions_SVD, n=10) elif best_algorithm == 'KNN': top_n = get_top_n(predictions_KNN, n=10) elif best_algorithm == 'ALS': top_n = get_top_n(predictions_ALS, n=10) i_cols = [ 'movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western' ] items = pd.read_csv('../../ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1') predictions = [] # Print the recommended items for the user for uid, user_ratings in top_n.items(): if int(uid) + 1 == int(user_id) + 1: # print(uid, [iid for (iid, _) in user_ratings]) for (iid, _) in user_ratings: title = items[items['movie_id'] == int(iid) + 1]['movie_title'] title_t = str(title) title_split = title_t.split() print(title_split) # print(title_split(1)) # print(title_split(2)) # print(title_t) predictions.append(title_t) return predictions
def test_trainset_testset(): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) with pytest.warns(UserWarning): trainset, testset = next(data.folds()) # test ur ur = trainset.ur assert ur[0] == [(0, 4, None)] assert ur[1] == [(0, 4, None), (1, 2, None)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)] assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test user features u_features = trainset.u_features assert u_features[0] == [] # no u_features_df added assert u_features[1] == [] # no u_features_df added assert u_features[3] == [] # no u_features_df added assert u_features[40] == [] # not in trainset and no u_features_df assert trainset.user_features_labels == [] assert trainset.n_user_features == 0 # test item features i_features = trainset.i_features assert i_features[0] == [] # no i_features_df added assert i_features[1] == [] # no i_features_df added assert i_features[20000] == [] # not in trainset and no i_features_df assert trainset.item_features_labels == [] assert trainset.n_item_features == 0 # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unknown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unknown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', [], [], 4) in testset assert ('user3', 'item1', [], [], 5) in testset assert ('user3', 'item1', [], [], 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', [], [], trainset.global_mean) not in testset assert ('user3', 'item1', [], [], trainset.global_mean) not in testset assert ('user0', 'item1', [], [], trainset.global_mean) in testset assert ('user3', 'item0', [], [], trainset.global_mean) in testset
def test_trainset_testset_ui_features(): """Test the construct_trainset and construct_testset methods with user and item features.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) u_features_df = pd.DataFrame( { 'urid': ['user0', 'user2', 'user3', 'user1'], 'isMale': [False, True, False, True] }, columns=['urid', 'isMale']) data = data.load_features_df(u_features_df, user_features=True) i_features_df = pd.DataFrame( { 'irid': ['item0', 'item1'], 'isNew': [False, True], 'webRating': [4, 3], 'isComedy': [True, False] }, columns=['irid', 'isNew', 'webRating', 'isComedy']) data = data.load_features_df(i_features_df, user_features=False) with pytest.warns(UserWarning): trainset, testset = next(data.folds()) # test ur ur = trainset.ur assert ur[0] == [(0, 4, None)] assert ur[1] == [(0, 4, None), (1, 2, None)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)] assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test user features u_features = trainset.u_features assert u_features[0] == [False] assert u_features[40] == [] # not in trainset and u_features_df assert trainset.user_features_labels == ['isMale'] assert trainset.n_user_features == 1 # test item features i_features = trainset.i_features assert i_features[0] == [False, 4, True] assert i_features[20000] == [] # not in trainset and i_features_df assert trainset.item_features_labels == ['isNew', 'webRating', 'isComedy'] assert trainset.n_item_features == 3 # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unknown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unknown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', [False], [False, 4, True], 4) in testset assert ('user2', 'item1', [True], [True, 3, False], 1) in testset assert ('user3', 'item1', [False], [True, 3, False], 5) in testset assert ('user3', 'item1', [False], [True, 3, False], 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert (('user0', 'item0', [False], [False, 4, True], trainset.global_mean) not in testset) assert (('user3', 'item1', [False], [True, 3, False], trainset.global_mean) not in testset) assert (('user0', 'item1', [False], [True, 3, False], trainset.global_mean) in testset) assert (('user3', 'item0', [False], [False, 4, True], trainset.global_mean) in testset)
def test_trainset_testset(toy_data_reader): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=toy_data_reader, rating_scale=(1, 5)) with pytest.warns(UserWarning): trainset, testset = next(data.folds()) # test ur ur = trainset.ur assert ur[0] == [(0, 4)] assert ur[1] == [(0, 4), (1, 2)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4), (1, 4), (2, 1)] assert ir[1] == [(1, 2), (2, 1), (3, 5)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unkown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unkown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', 4) in testset assert ('user3', 'item1', 5) in testset assert ('user3', 'item1', 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', trainset.global_mean) not in testset assert ('user3', 'item1', trainset.global_mean) not in testset assert ('user0', 'item1', trainset.global_mean) in testset assert ('user3', 'item0', trainset.global_mean) in testset