def test_wrong_file_name(): """Ensure file names are checked when creating a (custom) Dataset.""" wrong_files = [('does_not_exist', 'does_not_either')] with pytest.raises(ValueError): Dataset.load_from_folds(folds_files=wrong_files, reader=Reader(), rating_scale=(1, 5))
def test_deprecated_way(): """Test all Dataset constructors without passing rating_scale as a parameter. Make sure we revert back to the Reader object, with a warning message. Also, make sure ValueError is raised if reader has no rating_scale in this context. Not using dataset fixtures here for more control. """ # test load_from_file toy_data_path = (os.path.dirname(os.path.realpath(__file__)) + '/custom_dataset') with pytest.warns(UserWarning): reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=(1, 5)) data = Dataset.load_from_file(file_path=toy_data_path, reader=reader) with pytest.raises(ValueError): reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=None) data = Dataset.load_from_file(file_path=toy_data_path, reader=reader) # test load_from_folds train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') with pytest.warns(UserWarning): reader = Reader(line_format='user item rating timestamp', sep='\t', rating_scale=(1, 5)) data = Dataset.load_from_folds([(train_file, test_file)], reader=reader) with pytest.raises(ValueError): reader = Reader(line_format='user item rating timestamp', sep='\t', rating_scale=None) data = Dataset.load_from_folds([(train_file, test_file)], reader=reader) # test load_from_df ratings_dict = {'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, '10000'], 'rating': [3, 2, 4, 3, 1]} df = pd.DataFrame(ratings_dict) with pytest.warns(UserWarning): reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader=reader) with pytest.raises(ValueError): reader = Reader(rating_scale=None) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], # noqa reader=reader)
def test_cross_validate(toy_data): # First test with a specified CV iterator. current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] reader = Reader(line_format='user item rating', sep=' ', skip_lines=3) data = Dataset.load_from_folds(folds_files=folds_files, reader=reader, rating_scale=(1, 5)) algo = NormalPredictor() pkf = ms.PredefinedKFold() ret = ms.cross_validate(algo, data, measures=['rmse', 'mae'], cv=pkf, verbose=1) # Basically just test that keys (dont) exist as they should assert len(ret['test_rmse']) == 1 assert len(ret['test_mae']) == 1 assert len(ret['fit_time']) == 1 assert len(ret['test_time']) == 1 assert 'test_fcp' not in ret assert 'train_rmse' not in ret assert 'train_mae' not in ret # Test that 5 fold CV is used when cv=None # Also check that train_* key exist when return_train_measures is True. ret = ms.cross_validate(algo, toy_data, measures=['rmse', 'mae'], cv=None, return_train_measures=True, verbose=True) assert len(ret['test_rmse']) == 5 assert len(ret['test_mae']) == 5 assert len(ret['fit_time']) == 5 assert len(ret['test_time']) == 5 assert len(ret['train_rmse']) == 5 assert len(ret['train_mae']) == 5
def u1_ml100k(): """Return a Dataset object that contains 10% of the u1 fold from movielens 100k. Trainset has 8000 ratings and testset has 2000. """ train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k'), rating_scale=(1, 5)) return data
def get_rating_predictions(self, test_set, cluster_user_mapping=None): self.test_set = test_set test_path_tmp = "..\\resources\\tmp\\test_file.csv" train_path_tmp = "..\\resources\\tmp\\train_file.csv" self.train_set.to_csv(train_path_tmp, index=False, header=False) self.test_set.to_csv(test_path_tmp, index=False, header=False) fold_files = [(train_path_tmp, test_path_tmp)] reader = Reader(rating_scale=(1, 10), line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold_files, reader=reader) for trainset, testset in PredefinedKFold().split(data): if cluster_user_mapping is None: self.method.fit(trainset) else: df_users_in_clusters = pd.DataFrame.from_dict( cluster_user_mapping) df_cluser_users = df_users_in_clusters.groupby('') #Distinct clusters: clusters = list(set(cluster_user_mapping.values())) #for cluster in clusters: #cluster_train_data = trainset[trainset.userID.isin() userID] pass results = pd.DataFrame(columns=['userID', 'itemID', 'real', 'est']) pbar = tqdm(total=len(self.test_set.index)) for key, val in self.test_set.iterrows(): prediction = self.method.predict(str(val.userID), str(val.itemID), clip=False) results = results.append( { "userID": int(val.userID), "itemID": int(val.itemID), "real": int(val.rating), "est": int(prediction.est) }, ignore_index=True) pbar.update(1) pbar.close() return results
def predict_rating_split_by_time(self, files_pair, algo_test): algo = algo_test[0] use_auto_parse = algo_test[1] if use_auto_parse: fold_files = [(files_pair)] reader = Reader(rating_scale=(1, 10), line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold_files, reader=reader) for trainset, testset in PredefinedKFold().split(data): algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) return rmse else: # Prepare dataset train_set = pd.read_csv(files_pair[0], parse_dates=[3]) test_set = pd.read_csv(files_pair[1], parse_dates=[3]) item_to_id_mapping = {} user_to_id_mapping = {} item_index = 0 user_index = 0 all_sets = pd.concat([train_set, test_set]) for item in all_sets['itemID']: if item not in item_to_id_mapping.keys(): item_to_id_mapping[item] = item_index item_index += 1 for user in all_sets['userID']: if user not in user_to_id_mapping.keys(): user_to_id_mapping[user] = user_index user_index += 1 train_set['itemID'] = train_set['itemID'].map(item_to_id_mapping) test_set['itemID'] = test_set['itemID'].map(item_to_id_mapping) train_set['userID'] = train_set['userID'].map(user_to_id_mapping) test_set['userID'] = test_set['userID'].map(user_to_id_mapping) algo.fit(train_set) rec_list = algo.get_top_n_recommendations(test_set) pass
def get_top_n_recommendations(self, test_set, top_n): self.test_set = test_set test_path_tmp = "..\\resources\\tmp\\test_file.csv" train_path_tmp = "..\\resources\\tmp\\train_file.csv" self.train_set.to_csv(train_path_tmp, index=False, header=False) self.test_set.to_csv(test_path_tmp, index=False, header=False) fold_files = [(train_path_tmp, test_path_tmp)] reader = Reader(rating_scale=(1, 10), line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold_files, reader=reader) for trainset, testset in PredefinedKFold().split(data): self.method.fit(trainset) already_ranked_items_by_users = self.train_set.groupby( 'userID')['itemID'].apply(list) recommendations = {} pbar = tqdm(total=len(self.test_set.userID.unique())) for userID in self.test_set.userID.unique(): pbar.update(1) if userID not in self.train_set.userID.unique(): recommendations[str(userID)] = [] continue items_expected_ranking = {} for itemID in self.train_set.itemID.unique(): if itemID in already_ranked_items_by_users[userID]: continue # Calc prediction for item for user predicted = self.method.predict(str(userID), str(itemID), clip=False) items_expected_ranking[itemID] = predicted[3] sorted_predictions = sorted(items_expected_ranking.items(), key=operator.itemgetter(1)) sorted_predictions.reverse() sorted_predictions = [str(x[0]) for x in sorted_predictions] user_recommendations = sorted_predictions[:top_n] recommendations[str(userID)] = user_recommendations pbar.close() return recommendations
def test_cross_validate(): # First test with a specified CV iterator. current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=(1, 5)) data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) algo = NormalPredictor() pkf = ms.PredefinedKFold() ret = ms.cross_validate(algo, data, measures=[['neg_rmse', neg_rmse], ['neg_mae', neg_mae]], cv=pkf, verbose=1) # Basically just test that keys (dont) exist as they should assert len(ret['test_neg_rmse']) == 1 assert len(ret['test_neg_mae']) == 1 assert len(ret['fit_time']) == 1 assert len(ret['test_time']) == 1 assert 'test_fcp' not in ret assert 'train_neg_rmse' not in ret assert 'train_neg_mae' not in ret # Test that 5 fold CV is used when cv=None # Also check that train_* key exist when return_train_measures is True. data = Dataset.load_from_file(current_dir + '/custom_dataset', reader) ret = ms.cross_validate(algo, data, measures=[['neg_rmse', neg_rmse], ['neg_mae', neg_mae]], cv=None, return_train_measures=True, verbose=True) assert len(ret['test_neg_rmse']) == 5 assert len(ret['test_neg_mae']) == 5 assert len(ret['fit_time']) == 5 assert len(ret['test_time']) == 5 assert len(ret['train_neg_rmse']) == 5 assert len(ret['train_neg_mae']) == 5
def test_trainset_testset(): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) for trainset, testset in data.folds(): pass # just need trainset and testset to be set # test rm: rm = trainset.rm assert rm[0, 0] == 4 assert rm[1, 0] == 4 assert rm[3, 1] == 5 assert rm[40, 20000] == 0 # not in the trainset # test ur ur = trainset.ur assert ur[0] == [(0, 4)] assert ur[1] == [(0, 4), (1, 2)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4), (1, 4), (2, 1)] assert ir[1] == [(1, 2), (2, 1), (3, 5)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, r_min, r_max assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.r_min == 1 assert trainset.r_max == 5 # test raw2inner: ensure inner ids are given in proper order raw2inner_id_users = trainset._raw2inner_id_users for i in range(4): assert raw2inner_id_users['user' + str(i)] == i raw2inner_id_items = trainset._raw2inner_id_items for i in range(2): assert raw2inner_id_items['item' + str(i)] == i
def test_PredifinedKFold(): reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=(1, 5)) current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) # Make sure rating files are read correctly pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) assert trainset.n_ratings == 6 assert len(testset) == 3
def test_PredifinedKFold(toy_data_reader): current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=toy_data_reader, rating_scale=(1, 5)) # Make sure rating files are read correctly pkf = PredefinedKFold() trainset, testset = next(pkf.split(data)) assert trainset.n_ratings == 6 assert len(testset) == 3 # Make sure pkf returns the same folds as the deprecated data.folds() with pytest.warns(UserWarning): trainset_, testset_ = next(data.folds()) assert testset_ == testset
def surprise_SVDpp(train_file, test_file): """ Svd++ with Surprise library. Compute the predictions on a test_set after training on a train_set using the method Svd++ from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters: n_factors : The number of factors. n_epochs : The number of iteration of the SGD procedure lr_'x': The learning rate for 'x' reg_'x' : The regularization term for 'x' 'x': bi : The item biases bu : The user biases qi : The item factors yj : The (implicit) item factors pu : The user factors Returns: numpy array: predictions """ print("SVDpp") fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() # Algorithm algo = SVDpp(n_epochs=40, n_factors=100, lr_all=0.01, reg_all=0.01) for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def test_performances(): """Test the returned dict. Also do dumping.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] reader = Reader(line_format='user item rating', sep=' ', skip_lines=3) data = Dataset.load_from_folds(folds_files=folds_files, reader=reader, rating_scale=(1, 5)) algo = NormalPredictor() tmp_dir = tempfile.mkdtemp() # create tmp dir with pytest.warns(UserWarning): performances = evaluate(algo, data, measures=['RmSe', 'Mae'], with_dump=True, dump_dir=tmp_dir, verbose=2) shutil.rmtree(tmp_dir) # remove tmp dir assert performances['RMSE'] is performances['rmse'] assert performances['MaE'] is performances['mae']
def test_gridsearchcv_best_estimator(): """Ensure that the best estimator is the one giving the best score (by re-running it)""" train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) param_grid = {'n_epochs': [5], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0]} gs = GridSearchCV(SVD, param_grid, measures=['mae'], cv=PredefinedKFold(), joblib_verbose=100) gs.fit(data) best_estimator = gs.best_estimator['mae'] # recompute MAE of best_estimator mae = cross_validate(best_estimator, data, measures=['MAE'], cv=PredefinedKFold())['test_mae'] assert mae == gs.best_score['mae']
def load_data(file_dict, dataformat): # 加载数据 if dataformat == "builtin": data = Dataset.load_builtin(name=file_dict["name"], prompt=True) elif dataformat == "file": reader = Reader(line_format=file_dict["line_format"], sep=file_dict.get("sep", None), rating_scale=file_dict.get("rating_scale", (1, 5)), skip_lines=file_dict.get("skip_lines", 0)) data = Dataset.load_from_file(file_path=file_dict["file_path"], reader=reader) elif dataformat == "dataframe": reader = Reader(rating_scale=file_dict.get("rating_scale", (1, 5))) data = Dataset.load_from_df(df=file_dict["df"][file_dict["header"]], reader=reader) elif dataformat == "folds": # 已经进行k折交叉验证 files_dir = os.path.expanduser(file_dict["file_dir"]) reader = Reader(name=file_dict["name"]) train_file = files_dir + file_dict["train_name"] test_file = files_dir + file_dict["test_name"] folds_files = [(train_file % i, test_file % i) for i in file_dict["file_num"]] print(folds_files) data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) else: ValueError("dataframe 超出了可处理的文件的类型范围") return data
def test_knns(): """Ensure the k and min_k parameters are effective for knn algorithms.""" # the test and train files are from the ml-100k dataset (10% of u1.base and # 10 % of u1.test) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) # Actually, as KNNWithMeans and KNNBaseline have back up solutions for when # there are not enough neighbors, we can't really test them... klasses = (KNNBasic, ) # KNNWithMeans, KNNBaseline) k, min_k = 20, 5 for klass in klasses: algo = klass(k=k, min_k=min_k) for trainset, testset in data.folds(): algo.fit(trainset) predictions = algo.test(testset) for pred in predictions: if not pred.details['was_impossible']: assert min_k <= pred.details['actual_k'] <= k
def func7(): import os from surprise import SVD from surprise import Dataset from surprise import Reader from surprise import accuracy from surprise.model_selection import PredefinedKFold files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/') reader = Reader('ml-100k') train_file = files_dir + 'u%d.base' test_file = files_dir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader) pkf = PredefinedKFold() algo = SVD() for trainset, testset in pkf.split(data): algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions, verbose=True)
def basic_rec(model_name, train_path, test_path, target_id): # build data # TODO check float and min_r reader = Reader(line_format='user item rating', sep='\t', rating_scale=(1, 5)) data = Dataset.load_from_folds([(train_path, test_path)], reader=reader) trainset, testset = None, None pkf = PredefinedKFold() for trainset_, testset_ in pkf.split(data): trainset, testset = trainset_, testset_ # train model rec_algo = get_model(model_name) rec_algo.fit(trainset) # eval preds = rec_algo.test(testset) rmse = accuracy.rmse(preds, verbose=True) # predor target fn_pred = lambda uid: rec_algo.predict(str(uid), str(target_id), r_ui=0 ).est target_predictions = list(map(fn_pred, range(trainset.n_users))) # topn testset = trainset.build_anti_testset() predictions = rec_algo.test(testset) top_n = get_top_n(predictions, n=50) hit_ratios = {} for uid, user_ratings in top_n.items(): topN = [int(iid) for (iid, _) in user_ratings] hits = [ 1 if target_id in topN[:i] else 0 for i in [1, 3, 5, 10, 20, 50] ] hit_ratios[int(uid)] = hits return target_predictions, hit_ratios
for x in ratingRDD.collect(): ratingMap[(x[0], x[1])] = x[2] # print("total data: " + str(len(ratingMap))) # add avg business stars # avgBusList = list() # userRatingRDD = userRatingRDD.filter(lambda t: int(preUserMap[t[0]]) % 30 == 0) # busRatingRDD = busRatingRDD.filter(lambda t: int(preBusinessMap[t[0]]) % 40 == 0) # print("add user avg count: " + str(userRatingRDD.count())) # print("add bus avg count: " + str(busRatingRDD.count())) reader = Reader(line_format='user item rating', sep=",", skip_lines=1) folds_files = [(trainingFilePath, validationFilePath)] data = Dataset.load_from_folds(folds_files, reader=reader) pkf = PredefinedKFold() algo = SVD() predictionList = list() for trainset, testset in pkf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) for uid, iid, true_r, est, _ in predictions: predictionList.append((uid, iid, est)) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
from __future__ import (absolute_import, division, print_function, unicode_literals) import os import pytest from surprise import NMF from surprise import Dataset from surprise import Reader from surprise import evaluate # the test and train files are from the ml-100k dataset (10% of u1.base and # 10 % of u1.test) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) def test_NMF_parameters(): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = NMF(n_factors=1, n_epochs=1) rmse_default = evaluate(algo, data, measures=['rmse'])['rmse'] # n_factors algo = NMF(n_factors=2, n_epochs=1) rmse_factors = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_factors # n_epochs
def test_trainset_testset(): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) for trainset, testset in data.folds(): pass # just need trainset and testset to be set # test ur ur = trainset.ur assert ur[0] == [(0, 4)] assert ur[1] == [(0, 4), (1, 2)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4), (1, 4), (2, 1)] assert ir[1] == [(1, 2), (2, 1), (3, 5)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unkown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unkown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.train(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', 4) in testset assert ('user3', 'item1', 5) in testset assert ('user3', 'item1', 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.train(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', trainset.global_mean) not in testset assert ('user3', 'item1', trainset.global_mean) not in testset assert ('user0', 'item1', trainset.global_mean) in testset assert ('user3', 'item0', trainset.global_mean) in testset
def test_wrong_file_name(): """Ensure file names are checked when creating a (custom) Dataset.""" wrong_files = [('does_not_exist', 'does_not_either')] with pytest.raises(ValueError): Dataset.load_from_folds(folds_files=wrong_files, reader=reader)
from surprise import Dataset from surprise import Reader from surprise import accuracy from surprise.model_selection import PredefinedKFold # path to dataset folder files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/') # This time, we'll use the built-in reader. reader = Reader('ml-100k') # folds_files is a list of tuples containing file paths: # [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)] train_file = files_dir + 'u%d.base' test_file = files_dir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5)) pkf = PredefinedKFold() algo = SVD() for trainset, testset in pkf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
def k_recommend(model, k, testset): reader = Reader(line_format='user item rating', sep=',', skip_lines=1) fold_files = [('~/Desktop/Tufts/Fall2018/COMP135/Project3/trainset.csv', '~/Desktop/Tufts/Fall2018/COMP135/Project3/testset.csv')] pdkfold = sp.model_selection.split.PredefinedKFold() clf = model.best_estimator['mae'] data = Dataset.load_from_folds(fold_files, reader=reader) for train, test in pdkfold.split(data): clf.fit(train) test1 = train.build_anti_testset() preds = clf.test(test1) top_n = defaultdict(list) for uid, iid, true_r, est, _ in preds: top_n[uid].append((iid, est)) for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:k] """ for uid, user_ratings in top_n.items(): print(uid, [iid for (iid, _) in user_ratings]) for uid, user_ratings in top_n.items(): print uid, user_ratings """ for uid in top_n: i = 0 for iid in top_n[uid]: found = False for iid2 in testset[uid]: if iid[0] == str(iid2[0]): a = iid[0] top_n[uid].remove(top_n[uid][i]) top_n[uid].insert(i,(a, iid2[1])) found = True i += 1 break if found == False: a = iid[0] top_n[uid].remove(top_n[uid][i]) top_n[uid].insert(i,(a, 2)) i += 1 total_sum = 0.0 user_sum = 0.0 us_rec = [] for uid in top_n: i = 0.0 for iid in top_n[uid]: i += 1.0 user_sum += iid[1] total_sum += float(user_sum / i) us_rec.append(user_sum / i) user_sum = 0.0 #print us_rec print "Average rating: ", (total_sum/float(len(top_n)))
def test_refit(): data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(data_file, Reader('ml-100k')) param_grid = { 'n_epochs': [5], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [2] } # assert gs.fit() and gs.test will use best estimator for mae (first # appearing in measures) gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit=True) gs.fit(data) gs_preds = gs.test(data.construct_testset(data.raw_ratings)) mae_preds = gs.best_estimator['mae'].test( data.construct_testset(data.raw_ratings)) assert gs_preds == mae_preds # assert gs.fit() and gs.test will use best estimator for rmse gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit='rmse') gs.fit(data) gs_preds = gs.test(data.construct_testset(data.raw_ratings)) rmse_preds = gs.best_estimator['rmse'].test( data.construct_testset(data.raw_ratings)) assert gs_preds == rmse_preds # test that predict() can be called gs.predict(2, 4) # assert test() and predict() cannot be used when refit is false gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit=False) gs.fit(data) with pytest.raises(ValueError): gs_preds = gs.test(data.construct_testset(data.raw_ratings)) with pytest.raises(ValueError): gs.predict('1', '2') # test that error is raised if used with load_from_folds train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) gs = GridSearchCV(SVD, param_grid, measures=['mae', 'rmse'], cv=2, refit=True) with pytest.raises(ValueError): gs.fit(data)
""" Created on Fri Nov 6 09:40:38 2020 @author: sasha """ from surprise import Dataset, Reader from surprise.model_selection import PredefinedKFold from surprise import accuracy from lsh_jaccard import lsh_jaccard train_file_path = "train.csv" test_file_path = "test.csv" reader = Reader(line_format='user item rating timestamp', sep=',') #data = Dataset.load_from_file(train_file_path, reader=reader) data = Dataset.load_from_folds([(train_file_path, test_file_path)], reader=reader) pkf = PredefinedKFold() algo = lsh_jaccard(threshold = 0.1) for trainset, testset in pkf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
def test_trainset_testset(): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) with pytest.warns(UserWarning): trainset, testset = next(data.folds()) # test ur ur = trainset.ur assert ur[0] == [(0, 4, None)] assert ur[1] == [(0, 4, None), (1, 2, None)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)] assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test user features u_features = trainset.u_features assert u_features[0] == [] # no u_features_df added assert u_features[1] == [] # no u_features_df added assert u_features[3] == [] # no u_features_df added assert u_features[40] == [] # not in trainset and no u_features_df assert trainset.user_features_labels == [] assert trainset.n_user_features == 0 # test item features i_features = trainset.i_features assert i_features[0] == [] # no i_features_df added assert i_features[1] == [] # no i_features_df added assert i_features[20000] == [] # not in trainset and no i_features_df assert trainset.item_features_labels == [] assert trainset.n_item_features == 0 # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unknown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unknown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', [], [], 4) in testset assert ('user3', 'item1', [], [], 5) in testset assert ('user3', 'item1', [], [], 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', [], [], trainset.global_mean) not in testset assert ('user3', 'item1', [], [], trainset.global_mean) not in testset assert ('user0', 'item1', [], [], trainset.global_mean) in testset assert ('user3', 'item0', [], [], trainset.global_mean) in testset
def run_knn_baseline(sparse_data): #filename = "test.json" prefix = "knn_baseline_" trainFile = prefix + "train.txt" testFile = prefix + "test.txt" raw_data, userPurchasedSet, userTrueTestSet = preprocess( sparse_data, trainFile, testFile) folds_files = [(trainFile, testFile)] reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_folds(folds_files, reader=reader) pkf = PredefinedKFold() bsl_options = { 'method': 'sgd', 'n_epochs': 20, 'learning_rate': 0.005, } ### sim name: cosine msd pearson pearson_baseline ### user_based : True ---- similarity will be computed based on users ### : False ---- similarity will be computed based on items. sim_options = {'name': 'pearson_baseline', 'user_based': False} predictions = {} top_n = {} testsSet = None total_precisions = 0.0 total_recalls = 0.0 total_hit = 0.0 total_nDCG = 0.0 total_ffeature = 0.0 result_file = prefix + "result.txt" result_f = open(result_file, "w") for trainset, testset in pkf.split(data): testsSet = testset #algo = SVD(n_factors = 5) algo = KNNBaseline(bsl_options=bsl_options, sim_options=sim_options) algo.fit(trainset) pre = algo.test(testset) accuracy.rmse(pre) accuracy.mae(pre) #calculate_rmse(predictions) ### test rowNum = raw_data.get_row_size() colNum = raw_data.get_col_size() cur_time = time.time() time_cost = 0 for i in range(rowNum): user = raw_data.get_userID(i) predictions[user] = set() pq = [] heapq.heapify(pq) for j in range(colNum): item = raw_data.get_itemID(j) if user not in userPurchasedSet or item in userPurchasedSet[ user]: continue value = raw_data.get_val(user, item, 'rating') predict = algo.predict(user, item, r_ui=0, verbose=False)[3] if len(pq) >= 10: heapq.heappop(pq) heapq.heappush(pq, (predict, item)) top_n[user] = set() for items in pq: top_n[user].add(items[1]) if user in userTrueTestSet: curPrecisions = calculate_precision(top_n[user], userTrueTestSet[user]) curRecalls = calculate_recall(top_n[user], userTrueTestSet[user]) ffeature = calculate_f_feature(curPrecisions, curRecalls) curHit = isHit(top_n[user], userTrueTestSet[user]) cur_nDCG = calculate_NDCG(top_n[user], userTrueTestSet[user]) total_precisions += curPrecisions total_recalls += curRecalls total_hit += curHit total_nDCG += cur_nDCG total_ffeature += ffeature result_f.write(user + "\t" + str(curPrecisions) + "\t" + str(curRecalls) + "\t" + str(ffeature) + "\t" + str(curHit) + '\t' + str(cur_nDCG) + "\n") if i != 0 and i % 1000 == 0: duration = (time.time() - cur_time) / 60 time_cost += duration remaining_time = ((rowNum - i) / 1000) * duration cur_time = time.time() #print 'precisions', total_precisions, ' recalls', total_recalls, ' nDCG', total_nDCG print 'i:', i, "/", rowNum, 'remaining time:', remaining_time, 'min' print 'precicions', total_precisions, ' recalls', total_recalls, ' hit', total_hit, 'nDCG:', total_nDCG rowNum = raw_data.get_row_size() print 'avg_precisions:', total_precisions / rowNum, 'avg_recalls:', total_recalls / rowNum, 'avg_ffeature', str( total_ffeature / rowNum ), 'avg_hit:', total_hit / rowNum, 'avg_nDCG:', total_nDCG / rowNum result_f.write("avg:\t" + str(total_precisions / rowNum) + "\t" + str(total_recalls / rowNum) + "\t" + str(total_ffeature / rowNum) + "\t" + str(total_hit / rowNum) + '\t' + str(total_nDCG / rowNum) + "\n") result_f.close()
def test_randomizedsearchcv_refit(): """Test refit method of RandomizedSearchCV class.""" data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_file(data_file, Reader('ml-100k')) param_distributions = { 'n_epochs': [5], 'lr_all': uniform(0.002, 0.003), 'reg_all': uniform(0.4, 0.2), 'n_factors': [2] } # assert rs.fit() and rs.test will use best estimator for mae (first # appearing in measures) rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'], cv=2, refit=True) rs.fit(data) rs_preds = rs.test(data.construct_testset(data.raw_ratings)) mae_preds = rs.best_estimator['mae'].test( data.construct_testset(data.raw_ratings)) assert rs_preds == mae_preds # assert rs.fit() and rs.test will use best estimator for rmse rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'], cv=2, refit='rmse') rs.fit(data) rs_preds = rs.test(data.construct_testset(data.raw_ratings)) rmse_preds = rs.best_estimator['rmse'].test( data.construct_testset(data.raw_ratings)) assert rs_preds == rmse_preds # test that predict() can be called rs.predict(2, 4) # assert test() and predict() cannot be used when refit is false rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'], cv=2, refit=False) rs.fit(data) with pytest.raises(ValueError): rs.test(data.construct_testset(data.raw_ratings)) with pytest.raises(ValueError): rs.predict('1', '2') # test that error is raised if used with load_from_folds train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) rs = RandomizedSearchCV(SVD, param_distributions, measures=['mae', 'rmse'], cv=2, refit=True) with pytest.raises(ValueError): rs.fit(data)
import numpy as np import pandas as pd from surprise import SVD, Dataset, Reader from surprise.model_selection import PredefinedKFold from matrix_vis import visualization u_cols = ['movie_id', 'movie_title', 'unknown', 'action', 'adventure', 'animation', 'childrens', 'comedy', 'crime', 'documentary', 'drama', 'fantasy', 'film-noir', 'horror', 'musical', 'mystery', 'romance', 'sci-fi', 'thriller', 'war', 'western'] # users = pd.read_csv('data/movies.txt', sep='\t', names = u_cols, encoding = 'latin-1') r_cols = ['user_id', 'movie_id', 'rating'] reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_folds([('data/train.txt', 'data/test.txt')], reader=reader) pkf = PredefinedKFold() for trainset, testset in pkf.split(data): algo = SVD() algo.fit(trainset) u = algo.pu v = algo.qi v = np.transpose(v) a, _, _ = np.linalg.svd(v) a = a[:2] vplot = np.dot(a, v) predictions = algo.test(testset) print('method 2 error: %f' % accuracy.rmse(predictions) movie_ratings = np.genfromtxt("data/summary.txt", names=True) movie_titles = [] movie_data = [] with open("data/movies.txt", mode="r", encoding="ISO-8859-1") as f: for line in f:
def test_trainset_testset_ui_features(): """Test the construct_trainset and construct_testset methods with user and item features.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) u_features_df = pd.DataFrame( { 'urid': ['user0', 'user2', 'user3', 'user1'], 'isMale': [False, True, False, True] }, columns=['urid', 'isMale']) data = data.load_features_df(u_features_df, user_features=True) i_features_df = pd.DataFrame( { 'irid': ['item0', 'item1'], 'isNew': [False, True], 'webRating': [4, 3], 'isComedy': [True, False] }, columns=['irid', 'isNew', 'webRating', 'isComedy']) data = data.load_features_df(i_features_df, user_features=False) with pytest.warns(UserWarning): trainset, testset = next(data.folds()) # test ur ur = trainset.ur assert ur[0] == [(0, 4, None)] assert ur[1] == [(0, 4, None), (1, 2, None)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4, None), (1, 4, None), (2, 1, None)] assert ir[1] == [(1, 2, None), (2, 1, None), (3, 5, None)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test user features u_features = trainset.u_features assert u_features[0] == [False] assert u_features[40] == [] # not in trainset and u_features_df assert trainset.user_features_labels == ['isMale'] assert trainset.n_user_features == 1 # test item features i_features = trainset.i_features assert i_features[0] == [False, 4, True] assert i_features[20000] == [] # not in trainset and i_features_df assert trainset.item_features_labels == ['isNew', 'webRating', 'isComedy'] assert trainset.n_item_features == 3 # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unknown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unknown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', [False], [False, 4, True], 4) in testset assert ('user2', 'item1', [True], [True, 3, False], 1) in testset assert ('user3', 'item1', [False], [True, 3, False], 5) in testset assert ('user3', 'item1', [False], [True, 3, False], 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert (('user0', 'item0', [False], [False, 4, True], trainset.global_mean) not in testset) assert (('user3', 'item1', [False], [True, 3, False], trainset.global_mean) not in testset) assert (('user0', 'item1', [False], [True, 3, False], trainset.global_mean) in testset) assert (('user3', 'item0', [False], [False, 4, True], trainset.global_mean) in testset)
def test_trainset_testset(toy_data_reader): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=toy_data_reader, rating_scale=(1, 5)) with pytest.warns(UserWarning): trainset, testset = next(data.folds()) # test ur ur = trainset.ur assert ur[0] == [(0, 4)] assert ur[1] == [(0, 4), (1, 2)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4), (1, 4), (2, 1)] assert ir[1] == [(1, 2), (2, 1), (3, 5)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unkown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unkown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', 4) in testset assert ('user3', 'item1', 5) in testset assert ('user3', 'item1', 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.fit(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', trainset.global_mean) not in testset assert ('user3', 'item1', trainset.global_mean) not in testset assert ('user0', 'item1', trainset.global_mean) in testset assert ('user3', 'item0', trainset.global_mean) in testset