def collaborative_filter(id, new_words): ratings_dict = calc_collaborative_param(new_words, id) df = pd.DataFrame(ratings_dict) # A reader is still needed but only the rating_scale param is required. reader = Reader(rating_scale=(0.0, 5.0)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) # define a cross-validation iterator kf = KFold(n_splits=3) algo = KNNBasic() for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) kf_predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(kf_predictions, verbose=True) trainset = data.build_full_trainset() new_data = trainset.build_anti_testset() predictions = algo.test(new_data) top_n = get_top_n(predictions, n=3) with open('top_n.json', 'w') as fp: dump(top_n, fp, indent=4) return top_n
def knn_running_time(data): ''' Calculates the running times for training and predictions for Basic KNN Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_KnnBasictrain: running time for training elapsedtime_KnnBasictest: running time for predictions on testset ''' elapsedtime_KnnBasictrain = [] elapsedtime_KnnBasictest = [] # tune the parameters on the entire data param_grid = { 'k': [5, 10, 20], 'sim_options': { 'name': ['msd', 'cosine', 'pearson'], 'min_support': [1, 5], 'user_based': [False] } } grid_search = GridSearch(KNNBasic, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data[3]) param = grid_search.best_params['RMSE'] k = param['k'] sim = param['sim_options']['name'] min_support = param['sim_options']['min_support'] user_based = param['sim_options']['user_based'] # using the tuned parameters calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() knn = KNNBasic(k=k, name=sim, min_support=min_support, user_based=user_based) knn.train(training) elapsedtime_KnnBasictrain.append(time.time() - training_start) # prediction running time test_start = time.time() knn.test(testing) elapsedtime_KnnBasictest.append(time.time() - test_start) return elapsedtime_KnnBasictrain, elapsedtime_KnnBasictest
def use_cosine_similarity(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using cosine similarity') sim_options = { 'name': 'cosine', 'user_based': False # compute similarities between items } algo_cosine = KNNBasic(sim_options=sim_options) algo_cosine.fit(trainset) testset = trainset.build_anti_testset() predictions_KNN = algo_cosine.test(testset) accuracy_rmse = accuracy.rmse(predictions_KNN) accuracy_mae = accuracy.mae(predictions_KNN) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def rodar_modelo(data, teste_tamanho, sim_opcoes, k): treina, testa = train_test_split(data, teste_tamanho) knn = KNNBasic(k=k, sim_options=sim_opcoes) knn.fit(treina) knn_predicoes = knn.test(testa) accuracy.rmse(knn_predicoes) return knn
def use_pearson_baseline(): start = time.time() performance = [] data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() print('Using Pearson baseline') sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } algo_pearson = KNNBasic(sim_options=sim_options) algo_pearson.fit(trainset) testset = trainset.build_anti_testset() predictions_KNN = algo_pearson.test(testset) accuracy_rmse = accuracy.rmse(predictions_KNN) accuracy_mae = accuracy.mae(predictions_KNN) performance.append(accuracy_rmse) performance.append(accuracy_mae) end = time.time() performance.append(end - start) return performance
def KNN_Tester(trainset, testset, algo): param_grid = { 'k': [50, 100], 'sim_options': { 'name': ['msd', 'cosine', 'pearson'] } } gs = GridSearchCV(algo, param_grid, measures=['rmse'], cv=5) gs.fit(data) params = gs.best_params['rmse'] algo = KNNBasic(k=params['k'], sim_options=params['sim_options']) algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=4) avg_precision = sum(prec for prec in precisions.values()) / len(precisions) avg_recall = sum(rec for rec in recalls.values()) / len(recalls) metrics = { 'rmse': rmse, 'avg_precision': avg_precision, 'avg_recall': avg_recall, 'best_parameters': params } return metrics
def content(self): # content based surprise_data = self.prepare_Data() if surprise_data == []: print("No data provided") return sim_options = { 'name': 'cosine', 'user_based': False # compute similarities between items } algo = KNNBasic(sim_options=sim_options) trainset = surprise_data.build_full_trainset() algo.fit(trainset) testset = trainset.build_testset() predictions = algo.test(testset) recommendation = self.get_top_n(predictions) new_list = [] k = 0 for i, j in recommendation[self.user_id]: data_to_append = {} data_to_append.update({'id': k}) data_to_append.update({'business id': i}) new_list.append(data_to_append) k += 1 recommend = {} recommend = {item['id']: item for item in new_list} return (recommend)
def run_KNN(x_train, x_test, k): reader = Reader(rating_scale=(1, 5)) data_train_df = Dataset.load_from_df( x_train[['userId', 'movieId', 'rating']], reader) data_test_df = Dataset.load_from_df( x_test[['userId', 'movieId', 'rating']], reader) data_train = data_train_df.build_full_trainset() data_test = data_test_df.build_full_trainset() data_testset = data_test.build_testset() algo = KNNBasic() algo.fit(data_train) pr = algo.test(data_testset) rec = format_baselines(pr) seen = format_baselines_apk(pr, x_test) predicted, actual = format_baselines_third(pr, x_test) print(predicted) print(actual) print(f'Alternative Precision {recommender_precision(predicted, actual)}') print(f'Alternative Recall {recommender_recall(predicted, actual)}') print(f'APK: {yallah(seen, k)}') precisions, recalls = precision_recall_at_k(rec, k) print( f'|KNN : Precision| = {sum(prec for prec in precisions.values()) / len(precisions)}' ) print( f'|KNN : Recall| = {sum(rec for rec in recalls.values()) / len(recalls)}' )
class KNN_Basic(BaseSurpriseSTLEstimator): """ Args: :attr:`k` (int): number of neighbors :attr:`sim_options` (optional): option from surprise for a similarity metric """ def __init__(self, k, name='KNN_Basic', sim_options=None): super().__init__(name, 'non_feature_based') self.k = k if sim_options is not None: self.model = KNNBasic(k=self.k, verbose=False, sim_options=sim_options) else: self.model = KNNBasic(k=self.k, verbose=False) def _fit(self, x): self.model.fit(x) def _predict(self, x): return self.model.test(x) def get_hyper_params(self): hparams = {'k': {'type': 'integer', 'values': [2, 13]}} return hparams def set_hyper_params(self, **kwargs): self.k = kwargs['k'] def similarity_matrix(self): return self.model.compute_similarities()
def results(): names = ['userID', 'itemID', 'rating'] df = pd.read_csv('~/.surprise_data/ratings.csv', names=names) names1 = ['itemID', 'Profession', 'City'] df1 = pd.read_csv('~/.surprise_data/workers1.csv', names=names1) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) trainset = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': False} algo = KNNBasic(k=40, min_k=1, sim_options={}) algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, n=10) myArray = [] for uid, user_ratings in top_n.items(): abcd = [] #abcd.append(iid for (iid, _) in user_ratings) for w in user_ratings: abcd.append(w) myArray.append([uid, abcd]) print(myArray) return render_template('secondpage.html', returned={'data': myArray}) return ('results working')
def get_accuracy(df, genre, neighbors=30, min_neighbors=5, seed=12345, kfolds=5, k=5, threshold=4): """ Gets the precision and accuracy of the model for each genre using cross validation Args: df (pandas.DataFrame): the dataset of actual ratings genre (str): the genre for the model neighbors (int): the number of neighbors to take into account when training the model Default is 30. min_neighbors (int): the number of neighbors a user must have in order to get a prediction. Default is 5. seed (int): setting the random state. Default is 12345. kfolds (int): the number of folds for cross validation. Default is 5. k (int): number of recommendations for each user. default is 5. threshold (int): the cutoff rating at which an item will be considered 'enjoyed.' Returns: prec (int): The average of precision across the kfolds cross validation rec (int): The average of recall across the kfolds cross validation """ data = df[df['genre'] == genre] data = data[['user_id', 'book_id', 'rating']] reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(data[['user_id', 'book_id', 'rating']], reader) algo_KNNbasic = KNNBasic(k=neighbors, min_k=min_neighbors, random_state=seed) kf = KFold(n_splits=kfolds, random_state=seed) prec_list = [] recalls_list = [] for trainset, testset in kf.split(data): algo_KNNbasic.fit(trainset) predictions = algo_KNNbasic.test(testset) precisions, recalls = precision_recall_at_k(predictions, k=k, threshold=threshold) # Precision and recall can then be averaged over all users logger.info("Precision:") logger.info( sum(prec for prec in precisions.values()) / len(precisions)) precision = (sum(prec for prec in precisions.values()) / len(precisions)) logger.info("Recall") logger.info(sum(rec for rec in recalls.values()) / len(recalls)) recall = (sum(rec for rec in recalls.values()) / len(recalls)) prec_list.append(precision) recalls_list.append(recall) prec = (sum(prec_list) / len(prec_list)) rec = (sum(recalls_list) / len(recalls_list)) return prec, rec
def executeTraining(modelFileName, simOptions): knn = KNNBasic(sim_options=sim_options, k=3) knn.train(trainingSet) testSet = trainingSet.build_anti_testset() predictions = knn.test(testSet) os.makedirs('./outputs', exist_ok=True) with open(modelFileName, "wb") as file: joblib.dump(knn, os.path.join('./outputs/', modelFileName))
def main(): data = Dataset.load_builtin('ml-100k') trainset, testset = train_test_split(data, test_size=.25) algo = KNNBasic() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(trainset) predictions = algo.test(testset) # Then compute RMSE score = accuracy.rmse(predictions) print('rmse: ', score)
def knn_basic_movie(train, test, ids, Xtest, Xids): """ kNN basic approach on movies Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('kNN Basic Movie') algo = KNNBasic(k=21, name='msd', min_support=2, user_based=False, verbose=False) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def run_collaborative_filtering(): global top_recommendations global knn data = Dataset.load_builtin("ml-100k") training_set = data.build_full_trainset() sim_options = {'name': 'pearson_baseline', 'user_based': True} knn = KNNBasic(sim_options=sim_options) knn.fit(training_set) test_set = training_set.build_anti_testset() predictions = knn.test(test_set) top_recommendations = get_top_recommendations(predictions) return 'OK'
def do_knn(trainingSet, start_time): knn = KNNBasic(sim_options=sim_options) # evaluate(knn, Dataset.load_builtin("ml-100k"), measures=['RMSE', 'MAE']) knn.fit(trainingSet) testSet = trainingSet.build_anti_testset() print("Training complete") predictions = knn.test(testSet) print("Predictions ready") LOGGER.info("0;Data prediction completed in '%s' minutes", str((time.time() - start_time) / 60)) print("Rmse values for doing model based recomm on movielens data is " + str(accuracy.rmse(predictions))) return predictions
def algoFunc(train_data, test_data): SVD_var = SVD() print("Singular Value Decomposition :\n") SVD_var.fit(train_data) predict_var = SVD_var.test(test_data) SVD_RMSE_var = accuracy.rmse(predict_var, verbose=True) SVD_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nProbabilistic Matrix Factorization :\n") PMF_var = SVD(biased=False) PMF_var.fit(train_data) predict_var = PMF_var.test(test_data) PMF_RMSE_var = accuracy.rmse(predict_var, verbose=True) PMF_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nNon-negative Matrix Factorization :\n") NMF_var = NMF() NMF_var.fit(train_data) predict_var = NMF_var.test(test_data) NMF_RMSE_var = accuracy.rmse(predict_var, verbose=True) NMF_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nUser based Collaborative Filtering algorithm :\n") UB_var = KNNBasic(sim_options={'user_based': True}) UB_var.fit(train_data) predict_var = UB_var.test(test_data) user_RMSE_var = accuracy.rmse(predict_var, verbose=True) user_MAE_var = accuracy.mae(predict_var, verbose=True) print("\nItem based Collaborative Filtering algorithm :\n") IB_var = KNNBasic(sim_options={'user_based': False}) IB_var.fit(train_data) predict_var = IB_var.test(test_data) item_RMSE_var = accuracy.rmse(predict_var, verbose=True) item_MAE_var = accuracy.mae(predict_var, verbose=True) print("\n") return SVD_RMSE_var, SVD_MAE_var, PMF_RMSE_var, PMF_MAE_var, NMF_RMSE_var, NMF_MAE_var, user_RMSE_var, user_MAE_var, item_RMSE_var, item_MAE_var
def user_based_cf(co_pe): # INITIALIZE REQUIRED PARAMETERS # path = 'ml-100k/u.user' prnt = "USER" sim_op = {'name': co_pe, 'user_based': True} algo = KNNBasic(sim_options=sim_op) reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5)) df = Dataset.load_from_file('ml-100k/u.data', reader=reader) # START TRAINING trainset = df.build_full_trainset() # APPLYING ALGORITHM KNN Basic algo.train(trainset) print "ALGORITHM USED", co_pe # -------------------------------`-------------- MARKERS f = io.open("_AlgoHist_ub.txt", "wb") f.write(repr(co_pe)) f.close() # --------------------------------------------- MARKERS END print "CF Type:", prnt, "BASED" # PEEKING PREDICTED VALUES search_key = raw_input("Enter User ID:") item_id = raw_input("Enter Item ID:") actual_rating = input("Enter actual Rating:") print algo.predict(str(search_key), item_id, actual_rating) testset = trainset.build_anti_testset() predictions = algo.test(testset=testset) top_n = get_top_n(predictions, 5) result_u = True k = input("Enter size of Neighborhood (Min:1, Max:40)") inner_id = algo.trainset.to_inner_iid(search_key) neighbors = algo.get_neighbors(inner_id, k=k) print "Nearest Matching users are:" for i in neighbors: print "\t " * 6, i return top_n, result_u
def CosineAlgorithmSurprise(self): sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 1} model = KNNBasic(sim_options=sim_options) model.fit(self.Train) #testset = self.Train.build_anti_testset() #predictions = model.test(testset) predictions = model.test(self.Test) df = pd.DataFrame(predictions, columns=[ 'user_id', 'song_id', 'listen_count', 'prediction', 'details' ]) return model, df
class MusicRecommend(): def __init__(self): self.current = 0 self.updateTimeStamp = [(self.current, time.time())] self.top_n = defaultdict(list) reader = Reader(line_format=READER_OPT["line_format"], sep=READER_OPT["sep"], rating_scale=READER_OPT["rating_scale"], skip_lines=READER_OPT["skip_lines"]) self.data = Dataset.load_from_file(RATE_PATH, reader=reader) if os.path.isfile(RECOMMEND_PATH): self.predictions, self.algo = dump.load(RECOMMEND_PATH) else: sim_opt = { "name": ALGO_OPT["similarity"], "user_based": ALGO_OPT["user_based"] } self.algo = KNNBasic(sim_options=sim_opt) self.predictions = [] def __del__(self): # dump.dump(RECOMMEND_PATH, predictions=self.predictions, algo=self.algo, verbose=0) StdError.info( 'The dump has been saved as file {}'.format(RECOMMEND_PATH)) def calculate(self, n=100): trainset = self.data.build_full_trainset() self.algo.fit(trainset) testset = trainset.build_anti_testset() self.predictions = self.algo.test(testset) self.current += 1 self.updateTimeStamp.append((self.current, time.time())) self.top_n = defaultdict(list) for uid, iid, t_rating, est, _ in self.predictions: self.top_n[uid].append((iid, est)) for uid, user_ratings in self.top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) self.top_n[uid] = user_ratings[:n] return (self.predictions, self.top_n) def get_top_n(self, uid, start=0, end=RECOMMEND_NUM): tmplist = self.top_n[str(uid)][start:end] return [iid for iid, _ in tmplist] def show(self): if self.current > 0: StdError.info("recommend current version={}".format(self.current)) for uid, user_ratings in self.top_n.items(): StdError.info( str(uid) + ":" + str([iid for iid, _ in user_ratings]))
def gen_pred_matrix_ibcf(co_pe): # ---------------------------------------------------- IBCF as is # INITIALIZE REQUIRED PARAMETERS path = '/home/mister-t/Projects/PycharmProjects/RecommendationSys/ml-100k/u.item' prnt = "ITEM" sim_op = {'name': co_pe, 'user_based': False} algo = KNNBasic(sim_options=sim_op) reader = Reader(line_format="user item rating", sep='\t', rating_scale=(1, 5)) df = Dataset.load_from_file('ml-100k/u.data', reader=reader) # START TRAINING trainset = df.build_full_trainset() # APPLYING ALGORITHM KNN Basic res = algo.train(trainset) print "\t\t >>>TRAINED SET<<<<\n\n", res # Read the mappings raw id <-> movie name # rid_to_name, name_to_rid = read_item_names(path) print "CF Type:", prnt, "BASED" print "Please be Patient while 'pred_matrix-full_ibcf.csv' is being Generated" for i in range(5): print "." time.sleep(0.5) # --------------------------------------------------------- EXPERIMENTAL testset = trainset.build_anti_testset() predictions = algo.test(testset=testset) top_n = get_top_n(predictions, 5) # --------------------------------------------------------- EXPERIMENTAL # ---------------------------------------------------- IBCF as is csvfile = 'pred_matrix-full_ibcf.csv' with open(csvfile, "w") as output: writer = csv.writer(output, delimiter=',', lineterminator='\n') writer.writerow(['uid', 'iid', 'rat']) for uid, user_ratings in top_n.items(): for (iid, r) in user_ratings: value = uid, iid, r writer.writerow(value) print "Done! You may now check the file in same Dir. as of Program"
def loadTrainPredict(): data = Dataset.load_builtin("ml-100k") trainingSet = data.build_full_trainset() sim_options = {'name': 'cosine', 'user_based': False} knn = KNNBasic(sim_options=sim_options) knn.fit(trainingSet) testSet = trainingSet.build_anti_testset() predictions = knn.test(testSet) return predictions
def main(): data = Dataset.load_builtin("ml-100k") trainingSet = data.build_full_trainset() sim_options = { 'name': 'cosine', 'user_based': True } knn = KNNBasic(sim_options=sim_options) knn.fit(trainingSet) testSet = trainingSet.build_anti_testset() predictions = knn.test(testSet) top3_recommendations = get_top3_recommendations(predictions) rid_to_name = read_item_names() for uid, user_ratings in top3_recommendations.items(): print(uid, [rid_to_name[iid] for (iid, _) in user_ratings])
def main(): row_num = 5000 #reading the important ratings file to make it a pandas dataframe in order to be used by surprise ratings_data = pd.read_csv('datasets/song_dataset_ranking.txt', sep="\t", header=None, nrows = row_num) #define the document's columns ratings_data.columns = ['userId', 'songId', 'rating'] #read the csv where it is the songs data song_data = open('datasets/song_data.csv', 'rt') c_reader = csv.reader(song_data, delimiter=',', quotechar='|') #create a hash where we will store the important info from all songs song_dict = {} #update the hash, example #keysonisonioiaofnai: ['Smoke on the water', 'Deep purple'] for row in c_reader: song_dict.update({row[0]: [row[1], row[3]]}) #surprise reader, define the rating scale to use reader = Reader(rating_scale=(1,100)) #transform info to a surprise dataset data = Dataset.load_from_df(ratings_data, reader) #split data into training and testSet training_set, testSet = train_test_split(data, test_size=.25) #define the algorithm to use knn = KNNBasic(name="cosine", user_based=False) #train the algorithm knn.fit(training_set) print("Done training") print("Test set length", len(testSet)) print("testing") #make predictions predictions = knn.test(testSet) print("getting recommendations") #measure accuracy, Compute FCP (Fraction of Concordant Pairs). accuracy.fcp(predictions) #get top n predictions top_n = get_top_n(predictions,4) file = open('predictions.txt', 'w') for uid, user_ratings in top_n.items(): file.write("prediction for " +str(uid) +":\n") result_array = [find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings] for item in result_array: file.write("\t") file.write('-'.join(item)) file.write("\n") #print("prediction for " +str(uid) +"\n" +str([find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings]) + "\n") file.close()
def kNNBasic(trainset, testset): # KNN basic print("\n" + "-" * 5 + " KNNBasic algorithm using surprise package " + "-" * 5) sim_options = { 'name': 'MSD', # MSD similarity measure gives the best result # 'user_based': True # compute similarities between users: MAE = 0.7744112391896695 'user_based': False # compute similarities between items: MAE = 0.7685376263051 } algo = KNNBasic(sim_options=sim_options) # algo = KNNBasic() algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return rmse, mae, predictions
def KNN_top_n(data): # First train an SVD algorithm on the movielens dataset. # data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo = KNNBasic() algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, n=10) # Dump algorithm and reload it. file_name = os.path.expanduser('./KNNBasic_model_couchDB') dump.dump(file_name, algo=algo) print("file dumped")
class RecommenderItemBased(Recommender): def __init__(self, recommendation_dataset: RecommendationDataSet, similarity='cosine'): super(RecommenderItemBased, self).__init__(recommendation_dataset.movies) self.recommendation_dataset = recommendation_dataset sim_options = {'name': similarity, 'user_based': False } self.algorithm = KNNBasic(sim_options=sim_options) def get_recommendation(self, watched, k=20, k_inner_item=100): similar_items = self.get_similar_movie_ids(watched, k=k, k_inner_item=k_inner_item) return self.movies.get_movie_by_movie_ids(similar_items) def fit(self, dataset): return self.algorithm.fit(dataset) def test(self, test_set): return self.algorithm.test(test_set) def get_similar_movie_ids(self, watched, k=20, k_inner_item=100): """ Based on similar item movies, find nearest movies to the watched :param """ full_dataset = self.algorithm.trainset # watched movies watched = {full_dataset.to_inner_iid(key): value for key,value in watched.items()} # Get most liked movies # inner_item_ratings = full_dataset.ur[inner_user_id] most_liked = heapq.nlargest(k_inner_item, watched, key = watched.get) #['Ocena','OcenaImdb','averageRating'])[['movieId','OcenaImdb']] # Get the stuff they rated, and add up ratings for each item, weighted by user similarity candidates = defaultdict(float) for most_liked_inner_id in most_liked: rating = watched[most_liked_inner_id] similarity_row = self.algorithm.sim[most_liked_inner_id] for inner_id, score in enumerate(similarity_row): if inner_id!=most_liked_inner_id: candidates[inner_id] += score * (rating / 5.0) # return top-n movies similar_items = [full_dataset.to_raw_iid(i) for i in heapq.nlargest(k, candidates, key = candidates.get)] return similar_items
def Basic_CF(self): kf = KFold(n_splits=5) sim_options = {'name': 'cosine', 'user_based': True} algo = KNNBasic(k=40, min_k=1, sim_options=sim_options) for trainset, testset in kf.split(self.data): algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = self.precision_recall_at_k(predictions) P = sum(prec for prec in precisions.values()) / len(precisions) R = sum(rec for rec in recalls.values()) / len(recalls) F1 = 2 * P * R / (P + R) print("Precision : ", P) print("Recall : ", R) print("F1 : ", F1)
def GetAccuracy(): d = Data() data = d.loadData() trainSet = data.build_full_trainset() _, testSet = train_test_split(data, test_size=.25, random_state=1) model = KNNBasic(sim_options=sim_options, verbose=False) model.fit(trainSet) predictions = model.test(testSet) mae = accuracy.mae(predictions, verbose=False) rmse = accuracy.rmse(predictions, verbose=False) return mae, rmse
def recommendation_base_on_itemCF(train_data, user_item_matrix, user_ID, N): # 阅读器 reader = Reader(line_format='user item rating', sep=',') # 载入数据 raw_data = Dataset.load_from_df(user_item_matrix, reader=reader) # 构建模型 raw_data.split(n_folds=5) # kf = KFold(n_splits=5) knn_item = KNNBasic(k=40, sim_options={'user_based': False}) # 训练数据,并返回rmse误差 for train_set, test_set in raw_data.folds(): knn_item.fit(train_set) predictions = knn_item.test(test_set) accuracy.rmse(predictions, verbose=True) # 用户听过的歌曲合集 user_songs = {} for user, group in user_item_matrix.groupby('user'): user_songs[user] = group['item'].values.tolist() # 歌曲合集 songs = user_item_matrix['item'].unique().tolist() # 歌曲ID和歌曲名称对应关系 songID_titles = {} for index in train_data.index: songID_titles[train_data.loc[index, 'song']] = train_data.loc[index, 'title'] # itemCF # 用户听过的音乐集 user_items = user_songs[user_ID] # 用户对未听过音乐的评分 item_rating = {} for item in songs: if item not in user_items: item_rating[item] = knn_item.predict(user_ID, item).est # 找出评分靠前的N首歌曲 song_id = dict( sorted(item_rating.items(), key=lambda x: x[1], reverse=True)[:N]) song_topN = [songID_titles[s] for s in song_id.keys()] return song_topN