def knn_m(data, training, testing): ''' Tune KNN with Means parameters then calculates RMSE, coverage and running time of KNN with Means Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of KNN with Means with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters knn_param_grid = {'k': [5, 10, 20], 'sim_options': {'name': ['msd', 'cosine', 'pearson'], 'min_support': [1, 5], 'user_based': [False]}} # optimize parameters knnm_grid_search = GridSearch(KNNWithMeans, knn_param_grid, measures=['RMSE'], verbose=False) knnm_grid_search.evaluate(data) param = knnm_grid_search.best_params['RMSE'] print('KNNWithMeans:', param) # fit model using the optimized parameters knnm = KNNWithMeans(k=param['k'], name=param['sim_options']['name'], min_support=param['sim_options']['min_support'], user_based=param['sim_options']['user_based']) knnm.train(training) # evaluate the model using test data predictions = knnm.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def knnm_running_time(data): ''' Calculates the running times for training and predictions for KNN with Means Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_KnnMeanstrain: running time for training elapsedtime_KnnMeanstest: running time for predictions on testset ''' elapsedtime_KnnMeanstrain = [] elapsedtime_KnnMeanstest = [] # tune the parameters on the entire data param_grid = { 'k': [5, 10, 20], 'sim_options': { 'name': ['msd', 'cosine', 'pearson'], 'min_support': [1, 5], 'user_based': [False] } } grid_search = GridSearch(KNNWithMeans, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data[3]) param = grid_search.best_params['RMSE'] k = param['k'] sim = param['sim_options']['name'] min_support = param['sim_options']['min_support'] user_based = param['sim_options']['user_based'] # using the tuned parameters calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() knnm = KNNWithMeans(k=k, name=sim, min_support=min_support, user_based=user_based) knnm.train(training) elapsedtime_KnnMeanstrain.append(time.time() - training_start) # prediction running time test_start = time.time() knnm.test(testing) elapsedtime_KnnMeanstest.append(time.time() - test_start) return elapsedtime_KnnMeanstrain, elapsedtime_KnnMeanstest
def model_training_and_evalution(): print "欢迎来到 训练阶段" file_path=os.path.expanduser(r'E:\JiangIntellijWorkingSpace\tools\music_recommendation\transform_playlist_song_rating.txt') reader=Reader(line_format='user item rating',sep='\t') music_data=Dataset.load_from_file(file_path,reader=reader) print("构建数据集") trainset=music_data.build_full_trainset() print"开始训练模型....." sim_options={'name':'pearson_baseline','user_based':False} algo=KNNWithMeans(sim_options) algo.train(trainset) rid_to_name,name_to_rid=read_item_names() # print name_to_rid toy_story_raw_id=name_to_rid[u'Over The Horizon-SAMSUNG GALAXY THEME'] # toy_story_raw_id=423245641 print toy_story_raw_id toy_story_inner_id=algo.trainset.to_inner_iid(toy_story_raw_id) toy_story_neighbors=algo.get_neighbors(toy_story_inner_id,k=10) toy_story_neighbors=(algo.trainset.to_raw_iid(inner_id)for inner_id in toy_story_neighbors) toy_story_neighbors=(rid_to_name[rid]for rid in toy_story_neighbors) print('the 10 nearest neighbors of it are(为你推荐最相近的10首歌单):') for music in toy_story_neighbors: print music
file_path = os.path.expanduser('./data/163_music_suprise_format.txt') # 指定文件格式 reader = Reader(line_format='user item rating timestamp', sep=',') # 从文件读取数据 music_data = Dataset.load_from_file(file_path, reader=reader) # 计算歌曲和歌曲之间的相似度 print "构建数据集..." trainset = music_data.build_full_trainset() #sim_options = {'name': 'pearson_baseline', 'user_based': False} #查找最近的user print "开始训练模型..." #sim_options = {'user_based': False} #algo = KNNBaseline(sim_options=sim_options) algo = KNNWithMeans() algo.train(trainset) current_playlist = list(name_id_dic.keys())[39] print "歌单名称", current_playlist # 取出近邻 # 映射名字到id playlist_id = name_id_dic[current_playlist] print "歌单id", playlist_id # 取出来对应的内部user id => to_inner_uid playlist_inner_id = algo.trainset.to_inner_uid(playlist_id) print "内部id", playlist_inner_id playlist_neighbors = algo.get_neighbors(playlist_inner_id, k=10) # 把歌曲id转成歌曲名字
class recommender: def __init__(self, algorithm): # Always call base method before doing anything. self.name = algorithm.lower() # SVD, NMF, SAE, LSTM self.surprise_algorithms = ['svd', 'nmf', 'knnbasic', 'knnmeans'] self.devooght_algorithms = ['fism'] ''' To implement with surprise: - Matrix-Factorization Based: SVDpp: The SVD++ algorithm, an extension of SVD taking into account implicit ratings. - Neighbourhood-based: Coclustering KNNWithZScore: A basic collaborative filtering algorithm, taking into account the z-score normalization of each user. KNNBaseline: A basic collaborative filtering algorithm taking into account a baseline rating. - Random Predictor NormalPredictor: Algorithm predicting a random rating based on the distribution of the training set, which is assumed to be normal. - Baseline BaselineOnly: Algorithm predicting the baseline estimate for given user and item. - Slope One SlopeOne: A simple yet accurate collaborative filtering algorithm. To implement using RNN: - LSTM - GRU (Devooght, Bersini) - GRU with clustering (Devooght, Bersini) To extract latent factors: - Stacked Autoencoders - CNN - CNN with Stacked Autoencoders ''' self.df_known_predictions = None self.df_unknown_predictions = None self.known_sequence_dict = None self.unknown_sequence_dict = None self.k = None self.k_min = None self.metrics = None def get_name(self, verbose=False): return self.name def fit(self, df_ratings=None, columns=['userId', 'itemId', 'rating'], verbose=False, **kwargs): self.columns = np.array(columns) # If Surprise lib is the base package to fit, then df_ratings must be used. # Algorithms that use Surprise Lib: NMF, SVD, KNN, SVDpp if (df_ratings is not None): self.df_ratings = df_ratings.copy() ########################################### # Convert Utility Matrix to df_ratings if utility matrix is passed # # ########################################### if self.name in self.surprise_algorithms: # Surprise-based recommenders from surprise import Dataset from surprise import Reader # A reader is still needed but only the rating_scale param is required. # The Reader class is used to parse a file containing ratings. reader = Reader(rating_scale=(0.5, 5.0)) # Separating timestamp column if ('timestamp' in columns): self.df_timestamp = self.df_ratings['timestamp'].copy() self.df_ratings.drop(labels='timestamp', inplace=True, axis=1) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df( self.df_ratings[self.columns[np.where( self.columns != 'timestamp')]], reader) # Creting trainset variable to be used in prediction functions of Surprise self.trainset = data.build_full_trainset() # Creating Model if self.name == 'svd': from surprise import SVD # Setting Number of Factors in Matrix Factorization if ('n_factors' in kwargs): self.n_factors = kwargs['n_factors'] else: self.n_factors = 100 if (verbose): print("Using default number of factors: {}".format( self.n_factors)) # Setting number of epochs in stocastic gradient descent if ('n_epochs' in kwargs): self.n_epochs = kwargs['n_epochs'] else: self.n_epochs = 20 if (verbose): print("Using default number of epochs: {}".format( self.n_epochs)) self.model = SVD(n_factors=self.n_factors, n_epochs=self.n_epochs, verbose=verbose) elif self.name == 'nmf': from surprise import NMF # Setting Number of Factors in Matrix Factorization if ('n_factors' in kwargs): self.n_factors = kwargs['n_factors'] else: self.n_factors = 15 if (verbose): print("Using default number of factors: {}".format( self.n_factors)) # Setting number of epochs in stocastic gradient descent if ('n_epochs' in kwargs): self.n_epochs = kwargs['n_epochs'] else: self.n_epochs = 50 if (verbose): print("Using default number of epochs: {}".format( self.n_epochs)) self.model = NMF(n_factors=self.n_factors, n_epochs=self.n_epochs, verbose=verbose) elif self.name == 'knnbasic': from surprise import KNNBasic # Setting number of neighbours if ('k' in kwargs): self.k = kwargs['k'] else: self.k = 40 if (verbose): print("Using default k: {}".format(self.k)) # Setting minimum number of neighbours if ('k_min' in kwargs): self.k_min = kwargs['k_min'] else: self.k_min = 1 if (verbose): print("Using default k_min: {}".format(1)) self.model = KNNBasic(k=self.k, min_k=self.k_min, verbose=verbose) elif self.name == 'kmeans': from surprise import KNNWithMeans # Setting number of neighbours if ('k' in kwargs): self.k = kwargs['k'] else: self.k = 40 if (verbose): print("Using default k: {}".format(40)) # Setting minimum number of neighbours if ('k_min' in kwargs): self.k_min = kwargs['k_min'] else: self.k_min = 1 if (verbose): print("Using default k_min: {}".format(1)) self.model = KNNWithMeans(k=self.k, min_k=self.k_min, verbose=verbose) else: if (verbose): print("Algorithm not configured: {}".format(self.name)) return -1 # Train the algorithm on the trainset, and predict ratings for the testset self.model.train(self.trainset) return 0 elif (self.name in self.devooght_algorithms): # Arguments directory_path = os.path.join( '.', 'Sequence_based_recommendation_files', self.name) preprocess.create_dirs(dirname=directory_path, verbose=verbose) data = preprocess.remove_rare_elements(data=df_ratings, min_user_activity=1, min_item_popularity=1, verbose=verbose) data = preprocess.save_index_mapping(data=data, dirname=directory_path, separator=',') train_set, val_set, test_set = preprocess.split_data( data=data, nb_val_users=0.1, # val_size nb_test_users=0.1, # test_size dirname=directory_path, verbose=verbose) preprocess.make_sequence_format(train_set=train_set, val_set=val_set, test_set=test_set, dirname=directory_path, verbose=verbose) preprocess.save_data_stats(data=data, train_set=train_set, val_set=val_set, test_set=test_set, dirname=directory_path, verbose=verbose) # Training Algorithm parser = parse.command_parser(parse.predictor_command_parser, train.training_command_parser, parse.early_stopping_command_parser) if self.name == 'fism': args = parser.parse_args([ '--dir', os.path.join(directory_path, 'models'), '-d', directory_path, #directory_path + '/', '-b', '20', # Batch size: the number of training examples present in a single blatch '--max_iter', '50', # Maximum number of iterations: the number of batches needed to complete one epoch '--progress', '10', # when progress information should be printed during training '-m', self.name.upper(), # Method #'-i', '-1', # Number of batches - only on test parser '--loss', 'RMSE', '--save', 'Best' ]) self.model = parse.get_predictor(args) dataset = handler.DataHandler( dirname=args.dataset, extended_training_set=args.extended_set, shuffle_training=args.tshuffle) self.model.prepare_model(dataset) self.metrics = self.model.train( dataset, save_dir=args.dir, time_based_progress=args.time_based_progress, progress=float(args.progress), autosave=args.save, max_progress_interval=args.mpi, max_iter=args.max_iter, min_iterations=args.min_iter, max_time=args.max_time, early_stopping=parse.get_early_stopper(args), load_last_model=args.load_last_model, validation_metrics=args.metrics.split(',')) else: if (verbose): print("Algorithm not configured: {}".format(self.name)) return -1 return 0 else: # if self.name not in self.surprise_algorithms if (verbose): print("Invalid algorithm: {}".format(self.name)) def get_model(self): return self.model def get_metrics(self): return self.metrics def calculate_known_predictions(self): # Calculating all predictions for known items if self.name in self.surprise_algorithms: # Calculating predictions dataframe as userId, itemId, rating, prediction # predictions return raw uid and iid known_predictions = self.model.test(self.trainset.build_testset( )) # Brings all predictions of existing ratings for prediction in known_predictions: arr = np.array([ int(prediction.uid), int(prediction.iid), prediction.r_ui, prediction.est ]) if prediction == known_predictions[0]: predictions = np.array([arr]) else: predictions = np.append(predictions, [arr], axis=0) self.df_known_predictions = pd.DataFrame({ 'userId': predictions[:, 0], 'itemId': predictions[:, 1], 'rating': predictions[:, 2], 'prediction': predictions[:, 3] }) if ('timestamp' in self.columns): self.df_known_predictions = self.df_known_predictions.set_index( keys=['userId', 'itemId']).join( df_ratings.drop('rating', axis=1).set_index( keys=['userId', 'itemId'])).reset_index() self.df_known_predictions['userId'] = self.df_known_predictions[ 'userId'].astype(int) self.df_known_predictions['itemId'] = self.df_known_predictions[ 'itemId'].astype(int) def get_known_predictions(self, calculate_predictions=False): if self.df_known_predictions is None or calculate_predictions == True: self.calculate_known_predictions() return self.df_known_predictions def calculate_unknown_predictions(self): # Calculating all predictions for known items # predictions return raw uid and iid if self.name in self.surprise_algorithms: unknown_predictions = self.model.test( self.trainset.build_anti_testset( )) # => Brings all predictions of non-existing ratings for prediction in unknown_predictions: arr = np.array([ int(prediction.uid), int(prediction.iid), 0, prediction.est ]) if prediction == unknown_predictions[0]: predictions = np.array([arr]) else: predictions = np.append(predictions, [arr], axis=0) self.df_unknown_predictions = pd.DataFrame({ 'userId': predictions[:, 0], 'itemId': predictions[:, 1], 'rating': predictions[:, 2], 'prediction': predictions[:, 3] }) def get_unknown_predictions(self, calculate_predictions=False): if self.df_unknown_predictions is None or calculate_predictions == True: self.calculate_unknown_predictions() return self.df_unknown_predictions def predict(self, userId, itemId, verbose=False): if self.name in self.surprise_algorithms: prediction = self.model.predict( uid=int(userId), iid=int(itemId)) # Take as input the raw user id and item id #ref: http://surprise.readthedocs.io/en/stable/algobase.html#surprise.prediction_algorithms.algo_base.AlgoBase.predict if prediction.details['was_impossible'] == True: if (verbose): print( "Impossible to predict item {} rating for user {} (one of them may not have been in training step)" .format(itemId, userId)) return 0 else: return prediction.est def get_top_n(self, n=10, source='unknown', calculate_sequence=False): '''Return the top-N recommendation for each user from a set of predictions. Args: n(int): The number of recommendation to output for each user. Default is 10. Returns: A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. ''' if (source.lower() == 'known'): # Checking if known predictions are calculated if (self.df_known_predictions is None): self.get_unknown_predictions(calculate_predictions=True) if (calculate_sequence == True or self.known_sequence_dict is None): self.known_sequence_dict = dict() for userId in self.df_known_predictions['userId'].unique(): # Selecting single user df_user = self.df_known_predictions[ self.df_known_predictions['userId'] == userId].copy() # Sorting values by prediction df_user.sort_values(by=['prediction'], ascending=False, inplace=True) # Saving the first K in sequence dict self.known_sequence_dict[userId] = np.array( df_user['itemId'].head(n)) return self.known_sequence_dict
class Surprise_recommender: def __init__(self, reader): ''' Constructor ------ Args: reader: A reader object for the dataset object in surprise ''' self.reader = reader return def create_test_set(self, test_data): ''' Function to create test_set This function drops timestamp from the data ------ Args: test_data: input test data ------ Returns: ts: test data after removing time stamp feature Basically a list with the following format: user, item, rating ''' ts = [[td[0], td[1], td[2]] for td in test_data] return ts def create_train_set(self, train_data): ''' Function to create training set ------ Args: train_data: Training set in the form of list ------ Returns: Trainset object from surprise Basically a list with the following format: user, item, rating, timestamp ''' ds = Dataset(self.reader) return ds.construct_trainset(train_data) def train_test_model(self, validation_set, train_set, test_set, algorithm, task): ''' Function to train models using different algorithms. Dumps GridSearch results for further analysis. ------ Args: train_set: The training data formatted according to the needs of surprise algorithm: The algorithm for training the model test_set: Testing data to check RMSE and MAE after GridSearch validation_set: Dataset for hyperparameter optimization task: Make predictions for rating, sentiment scores or for combined rating ------ Returns:None ''' if algorithm == 'SVD': param_grid = { 'n_epochs': np.arange(1, 101, 10).tolist(), 'n_factors': [10, 50, 100] } grid_search = GridSearch(SVD, param_grid, measures=['RMSE', 'MAE']) grid_search.evaluate(validation_set) p.dump(grid_search.cv_results, open('../stats/svd_results_' + task + '.p', 'wb')) best_model_RMSE = grid_search.best_params['RMSE'] validation_rmse = grid_search.best_score['RMSE'] best_model_mae = grid_search.best_params['MAE'] validation_mae = grid_search.best_score['MAE'] #print(validation_rmse) #print(validation_mae) print(type(grid_search.cv_results)) print(grid_search.cv_results) #Test based on best training RMSE n_epochs = best_model_RMSE['n_epochs'] n_factors = best_model_RMSE['n_factors'] self.algo = SVD(n_epochs=n_epochs, n_factors=n_factors) self.algo.train(train_set) predictions = self.algo.test(test_set) test_rmse = accuracy.rmse(predictions, verbose=True) test_mae = accuracy.mae(predictions, verbose=True) print("RMSE of predictions", test_rmse) print("MAE of predictions", test_mae) if algorithm == 'NMF': param_grid = { 'n_epochs': np.arange(0, 100, 10).tolist(), 'n_factors': [10, 100] } grid_search = GridSearch(NMF, param_grid, measures=['RMSE', 'MAE']) grid_search.evaluate(validation_set) p.dump(grid_search, open('../stats/nmf_results_' + task + '.p', 'wb')) best_model_RMSE = grid_search.best_params['RMSE'] validation_rmse = grid_search.best_score['RMSE'] best_model_mae = grid_search.best_params['MAE'] validation_mae = grid_search.best_score['MAE'] print(validation_rmse) print(validation_mae) #Test based on best training RMSE n_epochs = best_model_RMSE['n_epochs'] n_factors = best_model_RMSE['n_factors'] self.algo = NMF(n_epochs=n_epochs, n_factors=n_factors) self.algo.train(train_set) predictions = self.algo.test(test_set) test_rmse = accuracy.rmse(predictions, verbose=True) test_mae = accuracy.mae(predictions, verbose=True) print("RMSE of predictions", test_rmse) print("MAE of predictions", test_mae) if algorithm == 'KNNWithMeans': param_grid = { 'k': np.arange(1, 20).tolist(), 'sim_options': [{ 'name': 'cosine', 'user_based': True }, { 'name': 'msd', 'user_based': True }, { 'name': 'pearson', 'user_based': True }] } grid_search = GridSearch(KNNWithMeans, param_grid, measures=['RMSE', 'MAE']) grid_search.evaluate(validation_set) p.dump(grid_search, open('../stats/knn_means_results' + task + '.p', 'wb')) best_model_RMSE = grid_search.best_params['RMSE'] validation_rmse = grid_search.best_score['RMSE'] best_model_mae = grid_search.best_score['MAE'] validation_mae = grid_search.best_score['MAE'] #Test based on best training RMSE k = best_model_RMSE['k'] sim_options = best_model_RMSE['sim_options'] self.algo = KNNWithMeans(k=k, sim_options=sim_options) self.algo.train(train_set) predictions = self.algo.test(test_set) test_rmse = accuracy.rmse(predictions, verbose=True) test_mae = accuracy.mae(predictions, verbose=True) print("RMSE of predictions", test_rmse) print("MAE of predictions", test_mae) def generate_top_n_recommendation(self, test_set, train_set): ''' Function to generate top N recommendations ---- Args: user_id: The id of the user test_set: The testing set as a list train_set: The training set as a list ''' user_list = set([x[0] for x in train_set]) print("Number of users = ", len(user_list)) precision_list = [] recall_list = [] f_score_list = [] j = 0 for user in user_list: # print("===============================================================") # print("=====================+++++++++++++++++++++++===================") # print("===============================================================") j += 1 if j % 1000 == 0: print("Touchdown, j = ", j) item_train = set([x[1] for x in train_set if x[0] == user]) item_test = set([x[1] for x in test_set if x[0] == user]) item_train_all = set([x[1] for x in train_set]) item_test_all = set([x[1] for x in test_set]) item_all = item_train_all.union(item_test_all) # print("User = "******"===============================================================") # print("TRain items = ",item_train) # print("===============================================================") # print("Test items = ",item_test) # print("ITem all = ",item_all) # print("Number of test items= ",len(item_test)) negative_items = [ x for x in item_all if x not in item_train and x not in item_test ] # print("Number of negative items = ",len(negative_items)) # Get 1000 random negative items negative_indices = np.random.randint(0, len(negative_items), size=1000) negative_subset = [negative_items[x] for x in negative_indices] # Get 5 positive items from testing set: positive_subset = list(item_test) np.random.shuffle(positive_subset) # print("Positive subset items = ",positive_subset) # print(negative_subset) subset = positive_subset + negative_subset pred_list = [] for item in subset: pred = self.algo.predict(user, item, r_ui=1, verbose=False) pred_list.append(pred) predictions = sorted(pred_list, key=lambda x: x.est, reverse=True) # print(" =============================================================") precision = self.calculate_precision(predictions, positive_subset, 10) # print("Precision = ",precision) recall = self.calculate_recall(predictions, positive_subset, 10) # print("Recall = ",recall) # f_score=self.calculate_f_measure(precision,recall) # print("F score = ",f_score) precision_list.append(precision) recall_list.append(recall) # f_score_list.append(f_score) precision = np.mean(precision_list) recall = np.mean(recall_list) print("Mean precision = ", precision) print("Mean recall = ", recall) print("fscore=", self.calculate_f_measure(precision, recall)) return def calculate_precision(self, predictions, positive_items, N): ''' Function to calculate precision ''' count = 0 for i in np.arange(N): p = predictions[i] if p.iid in positive_items: count += 1 precision = float(count) / N return precision def calculate_recall(self, predictions, positive_items, N): ''' Function to calculate recall ''' count = 0 pred = predictions[:N] #Get TOP N Predictions for p in positive_items: for i in pred: if i.iid == p: count += 1 break recall = float(count) / len(positive_items) return recall def calculate_f_measure(self, precision, recall): ''' Function to calculate recall ''' try: f = 2.0 * precision * recall / (precision + recall) except: f = 0 return f
dftest = pd.read_csv(test_file_path) dftest = dftest.drop(['test_id', 'date'], axis=1) # create a trainset object reader = Reader() data = Dataset.load_from_df(dftrain, reader) trainingSet = data.build_full_trainset() # create a user-based K-nearest neighbours algorithm # - uses the Pearson correlation to measure user similarites # - takes user bias into account sim_options = {'name':'pearson'} algo = KNNWithMeans(sim_options=sim_options) # train the algorithm using the training set ########### fails here with MemoryError when I try to use the full set algo.train(trainingSet) # use the trained algorithm to predict ratings for the test set # output to a csv file f = open('ub_testOutput.csv', 'w') for i in range (len(dftest)): pred = algo.predict(dftest.at[i,'user_id'], dftest.at[i, 'business_id'], r_ui=4, verbose=True) predRating = pred.est f.write(str(i) + ", " + str(predRating) + '\n') f.close()