def svdpp(trainset, testset, predset): modelname = 'svdpp' # Check if predictions already exist if is_already_predicted(modelname): return bsl_options = { 'method': 'als', 'reg_i': 1.e-5, 'reg_u': 14.6, 'n_epochs': 10 } algo = SVDpp(n_epochs=40, n_factors=100, bsl_options=bsl_options, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, lr_yj=0.01, reg_bu = 0.05, reg_bi = 0.05, reg_pu = 0.09, reg_qi = 0.1, reg_yj=0.01) print('SVDpp Model') algo.train(trainset) predictions = algo.test(trainset.build_testset()) print(' RMSE on Train: ', accuracy.rmse(predictions, verbose=False)) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def svdPP(data): #SVDPP algorithm print("\nTraining SVDPP model..\n") global x_test, y_test, testlen, trainlen, model_params, x_train, y_train, X, Y, avg_rat, cold_itm p1, p2, p3 = [ model_params[1]['n_epochs'], model_params[1]['lr_all'], model_params[1]['reg_all'] ] svdModel = SVDpp(n_epochs=p1, lr_all=p2, reg_all=p3) svdModel.fit(data.build_full_trainset()) print("\nTraining done..\nPrediction started..") test = [(x_test[i][0], x_test[i][1], y_test[i]) for i in range(testlen)] #train_=[(x_train[i][0],x_train[i][1],y_train[i]) for i in range(trainlen)] #total_=[(X[i][0],X[i][1],Y[i]) for i in range(trainlen+testlen)] predict = svdModel.test(test) #trainset, testset = t_t_s(data, test_size=.25) svdModel_1 = SVDpp() svdModel_1.fit(data.build_full_trainset()) predict1 = svdModel_1.test(test) #predict_train = svdModel_1.test(train_) #predict_tot = svdModel_1.test(total_) usrA = [int(i[0]) - 1 for i in predict] itmA = [int(i[1]) - 1 for i in predict] res = [i[3] for i in predict] res1 = [i[3] for i in predict1] for i in range(testlen): if itmA[i] in cold_itm: res[i] = avg_rat[usrA[i]] res1[i] = avg_rat[usrA[i]] #restrain=[i[3] for i in predict_train] print("\nPrediction done..\n") return [res, res1, svdModel, svdModel_1] #,restrain, predict_tot
def final_model(data): """Pickles the collaborative filtering recommendation system model for repeat customers. Args: data -- a dataframe containing user id, item id, and ratings columns in that order. """ # Creates a user ratings surprise matrix for fitting model user_ratings_matrix = surprise_df(data) # Splits dataset into train and test datasets to generate predictions train_set, test_set = train_test_split(user_ratings_matrix, test_size=0.2, random_state=19) # Best params determined using GridSearchCV params = {'n_factors': 10, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1} svdpp = SVDpp(n_factors=params['n_factors'], n_epochs=params['n_epochs'], lr_all=params['lr_all'], reg_all=params['reg_all']) svdpp.fit(train_set) predictions = svdpp.test(test_set) # Use surprise wrapper to pickle model dump.dump('repeat_customer_model', predictions=predictions, algo=svdpp, verbose=0)
def time_location_model(df): """ Shows the performance of model based on just bias """ lower = df['date_dist_rating'].min() upper = df['date_dist_rating'].max() df = df.drop(columns=["rating", "dist_rating", "date_rating"], axis=1) reader = Reader(rating_scale=(lower, upper)) #TODO figure out data = surprise.dataset.Dataset.load_from_df(df=df, reader=reader) ts = data.build_full_trainset() dusers = ts._raw2inner_id_users ditems = ts._raw2inner_id_items # breakpoint() trainset, testset = train_test_split(data) algo = SVDpp() algo.fit(trainset) # testset = trainset.build_anti_testset() predictions = algo.test(testset) print('\n') return (trainset, testset, predictions, dusers, ditems)
class RecommenderSVDpp(Recommender): def __init__(self, recommendation_dataset: RecommendationDataSet): super(RecommenderSVDpp, self).__init__(recommendation_dataset.movies) self.algorithm = SVDpp() self.recommendation_dataset = recommendation_dataset def fit(self, dataset): return self.algorithm.fit(dataset) def test(self, test_set): return self.algorithm.test(test_set) def get_recommendation(self, watched, k=20): # get dataset new_user_id, full_dataset = self.recommendation_dataset.get_dataset_with_extended_user(watched) inner_user_id = full_dataset.to_inner_uid(new_user_id) # after new dataset we need again train our model with the new user for the whole # dataset with the new user. self.algorithm.fit(full_dataset) # watched movies watched = {full_dataset.to_inner_iid(key): value for key,value in watched.items()} # Calculate for all similar user, predictions test_items = [ self.algorithm.predict(new_user_id, full_dataset.to_raw_iid(i)) for i in range(0, full_dataset.n_items) if i not in watched ] topn_items = [i[0] for i in get_top_n(test_items, n=k, minimum_rating=1.0)[new_user_id]] return self.movies.get_movie_by_movie_ids(topn_items)
def top_ten_df (df): ''' inputs: df (Pandas DF) the dataframe that you would like to train on/NOTE: use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF outputs: top_ten_df (DataFrame Pandas) returns a dataframe with the top ten predictions for every user in your original dataframe ''' data= f.read_data_surprise(df)#use f.df_samp_unique_vals() to get a smaller DF if you dont have enough memory to run full DF # First train an SVD algorithm on entire dataset (choose 6x name filter) trainset = data.build_full_trainset() algo = SVDpp()#n_epochs= 18, lr_all= 0.01, reg_all= 0.175 algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset()#HEAVY THIS TAKES THE MOST RAM predictions = algo.test(testset) #create a dictionary of predictions top_n = f.get_top_n(predictions, n=10) #Turn the dictionary into a df top_ten_df = pd.DataFrame(top_n) return top_ten_df
def svdpp(trainset, testset): # Matrix factorization - SVD++ print("\n" + "-" * 5 + " SVD++ algorithm using surprise package " + "-" * 5) algo = SVDpp() algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return rmse, mae, predictions
def SVDpp_calculation(data , trainset, testset, time, cv): start = time.time() algo = SVDpp() algo.fit(trainset) predictions = algo.test(testset) cross_validate_svdpp_dict = cross_validate(algo, data, measures = ['RMSE'],cv=cv,verbose=True) end = time.time() time = end-start return time, cross_validate_svdpp_dict
class TrainModel: # def __init__(self, method='als', n_epochs=20, sim_option='pearson_baseline'): # # self.algo = KNNBasic(bsl_options={'method': method,'n_epochs': n_epochs}, # sim_options={'name': sim_option, 'user_based': False}) def __init__(self, lr_all=0.006, n_epochs=40): self.algo = SVDpp(lr_all=lr_all, n_epochs=n_epochs) self.reader = Reader(rating_scale=(0, 1)) self.filename = 'trained_model.pkl' def read_from_df(self, dataframe, user_col, item_col, rating_col): data = Dataset.load_from_df( dataframe[[user_col, item_col, rating_col]], self.reader) trainset = data.build_full_trainset() return trainset def train_mod(self, dataframe, user_col, item_col, rating_col): self.algo.fit( self.read_from_df(dataframe, user_col, item_col, rating_col)) def dump_model(self, predictions): saved_ent = dump.dump(self.filename, algo=self.algo, predictions=predictions) return saved_ent def load_model(self): predictions, loaded_ent = dump.load(self.filename) return predictions, loaded_ent def get_user_pred(self, user_id, dataframe, user_col, item_col, rating_col, n=2): data = Dataset.load_from_df( dataframe[[user_col, item_col, rating_col]], self.reader) testset = data.build_full_trainset().build_anti_testset() predictions = self.algo.test(testset) top_n = dict() for uid, iid, _, est, _ in predictions: if uid == user_id: top_n[iid] = est top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True) return predictions, top_n[:n] def get_user_pred_stable(self, user_id, predictions, n=2): top_n = dict() for uid, iid, _, est, _ in predictions: if uid == user_id: top_n[iid] = est top_n = sorted(top_n.items(), key=lambda kv: kv[1], reverse=True) # top_nn = {k: top_n[k] for k in top_n.keys()[0][:n]} return top_n[:n]
def svdpp(dataset): start = time.time() algo = SVDpp() kf = KFold(n_splits=5) for trainset, testset in kf.split(dataset): algo.fit(trainset) predictions = algo.test(testset) acc = accuracy.rmse(predictions, verbose=True) end = time.time() print('svdpp花分钟数为:', (end - start) / 60) return acc
def svdpp(train, test, ids, Xtest, Xids): """ Extension of svd taking the implicit ratings into account Argument : train, the trainset test, the testset ids, unknown ratings Xtest, predicted ratings for testset, to be used for final blending Xids, predicted ratings for unknown ratings, to be used for final blending """ print('SVD++') algo = SVDpp(n_factors=100, n_epochs=10, lr_all=0.0015, reg_all=0.05, random_state=15) #Train algorithm on training set algo.fit(train) #Predict on train and compute RMSE predictions = algo.test(train.build_testset()) print(' Training RMSE: ', accuracy.rmse(predictions, verbose=False)) #Predict on test and compute RMSE predictions = algo.test(test) rmse = accuracy.rmse(predictions, verbose=False) print(' Test RMSE: ', rmse) preds_test = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds_test[j] = pred.est #Predict unknown ratings preds_ids = [] for i in range(len(ids[0])): pred = algo.predict(str(ids[0][i]), str(ids[1][i])) preds_ids.append(pred.est) Xtest.append(preds_test) Xids.append(preds_ids) return rmse, Xtest, Xids, preds_test, preds_ids
def svdpp_running_time(data): ''' Calculates the running times for training and predictions for SVD++ Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_SVDpptrain: running time for training elapsedtime_SVDpptest: running time for predictions on testset ''' elapsedtime_SVDpptrain = [] elapsedtime_SVDpptest = [] # tune the parameters on the entire data param_grid = { 'n_factors': [25, 50, 100, 250], 'n_epochs': [10, 20, 30, 40, 50] } grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data[3]) param = grid_search.best_params['RMSE'] n_factors = param['n_factors'] n_epochs = param['n_epochs'] # using the tuned parameters calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() svdpp = SVDpp(n_factors=n_factors, n_epochs=n_epochs) svdpp.train(training) elapsedtime_SVDpptrain.append(time.time() - training_start) # prediction running time test_start = time.time() svdpp.test(testing) elapsedtime_SVDpptest.append(time.time() - test_start) return elapsedtime_SVDpptrain, elapsedtime_SVDpptest
def model(train_set, test_set): params = {'n_factors': 3, 'n_epochs': 50, 'lr_all': 0.01, 'reg_all': 0.1} svdpp = SVDpp(n_factors=params['n_factors'], n_epochs=params['n_epochs'], lr_all=params['lr_all'], reg_all=params['reg_all']) svdpp.fit(train_set) predictions = svdpp.test(test_set) rmse = accuracy.rmse(predictions, verbose=False) return predictions, rmse
def RecommendPredictions(): ## Load train and test data into Dataframes trainDF = pan.read_csv("data_source/train_count_norm_1_10.csv", header=None, dtype={2: np.float16}) trainDF = trainDF.fillna(10.0) reader = Reader(rating_scale=(1, 10)) print "Load train set...." dataTrain = Dataset.load_from_df(trainDF[[0, 1, 2]], reader=reader) trainset = dataTrain.build_full_trainset() print "Initiate Training ....." algo = SVDpp(n_epochs=1, lr_all=0.01, reg_all=0.02, verbose=True) algo.train(trainset) ## Predictions for test set with ground truth present print " Load test set..." testDF = pan.read_csv("data_source/test_count_norm_1_10.csv", header=None, dtype={2: np.float16}) testDF = testDF.fillna(10.0) dataTest = Dataset.load_from_df(testDF[[0, 1, 2]], reader=reader) testset = dataTest.build_full_trainset().build_testset() print "Start predictions" predictions = algo.test(testset) try: os.remove("data_source/predictions_results_svdpp.csv") except OSError: pass print "Saving Prediction results in File" resultFile = open("data_source/predictions_results_svdpp.csv", "a") csv_writer = csv.writer(resultFile) for item in predictions: predictionTuple = (item.uid, item.iid, item.r_ui, item.est) csv_writer.writerow(predictionTuple) resultFile.close() ## Predictions for test set with random products present ## LEFT #rmse = accuracy.rmse(predictions, verbose=True)
def run_colab_filter(self): # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 4)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df( self.df20[['user_id', 'route_id', 'rating']], reader) # Retrieve the trainset. trainset = data.build_full_trainset() # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset() algo_tuned = SVDpp(n_factors=20) algo_tuned.fit(trainset) iid = self.df20['route_id'].unique() #user_id = 200128311 #mine, trad, alpine, intermediate #user_id = 110596403 #boulder-er #user_id = 200272475 #boulder-er, advanced #user_id = 200077815 #michaels, trad, alpine, intermediate user_id = 106540415 #mixed climber, alpine climber, advanced iid_me = self.df20.loc[self.df20['user_id'] == user_id, 'user_id'] iids_to_pred = np.setdiff1d(iid, iid_me) testset = [[user_id, iid, 2] for iid in iids_to_pred] predictions_tuned = algo_tuned.test(testset) dump.dump(file_name='SVD_tuned.p', predictions=predictions_tuned, algo=algo_tuned) pred_ratings_tuned = np.array([pred.est for pred in predictions_tuned]) i_max = np.argpartition(pred_ratings_tuned, -20)[-20:] i_max = i_max[np.argsort(-pred_ratings_tuned[i_max])] iid = iids_to_pred[i_max] #top 20 recommended climbs self.df_top_climbs_mf = pd.DataFrame(iid, pred_ratings_tuned[i_max]) self.df_top_climbs_mf = self.df_top_climbs.reset_index() self.df_top_climbs_mf.columns = ['predicted rating', 'route id']
def SVD_pp(): algo = SVDpp() # 定义K折交叉验证迭代器,k=3 kf = KFold(n_splits=3) for trainset, testset in kf.split(data): # 训练并预测 algo.fit(trainset) predictions = algo.test(testset) # 计算RMSE accuracy.rmse(predictions, verbose=True) # verbose 输出当前跌代,默认False uid = str(196) iid = str(302) # 输出uid对iid的预测结果 pred = algo.predict(uid, iid, r_ui=4, verbose=True) time2 = time.time() print(time2 - time1)
def surprise_SVDpp(train_file, test_file): """ Svd++ with Surprise library. Compute the predictions on a test_set after training on a train_set using the method Svd++ from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters: n_factors : The number of factors. n_epochs : The number of iteration of the SGD procedure lr_'x': The learning rate for 'x' reg_'x' : The regularization term for 'x' 'x': bi : The item biases bu : The user biases qi : The item factors yj : The (implicit) item factors pu : The user factors Returns: numpy array: predictions """ print("SVDpp") fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() # Algorithm algo = SVDpp(n_epochs=40, n_factors=100, lr_all=0.01, reg_all=0.01) for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def evaluate_on_test(self, train_set, test_set): """ Evaluate the algorithm on the test set after running it on the test set :param train_set: :param test_set: :return: RMSE value on test set """ if train_set is not None and test_set is not None: print("Evaluate RMSE on test data") self.LOG_HANDLE.info("Evaluate RMSE on test data") # http://surprise.readthedocs.io/en/stable/matrix_factorization.html#surprise.prediction_algorithms.matrix_factorization.SVDpp algo = SVDpp() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(train_set) predictions = algo.test(test_set) # Then compute RMSE return accuracy.rmse(predictions)
def train_best_model_generate_ratings_test(self, ratings_set, test_set): """ Train the best model (with minimum AMSE) on the complete ratings set and then compute the ratings for the test set :param ratings_set: The complete ratings data set :param test_set: The streams for the users for which ratings are not yet available :return: A data frame of the form user, stream, predicted rating """ if ratings_set and test_set: print( "Training the best model and generating the ratings for the test data set" ) self.LOG_HANDLE.info( "Training the best model and generating the ratings for the test data set" ) algo = SVDpp(**model_params.svdpp_best_params) algo.fit(ratings_set) predictions = algo.test(test_set) return predictions
class RecommenderSVDppSimilarUsers(Recommender): """ Instead of building new dataset when the new user is in, we get similar users, and based on that try to get similar movies """ def __init__(self, movies): super(RecommenderSVDppSimilarUsers, self).__init__(movies) self.algorithm = SVDpp() def fit(self, dataset): return self.algorithm.fit(dataset) def test(self, test_set): return self.algorithm.test(test_set) def get_recommendation(self, watched, k=20, k_inner_item=10): # get dataset full_dataset = self.algorithm.trainset # watched movies watched = { full_dataset.to_inner_iid(key): value for key, value in watched.items() } # get similar users similar_users = self.get_similar_user_ids(watched, k=k_inner_item) # Calculate for all similar user, predictions candidates = defaultdict(float) for inner_move_id in range(0, full_dataset.n_items): if inner_move_id not in watched: movie_id = full_dataset.to_raw_iid(inner_move_id) for inner_user_id, similarity in similar_users.items(): prediction = self.algorithm.predict( full_dataset.to_raw_uid(inner_user_id), movie_id) candidates[movie_id] += similarity * prediction.est # heapq.nlargest(k, candidates.items(), key=itemgetter(1)) return self.movies.get_movie_by_movie_ids( heapq.nlargest(k, candidates, key=candidates.get))
class SurpriseRecommender(Recommender): name = 'surprise-svdpp' def train(self, data): ratings_dict = {'itemID': data[:,1], 'userID': data[:,0], 'rating': data[:,2]} df = pd.DataFrame(ratings_dict) reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader).build_full_trainset() # self.algo = KNNBasic(verbose=False) self.algo = SVDpp(verbose=True) self.algo.fit(data) def rate(self, user, movie): return self.algo.test([[user, movie, 0]])[0].est def rate_bool(self, user, movie): return self.rate(user, movie) > 0.5
def svd_model(df): """ Apply SVD. """ df = pd.melt(df, id_vars='smiles', value_vars=list(df.columns[1:]), var_name='Target', value_name='TargetValue') mark = df.TargetValue.isna() unknown = df.loc[mark] known = df.loc[~mark] reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(known[['smiles', 'Target', 'TargetValue']], reader) kf = KFold(n_splits=3, random_state=57) algo = SVDpp(n_factors=12, reg_all=0.003, lr_all=0.006, random_state=132) for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) rmse = round(accuracy.rmse(predictions, verbose=True), 3) print('RMSE of SVD model for cross validation' + str(rmse)) result = unknown.copy() result['ToxicProb'] = result.apply( lambda x: algo.predict(x.smiles, x.Target).est, axis=1) result = result.drop(columns='TargetValue') return result
def svdpp(data, training, testing): ''' Tune SVD++ parameters then calculates RMSE, coverage and running time of SVD++ Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of SVD++ with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters param_grid = {'n_factors': [25, 50, 100, 250], 'n_epochs': [10, 20, 30, 40, 50]} # optimize parameters grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data) param = grid_search.best_params['RMSE'] print('SVDpp:', param) # RMSE against parameters result_df = pd.DataFrame.from_dict(grid_search.cv_results) result_df.to_csv('data/svdpp_rmse_against_param.csv') # fit model using the optimized parameters svdpp = SVDpp(n_factors=param['n_factors'], n_epochs=param['n_epochs']) svdpp.train(training) # evaluate the model using test data predictions = svdpp.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def surpriseSVDpp(mode, DataPath='../data/data_clean.txt', TrainPath='../data/train_clean.txt', TestPath='../data/test_clean.txt', n_factors=20, n_epochs=20, lr_all=0.007, reg_all=0.02, verbose=True): # We need the rating scale. reader = Reader(rating_scale=(1, 5)) if mode == 'evaluation': # train data processing train = pd.read_csv(TrainPath, sep="\t", header=None) train.columns = ["User Id", "Movie Id", "Rating"] data = Dataset.load_from_df(train[["User Id", "Movie Id", "Rating"]], reader) trainset = data.build_full_trainset() # fit model algo = SVDpp(n_factors=n_factors, n_epochs=n_epochs, init_mean=0, init_std_dev=0.1, lr_all=lr_all, reg_all=reg_all, verbose=verbose) algo.fit(trainset) # evaluate train error test = trainset.build_testset() predictions = algo.test(test) train_err = accuracy.rmse(predictions, verbose=False) # test data processing test = pd.read_csv(TestPath, sep="\t", header=None) test.columns = ["User Id", "Movie Id", "Rating"] data = Dataset.load_from_df(test[["User Id", "Movie Id", "Rating"]], reader) testset = data.build_full_trainset() # evaluate train error test = testset.build_testset() predictions = algo.test(test) test_err = accuracy.rmse(predictions, verbose=False) # Return V (qi), U (pu), train_err (RMSE), test_err (RMSE) return algo.qi, algo.pu, train_err, test_err elif mode == 'visualization': # train data processing alldata = pd.read_csv(DataPath, sep="\t", header=None) alldata.columns = ["User Id", "Movie Id", "Rating"] data = Dataset.load_from_df(alldata[["User Id", "Movie Id", "Rating"]], reader) trainset = data.build_full_trainset() # fit model algo = SVDpp(n_factors=n_factors, n_epochs=n_epochs, init_mean=0, init_std_dev=0.1, lr_all=lr_all, reg_all=reg_all, verbose=verbose) algo.fit(trainset) # evaluate train error test = trainset.build_testset() predictions = algo.test(test) train_err = accuracy.rmse(predictions, verbose=False) U = algo.pu V = algo.qi A, _, B = np.linalg.svd(V.T) A = A.T # Use the first 2 cols for work Asub = A[:, :2] Uproj = np.dot(Asub.T, U.T) Vproj = np.dot(Asub.T, V.T) # Return Vproj, Uproj, train_err (RMSE of Y = U^T V) return Vproj, Uproj, train_err
trainset = data.build_full_trainset() # Build an algorithm, and train it. algo = SVDpp() algo.fit(trainset) ''' uid = str(196) # raw user id (as in the ratings file). They are **strings**! iid = str(302) # raw item id (as in the ratings file). They are **strings**! # get a prediction for specific users and items. pred = algo.predict(uid, iid, verbose=True) # print(pred.est) ''' # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset() predictions = algo.test(testset) # rank_n = get_rank_n_for_one_user(predictions, str(603)) # print(rank_n) db = pymysql.connect(host_ip, user, password, database_name) cursor = db.cursor() store_predictions_to_sql(predictions) db.close() # top_n = get_top_n(predictions, n=10) # # # Print the recommended items for each user # for uid, user_ratings in top_n.items(): # print(uid, [iid for (iid, _) in user_ratings])
svd.fit(train) svd_fu = pd.concat([ ratings['user_id'].drop_duplicates().reset_index(drop=True), pd.DataFrame(svd.pu.tolist()) ], axis=1) svd_fi = pd.concat([ ratings['movie_id'].drop_duplicates().reset_index(drop=True), pd.DataFrame(svd.qi.tolist()) ], axis=1) train, test = surprise_train_test_split(data, train_size=0.9, test_size=0.1, shuffle=False) svd_pred = svd.test(test) svd_pred = pd.DataFrame([i.r_ui for i in svd_pred]) with open('feature/svd_pp_fu.pkl', 'wb') as f: pickle.dump(svd_fu, f) with open('feature/svd_pp_fi.pkl', 'wb') as f: pickle.dump(svd_fi, f) with open('feature/svd_pp_pred.pkl', 'wb') as f: pickle.dump(svd_pred, f) else: with open('feature/svd_pp_fu.pkl', 'rb') as f: svd_fu = pickle.load(f) with open('feature/svd_pp_fi.pkl', 'rb') as f: svd_fi = pickle.load(f) with open('feature/svd_pp_pred.pkl', 'rb') as f: svd_pred = pickle.load(f) ratings = pd.merge(ratings, users, how='left', on='user_id')
def generate_svd_recommendation_df() -> pd.DataFrame: # Prepare input DataFrame and algorithm score_df = genearte_score_df() svd_data = MyDataSet(score_df) print (score_df) print (svd_data.raw_ratings) #Try SVD algo_svd = SVD() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo_svd, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo_svd.fit(full_train_set) predictions = algo_svd.test(test_set) # Then compute RMSE accuracy.rmse(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) latent_usr_factor = algo_svd.pu latent_item_factor = algo_svd.qi user_bias = algo_svd.bu item_bias = algo_svd.bi recomendation_reportname_df_svd = pd.merge(recommendation_df_svd, df_reports_id, how = 'left', on= 'report_id') #Try SVD++ algo_svdpp = SVDpp() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo_svdpp, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo_svdpp.fit(full_train_set) predictions = algo_svdpp.test(test_set) # Then compute RMSE accuracy.rmse(predictions) # Generate recommendation DataFrame recommendation_df_svdpp = get_top_n(predictions, n=5) latent_usr_factor_pp = algo_svd.pu latent_item_factor_pp = algo_svd.qi user_bias_pp = algo_svd.bu item_bias_pp = algo_svd.bi recomendation_reportname_df_svdpp = pd.merge(recommendation_df_svdpp, df_reports_id, how = 'left', on= 'report_id') #Try SVD++ with more factors as default is 20 algo_svdpp_mod = SVDpp(n_factors =50, n_epochs = 50) full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo_svdpp, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo_svdpp.fit(full_train_set) predictions = algo_svdpp.test(test_set) # Then compute RMSE accuracy.rmse(predictions) print (score) #print (recommendation_df) #Try the NMF #nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) algo_nmf = NMF() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo_nmf, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo_nmf.fit(full_train_set) predictions = algo_nmf.test(test_set) # Then compute RMSE accuracy.rmse(predictions) accuracy.mae(predictions) # Generate recommendation DataFrame recommendation_df_nmf = get_top_n(predictions, n=5) #print (recommendation_df) latent_usr_factor_nmf = algo_svd.pu latent_item_factor_nmf = algo_svd.qi user_bias_nmf = algo_svd.bu item_bias_nmf = algo_svd.bi recomendation_reportname_df_mmf = pd.merge(recommendation_df_nmf, df_reports_id, how = 'left', on= 'report_id') sidd_recmidation = recomendation_reportname_df.loc[recomendation_reportname_df['user_sso'] == 212568816] #Try the NMF without default #nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) algo_nmf_mod = NMF(n_factors =50, n_epochs = 50) full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo_nmf, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True, ) # Fitting the SVD algo_nmf.fit(full_train_set) predictions = algo_nmf.test(test_set) # Then compute RMSE accuracy.rmse(predictions) accuracy.mae(predictions) # Generate recommendation DataFrame recommendation_df_nmf = get_top_n(predictions, n=5) #print (recommendation_df) latent_usr_factor_nmf = algo_svd.pu latent_item_factor_nmf = algo_svd.qi user_bias_nmf = algo_svd.bu item_bias_nmf = algo_svd.bi recomendation_reportname_df_mmf = pd.merge(recommendation_df_nmf, df_reports_id, how = 'left', on= 'report_id') sidd_recmidation = recomendation_reportname_df.loc[recomendation_reportname_df['user_sso'] == 212568816] #--------------------------------------------------- # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise # Matrix Factorization Based Algorithms svd_cv = cross_validate(algo_svd, svd_data, cv=5, n_jobs=5, verbose=False) svdpp_cv = cross_validate(algo_svdpp,svd_data, cv=5, n_jobs=5, verbose=False) nmf_cv = cross_validate(algo_nmf, svd_data, cv=5, n_jobs=5, verbose=False) svdpp_cv_mod = cross_validate(algo_svdpp_mod,svd_data, cv=5, n_jobs=5, verbose=False) nmf_cv_mod = cross_validate(algo_nmf_mod, svd_data, cv=5, n_jobs=5, verbose=False)
# Load the movielens-100k dataset UserID::MovieID::Rating::Timestamp data = Dataset.load_builtin('ml-1m') trainset, testset = train_test_split(data, test_size=.15) # Configura o algoritmo. K = número de vizinhos. Name = Tipo de medida de similiradade. User based = filtragem por usuário ou item. algoritmo = SVDpp(n_epochs=5) algoritmo.fit(trainset) # Selecionamos o usuário e o filme que será analisado # User 49. Tem entre 18 e 24 anos. É programador e mora em Huston, Texas uid = str(49) # Filme visto e avaliado: Negotiator, The (1998)::Action|Thriller. Avaliação 4 iid = str(2058) # raw item id # get a prediction for specific users and items. pred = algoritmo.predict(uid, iid, r_ui=4, verbose=True) # run the trained model against the testset test_pred = algoritmo.test(testset) # Avalia RMSE print("Avaliação RMSE: ") accuracy.rmse(test_pred, verbose=True) # Avalia MAE print("Avaliação MAE: ") accuracy.mae(test_pred, verbose=True)
# First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n algo = SVDpp() algo.fit(train_set) predictions = algo.test(testsets.values) top_n = get_top_n(predictions, n=10) # Print the recommended items for each user for uid, user_ratings in top_n.items(): print(uid, [iid for (iid, _) in user_ratings]) kf = KFold(n_splits=3) algo = SVDpp() for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
# Precision@K: Proportion of recommended items that are relevant precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1 # Recall@K: Proportion of relevant items that are recommended recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1 return precisions, recalls ## Check precision and recall for K predictions for each user kf = KFold(n_splits=2) algo = SVDpp() for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) precisions, recalls = precision_recall_at_k(predictions, k=15, threshold=3.5) # Precision and recall can then be averaged over all users print(sum(prec for prec in precisions.values()) / len(precisions)) print(sum(rec for rec in recalls.values()) / len(recalls)) ## An item is considered relevant if its true rating rui is greater than a ## given threshold. An item is considered recommended if its estimated rating ## r^ui is greater than the threshold, and if it is among the k highest ## estimated ratings. ## Remember that: ## Recall = Sensitivity = TP / (TP + FN)