def making_model(id_purify_data, skin_type): evaluate_data = making_evaluate_data(id_purify_data, skin_type) evaluate_data = evaluate_data.build_full_trainset() svd = SVD() svd.fit(evaluate_data) return svd
iter = 0 for uid, iid, ratings in trainset.all_ratings(): # print("is uid,iid int or not?", isinstance(uid, int)) ruid = trainset.to_raw_uid(uid) riid = trainset.to_raw_iid(iid) # print("and raw ids are:",ruid,riid) testset[iter] = [ruid, riid, ratings] # print("testset element are:", testset[iter]) iter += 1 # Output testset to a csv file PM = pd.DataFrame(testset) PM.to_csv("TestSet.csv") # Initializing algorithm with predefined options # algo = NMF(biased = True) algo = SVD(biased=True) # algo = KNNBaseline() # Initializing sizes for Adaboost parameter matrices size_ui = (trainset.n_users + 1, trainset.n_items + 1) size_mui = (m, trainset.n_users + 1, trainset.n_items + 1) size_wmui = (m, WholeSet.n_users + 1, WholeSet.n_items + 1) # Initializing weight matrix W = np.ones(size_ui) # Initializing Adaboost Prediction matrix from ABtestset ABPredictM = np.zeros(size_wmui) # Initializing weight-update Prediction matrix from T_train PredictM = np.zeros(size_mui) # Initializing RMSE vector to store RMSE of ABtestset from each model in Adaboost iteration ABRMSE = np.zeros(m, dtype=float)
data = ml.loadMovieLensLatestSmall() print("\nComputing movie popularity ranks so we can measure novelty later...") rankings = ml.getPopularityRanks() return (ml, data, rankings) np.random.seed(0) random.seed(0) # Load up common data set for the recommender algorithms (ml, evaluationData, rankings) = LoadMovieLensData() # Construct an Evaluator to, you know, evaluate them evaluator = Evaluator(evaluationData, rankings) # SVD SVD = SVD() evaluator.AddAlgorithm(SVD, "SVD") # SVD++ SVDPlusPlus = SVDpp() evaluator.AddAlgorithm(SVDPlusPlus, "SVD++") # Just make random recommendations Random = NormalPredictor() evaluator.AddAlgorithm(Random, "Random") # Fight! evaluator.Evaluate(False) evaluator.SampleTopNRecs(ml)
# First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # First train an SVD algorithm on the movielens dataset. data = Dataset.load_builtin('ml-1m') trainset = data.build_full_trainset() algo = SVD() algo.train(trainset) # Then predict ratings for all pairs (u, i) that are NOT in the training set testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, n=10) movies = pd.read_csv('movies.csv', index_col='id') rec = top_n['196'][0][0] print('Top movie recommendation for user_id 196: {}'.format( \ movies[movies.index==int(rec)]))
param_grid = {'n_epochs': [20, 30], 'lr_all': [0.005, 0.010], 'n_factors': [50, 100]} gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(evaluationData) # best RMSE score print("Best RMSE score attained: ", gs.best_score['rmse']) # combination of parameters that gave the best RMSE score print(gs.best_params['rmse']) # Construct an Evaluator to, you know, evaluate them evaluator = Evaluator(evaluationData, rankings) params = gs.best_params['rmse'] SVDtuned = SVD(n_epochs = params['n_epochs'], lr_all = params['lr_all'], n_factors = params['n_factors']) evaluator.AddAlgorithm(SVDtuned, "SVD - Tuned") SVDUntuned = SVD() evaluator.AddAlgorithm(SVDUntuned, "SVD - Untuned") # Just make random recommendations Random = NormalPredictor() evaluator.AddAlgorithm(Random, "Random") # Fight! evaluator.Evaluate(False) evaluator.SampleTopNRecs(gb)
plt.figure(figsize=[12, 10]).set_tight_layout(True) trainset, testset = train_test_split(data, test_size=0.1) algo = KNNWithMeans(k=30, sim_options={'name': 'pearson'}) # find in Q11 plot_ROC_of_algo(algo=algo, curvelabel='K-NN', color='darkorange', trainset=trainset, testset=testset) algo = NMF(n_factors=20) plot_ROC_of_algo(algo=algo, curvelabel='NNMF', color='cyan', trainset=trainset, testset=testset) algo = SVD(n_factors=8, biased=True) plot_ROC_of_algo(algo=algo, curvelabel='MF with bias', color='lime', trainset=trainset, testset=testset) plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.legend(loc="lower right")
from surprise import Dataset, SVD, Reader import pandas as pd train_rating_df = pd.read_csv("train_rating.txt", header=0, index_col=0) test = pd.read_csv('test_rating.txt', header=0, index_col=0) test['dummy_rating'] = '-1' reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df( train_rating_df[['user_id', 'business_id', 'rating']], reader) trainset = data.build_full_trainset() algo = SVD(lr_all=0.0035, reg_all=0.04, n_factors=200, lr_bu=0.01, lr_bi=0.01) algo.train(trainset) testdata = Dataset.load_from_df( test[['user_id', 'business_id', 'dummy_rating']], reader) predictions = algo.test( testdata.construct_testset(raw_testset=testdata.raw_ratings)) df = pd.DataFrame(predictions) newdf = df['est'] newdf.rename('rating', inplace=True) newdf.to_csv('submission.csv', header='rating', index_label='test_id')
inplace=True) # In[39]: predictions_df.groupby('userid').head(10).reset_index(drop=True) # ## SVD Based Recommendation # In[40]: from surprise import SVD from surprise import accuracy # In[41]: svd_model = SVD(n_factors=50, biased=False) svd_model.fit(trainset) test_pred_svd = svd_model.test(testset) # ### RMSE for SVD # In[42]: accuracy.rmse(test_pred_svd) accuracy.mae(test_pred_svd) # In[43]: test_pred_svd[20] # ### Parameter tuning for SVD
b = datetime.now() print("共", (b - a).seconds, "秒") # transfer dataCombed into surpriseLib-SVD-fitting style data = shuffle(dataCombed) del dataCombed data.to_csv("dataForDump/trainingData.data", sep='\t', header=False, index=False) reader = Reader(line_format='user item rating', sep='\t') file_path = os.path.expanduser('dataForDump/trainingData.data') dataForTraining=Dataset.load_from_file(file_path,reader=reader) dataForTraining = dataForTraining.build_full_trainset() # fitting... algo = SVD(n_factors=30, n_epochs=30, lr_all=0.009, reg_all=0.08) algo.fit(dataForTraining) # Dump the SVD predictions for later usage dump.dump("interDump/svd-predictions", predictions=None, algo=algo, verbose=False) # prediction,algor = dump.load("svd-predictions") # ----------------Insert supplierId-bidId-score into database---------------- # get ids of all suppliers who have at lease one operation supplierId = data[['sid']].copy() supplierId.drop_duplicates(inplace=True) a=datetime.now() #supplierData_dict供应商主营辅营物资转字典 supplierData_dict = {}
dataframe["itemID"] = items dataframe["userID"] = users dataframe["ratings"] = ratings return dataframe # ========================================================================= # ######################################################################### # Tests against Scikit-Surprise # ######################################################################### reader = Reader(rating_scale=(0, 1)) algo = SVD(n_factors=K, n_epochs=100, biased=False, reg_all=0, lr_all=alph, verbose=False) data = Dataset.load_from_df(mlong1, reader) trainset = data.build_full_trainset() algo.train(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) dfpred1 = predictions_df(predictions) df1 = pd.concat([mlong1, dfpred1]) df1 = pd.DataFrame(df1) df1 = df1.pivot(index="userID", columns="itemID", values="ratings") num1 = np.array(df1) data = Dataset.load_from_df(mlong2, reader)
alg1 = surprise.SVD() alg2 = surprise.KNNBasic() alg3 = surprise.NMF() #cross_validate(alg1, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True) #cross_validate(alg2, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True) #cross_validate(alg3, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=True) ############## # EVALUATION # ############## benchmark = [] # Iterate over all algorithms --> First Fold ist train, k-1 Folds for testing for algorithm in [SVD(), NMF(), KNNBasic()]: # Perform cross validation results = cross_validate(algorithm, data, measures=['RMSE', 'MAE', "MSE"], cv=5, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse')
print( "\nComputing movie popularity ranks so we can measure novelty later..." ) rankings = ml.get_popularity_ranks() print("\nComputing item similarities so we can measure diversity later...") full_trainset = data.build_full_trainset() options = {'name': 'pearson_baseline', 'user_based': False} knn_model = KNNBaseline(sim_options=options) knn_model.fit(full_trainset) print("\nBuilding recommendation model...") train, test = train_test_split(data, test_size=.25, random_state=1) svd_model = SVD(random_state=10) svd_model.fit(train) print("\nComputing recommendations...") predictions = svd_model.test(test) print("\nEvaluating accuracy of model...") print("RMSE: ", metrics.rmse(predictions)) print("MAE: ", metrics.mae(predictions)) print("\nEvaluating top-10 recommendations...") # Set aside one rating per user for testing LOOCV = LeaveOneOut(n_splits=1, random_state=1) for train, test in LOOCV.split(data):
def main(): # Load dataset df = pd.read_csv('input/combined_data_1.txt', names=['Cust-Id', 'Ratings'], usecols=[0, 1], header=None) df.index = np.arange(0, len(df)) # df_nan returns df with rows index that contain nan values df_nan = pd.DataFrame(pd.isnull(df.Ratings)) df_nan = df_nan[df_nan['Ratings'] == True] # When reset_index is used, the old index becomes values in a column while the new index is sequential df_nan = df_nan.reset_index() # Numpy array movie_np = [] movie_id = 1 for i, j in zip(df_nan['index'][1:], df_nan['index'][:-1]): # excludes 23057834 in df_na temp = np.full((1, i - j - 1), movie_id) # i-j-1 because you want to know the number of rows in between 0 and 548. # The number of rows between 0 and 548 correspond to the number of customer ratings for movie 1 movie_np = np.append(movie_np, temp) movie_id += 1 last_record = np.full((1, len(df) - df_nan.iloc[-1, 0] - 1), movie_id) # len(df) is the last customer rating for movie 4499 and df_nan.iloc[-1,0] is first row for customer ratings for 4499 movie_np = np.append(movie_np, last_record) # Adjust dataframe with notnull and datatype df = df[pd.notnull(df['Ratings'])] df['Movie_Id'] = movie_np.astype(int) f = ['count', 'mean'] # Benchmark movies df_movie_summary = df.groupby('Movie_Id')['Ratings'].agg(f) df_movie_summary.index = df_movie_summary.index.map(int) movie_benchmark = round(df_movie_summary['count'].quantile(0.7), 0) movie_list = df_movie_summary[ df_movie_summary['count'] < movie_benchmark].index print(f'Movie minimum times of review: {movie_benchmark}') # Benchmark users df_customer_summary = df.groupby('Cust-Id')['Ratings'].agg(f) customer_benchmark = round(df_customer_summary['count'].quantile(0.7), 0) customer_list = df_customer_summary[ df_customer_summary['count'] < customer_benchmark].index print(f'Customer minimum times of review: {customer_benchmark}') # Slice df with benchmarked customer_list and movie_list df = df[~df['Movie_Id'].isin(movie_list)] df = df[~df['Cust-Id'].isin(customer_list)] df = df.reset_index(drop=True) # Create pivot table # df_p = pd.pivot_table(df, values='Ratings', index='Cust-Id', columns='Movie_Id') # Load movie titles into dataframe df_title = pd.read_csv('input\\movie_titles.csv', encoding="ISO-8859-1", names=['Movie_Id', 'Year', 'Name']) df_title.set_index('Movie_Id', inplace=True) # Count which user rates the most movies # df_count = df_p.count(axis='columns') # df_count = df_count.sort_values(ascending=False) # print(df_count) # Top 100K rows for faster evaluating reader = Reader() data = Dataset.load_from_df( df[['Cust-Id', 'Movie_Id', 'Ratings']][:100000], reader) # Choose algorithm algorithm = SVD() # Evaluate chosen algorithm cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=True) # Viewing 5-star rated movies by chosen user df_chosen_user = df[(df['Cust-Id'] == '785314') & (df['Ratings'] == 5)] df_chosen_user = df_chosen_user.join(df_title) # Drop all ready seen movies from possibilities chosen_user = df_title.copy() chosen_user = chosen_user.reset_index() chosen_user = chosen_user[~chosen_user['Movie_Id'].isin(movie_list)] cond = chosen_user['Movie_Id'].isin(df_chosen_user['Movie_Id']) chosen_user.drop(chosen_user[cond].index, inplace=True) # Load complete dataset data = Dataset.load_from_df(df[['Cust-Id', 'Movie_Id', 'Ratings']], reader) # Create trainset trainset = data.build_full_trainset() # Fit algorithm algorithm.fit(trainset) # Predict chosen_user['Estimate_Score'] = chosen_user['Movie_Id'].apply( lambda x: algorithm.predict(785314, x).est) # Sort and clean prediction to print on console chosen_user = chosen_user.sort_values(['Estimate_Score'], ascending=False) chosen_user["Year"] = chosen_user["Year"].fillna(0.0).astype(int) print(chosen_user.head(n=10).to_string(index=False)) # End timer print(f"Total prediction time {int(time.perf_counter())} seconds") # Print complete results to csv chosen_user.to_csv("output\\recommendation_results.csv", index=False)
def make_predictions(self): """ Predict ratings of un-rated wines based on past ratings and SVD. Returns ------- top_k_items : defaultdict Top k recommended wines. top_k_items_pd : DataFrame Top k recommended wines. Columns: Username, Wine, est. predictions : list of surprise.prediction_algorithms.predictions.Prediction objects All rating predictions for all users and all wines. """ # Tuning # if tune, always compare tuned and un-tuned cross-validation results if self.tune: tuned_algo = self.hyper_tune() algo = SVD() # Cross-validation # cross-validate with n_splits folds. # 5 folds (default) corresponds to a 80/20 split kf = KFold(n_splits=self.n_splits) # initialize cross-validation measures measures = ['rmse', 'mae', 'preci.@k', 'recall@k'] if self.tune: rmse_tuned_vals = [] mae_tuned_vals = [] precision_tuned_vals = [] recall_tuned_vals = [] train_time_tuned = [] test_time_tuned = [] rmse_vals = [] mae_vals = [] precision_vals = [] recall_vals = [] train_time = [] test_time = [] # perform cross-validation for trainset, testset in kf.split(self.data_ml): # train and test algorithm if self.tune: start_time = time.time() tuned_algo.fit(trainset) # train train_time_tuned.append(time.time() - start_time) start_time = time.time() tuned_predictions = tuned_algo.test(testset) # test test_time_tuned.append(time.time() - start_time) start_time = time.time() algo.fit(trainset) # train train_time.append(time.time() - start_time) start_time = time.time() predictions = algo.test(testset) # test test_time.append(time.time() - start_time) # compute metrics if self.tune: # get rmse and mae rmse_tuned_vals.append( accuracy.rmse(tuned_predictions, verbose=False)) mae_tuned_vals.append( accuracy.mae(tuned_predictions, verbose=False)) # get precision@k and recall@k tuned_precisions, tuned_recalls = self.precision_recall_at_k( tuned_predictions, threshold=3.5) # average precision@k and recall@k over all users precision_tuned_vals.append( sum(prec for prec in tuned_precisions.values()) / len(tuned_precisions)) recall_tuned_vals.append( sum(rec for rec in tuned_recalls.values()) / len(tuned_recalls)) # get rmse and mae rmse_vals.append(accuracy.rmse(predictions, verbose=False)) mae_vals.append(accuracy.mae(predictions, verbose=False)) # get precision@k and recall@k precisions, recalls = self.precision_recall_at_k(predictions, threshold=3.5) # average precision@k and recall@k over all users precision_vals.append( sum(prec for prec in precisions.values()) / len(precisions)) recall_vals.append( sum(rec for rec in recalls.values()) / len(recalls)) # print metrics # take advantage of surprise.model_selection.validation.print_summary # test metrics results must be in the form of a dict of lists if self.tune: test_measures_tuned_dict = {} test_measures_tuned_list = [ rmse_tuned_vals, mae_tuned_vals, precision_tuned_vals, recall_tuned_vals ] test_measures_dict = {} test_measures_list = [rmse_vals, mae_vals, precision_vals, recall_vals] for i, m in enumerate(measures): if self.tune: test_measures_tuned_dict[m] = test_measures_tuned_list[i] test_measures_dict[m] = test_measures_list[i] # use surprise.model_selection.validation.print_summary to print summary of results if self.tune: print('Tuned Cross-Validation Results:') surprise.model_selection.validation.print_summary( tuned_algo, measures, test_measures_tuned_dict, None, train_time_tuned, test_time_tuned, self.n_splits) print('Un-tuned Cross-Validation Results:') surprise.model_selection.validation.print_summary( algo, measures, test_measures_dict, None, train_time, test_time, self.n_splits) # Make recommendations # only recommend using tuned OR un-tuned algorithm # train on the full data set full_trainset = self.data_ml.build_full_trainset() if self.tune: start_time = time.time() tuned_algo.fit(full_trainset) # train train_time = time.time() - start_time print( "Took {} seconds for tuned full training.".format(train_time)) else: start_time = time.time() algo.fit(full_trainset) # train train_time = time.time() - start_time print("Took {} seconds for un-tuned full training.".format( train_time)) # all user-item pairs with no rating in the trainset (don't recommend already rated wines) anti_testset = trainset.build_anti_testset() if self.tune: start_time = time.time() predictions = tuned_algo.test(anti_testset) # predict test_time = time.time() - start_time print("Took {} seconds for tuned predictions.".format(test_time)) else: start_time = time.time() predictions = algo.test(anti_testset) # predict test_time = time.time() - start_time print( "Took {} seconds for un-tuned predictions.".format(test_time)) # Get top-k predictions for all users top_k_items, top_k_items_pd = self.get_top_k(predictions) return top_k_items, top_k_items_pd, predictions
def main(rec='SVD', threshold=4, topK=10): # First train an SVD algorithm on the movielens dataset. print("load data...") ''' data = Dataset.load_builtin('ml-1m') # test set is made of 40% of the ratings. test_size = 0.4 trainset, testset = train_test_split(data, test_size=test_size) ''' # path to dataset file test_data_path = r'C:\Users\abc\.surprise_data\ml-100k\ml-100k\u.data' #这个还不知道干嘛用 file_path = os.path.expanduser( r'C:\Users\abc\.surprise_data\ml-100k\ml-100k\u.data') reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() test_user, test_item, test_rate = read_data(test_data_path) #分为三组 #print("test size %.1f..." % test_size) print("training...") sim_options = { 'name': 'cosine', 'user_based': False # 计算物品相似度 } #选择算法 if rec == 'NMF': algo = NMF() elif rec == 'SVD': algo = SVD() name = ['SVD'] else: algo = KNNBaseline(sim_options=sim_options) name = ['ItemKNN'] train_start = time.time() algo.fit(trainset) train_end = time.time() print('train time:%.1f s' % (train_end - train_start)) #Than predict ratings for all pairs (u, i) that are NOT in the training set. ######填充空值,预测trainset的值 testset = trainset.build_anti_testset() predictions = algo.test(testset) test_end = time.time() print('test time:%.1f s' % (test_end - train_end)) #top_n_est 是元组列表,元组里边是itemid 和 对应预测评分 top_n_est, true_ratings = get_top_n(predictions, n=10, threshold=threshold) #模型评估 f1, map, mrr, mndcg = evaluate_model_new(algo, test_user, test_item, test_rate, topK) eval_end = time.time() print('evaluate time:%.1f s' % (eval_end - test_end)) print("algorithm : %s" % rec) print( 'recommendation metrics: F1 : %0.4f, NDCG : %0.4f, MAP : %0.4f, MRR : %0.4f' % (f1, mndcg, map, mrr)) print('%0.4f个用户' % algo.pu.shape) print('%0.4f个物品' % algo.qi.shape) return top_n_est
qualified = qualified.sort_values('wr', ascending = False).head(10) return qualified improved_recommendations('The Dark Knight') #---------------------------------------------- Collborative Filtering Based Recommender ---------------------------------------------- reader = Reader() ratings = pd.read_csv('ratings_small.csv') ratings.head() data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) data.split(n_folds = 5) svd = SVD() evaluate(svd, data, measures = ['RMSE', 'MAE']) trainset = data.build_fill_trainset() svd.train(trainset) ratings[ratings['userId'] == 1] svd.predict(1, 302, 3) #---------------------------------------------- Hybrid Recommender ---------------------------------------------- def convert_int(x): try: return int(x) except:
random.seed(0) data = GetBookData(density_filter = False) trainset, testset = train_test_split(data, test_size=0.25) ##Tuning Parameters param_grid = {'n_epochs': [30, 30], 'lr_all': [0.001, 0.15], 'reg_all':[0.01,0.1], 'n_factors': [10, 200]} gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=5) gs.fit(data) params = gs.best_params['rmse'] SVD_TUNED = SVD(n_epochs = params['n_epochs'], lr_all = params['lr_all'], n_factors = params['n_factors']) SVD_TUNED.fit(trainset) gs_predictions = SVD_TUNED.test(testset) rmse = accuracy.rmse(gs_predictions) precisions, recalls = precision_recall_at_k(gs_predictions, k = 10, threshold = 4.9) avg_precision = sum(prec for prec in precisions.values()) / len(precisions) avg_recall= sum(rec for rec in recalls.values()) / len(recalls) metrics = {'rmse': rmse, 'avg_precision': avg_precision, 'avg_recall': avg_recall, 'best_parameters': params} results['SVD'] = metrics
def Cal_Svd(filepath, user_id): # 1. raw dataset rating = pd.read_csv(filepath) rating['userId'].value_counts() rating['placeId'].value_counts() # 관광 vs 미관광 tab = pd.crosstab(rating['userId'], rating['placeId']) #print(tab) # rating # 두 개의 집단변수를 가지고 나머지 rating을 그룹화 rating_g = rating.groupby(['userId', 'placeId']) rating_g.sum() tab = rating_g.sum().unstack() # 행렬구조로 변환 #print(tab) #print(tab.info()) #사용자 2이 가지 않은 곳, 1,15, 39.... # 2. rating 데이터셋 생성 reader = Reader(rating_scale=(1, 5)) # 평점 범위 data = Dataset.load_from_df(df=rating, reader=reader) # rating이라는 데이터프레임은 reader(1~5)의 평점 범위를 가진다. #print(data) # 3. train/test set train = data.build_full_trainset() # 훈련셋 test = train.build_testset() # 검정셋 # 4. model 생성 #help(SVD) model = SVD(n_factors=100, n_epochs=20, random_state=123) model.fit(train) # model 생성 # 5. user_id 입력 #user_id = 1 # 추천대상자 item_ids = range(0, 2106) # placeId 범위 actual_rating = 0 # 평점 predict_result = [] for item_id in item_ids: if not actual_rating in tab: actual_rating = 0 predict_result.append( model.predict(user_id, item_id, actual_rating)) ddff = pd.DataFrame(predict_result) #print(ddff) # 유저 1 추천 여행지 상위 5개 result = ddff.sort_values(by='est', ascending=False)[:5] #print(result) results.append(result) # # if __name__ == '__main__': # Cal_Svd(filepath, user_id) # print(results[0]) #print(type(results[0])) #dataframe #print(results[0]['iid']) # placeId
user = prediction[0] book = prediction[1] actual_rating = prediction[2] recc_rating = prediction[3] if actual_rating == 0: write_str = str(user) + "," + str(book) + "," + str( actual_rating) + "," + str(recc_rating) + "\n" pred_file.write(write_str) pred_file.close() print("done") # In[ ]: from surprise import SVD algo = SVD(n_factors=20, n_epochs=500, random_state=1) trainSet = data.build_full_trainset() algo.fit(trainSet) cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True) # In[ ]: testset = trainSet.build_testset() pred = algo.test(testset) accuracy.rmse(pred, verbose=True), accuracy.mae(pred, verbose=True) # In[ ]: from surprise import KNNBasic algo = KNNBasic(n_factors=20, n_epochs=500, random_state=1)
def build_model(): # Load movies data from ./ml-20m/ movies = pd.read_csv('ml-20m/movies.csv') tags = pd.read_csv('ml-20m/tags.csv') ratings = pd.read_csv('ml-20m/ratings.csv') # limit ratings to user ratings that have rated more that 55 movies # it also filters the number of movies we can keep-- the reason is my # laptop limited power. ratings_f = ratings.groupby('userId').filter(lambda x: len(x) >= 55) movie_list_rating = ratings_f.movieId.unique().tolist() # filter the movies data frame movies = movies[movies.movieId.isin(movie_list_rating)] # map movie to id: Mapping_file = dict(zip(movies.title.tolist(), movies.movieId.tolist())) # remove unnecessary timesteps tags.drop(['timestamp'], 1, inplace=True) ratings_f.drop(['timestamp'], 1, inplace=True) # make a useful dataframe from tags and movies mixed = pd.merge(movies, tags, on='movieId', how='left') # create metadata from all tags and genres mixed.fillna("", inplace=True) mixed = pd.DataFrame( mixed.groupby('movieId')['tag'].apply(lambda x: "%s" % ' '.join(x))) Final = pd.merge(movies, mixed, on='movieId', how='left') Final['metadata'] = Final[['tag', 'genres']].apply(lambda x: ' '.join(x), axis=1) # text transformation and truncated SVD to create a content latent matrix: tfidf = TfidfVectorizer(stop_words='english') tfidf_matrix = tfidf.fit_transform(Final['metadata']) tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), index=Final.index.tolist()) svd = TruncatedSVD(n_components=200) latent_matrix_1 = svd.fit_transform(tfidf_df) latent_matrix_1_df = pd.DataFrame(latent_matrix_1, index=Final.title.tolist()) # text transformation and truncated SVD to create a collaborative # latent matrix: ratings_f1 = pd.merge(movies['movieId'], ratings_f, on="movieId", how="right") ratings_f2 = ratings_f1.pivot(index='movieId', columns='userId', values='rating').fillna(0) svd = TruncatedSVD(n_components=200) latent_matrix_2 = svd.fit_transform(ratings_f2) latent_matrix_2_df = pd.DataFrame(latent_matrix_2, index=Final.title.tolist()) # now a user collabortive model using Surprise reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(ratings_f1[['userId', 'movieId', 'rating']], reader) trainset, testset = train_test_split(data, test_size=.25) algorithm = SVD() # Train the algorithm on the trainset, and predict ratings for the testset algorithm.fit(trainset) accuracy.rmse(algorithm.test(testset)) # pickle all necessary files in ./Files/: ratings_f.to_pickle('./Files/rating.pkl') latent_matrix_1_df.to_pickle('./Files/latent_content.pkl') latent_matrix_2_df.to_pickle('./Files/latent_collaborative.pkl') with open('./Files/map.pkl', 'wb') as f: pickle.dump(Mapping_file, f, pickle.HIGHEST_PROTOCOL) with open('./Files/model_svd.pkl', 'wb') as f: pickle.dump(algorithm, f, pickle.HIGHEST_PROTOCOL) return
print(s) #load data from a file file_path = os.path.expanduser('restaurant_ratings.txt') reader = Reader(line_format='user item rating timestamp', sep='\t', skip_lines=0) data = Dataset.load_from_file(file_path, reader=reader) data.folds() #Splitting data into 3 folds data.split(n_folds=3, shuffle=False) #PMF Algorithm algo = SVD(biased=False) #Printing the result perf = evaluate(algo, data, measures=['RMSE', 'MAE']) # def printPMF(): # pt(perf) # printPMF() # # printPMF() # os.chdir("C:/Users/Stark/Desktop/Programming/Everythin_else!/Work/Current/Recommender-System/Outputs/") # # with open('PMF.csv','w') as fo: # print_perf(perf,fo) print_perf(perf) #Visualization
trainset = rating_train2.build_full_trainset() testset = rating_test2.build_full_trainset().build_testset() #SVD Model n_factors=[100] # where default = 100 n_epochs=[5] # where default = 20 lr_all=[0.05, 0.005] # where default = 0.005 reg_all=[0.2, 0.02] # where default = 0.02 count=1 for i in n_factors: for j in n_epochs: for k in lr_all: for m in reg_all: start = dt.datetime.today() print("================================================") algo = SVD(n_factors=i, n_epochs=j, lr_all=k, reg_all=m) algo.train(trainset) print("This is the #" + str(count) + " parameter combination") predictions=algo.test(testset) print("n_factors="+str(i)+", n_epochs="+str(j)+", lr_all="+str(k)+", reg_all="+str(m)) accuracy.rmse(predictions, verbose=True) accuracy.fcp(predictions, verbose=True) accuracy.mae(predictions, verbose=True) count=count+1 end = dt.datetime.today() print("Runtime: "+str(end - start))
def build_model(self, data): algo = SVD() cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5) return algo
model_user_results = cross_validate(model_user, data, measures=['RMSE'], cv=5, verbose=True) print('\n\nModel training successful!') # Create model object model_item = KNNBasic(sim_options={'user_based': False}) print('Model creation successful!') # Train on data using cross-validation with k=5 folds, measuring the RMSE # Note, this may have a lot of print output # You can set verbose=False to prevent this from happening model_item_results = cross_validate(model_item, data, measures=['RMSE'], cv=5, verbose=True) print('\n\nModel training successful!') # Create model object model_matrix = SVD() print('Model creation successful!') # Train on data using cross-validation with k=5 folds, measuring the RMSE # Note, this may take some time (2-3 minutes) to train, so please be patient model_matrix_results = cross_validate(model_matrix, data, measures=['RMSE'], cv=5, verbose=True) print('\n\nModel training successful!') def precision_recall_at_k(predictions, k=10, threshold=3.5): '''Return precision and recall at k metrics for each user.''' # First map the predictions to each user. user_est_true = dict() for uid, _, true_r, est, _ in predictions: current = user_est_true.get(uid, list())