def step5_surprise2(): data = surprise.Dataset.load_builtin('ml-100k') # user item rate # print(data) # print(data.raw_ratings) # 모든 컬럼에 있는 데이터를 가져와야함. # df = pd.DataFrame(data.raw_ratings, columns=['user', 'item', 'rate', 'id']) option1 = {'name': 'msd'} option2 = {'name': 'cosine'} option3 = {'name': 'pearson'} # 추천 목록을 만들기 위한 객체 생성 # 본질을 흐리지 말것. 애매하게 하는게 제일 나쁘다. # 학습용 데이터를 생성한다. trainset = data.build_full_trainset() algo = surprise.KNNBasic(sim_options=option3) print('학습시작') algo.fit(trainset) # 추천 목록을 가져 온다. result = algo.get_neighbors(196, k=3) # 추천 영화 목록 print('result type:',type(result)) for r1 in result: print(r1)
def step5_surprise2(): # 사용할 영화 데이터 셋 data = surprise.Dataset.load_builtin('ml-100k') # print(data) # print(data.raw_ratings) # df = pd.DataFrame(data.raw_ratings, columns=['user', 'item', 'rate', 'id']) # del df['id'] # print(df) # 유사도 계산 방식을 설정 # option1 = {'name': 'msd'} # option2 = {'name': 'cosine'} option3 = {'name': 'pearson'} # 추천 목록을 만들기 위한 객체 생성 algo = surprise.KNNBasic(sim_options=option3) # 학습한다. trainset = data.build_full_trainset() algo.fit(trainset) # 추천 목록을 가져온다. # k == 추천받을 상품의 수 result = algo.get_neighbors('196', k=5) for r1 in result: print(r1)
def learn(id): print(id) dataset = getData() # 데이터 셋을 만든다. df = pd.DataFrame(dataset) # 데이터를 읽어와 surprise에서 사용하는 데이터 형태로 # 만들어주는 객체, rating_sacle=(최소, 최대) <-평점기준 reader = sp.Reader(rating_scale=(0.0, 5)) # 딕셔너리에 담겨있는 데이터의 이름 # 데이터셋을 만들 때 첫번재 이름이 사용자 구분값, 두번째 # 이름이 상품 구분값, 세번째 이름이 평점으로 인식하여 # 데이터를 읽어들이고 데이터셋으로 만든다. col_list = ['user_id', 'wine_id', 'points'] data = sp.Dataset.load_from_df(df[col_list], reader) # 학습할 모델 model = sp.KNNBasic(sim_options={'name': 'pearson'}) # 학습한다. trainset = data.build_full_trainset() model.fit(trainset) result = model.get_neighbors(id, k=5) print(result) rec_list = list() for r in result: rec_list.append(str(dataset['wine_id'][r])) print(dataset['wine_id'][r]) winelist = ','.join(rec_list) return winelist
def step5_surprise(): name_list, movie_list, rating_dic = data_to_dic() # print(rating_dic) # 데이터 셋을 만든다. df = pd.DataFrame(rating_dic) # rating_scale : 데이터에 담긴 평점의 범위 reader = surprise.Reader(rating_scale=(0.0, 5.0)) # 딕셔너리에 담겨져 있는 리스트의 이름 col_list = ['user_id', 'item_id', 'rating'] data = surprise.Dataset.load_from_df(df[col_list], reader) trainset = data.build_full_trainset() # 학습한다. # 유사도 계산 방식을 설정 # option1 = {'name': 'msd'} # option2 = {'name': 'cosine'} option3 = {'name': 'pearson'} # 추천 목록을 만들기 위한 객체 생성 algo = surprise.KNNBasic(sim_options=option3) algo.fit(trainset) # 소이현에 대해 영화를 추천받는다. index = name_list.index('소이현') result = algo.get_neighbors(index, k=3) for r1 in result: # r1이 1부터 시작해서 -1을 해준다. print(movie_list[r1-1])
def surprise_basicKNN(trainset, finalset): "Basic K Nearest Neighbours model" algo = spr.KNNBasic() algo.fit(trainset) predictions_final = algo.test(finalset) return spr_estimate_to_vect(predictions_final)
def train(where, k): # df_to_dict = recur_dictify(pd.read_pickle('../../../data/over_10review_stores.pkl')) # store_list = [] # 사용자 목록을 담을 리스트 # user_set = set() # 음식점 목록을 담을 set # # # store 수 만큼 반복 # for store_key in df_to_dict: # store_list.append(store_key) # # for user_key in df_to_dict[store_key]: # user_set.add(user_key) # # user_list = list(user_set) df = pd.read_pickle("../../data/dic_to_train_stores.pkl") reader = surprise.Reader(rating_scale=(1, 5)) col_list = ['store_id', 'user_id', 'score'] data = surprise.Dataset.load_from_df(df[col_list], reader) # Train trainset = data.build_full_trainset() option = {'name': 'pearson'} algo = surprise.KNNBasic(sim_options=option) algo.fit(trainset) # 사용자의 음식점을 추천한다. # where = input('store id : ') print("\n") user_list = pd.read_pickle( "../../data/Item_based_user_list.pkl")[0].tolist() store_list = pd.read_pickle( "../../data/Item_based_store_list.pkl")[0].tolist() # user_list = dff.user.unique().tolist() # store_list = dff.store.unique().tolist() index = store_list.index(int(where)) print('store_idx : ', index) print("\n") result = algo.get_neighbors(index, k=k) # k=10 print(where, "와 유사한 음식점은?") print(result) print("\n") # 음식점에 대한 유저를 추천한다. print(where, "를 평가한 당신에게 추천하는 친구:", "\n") recommend_user_list = [] for r1 in result: max_rating = data.df[data.df["store_id"] == r1]["score"].max() user_id = data.df[(data.df["score"] == max_rating) & (data.df["store_id"] == r1)]["user_id"].values for user in user_id: recommend_user_list.append(user_list[user]) # print(user_list[user]) return recommend_user_list
def train(dataframe, k): # df_to_dict = recur_dictify(pd.read_pickle('../data/over_10review_peoples.pkl')) # name_list = [] # 사용자 목록을 담을 리스트 # store_set = set() # 음식점 목록을 담을 set # # # 유저 수 만큼 반복 # for user_key in df_to_dict: # name_list.append(user_key) # # for sto_key in df_to_dict[user_key]: # store_set.add(sto_key) # # store_list = list(store_set) df = dataframe reader = surprise.Reader(rating_scale=(1, 5)) col_list = ['user_id', 'store_id', 'score'] data = surprise.Dataset.load_from_df(df[col_list], reader) # Train trainset = data.build_full_trainset() option = {'name': 'pearson'} algo = surprise.KNNBasic(sim_options=option) algo.fit(trainset) user_id = input('유저 id:') # 사용자의 음식점을 추천한다. who = user_id print("\n") name_list = pd.read_pickle("../data/user_based_name_list.pkl")[0].tolist() store_list = pd.read_pickle( "../data/user_based_store_list.pkl")[0].tolist() # name_list = dff.user.unique().tolist() # store_list = dff.store.unique().tolist() index = name_list.index(int(who)) print('user_idx : ', index) print("\n") result = algo.get_neighbors(index, k=k) # k=5 print(who, "에게 유사한 사용자는?") print(result) print("\n") # user 에 대해 음식점을 추천한다. print(who, "에게 추천하는 음식점:", "\n") for r1 in result: max_rating = data.df[data.df["user_id"] == r1]["score"].max() sto_id = data.df[(data.df["score"] == max_rating) & (data.df["user_id"] == r1)]["store_id"].values for sto in sto_id: print(store_list[sto])
def trainModels(self): #when importing from a DF, need to specify the scale of the ratings in order to get best performance reader = surprise.Reader(rating_scale=(self.scale_low, self.scale_high)) data = surprise.Dataset.load_from_df(self.rawdata, reader) trainset = data.build_full_trainset() testset = trainset.build_anti_testset() self.rmse = [] self.predictions = [] print("=== Training with Collaborative KNN ===") sim_options = { 'name': 'cosine', 'user_based': False } # compute similarities between items self.cKNN = surprise.KNNBasic(k=40, sim_options=sim_options) self.cKNN.fit(trainset) self.predictions.append(self.cKNN.test(testset)) self.rmse.append( surprise.accuracy.rmse(self.predictions[0], verbose=True)) minR = self.rmse[0] self.algoIndex = 0 print("=== Matrix Factorization ===") self.SVD = surprise.prediction_algorithms.matrix_factorization.SVD( n_factors=30, n_epochs=10, biased=True) self.SVD.fit(trainset) self.predictions.append(self.SVD.test(testset)) self.rmse.append( surprise.accuracy.rmse(self.predictions[1], verbose=True)) if (minR > self.rmse[1]): self.algoIndex = 1 minR = self.rmse[1] print("=== Co-clustering ===") self.Co = surprise.prediction_algorithms.co_clustering.CoClustering( n_cltr_u=4, n_cltr_i=4, n_epochs=25) self.Co.fit(trainset) self.predictions.append(self.Co.test(testset)) self.rmse.append( surprise.accuracy.rmse(self.predictions[2], verbose=True)) if (minR > self.rmse[2]): self.algoIndex = 2 minR = self.rmse[2] print("=== Slope One Collaborative Filtering ===") self.slope = surprise.prediction_algorithms.slope_one.SlopeOne() self.slope.fit(trainset) self.predictions.append(self.slope.test(testset)) self.rmse.append( surprise.accuracy.rmse(self.predictions[3], verbose=True)) if (minR > self.rmse[3]): self.algoIndex = 3
def train(who, k): df = pd.read_pickle("../data/dic_to_train.pkl") reader = surprise.Reader(rating_scale=(1, 5)) col_list = ['user_id', 'store_id', 'score'] data = surprise.Dataset.load_from_df(df[col_list], reader) # benchmark = [] # from surprise import SVD, SVDpp, SlopeOne, NMF, NormalPredictor, KNNBasic, KNNBaseline, KNNWithMeans, KNNWithZScore, BaselineOnly, CoClustering # # from sklearn.model_selection import cross_validate 사이킷런의 크로스벨리데이션이 아니다. # from surprise.model_selection import cross_validate # benchmark = [] # # 모든 알고리즘을 literate화 시켜서 반복문을 실행시킨다. # for algorithm in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), KNNBaseline(), KNNBasic(), KNNWithMeans(), KNNWithZScore(), BaselineOnly(), CoClustering()]: # # 교차검증을 수행하는 단계. # results = cross_validate(algorithm, data, measures=['RMSE'], cv=3, verbose=False) # # 결과 저장과 알고리즘 이름 추가. # tmp = pd.DataFrame.from_dict(results).mean(axis=0) # tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) # benchmark.append(tmp) # print(pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse') ) # Train trainset = data.build_full_trainset() option = {'name': 'pearson'} algo = surprise.KNNBasic(sim_options=option) algo.fit(trainset) name_list = pd.read_pickle("../data/user_based_name_list.pkl")[0].tolist() store_list = pd.read_pickle( "../data/user_based_store_list.pkl")[0].tolist() # name_list = dff.user.unique().tolist() # store_list = dff.store.unique().tolist() index = name_list.index(int(who)) neighbors = algo.get_neighbors(index, k=k) # k=5 # Recommend store to user recommend_store_list = [] for i in neighbors: max_rating = data.df[data.df["user_id"] == i]["score"].max() store = data.df[(data.df["score"] == max_rating) & (data.df["user_id"] == i)]["store_id"].values for idx in store: recommend_store_list.append(store_list[idx]) return recommend_store_list
def basicKNN(train, test): """ Run the basic KNN model from Surprise library. @param train: the training set in the Surprise format. @param test: the test set in the Surprise format. @return: the predictions in a numpy array. """ algo = spr.KNNBasic() algo.fit(train) predictions = algo.test(test) return get_predictions(predictions)
def main(args): user_item_based = 'item_based' if args.item_based else 'user_based' filename = '_'.join([ args.exp_name, args.algorithm, args.sim_name, user_item_based, str(args.num_rows) ]) + '.pkl' output_file = Path(filename) if output_file.exists(): print(f'ERROR! Output file {output_file} already exists. Exiting!') sys.exit(1) print(f'Saving scores in {output_file}\n') reader = surprise.Reader(rating_scale=(1, 5)) df = pq.read_table('all_ratings_with_indices.parquet', columns=['user_idx', 'movie_idx', 'rating']).to_pandas() df.user_idx = df.user_idx.astype(np.uint32) df.movie_idx = df.movie_idx.astype(np.uint16) df.rating = df.rating.astype(np.uint8) print(df.dtypes) data = surprise.Dataset.load_from_df(df[:args.num_rows], reader=reader) del df sim_options = { 'name': args.sim_name, 'user_based': False if args.item_based else True } if args.algorithm == 'knn': algo = surprise.KNNBasic(sim_options=sim_options) elif args.algorithm == 'baseline': algo = surprise.BaselineOnly() elif args.algorithm == 'normal': algo = surprise.NormalPredictor() elif args.algorithm == 'knn_zscore': algo = surprise.KNNWithZScore(sim_options=sim_options) elif args.algorithm == 'svd': algo = surprise.SVD() elif args.algorithm == 'nmf': algo = surprise.NMF() else: print(f'Algorithm {args.algorithm} is not a valid choice.') scores = surprise.model_selection.cross_validate(algo, data, cv=args.cv_folds, verbose=True, n_jobs=-1) pickle.dump(scores, open(output_file, 'wb'))
def algo_tester(data_object): ''' Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms ---Parameters--- data_object(variable) created from the read_data_surprise function ---Returns--- returns a dataframe where you can compare the performance of different algorithms ''' benchmark = [] algos = [ sp.SVDpp(), sp.SVD(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering() ] # Iterate over all algorithms for algorithm in algos: # Perform cross validation results = cross_validate(algorithm, data_object, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse') return benchmark
def run(self): df = pd.read_csv(self.input_csv, delimiter=";") lower_rating = df['score'].min() upper_rating = df['score'].max() print('Score range: {0} to {1}'.format(lower_rating, upper_rating)) reader = surprise.Reader(rating_scale=(df['score'].min(), df['score'].max())) data = surprise.Dataset.load_from_df(df, reader) trainset = data.build_full_trainset() chosen_k = math.ceil(math.sqrt(len(df['userId'].unique())) + 1) algo = surprise.KNNBasic(k=chosen_k, sim_options={ 'name': 'pearson_baseline', 'user_based': True }) algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) del df del reader top_n = self.get_top_n(predictions, n=10) df_top_rated = pd.DataFrame(columns=['userId', 'itemId', 'est']) for uid, user_ratings in top_n.items(): for iid, est in user_ratings: df_top_rated.loc[len(df_top_rated)] = [uid, iid, est] df_top_rated.to_csv(self.output_csv, sep=';', encoding='utf-8', index=False) return df_top_rated #validate = surprise.model_selection.cross_validate(alg, data, verbose = True)
def __init__(self): warnings.filterwarnings('ignore') with open('./LogDic.pickle', 'rb') as f: df_to_dict = pickle.load(f) print("----------Let's dictionary----------", df_to_dict) cos_set = set() self.name_list = [] for user_key in df_to_dict: self.name_list.append(user_key) for cos_key in df_to_dict[user_key]: cos_set.add(cos_key) self.cos_list = list(cos_set) rating_dic = {'Nickname': [], 'ProductIdx': [], 'rating': []} for name_key in df_to_dict: for cos_key in df_to_dict[name_key]: a1 = self.name_list.index(name_key) a2 = self.cos_list.index(cos_key) a3 = df_to_dict[name_key][cos_key] rating_dic['Nickname'].append(a1) rating_dic['ProductIdx'].append(a2) rating_dic['rating'].append(a3) df = pd.DataFrame(rating_dic) reader = surprise.Reader(rating_scale=(1, 5)) col_list = ['Nickname', 'ProductIdx', 'rating'] self.data = surprise.Dataset.load_from_df(df[col_list], reader) # print("----------Let's training----------") trainset = self.data.build_full_trainset() option = {'name': 'pearson'} self.algo = surprise.KNNBasic(sim_options=option) self.algo.fit(trainset)
def model_fit(self): ''' Train model using surprise.SVD algorithm. ''' self.build_trainset() algo = self._algo_choise if algo == 'SVD': self.algorithm = surprise.SVD() elif algo == 'Baseline': self.algorithm = surprise.BaselineOnly() elif algo == 'SlopeOne': self.algorithm = surprise.SlopeOne() elif algo == 'CoClustering': self.algorithm = surprise.CoClustering() else: self.algorithm = surprise.KNNBasic() print('Training Recommender System using %s...' % algo) self.algorithm.fit(self.trainset) self.ratings_changed = False print('Done')
def step5_surprise(): name_list, movie_list, rating_dic = data_to_dic() print(rating_dic) # 데이터 셋을 만든다. df = pd.DataFrame(rating_dic) # rating_scale : 데이터에 담긴 평점이 범위 reader = surprise.Reader(rating_scale=(0.0, 5.0)) # 0~5점의 범위를 가지고 있다. # print(rating_dic.keys()) col_list = [key for key in rating_dic.keys()] data = surprise.Dataset.load_from_df(df[col_list],reader) print(data) trainset = data.build_full_trainset() option1 = {'name':'pearson'} algo = surprise.KNNBasic(sim_options=option1) algo.fit(trainset) # 소이현에 대해 영화를 추천 받는다. index = name_list.index('소이현') result = algo.get_neighbors(index, k=3) #iid 자리 -> 대상 인간 for r1 in result: print(movie_list[r1-1]) # r1이 1번부터 시작하나보네
def step5_surprise() : # 데이터를 가져온다. name_list, movie_list, rating_dic = data_to_dic() # print(rating_dic) # 데이터 셋을 만든다. df = pd.DataFrame(rating_dic) # rating_scale : 데이터에 담긴 평점의 범위 reader = surprise.Reader(rating_scale=(0.0, 5.0)) # 딕셔너리에 담겨져 있는 리스트의 이름 col_list = ['user_id', 'item_id', 'rating'] data = surprise.Dataset.load_from_df(df[col_list], reader) # 학습 한다. trainset = data.build_full_trainset() option1 = {'name' : 'pearson'} algo = surprise.KNNBasic(sim_options=option1) algo.fit(trainset) # 소이현에 대해 영화를 추천받는다. index = name_list.index('소이현') result = algo.get_neighbors(index, k=3) for r1 in result : print(movie_list[r1 - 1])
'user_based': False # compute similarities between items } mean_ap = [] precision = [] recall = [] fscore = [] normalized_DCG = [] mean_ap_train = [] precision_train = [] recall_train = [] fscore_train = [] normalized_DCG_train = [] for k_val in ks: print(k_val) algo = surprise.KNNBasic(k=k_val, sim_options=sim_options) pr = 0 re = 0 fs = 0 ap = 0 nd = 0 pr_train = 0 re_train = 0 fs_train = 0 ap_train = 0 nd_train = 0 for trainset, testset in data.folds(): algo.train(trainset) predictions_on_test = algo.test(testset) precisions_test, recalls_test = precision_recall_at_k(
import surprise as sp from surprise import Dataset from surprise.model_selection import cross_validate import NetflixDataLoad #for 100000 rows for fast processing data = Dataset.load_from_df( NetflixDataLoad.df_filterd[['Cust_Id', 'Movie_Id', 'Rating']][:100000]) n_folds = 5 for algo in [sp.SVD(), sp.SVDpp(), sp.KNNBasic(), sp.KNNWithMeans()]: print( cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=n_folds, verbose=True)) # Output Example # Evaluating RMSE, MAE of algorithm SVD on 5 split(s). # # Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std # RMSE 0.9311 0.9370 0.9320 0.9317 0.9391 0.9342 0.0032 # MAE 0.7350 0.7375 0.7341 0.7342 0.7375 0.7357 0.0015 # Fit time 6.53 7.11 7.23 7.15 3.99 6.40 1.23 # Test time 0.26 0.26 0.25 0.15 0.13 0.21 0.06
return results np.random.seed(0) file_path = 'data/user_artists_log.dat' reader = Reader(line_format='user item rating', sep='\t') data = Dataset.load_from_file(file_path, reader=reader) trainset = data.build_full_trainset() testset = trainset.build_anti_testset() # 2 - User-based Recommendation uid_list = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11] # TODO - 2-1-1. KNNBasic, cosine sim_options = {'name': 'cosine'} algo = surprise.KNNBasic(sim_options=sim_options) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('2-1-1_results.txt', 'w') as f: for uid, ratings in sorted(results.items(), key=lambda x: int(x[0])): f.write('User ID %s top-10 results\n' % uid) for iid, score in ratings: f.write('Item ID %s\tscore %s\n' % (iid, str(score))) f.write('\n') # TODO - 2-1-2. KNNWithMeans, pearson sim_options2 = {'name': 'pearson'} algo = surprise.KNNWithMeans(sim_options=sim_options2) algo.fit(trainset) results = get_top_n(algo, testset, uid_list, n=10, user_based=True) with open('2-1-2_results.txt', 'w') as f:
def KNN_train(BASE_DIR): print('KNN 학습 시작 : ', str(datetime.now())[10:19]) conn = pymysql.connect(host=config('HOST'), port=3306, user=config('USER'), password=config('PASSWORD'), db=config('DB')) sql = 'SELECT * FROM wouldyouci.accounts_rating' data = pd.read_sql_query(sql, conn) conn.close() df = data[['user_id', 'movie_id', 'score']] # 리뷰 n개 이상 달린 영화 # 5개 이상 달린 영화 n1 = 5 filter_movies = df['movie_id'].value_counts() >= n1 filter_movies = filter_movies[filter_movies].index.tolist() # n개 이상 평가한 유저 n2 = 5 filter_users = df['user_id'].value_counts() >= n2 filter_users = filter_users[filter_users].index.tolist() df_new = df[df['movie_id'].isin(filter_movies) & df['user_id'].isin(filter_users)] df_to_dict = recur_dictify(df_new) user_list = [] movie_set = set() # 유저 수 만큼 반복한다 for user in df_to_dict: user_list.append(user) # 현재 사용자가 본 영화 목록을 set에 담는다. for movie in df_to_dict[user]: movie_set.add(movie) movie_list = list(movie_set) # 학습할 데이터를 준비한다. rating_dic = {'user_id': [], 'movie_id': [], 'score': []} # 유저 수 만큼 반복 for user in df_to_dict: # 해당 유저가 본 영화 수 만큼 반복 for movie in df_to_dict[user]: # 유저 인덱스 번호를 추출 u_index = user_list.index(user) # 영화 인덱스 번호를 추출 m_index = movie_list.index(movie) # 평점을 가져온다 score = df_to_dict[user][movie] # 딕셔너리에 담는다 rating_dic['user_id'].append(u_index) rating_dic['movie_id'].append(m_index) rating_dic['score'].append(score) # 데이터셋 만들기 df = pd.DataFrame(rating_dic) # 학습 reader = surprise.Reader(rating_scale=(0.5, 5.0)) # surprise에서 사용할 데이터셋을 구성할 때 필요한 이름 # 데이터가 저장되어 있는 딕셔너리의 컬럼 이름 col_list = ['user_id', 'movie_id', 'score'] data = surprise.Dataset.load_from_df(df_new[col_list], reader) # 학습한다 trainset = data.build_full_trainset() # Pearson similarity 사용 option = {'name': 'pearson'} algo = surprise.KNNBasic(sim_options=option) algo.fit(trainset) recommand_dic = { 'user_id': [], 'movie_id': [], } for user_key in df_new['user_id'].unique(): index = user_list.index(user_key) result = algo.get_neighbors(index, k=5) recom_set = set() for i in result: max_rating = data.df[data.df['user_id'] == user_list[i]]['score'].max() recom_movies = data.df[(data.df['score'] == max_rating) & ( data.df['user_id'] == user_list[i])]['movie_id'].values for item in recom_movies: recom_set.add(item) for item in recom_set: recommand_dic['user_id'].append(user_key) recommand_dic['movie_id'].append(item) pickle = pd.DataFrame(recommand_dic) path = os.path.join(BASE_DIR, 'KNN.p') pd.to_pickle(pickle, path) print('종료 : ', str(datetime.now())[10:19])
y = [] y_plot = [] counter = 0 x_plot = [] time_plot = [] mem_plot = [] for i in range(1, 10): reader = surprise.Reader(name=None, line_format='user item rating', sep=',', skip_lines=1) data = surprise.Dataset.load_from_file('/Users/keyadesai/Desktop/Recommendation Engine/ratings_woheader.csv', reader=reader) # data.split(5) algo = surprise.KNNBasic() param_grid = {'n_epochs': [i], 'lr_all': [0.005], 'reg_all': [0.02]} gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) results_df = pd.DataFrame.from_dict(gs.cv_results) y.append(results_df.mean_test_rmse) y_plot.append(y[counter][0]) x_plot.append(i) print(y_plot) plt.plot(x_plot, y_plot, 'ro')
epochs=2, validation_split=0.1, shuffle=True) y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf]) y_true = df_hybrid_test['Rating'].values rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true)) print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse)) # Load dataset into surprise specific data-structure data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader()) benchmark = [] # Iterate over all algorithms for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]: # Perform cross validation results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) # Store data benchmark.append(tmp) # Store results surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False) # Get data data = surprise_results[['test_rmse', 'test_mae']]
def get_queryset(self): # user_reviews = pd.DataFrame(list(models.Review.objects.all().values())) # df = user_reviews[['userid','storeid','bornyear']].head(100) # userid = self.request.query_params.get("user", "") # if userid is not None: user_reviews = pd.DataFrame(list(models.Review.objects.all().values())) df = user_reviews[['userid', 'storeid', 'bornyear']].head(1000) df_to_dict = recur_dictify(df) print(df_to_dict) name_list = [] # 사용자 목록을 담을 리스트 # 중복불가 cos_set = set() # 맛집 목록을 담을 set # 중복 가능 # user_key 는 음식점 id 가 나온다 for user_key in df_to_dict: name_list.append(user_key) # name_list [1070][1070, 6757][1070, 6757, 8272] for cos_key in df_to_dict[user_key]: # cost_key = 음식점 id 가 나온다 (216, 58, 149) cos_set.add(cos_key) # cost_set = {216}{216, 58} # for user_score in a[score]: # user_gender.append[user_score] # 학습할 데이터를 준비 rating_dic = { 'user': [], 'store': [], 'born_year': [] # 'gender': [], # 'bornyear': [] } cos_list = list(cos_set) # 사용자의 수 만큼 반복 for name_key in df_to_dict: # 해당 사용자가 본 맛집 수만큼 반복 for cos_key in df_to_dict[name_key]: # 사용자 인덱스 번호를 추출한다 a1 = name_list.index(name_key) # 맛집 인덱스 번호를 추출한다. a2 = cos_list.index(cos_key) # 나이를 가져온다. a3 = df_to_dict[name_key][cos_key] rating_dic['user'].append(a1) rating_dic['store'].append(a2) rating_dic['born_year'].append(a3) # rating_dic['gender'].append(a4) # rating_dic['bornyear'].append(a5) # print((rating_dic['user'])) # print((rating_dic['store_name'])) # print(len(rating_dic['score'])) df = pd.DataFrame(rating_dic) reader = surprise.Reader(rating_scale=(1, 5)) cos_list2 = ['user', 'store', 'born_year'] data = surprise.Dataset.load_from_df(df[cos_list2], reader) trainset = data.build_full_trainset() option = {'name': 'pearson'} algo = surprise.KNNBasic(sim_options=option) algo.fit(trainset) index = name_list.index(1070) print('user_index: ', index) print("\n") result = algo.get_neighbors(index, k=5) print("당신과 유사한 사용자? :", result) print("\n") print("당신에게 추천할만한 맛집 :", "\n") a = [] for r1 in result: max_rating = data.df[data.df["user"] == r1]["born_year"].max() cos_id = data.df[(data.df["born_year"] == max_rating) & (data.df["user"] == r1)]["store"].values for cos_item in cos_id: a.append(cos_list[cos_item]) # item_list = cos_list[a] print(a) queryset = models.Store.objects.all().filter(id=a[0]) print(a[0]) for i in a: select = models.Store.objects.all().filter(id=i) print(i) queryset = queryset | select print(queryset) return queryset
rating_matrix = df.pivot_table(index="userId", columns="movieId", values="rating") sparse_matrix = sparse.csr_matrix(rating_matrix) ''' movies = pd.read_csv("exportedData/moviesExp.csv", usecols=["title","movieId"]) ratings = pd.read_csv("exportedData/ratingsExp.csv") evalData = pd.merge(movies, ratings, on="movieId", how="inner") df = pd.read_table("exportedData/ratingsExp.csv", sep= ',').drop("timestamp", axis=1) df.head() recoms = pd.DataFrame() reader = surprise.Reader(rating_scale=(0.5,5.0)) data = surprise.Dataset.load_from_df(df, reader) trainset, testset = train_test_split(data, test_size=0.25) #alg = surprise.SVD() alg = surprise.KNNBasic() out = alg.fit(data.build_full_trainset()) def recom(uid, recomms_count): movieIds = df["movieId"].unique() rated_movies = df.loc[df["userId"] == uid, "movieId"] iid_to_pred = np.setdiff1d(movieIds, rated_movies) test_data = [[uid, iid, 5.0] for iid in iid_to_pred] predictions = alg.test(test_data) pred_ratings = np.array([pred.est for pred in predictions]) indice_max = np.argpartition(pred_ratings, -recomms_count)[-recomms_count:] iid = iid_to_pred[indice_max] iid_to_title = [i for i in range(0, recomms_count)]
measures=['RMSE', 'MAE'], cv=5, verbose=True) print('SVD--------------') print(svd_temp) normalPredictor = surprise.NormalPredictor() normalPredictor_temp = surprise.model_selection.cross_validate( normalPredictor, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('normalPredictor--------------') print(normalPredictor_temp) baselineOnly = surprise.BaselineOnly() baselineOnly_temp = surprise.model_selection.cross_validate( baselineOnly, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('baselineOnly-----------------') print(baselineOnly_temp) knnBasic = surprise.KNNBasic() knnBasic_temp = surprise.model_selection.cross_validate( knnBasic, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnBasic-----------------') print(knnBasic_temp) knnWithMeans = surprise.KNNWithMeans() knnWithMeans_temp = surprise.model_selection.cross_validate( knnWithMeans, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnWithMeans-----------------') print(knnWithMeans_temp) knnBaseline = surprise.KNNBaseline() knnBaseline_temp = surprise.model_selection.cross_validate( knnBaseline, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnBaseline-----------------') print(knnBaseline_temp) svdpp = surprise.SVDpp()
idcgs[uid] = sum(rel_true/discount_true) dcg = sum(dcgu for (_,dcgu) in dcgs.items()) idcg = sum(idcgu for (_,idcgu) in idcgs.items()) return dcg/idcg data = pd.read_csv('sampled.csv') print "Users: "+str(len(np.unique(data['User-ID'])))+ " items: "+str(len(np.unique(data['ISBN']))) print "No. of ratings: "+str(len(data)) sim_options = {'name': 'pearson', 'user_based': False } algo_knn = surprise.KNNBasic(k=5, sim_options=sim_options) algo_svd = surprise.SVD(n_factors = 10, lr_all= 0.001, reg_all =1) #Around 80% train data for each of these splits sample_sizes = [0.4, 0.2, 0.1,0.05, 0.01] time_knn = [] time_svd = [] for s in sample_sizes: a = data.sample(frac = s, random_state = 111) print "s= "+str(len(a)) print("Removing users with less than 20 ratings....") b = a.groupby('User-ID').filter(lambda x: len(x) >= 20) densityu = (float(len(b))/(len(np.unique(b['User-ID']))*len(np.unique(b['ISBN']))))*100
sparsityNew img2 = plt.spy(imgpivotNew, markersize=0.1) #################################################################################################### data = surprise.Dataset.load_from_df(df_ratings, reader) user_based = {"name": "cosine", "user_based": True} item_based = {"name": "cosine", "user_based": False} ############# # User Based ############# trainset = data.build_full_trainset() # Using KNNBasic algorithm alg = surprise.KNNBasic(sim_options=user_based) # Training model alg.fit(trainset) def recom(uid, recomms_count): movieIds = df["movieId"].unique() rated_movies = df.loc[df["userId"] == uid, "movieId"] iid_to_pred = np.setdiff1d(movieIds, rated_movies) test_data = [[uid, iid, 5.0] for iid in iid_to_pred] predictions = alg.test(test_data) pred_ratings = np.array([pred.est for pred in predictions]) indice_max = np.argpartition(pred_ratings, -recomms_count)[-recomms_count:] iid = iid_to_pred[indice_max]
def main(train_df, target_df, cache_name="test", force_recompute=[]): """Train multiple models on train_df and predicts target_df Predictions are cached. If the indices don't match the indices of target_df, the cache is discarded. By default, if a method was already computed it is not recomputed again (except if the method name is listed in force_recompute). cache_name is the name to use to read and write the cache. Arguments: train_df {dataframe} -- Training dataframe target_df {dataframe} -- Testing dataframe Keyword Arguments: cache_name {str} -- Name to use for caching (default: {"test"}) force_recompute {list} -- Name(s) of methods to recompute, whether or not it was already computed. Useful to only recompute single methods without discarding the rest. (default: {[]}) Returns: Dataframe -- Dataframe with predictions for each methods as columns, IDs as indices """ global algo_in_use CACHED_DF_FILENAME = os.path.dirname( os.path.abspath(__file__)) +\ "/cache/cached_predictions_{}.pkl".format(cache_name) train_df = preprocess_df(train_df) trainset = pandas_to_data(train_df) ids_to_predict = target_df["Id"].to_list() # try to retrieve backup dataframe try: print("Retrieving cached predictions") all_algos_preds_df = pd.read_pickle(CACHED_DF_FILENAME) print("Ensuring cached IDs match given IDs") assert sorted(ids_to_predict) == sorted( all_algos_preds_df.index.values) print("Indices match, continuing") except (FileNotFoundError, AssertionError): print("No valid cached predictions found") all_algos_preds_df = pd.DataFrame(ids_to_predict, columns=["Id"]) all_algos_preds_df.set_index("Id", inplace=True) all_algos = { "SVD": spr.SVD(n_factors=200, n_epochs=100), "Baseline": spr.BaselineOnly(), "NMF": spr.NMF(n_factors=30, n_epochs=100), "Slope One": spr.SlopeOne(), "KNN Basic": spr.KNNBasic(k=60), "KNN Means": spr.KNNWithMeans(k=60), "KNN Baseline": spr.KNNBaseline(), "KNN Zscore": spr.KNNWithZScore(k=60), "SVD ++": spr.SVDpp(n_factors=40, n_epochs=100), "Co Clustering": spr.CoClustering() } for name in all_algos: print("##### {} ####".format(name)) if name in force_recompute and name in all_algos_preds_df.columns: all_algos_preds_df.drop(name, axis=1, inplace=True) if name in all_algos_preds_df.columns: print("Already computed {}, skipping".format(name)) continue algo = all_algos[name] time.sleep(1) algo.fit(trainset) time.sleep(1) algo_in_use = algo print("Generating predictions...") predictions = parallelize_predictions(ids_to_predict, 80) print("Done. Merging with previous results") this_algo_preds_df = pd.DataFrame(predictions, columns=["Id", name]) this_algo_preds_df.set_index("Id", inplace=True) all_algos_preds_df = pd.merge(all_algos_preds_df, this_algo_preds_df, left_index=True, right_index=True) all_algos_preds_df.to_pickle(CACHED_DF_FILENAME) print("DONE computing surprize") return all_algos_preds_df
def recommend_new(request): test = Test() test.name = request.POST.get('name') test.local = request.POST.get('local') test.rating = request.POST.get('rating') test.save() if request.method == 'POST': if (request.POST.get('q1') == '1'): if (request.POST.get('q2') == '1'): if (request.POST.get('q3') == '1'): f = open('뚜벅혼자관광.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('뚜벅혼자관광.csv', encoding="utf-8", sep=",", error_bad_lines=False) elif (request.POST.get('q3') == '2'): f = open('뚜벅혼자휴양.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('뚜벅혼자휴양.csv', encoding="utf-8", sep=",", error_bad_lines=False) elif (request.POST.get('q2') == '2'): if (request.POST.get('q3') == '1'): f = open('뚜벅2인관광.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('뚜벅2인관광.csv', encoding="utf-8", sep=",", error_bad_lines=False) elif (request.POST.get('q3') == '2'): f = open('뚜벅2인휴양.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('뚜벅2인휴양.csv', encoding="utf-8", sep=",", error_bad_lines=False) elif (request.POST.get('q2') == '3'): if (request.POST.get('q3') == '1'): f = open('뚜벅3인관광.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('뚜벅3인관광.csv', encoding="utf-8", sep=",", error_bad_lines=False) elif (request.POST.get('q3') == '2'): f = open('뚜벅3인휴양.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('뚜벅3인휴양.csv', encoding="utf-8", sep=",", error_bad_lines=False) elif (request.POST.get('q1') == '2'): if (request.POST.get('q2') == '1'): if (request.POST.get('q3') == '1'): f = open('자차혼자관광.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('자차혼자관광.csv', encoding="utf-8", sep=",", error_bad_lines=False) elif (request.POST.get('q3') == '2'): f = open('자차혼자휴양.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('자차혼자휴양.csv', encoding="utf-8", sep=",", error_bad_lines=False) elif (request.POST.get('q2') == '2'): if (request.POST.get('q3') == '1'): f = open('자차2인관광.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('자차2인관광.csv', encoding="utf-8", sep=",", error_bad_lines=False) elif (request.POST.get('q3') == '2'): f = open('자차2인휴양.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('자차2인휴양.csv', encoding="utf-8", sep=",", error_bad_lines=False) elif (request.POST.get('q2') == '3'): if (request.POST.get('q3') == '1'): f = open('자차3인관광.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('자차3인관광.csv', encoding="utf-8", sep=",", error_bad_lines=False) elif (request.POST.get('q3') == '2'): f = open('자차3인휴양.csv', 'a', newline='', encoding='utf-8') wr = csv.writer(f) wr.writerow([test.name, test.local, test.rating]) f.close() warnings.filterwarnings('ignore') data = pd.read_csv('자차3인휴양.csv', encoding="utf-8", sep=",", error_bad_lines=False) df = data[['id', '여행지', 'rating']] df = df.drop_duplicates(['id', '여행지'], keep="last") def recur_dictify(frame): if len(frame.columns) == 1: if frame.values.size == 1: return frame.values[0][0] return frame.values.squeeze() grouped = frame.groupby(frame.columns[0]) d = {k: recur_dictify(g.iloc[:, 1:]) for k, g in grouped} return d df_to_dict = recur_dictify(df) name_list = [] local_set = set() for user_key in df_to_dict: name_list.append(user_key) for local_key in df_to_dict[user_key]: local_set.add(local_key) local_list = list(local_set) rating_dic = {'id': [], '여행지': [], 'rating': []} for name_key in df_to_dict: for cos_key in df_to_dict[name_key]: a1 = name_list.index(name_key) a2 = local_list.index(cos_key) a3 = df_to_dict[name_key][cos_key] rating_dic['id'].append(a1) rating_dic['여행지'].append(a2) rating_dic['rating'].append(a3) df = pd.DataFrame(rating_dic) reader = surprise.Reader(rating_scale=(1, 5)) data = surprise.Dataset.load_from_df(df[['id', '여행지', 'rating']], reader) trainset = data.build_full_trainset() option = {'name': 'pearson'} algo = surprise.KNNBasic(sim_options=option) algo.fit(trainset) index = name_list.index(test.name) result = algo.get_neighbors(index, k=3) def localtest(): for r1 in result: max_rating = data.df[data.df["id"] == r1]["rating"].max() local_id = data.df[(data.df["rating"] == max_rating) & (data.df["id"] == r1)]["여행지"].values for local_item in local_id: return (local_list[local_item]) return render(request, 'recommend_result.html', {'localtest': localtest})