def temp1(): import numpy as np import pandas as pd import matplotlib.pyplot as plt import sys import pickle from surprise import Dataset, Reader, SVD, accuracy from surprise.model_selection import train_test_split movies = pd.read_csv('../data/movies.csv') # genome_scores = pd.read_csv('../data/genome-scores.csv') # tags = pd.read_csv('../data/tags.csv') # genome_tags = pd.read_csv('../data/genome-tags.csv') # Use ratings data to downsample tags data to only movies with ratings ratings = pd.read_csv('../data/ratings.csv') # print(ratings) print("+++++++++++==") ratings = ratings.drop_duplicates('movieId') # print(ratings) # 사용자 추가 temp_df = pd.DataFrame rating_ser = [4.0, 5.0, 2.0] movie_ser = [1, 2, 3] user_id = ['138494', '138494', '138494'] # userId,movieId,rating,timestamp # temp_df['userId'] = pd.Series(user_id) # temp_df['movieId'] = pd.Series(movie_ser) # temp_df['rating'] = pd.Series(rating_ser) # pd.concat([ratings, temp_df]) # instantiate a reader and read in our rating data reader = Reader(rating_scale=(1, 5)) ratings_f = ratings.groupby('userId').filter(lambda x: len(x) >= 55) movie_list_rating = ratings_f.movieId.unique().tolist() Mapping_file = dict(zip(movies.title.tolist(), movies.movieId.tolist())) data = Dataset.load_from_df(ratings_f[['userId', 'movieId', 'rating']], reader) # train SVD on 75% of known rates trainset, testset = train_test_split(data, test_size=.25) algorithm = SVD() algorithm.fit(trainset) predictions = algorithm.test(testset) # check the accuracy using Root Mean Square Error accuracy.rmse(predictions) print("+++++++++++++1+++++++++++++") def pred_user_rating(ui): if ui in ratings_f.userId.unique(): ui_list = ratings_f[ratings_f.userId == ui].movieId.tolist() d = {k: v for k, v in Mapping_file.items() if not v in ui_list} predictedL = [] for i, j in d.items(): predicted = algorithm.predict(ui, j) predictedL.append((i, predicted[3])) pdf = pd.DataFrame(predictedL, columns=['movies', 'ratings']) pdf.sort_values('ratings', ascending=False, inplace=True) pdf.set_index('movies', inplace=True) return pdf.head(10) else: print("User Id does not exist in the list!") return None user_id = 1 print(pred_user_rating(user_id))
# data = surprise.Dataset.load_builtin('ml-100k') # print(data) # #data.split(n_folds=2) # split data for 2-folds cross validation # algo = SVD_SGD(learning_rate=.01, n_epochs=10, n_factors=10) # cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True) import pandas as pd import numpy as np import surprise # run 'pip install scikit-surprise' to install surprise from surprise import SVD from surprise import Dataset from surprise.model_selection import cross_validate from surprise import Reader total_review_df = pd.read_csv("../data/total_review_df.csv") # Load the dataset (download it if needed) reader = Reader(rating_scale=(0.5, 5.0)) data = Dataset.load_from_df(total_review_df[["user_name","res_id","rating"]],reader) # Use the famous SVD algorithm algo = SVD() print("---------------------------SVD--------------------------------") # Run 5-fold cross-validation and then print results cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=10, verbose=True) print() print("---------------------------테스트--------------------------------") uid = 1 iid = "형석" pred = algo.predict(uid,iid,5)
# !/usr/bin/env python # -*- coding: utf-8 -*- """ @author: lishuang @description: 使用邻域的协同过滤对movie lens进行预测,并采用K折交叉验证 """ from surprise import KNNWithZScore, Reader, Dataset from surprise import accuracy from surprise.model_selection import KFold # 加载数据 reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file('data/ratings.csv', reader) # ItemCF 计算得分 # 取最相思的用户计算时,只取最相思的k个 algo = KNNWithZScore(k=40, sim_options={ 'user_based': False, 'verbose': 'True' }) kf = KFold(n_splits=3) for train_set, test_set in kf.split(data): algo.fit(train_set) pred = algo.test(test_set) rmse = accuracy.rmse(pred, verbose=True)
import os from surprise import Dataset, Reader # 指定文件所在路径 file_path = os.path.expanduser('ml-100k/u.data') # 告诉文本阅读器,文本的格式是怎么样的 reader = Reader(line_format='user item rating timestamp', sep='\t') # 加载数据 data2 = Dataset.load_from_file(file_path, reader=reader) trainset = data2.build_full_trainset()# 转换成这种结构,才能获取到数据集详细信息 print('user count: ', trainset.n_users)# 用户数 print('item count', trainset.n_items)#电影数
def svd_impute(order_with_na): """ order_for_svd_drop: 분당 주문 데이터 - 일시품절 구간 na train_set: 아소트 정상 판매된 분당 판매량 - svd 학습 test_set: 일시 품절 발생한 train set 결측치 - svd 예측 """ random.seed(SEED) np.random.seed(SEED) """split all zero attributes""" all_zero_attributes = (order_with_na == 0).all(axis=1).loc[lambda x: x == True].index order_with_na_ = order_with_na.drop(all_zero_attributes) """svd require long shape""" sold_per_minute_long_reshape = (order_with_na_ .reset_index() .melt(id_vars=ATTR_KEY, value_name=ORD_AMT_KEY, var_name=TIME_KEY)) """check if acceptable""" predictable = sold_per_minute_long_reshape[ORD_AMT_KEY].isna().sum() >= 1 if not predictable: # return order itself if no null return order_with_na train_set = sold_per_minute_long_reshape.dropna().copy() long_enough = train_set.shape[0] >= 5 if not long_enough: return order_with_na """train svd""" sold_out_filter = sold_per_minute_long_reshape[ORD_AMT_KEY].isna() test_set = sold_per_minute_long_reshape[sold_out_filter].copy() # convert to surprise data type max_value = train_set[ORD_AMT_KEY].max() train_set_svd_object = Dataset.load_from_df(train_set, Reader(rating_scale=(0, max_value))) # find appropriate parameter grid search num_of_factor_candidates = 5 num_of_attr = train_set[ATTR_KEY].unique().shape[0] n_factor_min = (num_of_attr // num_of_factor_candidates) + 1 n_factor_max = num_of_attr - 1 factor_candidates = np.linspace(start=n_factor_min, stop=n_factor_max, num=num_of_factor_candidates) n_factors_grid_search_pool = np.unique(np.floor(factor_candidates)).astype(int) # svd의 파라미터로 쓸 latent factor 개수 grid_search_pool = {'reg_all': [0], 'lr_all': [0.003, 0.001], 'n_factors': n_factors_grid_search_pool, 'n_epochs': [15, 22, 30], 'biased': [False]} error_measure = 'mae' grid_searcher = GridSearchCV(SVD, grid_search_pool, measures=[error_measure], cv=5, n_jobs=N_CORE) # n_jobs: parallel compute try: grid_searcher.fit(train_set_svd_object) except: grid_searcher = GridSearchCV(SVD, grid_search_pool, measures=[error_measure], cv=2, n_jobs=N_CORE) # n_jobs: parallel compute grid_searcher.fit(train_set_svd_object) best_error = grid_searcher.best_score[error_measure] best_param = grid_searcher.best_params[error_measure] """svd predict""" svd = SVD(**best_param).fit(train_set_svd_object.build_full_trainset()) def _predict(test_set): return svd.predict(test_set[ATTR_KEY], test_set[TIME_KEY]).est for predict_row_idx, each_blank in test_set.iterrows(): predicted_value = _predict(each_blank) test_set.loc[predict_row_idx, ORD_AMT_KEY] = predicted_value filled_sold_per_minute_long_shape = pd.concat([train_set, test_set]).round(0) imputed = filled_sold_per_minute_long_shape.pivot(index=ATTR_KEY, columns=TIME_KEY, values=ORD_AMT_KEY) """merge all zeros""" for attribute in all_zero_attributes: imputed.loc[attribute, :] = 0 return imputed
def train_collaborative_filtering(self, grid_search=False, gs_params=None): #transform page list in single value analytics_df_SVD = self.analytics_df.copy() analytics_df_SVD['ranking'] = analytics_df_SVD[[ 'totals.pageviews', 'totals.timeOnSite' ]].apply(lambda x: self.__generate_ranking(x), axis=1) analytics_df_SVD = analytics_df_SVD['pages_visited'].apply(lambda x: pd.Series(eval(x)))\ .stack()\ .reset_index(level=1,drop=True)\ .to_frame('pageId')\ .join(analytics_df_SVD[['visitId','ranking']], how='left') analytics_df_SVD = analytics_df_SVD.dropna() analytics_df_SVD = analytics_df_SVD[['visitId', 'ranking', 'pageId']] analytics_df_SVD['pageId'] = analytics_df_SVD['pageId'].apply( lambda x: int(x)) # Saves Matrix for later use analytics_df_SVD.to_csv('state/visit_user_ranking.csv') # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 4)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df( analytics_df_SVD[['visitId', 'pageId', 'ranking']], reader) trainset, testset = train_test_split(data, test_size=.1) # If user desires to use GridSearch to find best params and algo if grid_search: if (not gs_params): param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008],\ 'reg_all': [0.08, 0.1, 0.15]} else: param_grid = gs_params gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) algo = gs.best_estimator['rmse'] print(gs.best_score['rmse']) print(gs.best_params['rmse']) ## Comment next lines if you are searching the best params # We can now use this dataset as we please, e.g. calling cross_validate else: algo = SVD(n_factors=110, n_epochs=110, lr_all=0.008, reg_all=0.15) cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) algo.fit(trainset) test_pred = algo.test(testset) print("SVD : Test Set") accuracy.rmse(test_pred, verbose=True) # Dump algorithm print('Saving trained algo...', end=" ") algo_list = glob.glob('state/algo_*') file_name = 'state/algo_' + datetime.datetime.now().strftime( "%Y_%B_%d__%Ih%M%p") dump.dump(file_name, algo=algo) for file in algo_list: os.remove(file) print('Done.')
((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k] ) # * Precision at K: Proportion of recommended items that are relevant precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1 # * Recall at K: Proportion of relevant items that are recommended recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1 return precisions, recalls # * using reader to be able to deal with the imported CSV reader = Reader( line_format="user item rating timestamp", sep=",", rating_scale=(1, 5), skip_lines=1 ) # * loading the csv data = Dataset.load_from_file( file_path="../../ML_Dataset/ml-latest-small/ratings.csv", reader=reader ) # * dividing in train and test sets trainset, testset = train_test_split(data, test_size=0.25) # * define a cross-validation iterator kf = KFold(n_splits=5) # * Choosing Slope One as algorithm algo = SlopeOne() # * Train the algorithm on the trainset, and predict ratings for the testset
import pandas as pd from surprise import Reader from surprise import Dataset import time import matplotlib.pyplot as plt import psutil timex = [] mem = [] m1 = psutil.virtual_memory().percent #For 100 record dataset start = time.time() df1 = pd.read_csv('C:/Users/dell pc/Desktop/Project/ratings_1million1.csv', dtype={'rating': float}) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df1[['user_id', 'book_id', 'rating']], reader) data.split(2) algo = surprise.KNNBasic() result1 = surprise.evaluate(algo, data, measures=['RMSE']) end = time.time() print("Time1", end - start) timex.append(end - start) m2 = psutil.virtual_memory().percent #print(m2) mem.append(m2) #For 1000 record dataset start = time.time() df2 = pd.read_csv('C:/Users/dell pc/Desktop/Project/ratings_1million2.csv', dtype={'rating': float})
""" import numpy as np import pprint as pp import pandas as pd from surprise import Reader from surprise import Dataset from surprise.model_selection import KFold from surprise.model_selection import cross_validate from surprise import NormalPredictor, BaselineOnly, KNNBasic, KNNWithMeans, KNNWithZScore, KNNBaseline, SVD, SVDpp, NMF, SlopeOne, CoClustering from surprise.model_selection import RandomizedSearchCV import datetime start = datetime.datetime.now( ) #We want to record the time required to select the estimators reader = Reader(sep=',', skip_lines=1) data = Dataset.load_from_file( './DMT_2020__HW_2/DMT_2020__HW_2/Part_1/dataset/ratings.csv', reader=reader) #Import dataset #KNNBASELINE TUNING similarity_options = { 'name': ['pearson_baseline'], 'user_based': [True, False] } #We set the similarity options for the RandomizedSearch.We set Pearson Baselina as suggested in the Surprise documentation baseline_predictor_options = { 'method': ['sgd'], 'learning_rate': [0.002, 0.005, 0.01], 'n_epochs': [50, 100, 150], 'reg': [0.01, 0.02, 0.05] } #We set the baseline predictor options options for the RandomizedSearch grid_of_parameters = {
parser = argparse.ArgumentParser() parser.add_argument('--model', '-m', required=True, choices=[ 'NormalPredictor', 'BaselineOnly', 'KNNBasic', 'KNNWithMeans', 'KNNWithZScore', 'KNNBaseline', 'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering' ]) args = parser.parse_args() train_path = path + '/Data/train_format.txt' train_reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0, 5)) trainset = Dataset.load_from_file(train_path, reader=train_reader) trainset = trainset.build_full_trainset() if args.model == 'NormalPredictor': model = surprise.NormalPredictor() elif args.model == 'BaselineOnly': model = surprise.BaselineOnly() elif args.model == 'KNNBasic': model = surprise.KNNBasic() elif args.model == 'KNNWithMeans': model = surprise.KNNWithMeans() elif args.model == 'KNNWithZScore': model = surprise.KNNWithZScore() elif args.model == 'KNNBaseline':
def train_model(): # Send request to Nodejs server for authentication. if not is_good_request(request): return abort(400) # Extract data from request. data = request.get_json() dataset, data_header, model_name, params, train_type, save_on_server, save_on_local = data.values() # if not is_header_valid(data_header): # return jsonify({'message': '[ERROR] Incorrect dataset format.'}) # Use the data uploaded or data on server. df = pd.DataFrame(dataset, columns=data_header) if dataset else pd.read_csv('./data/data-new.csv', header=0) try: train_set, test_set = build_train_test(df, Reader(), full=train_type == 'full') except ValueError: return jsonify({'error': 'Incorrect dataset format.'}) if model_name == 'insvd': n_factors, n_epochs, lr_all, reg_all, random_state = params.values() ALLOWED_EXTENSIONS = {'txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'} # Parse data types. n_factors = int(n_factors) n_epochs = int(n_epochs) lr_all = float(lr_all) reg_all = float(reg_all) random_state = int(random_state) model = InSVD(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all, random_state=random_state) else: k, sim_options, random_state = params.values() model = KNNBasic(k=int(k), sim_options={'name': sim_options, 'user_based': False}, random_state=int(random_state)) # Fitting and testing. model.fit(train_set) predictions = model.test(test_set) # Add suffix if not save on server. model_path = f'./model/{model_name}' if save_on_server else f'./model/{model_name}-temp' # Save. dump.dump(model_path, algo=model, predictions=predictions) model_info = { 'rmse': rmse(predictions), 'mae': mae(predictions), } # Zip the trained model. try: zip_obj = ZipFile(f'{model_path}.zip', 'w') zip_obj.write(model_path) zip_obj.close() except FileNotFoundError: return abort(404) @after_this_request def remove_dump_files(response): # If not save model on server, delete model dump file. if not save_on_server: os.remove(model_path) # Always delete the .zip file. os.remove(f'{model_path}.zip') return response if save_on_local: with open(f'{model_path}.zip', 'rb') as f: model_zip = f.readlines() resp = Response(model_zip) resp.headers['X-Model-Info'] = json.dumps(model_info) resp.headers['Content-Type'] = 'application/zip' resp.headers['Content-Disposition'] = 'attachment; filename=%s;' % 'model.zip' return resp # return Response(model_zip, headers={ # 'X-Info': json.dumps(model_info), # 'Content-Type': 'application/zip', # 'Content-Disposition': 'attachment; filename=%s;' % 'model.zip', # }) # return send_from_directory('./model', model_file), 200 return jsonify(model_info)
on=[itemID_column]) merged_data = result[[ userID_column, itemID_column, itemName_column, ratings_column ]] # - len(ratings['user_id'].unique()) # # process data # + from surprise import Dataset from surprise import Reader data = merged_data[['user_id', 'book_id', 'rating']] reader = Reader(rating_scale=(0.5, 4.5)) data = Dataset.load_from_df(data, reader) popularity_rankings = merged_data['book_id'].value_counts() rankings = pd.Series(range(1, len(popularity_rankings) + 1, 1), index=popularity_rankings.index) processed_data = ProcessData(data, rankings) # Train on leave-One-Out train set trainSet = processed_data.GetLOOCVTrainSet() testSet = processed_data.GetLOOCVTestSet() # - # # run test
'''Testing renaming of train() into fit()''' import os import pytest from surprise import Dataset from surprise import Reader from surprise import AlgoBase from surprise.model_selection import KFold data_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') data = Dataset.load_from_file(data_file, Reader('ml-100k')) kf = KFold(n_splits=2) def test_new_style_algo(): '''Test that new algorithms (i.e. algoritms that only define fit()) can support both calls to fit() and to train() - algo.fit() is the new way of doing things - supporting algo.train() is needed for the (unlikely?) case where a user has defined custom tools that use algo.train(). ''' class CustomAlgoFit(AlgoBase): def __init__(self): AlgoBase.__init__(self) self.cnt = -1 def fit(self, trainset): AlgoBase.fit(self, trainset) self.est = 3
def main(): #Here we have 4 data that can be taken for the test. The basic one, the others with users and movies that have more than 10, 20 and 50 ratings elem_to_import = 4 for i in range(elem_to_import): if(i == 0): #Import the basic data data_import = np.genfromtxt(DATA_PATH, delimiter=",", skip_header=1, dtype=str) #Construct User, Movie and rating userId, movieId, rating = construct_data(data_import); elif(i == 1): # Import the data with users and movies that have more than 10 ratings data_import = pd.read_excel(DATA_PATH_10, index_col=0, header=0) userId = data_import["userId"] movieId = data_import["movieId"] rating = data_import["rating"] elif(i == 2): # Import the data with users and movies that have more than 20 ratings data_import = pd.read_excel(DATA_PATH_20, index_col=0, header=0) userId = data_import["userId"] movieId = data_import["movieId"] rating = data_import["rating"] else: # Import the data with users and movies that have more than 30 ratings data_import = pd.read_excel(DATA_PATH_50, index_col=0, header=0) userId = data_import["userId"] movieId = data_import["movieId"] rating = data_import["rating"] #We take the indicies that we will shuffle indices_to_shuffle = np.array(range(len(userId))) test_ratio = 70 #We create the train and test data (70% ratio of the data does on train) Indicies are shuffeled in split_data X_train_userId, X_train_movieId, X_train_rating, X_test_userId, X_test_movieId, X_test_rating = split_data( indices_to_shuffle, userId, movieId, rating, test_ratio) ratings_dict_train = {'itemID': X_train_movieId, 'userID': X_train_userId, 'rating': X_train_rating} #Create the dataframe for the surprise train df_train = pd.DataFrame(ratings_dict_train) reader_train = Reader(rating_scale=(1, 5)) data_train = Dataset.load_from_df(df_train[['userID', 'itemID', 'rating']], reader_train) #We have to split the data because the algo test with the splited elements split = 3 data_train.split(split) ratings_dict_train = {'itemID': X_test_movieId, 'userID': X_test_userId, 'rating': X_test_rating} # Create the dataframe for the surprise test df_test = pd.DataFrame(ratings_dict_train) reader_test = Reader(rating_scale=(1, 5)) data_test = Dataset.load_from_df(df_test[['userID', 'itemID', 'rating']], reader_test) data_test.split(split) # KNN best param n_epochs = [5] reg_us = [5] reg_is = [5] ''' #KNN (takes to long to test all of them but you can check) n_epochs = [5,10,15] reg_us = [5,10,15,20] reg_is = [5,10,20] ''' # Apply the grid search for KNN perf_knn_grid = grid_search_knn_surprise(data_train, n_epochs, reg_us, reg_is) if (i == 0): #Manual grid search so we can see the values (only the result is given with the GridSearch of surprise) grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN.xlsx') # KNN with the best params from GridSearch surprise knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"], "surprise_bestKNN.csv") elif(i == 1): # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise) grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN10.xlsx') # KNN with the best params from GridSearch surprise knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"],"surprise_bestKNN10.csv") elif (i == 2): # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise) grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN20.xlsx') # KNN with the best params from GridSearch surprise knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"],"surprise_bestKNN20.csv") else: # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise) grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, 'surpise_manualGS_KNN50.xlsx') # KNN with the best params from GridSearch surprise knn_surprise(data_train, perf_knn_grid["n_epochs"], perf_knn_grid["reg_u"], perf_knn_grid["reg_i"],"surprise_bestKNN50.csv") # SVD best param n_epochs = [10] lr_alls = [0.00147] reg_alls = [0.2] init_mean = [0.2] n_factors = [80] ''' #SVD (takes to long to test all of them but you can check) n_epochs = [5,10] lr_alls = [0.00145, 0.00146, 0.00147] reg_alls = [0.2,0.3] init_mean = [0, 0.2] n_factors = [80,100,120] ''' # Apply the grid search for SVD perf_svd_grid = grid_search_svd_surprise(data_train, n_epochs, lr_alls, reg_alls, init_mean, n_factors) if (i == 0): # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise) grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors, 'surpise_manualGS_SVD.xlsx') # SVD with the best params from GridSearch surprise svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"], perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD.csv") elif(i == 1): # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise) grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors, 'surpise_manualGS_SVD10.xlsx') # SVD with the best params from GridSearch surprise svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"], perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD10.csv") elif (i == 2): # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise) grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors, 'surpise_manualGS_SVD20.xlsx') # SVD with the best params from GridSearch surprise svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"], perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD20.csv") else: # Manual grid search so we can see the values (only the result is given with the GridSearch of surprise) grid_search_svd(data_train, data_test, n_epochs, lr_alls, reg_alls, init_mean, n_factors, 'surpise_manualGS_SVD50.xlsx') # SVD with the best params from GridSearch surprise svd_surprise(data_train, perf_svd_grid["reg_all"], perf_svd_grid["init_mean"], perf_svd_grid["n_epochs"], perf_svd_grid["lr_all"], perf_svd_grid["n_factors"], "surprise_bestSVD50.csv")
`top_n:` a dictionary of the top k reccommendations for a given user `user:` internal user id(uid) used in the datasets like in ml-latest-parsed.csv Returns: a list containing the top k reccomendations for the given user """ matches = [[iid for (iid, _) in user_ratings] for uid, user_ratings in top_n.items() if uid == user] return matches # Load the movielens-100k dataset (download it if needed), reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(0.5, 5), skip_lines=1) data = Dataset.load_from_file('./ml-latest-parsed.csv', reader=reader) trainset = data.build_full_trainset() testset = trainset.build_anti_testset() #testset = trainset.build_anti_testset() #trainset, testset = train_test_split(data, test_size=.3) # We'll use the famous SVD algorithm. print("Creating Model") sim_options = {'name': 'cosine', 'user_based': True, 'min_support': 2} algo = KNNBasic(k=40, min_k=2, sim_options=sim_options)
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Mon Feb 4 00:08:44 2019 @author: abhijithneilabraham """ from surprise import KNNBasic from surprise import SVD from surprise import Dataset from surprise.model_selection import cross_validate from surprise import Reader from surprise.model_selection import train_test_split import pandas as pd customer = pd.read_csv('names.csv') reader = Reader(line_format='user item rating', rating_scale=(1, 5), sep=',') fieldnames = ['id', 'male_or_female'] for i in range(25): fieldnames.insert(2, 'question' + str(i + 1)) data = Dataset.load_from_df(customer[fieldnames], reader) del fieldnames[2] trainset = data.build_full_trainset() algo = KNNBasic() algo.fit(trainset)
mtarix_toGO['Norm_Tot_Amnt'] = (mtarix_toGO['Mean_amount'] - min_amt) / max_amt #lower_bound = min(mtarix_toGO['Log_Mean_Amount']) #upper_bound = max(mtarix_toGO['Log_Mean_Amount']) #print lower_bound #print upper_bound # Remove the outliers dfx = mtarix_toGO[mtarix_toGO['Norm_Tot_Amnt'] <= 0.4] lower_bound = min(dfx['Norm_Tot_Amnt']) upper_bound = max(dfx['Norm_Tot_Amnt']) print 'Lower Bound normalized spending =', lower_bound print 'Upper Bound normalized spending =', upper_bound print 'Number of Transactions remaining after removing Outliers::', mtarix_toGO.shape[ 0] #define the reader with upper and lower bounds , also now we are predicting Normalized Total Amount column reader_x = Reader(rating_scale=(lower_bound, upper_bound)) data = Dataset.load_from_df( df=dfx[['CustomerID', 'StockCode', 'Norm_Tot_Amnt']], reader=reader_x) #for i in range(9): # print (data.raw_ratings[0][2] - data.df['Log_Mean_amount'][0]) print 'difference in processed and pre-processed dataset = ', ( data.raw_ratings[0][2] - data.df['Norm_Tot_Amnt'][0]) import time start_time = time.time() param_grid = { 'n_factors': [2, 5, 10, 50], 'n_epochs': [10, 50, 100],
from surprise import Reader, Dataset from surprise import KNNBasic, evaluate import csv reader = Reader(line_format='user item rating', sep=';', rating_scale=(-1, 3)) print("Loading data...") data = Dataset.load_from_file('./tost/u.data', reader=reader) data.split(n_folds=3) print(">OK\n") #Build the trainset trainset = data.build_full_trainset() #Define similarities options #sim_options = { # 'name': 'cosine', # 'user_based': True # compute similarities between items #} print("Training data...") #Build an algo, and train it algo = KNNBasic() algo.train(trainset) print("> OK\n") #Get predicitions print("Predictions :") ratings = vars(data).get('raw_ratings') with open('tost/u.results', "wb") as csv_file:
def predict(user_id, topk): #读取数据 data = pd.read_csv("../data/data.dat", sep="\t", names=['userid', 'itemid', 'rating', 'timestamp']) #数据类型转换 data['rating'] = data['rating'].astype(float) #数据分析,获取数据大小 print('Dataset 1 shape: {}'.format(data.shape)) #重建索引 data.index = np.arange(0, len(data)) print('Full dataset shape: {}'.format(data.shape)) #计数 p = data.groupby('rating')['rating'].agg(['count']) # 获取电影总数 movie_count = data.isnull().sum()[1] # 获取用户总数 cust_count = data['userid'].nunique() - movie_count # 获取评分总数 rating_count = data['userid'].count() - movie_count #总体用户喜好分析 ax = p.plot(kind='barh', legend=False, figsize=(15, 10)) plt.title( 'Total pool: {:,} Movies, {:,} customers, {:,} ratings given'.format( movie_count, cust_count, rating_count), fontsize=20) plt.axis('off') for i in range(1, 6): ax.text(p.iloc[i - 1][0] / 4, i - 1, 'Rating {}: {:.0f}%'.format( i, p.iloc[i - 1][0] * 100 / p.sum()[0]), color='white', weight='bold') ''' 影片ID是一个混乱的导入!通过dataframe循环添加电影ID列会使内核耗尽内存, 因为它的效率太低。所以首先创建一个正确长度的numpy数组, 然后将整个数组作为列添加到主数据dataframe中! ''' df_nan = pd.DataFrame(pd.isnull(data.rating)) df_nan = df_nan[df_nan['rating'] == True] df_nan = df_nan.reset_index() movie_np = [] movie_id = 1 for i, j in zip(df_nan['index'][1:], df_nan['index'][:-1]): temp = np.full((1, i - j - 1), movie_id) movie_np = np.append(movie_np, temp) movie_id += 1 ''' 现在的数据集非常庞大,所以需要进行数据预处理 删除评论过少的电影(它们相对不受欢迎) 删除评论过少的客户(他们相对不活跃) 在matrix的观点中,不受欢迎的电影和不活跃的客户与受欢迎的电影和活跃的客户所占的数量相同 ''' #筛选出评分很少的电影 f = ['count', 'mean'] #将数据按照itemid进行分组,然后根据rating对数据聚合,求count,mean df_movie_summary = data.groupby('itemid')['rating'].agg(f) #设置处理后的数据索引数据类型 df_movie_summary.index = df_movie_summary.index.map(int) movie_benchmark = round(df_movie_summary['count'].quantile(0.8), 0) #筛选出不活跃的电影 drop_movie_list = df_movie_summary[ df_movie_summary['count'] < movie_benchmark].index print('Movie minimum times of review: {}'.format(movie_benchmark)) #筛选出不活跃的用户 df_cust_summary = data.groupby('userid')['rating'].agg(f) df_cust_summary.index = df_cust_summary.index.map(int) cust_benchmark = round(df_cust_summary['count'].quantile(0.8), 0) drop_cust_list = df_cust_summary[ df_cust_summary['count'] < cust_benchmark].index print('Customer minimum times of review: {}'.format(cust_benchmark)) print('Original Shape: {}'.format(data.shape)) df = data[~data['itemid'].isin(drop_movie_list)] df = df[~df['userid'].isin(drop_cust_list)] print('After Trim Shape: {}'.format(df.shape)) df_p = pd.pivot_table(df, values='rating', index='userid', columns='itemid') #开始训练算法模型 reader = Reader() #定义算法模型 svd = SVD() #指定所需用户 user_some = data[(data['userid'] == user_id) & (data['rating'] > 3)] user_some = user_some.set_index('itemid') user_some = user_some.reset_index() user_some = user_some[~user_some['itemid'].isin(drop_movie_list)] #将所有的数据集载入到data里面 data = Dataset.load_from_df(df[['userid', 'itemid', 'rating']], reader) #将数据集转换为训练的数据集合 trainset = data.build_full_trainset() svd.train(trainset) user_some['Estimate_Score'] = user_some['itemid'].apply( lambda x: svd.predict(user_id, x).est) user_some = user_some.sort_values('Estimate_Score', ascending=False) return user_some.head(topk)
from surprise import SlopeOne from surprise import CoClustering from surprise.model_selection import cross_validate sys.path.insert(1, './') from auto_surprise.engine import Engine if __name__ == '__main__': print("Starting benchmark") # Surprise algorithms to evaluate algorithms = (SVD, SVDpp, NMF, SlopeOne, KNNBasic, KNNWithMeans, KNNBaseline, CoClustering, BaselineOnly, NormalPredictor) # Load dataset file_path = os.path.expanduser('../datasets/libimseti/ratings.dat') reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(1, 10)) data = Dataset.load_from_file(file_path, reader=reader) benchmark_results = { 'Algorithm': [], 'RMSE': [], 'MAE': [], 'Best params': [], 'Time': [] } # Evaluate Surprise Algorithms for algo in algorithms: algo_name = algo.__name__ print("Running algorithm : %s" % algo_name)
from surprise import SVDpp from surprise import Dataset from surprise import Reader from surprise.model_selection import cross_validate import os # Load the movielens-100k dataset (download it if needed). #data = Dataset.load_builtin('ml-100k') file_path = os.path.expanduser("../train1.csv") #reader = Reader(line_format="user item rating timestamp", sep=',') reader = Reader(line_format="user item rating timestamp", sep=',', rating_scale=(0, 5)) data = Dataset.load_from_file(file_path, reader=reader) ''' file_path1 = os.path.expanduser("../test1.csv") reader1 = Reader(line_format="user item rating", sep=',') data = Dataset.load_from_file(file_path, reader=reader) data1 = Dataset.load_from_file(file_path1, reader=reader1) ''' trainset = data.build_full_trainset() #testset = data1.build_full_trainset() # Use the famous SVD algorithm. algo = SVDpp() # Run 5-fold cross-validation and print results. #cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) algo.fit(trainset)
#url = st.text_input('Enter the path for the data') st.write('The data is loaded') #data_load_state = st.text('Loading the data') data = ds.get_data(option1) st.write(data) #data = ds.get_data(_file_path, 'data/data_subset.csv', 0.99) #data = ds.get_data('/Users/lalitharahul/Desktop/AutoRecommender/RecServe/sample_us.tsv') #data = ds.get_data(url) #st.write(data) #data_load_state.text('Data is preprocessed') data_surprise = data[['customer_id', 'product_id', 'star_rating']]. \ rename(columns={'customer_id': 'userID', 'product_id': 'itemID', 'star_rating': 'rating'}) reader = Reader(rating_scale=(1.0, 5.0)) df_loaded = Dataset.load_from_df(data_surprise, reader) #trainset = df_loaded.build_full_trainset() results_list = [] # features reviews = data.shape[0] n_users = data.customer_id.nunique() n_products = data.product_id.nunique() mean_rating = data.star_rating.mean() rating_std = data.star_rating.std() sparsity = reviews * 100 / (n_users * n_products) url = st.sidebar.text_input('Enter CustomerID') #if st.button('Enter'): # st.write('Entered customer id') #st.write('The Entered Customer Id is', url)
""" from __future__ import (absolute_import, division, print_function, unicode_literals) import os from surprise import BaselineOnly from surprise import Dataset from surprise import evaluate from surprise import Reader # path to dataset folder files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/') # This time, we'll use the built-in reader. reader = Reader('ml-100k') # folds_files is a list of tuples containing file paths: # [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)] train_file = files_dir + 'u%d.base' test_file = files_dir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader) # We'll use an algorithm that predicts baseline estimates. algo = BaselineOnly() # Evaluate performances of our algorithm on the dataset. evaluate(algo, data)
def main(): # Load full training data with open('data/train_users_04.p', 'rb') as f: users = pickle.load(f) f = open('data/train_users.csv', 'w') for user in users: if users[user] != {}: # zscores = stats.zscore(list(users[user].values())) # items = [(a[0], zscores[i]) for i, a # in enumerate(users[user].items())] for i in users[user].items(): t = float(i[1]) if t > 6: t = 6 elif t < 1: t = 1 f.write('%s\t%s\t%.03f\n' % (user, i[0], t)) f.close() print("finished train data") # Load full test data with open('data/test_users_04.p', 'rb') as f: users = pickle.load(f) f = open('data/test_users.csv', 'w') for user in users: if users[user] != {}: # zscores = stats.zscore(list(users[user].values())) # items = [(a[0], zscores[i]) for i, # a in enumerate(users[user].items())] for i in users[user].items(): t = float(i[1]) if t > 6: t = 6 elif t < 1: t = 1 f.write('%s\t%s\t%.03f\n' % (user, i[0], t)) f.close() print("finished test data") ######### START OF TRAINING ######### reader = Reader(line_format='user item rating', sep='\t', rating_scale=(1, 6)) train_data = Dataset.load_from_file('data/train_users.csv', reader=reader) # .build_full_trainset() with open('data/test_users.csv', 'r') as f: s = list(map(lambda x: tuple(x.split('\t')), f.read().split('\n'))) test_data = [] for x in s: if len(x) > 2: test_data.append((x[0], x[1], float(x[2]))) # print(data.ur) # algo = KNNBasic(sim_options={'name': 'cosine'}) # algo = NMF(verbose=True) algo = SVD(verbose=True) # algo = NormalPredictor() algo.fit(train_data.build_full_trainset()) # cross_validate(algo, train_data, verbose=True) # print(algo.predict('76561197960675902', '70', r_ui=63, verbose=True)) # print(algo.predict('76561197960675902', '4540', r_ui=22, verbose=True)) # print(algo.predict('76561197960675902', '550', r_ui=791, verbose=True)) # print(algo.predict('76561197960675902', '10190', r_ui=1253, verbose=True)) # print(algo.predict('76561197960675902', '10', r_ui=1037, verbose=True)) predictions = algo.test(test_data) print(accuracy.rmse(predictions))
import pandas as pd from surprise import Dataset from surprise import BaselineOnly from surprise import Reader import sys input_path = sys.argv[1] + "yelp_train.csv" test_file_name = sys.argv[2] result_file_name = sys.argv[3] reader = Reader(rating_scale=(0, 5)) train_read = pd.read_csv(input_path) test_read = pd.read_csv(test_file_name) train_load = Dataset.load_from_df(train_read, reader=reader).build_full_trainset() test_load = Dataset.load_from_df(test_read, reader=reader).build_full_trainset() bsl_options = {'method': 'als', 'n_epochs': 9, 'reg_u': 7.6, 'reg_i': 3.5} algorithm = BaselineOnly(bsl_options=bsl_options) results = algorithm.fit(train_load).test(test_load.build_testset()) with open(result_file_name, "w") as fout: fout.write("user_id, business_id, prediction\n") for p in results: fout.write(str(p.uid) + "," + str(p.iid) + "," + str(p.est) + "\n") fout.close()
import os import random from surprise import Dataset from surprise import Reader from surprise import SVD from surprise import KNNBaseline from surprise import evaluate from surprise import GridSearch # the test and train files are from the ml-100k dataset (10% of u1.base and # 10 % of u1.test) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) random.seed(0) def test_grid_search_cv_results(): """Ensure that the number of parameter combinations is correct.""" param_grid = { 'n_epochs': [1, 2], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6], 'n_factors': [1], 'init_std_dev': [0] } grid_search = GridSearch(SVD, param_grid) grid_search.evaluate(data)
df_movie_summary['count'] < movie_benchmark].index df_cust_summary = df.groupby('Cust_Id')['Rating'].agg(J) df_cust_summary.index = df_cust_summary.index.map(int) cust_benchmark = round(df_cust_summary)['count'].quantile(0.8) drop_movie_list = df_cust_summary[ df_cust_summary['count'] < cust_benchmark].index # pivot the data set to make a matrix form df_pivot = pd.pivot_table(df, values='Rating', index='Cust_Id', columns='Movie_Id') # Now start collaborative filtering reader = Reader() algo = SVD() # Take input for Customer ID customerID = input("Customer ID Please: ") customerID = int(customerID) print("We are recommending Movies for Customer ID: ", customerID) number = input("Please input the total number of movies to be recommended: ") number = int(number) print("Number of Movies Being Recommended: ", number) sdata = df[(df['Cust_Id'] == customerID)] print(sdata) sdata = sdata.set_index('Movie_Id')
def train(params): # Load the training data and create a df for training temp_df = read_dataframe(params.client_id, params.source_bucket, 'buy.csv') raw_df = pd.DataFrame( data={ 'entity_id': temp_df['entity_id'], 'target_entity_id': temp_df['target_entity_id'] }) raw_df['rating'] = MAX_RATING # create the training set reader = Reader(rating_scale=(0, MAX_RATING)) data = Dataset.load_from_df( raw_df[['entity_id', 'target_entity_id', 'rating']], reader) training_data = data.build_full_trainset() # Find optimal parameters print(' --> fitting the model') param_grid = { 'n_epochs': [10, 30], 'lr_all': [0.002, 0.005], 'reg_all': [0.2, 0.6] } gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) print(gs.best_score['rmse']) print(gs.best_params['rmse']) # Build an model, and train it print(' --> build the model') #model = SVD() model = gs.best_estimator['rmse'] model.fit(training_data) # Batch predictions print(' --> batch predictions') unique_entity = np.unique(raw_df.entity_id.values) unique_target_entity = np.unique(raw_df.target_entity_id.values) px = pd.DataFrame(-1.0, index=unique_entity, columns=unique_target_entity, dtype=np.float64) predx = training_data.build_anti_testset(fill=0) for p in predx: pred = model.predict(training_data.to_inner_uid(p[0]), training_data.to_inner_iid(p[1])) px.at[p[0], p[1]] = round(pred.est, PRECISION) # create the export print(' --> create export') ex = pd.DataFrame(index=unique_entity, columns={'entity_type', 'target_entity_type', 'values'}) for id in unique_entity: p1 = px.loc[id, :] p2 = p1.sort_values(ascending=False) p3a = p2[p2 > FILTER_THRESHOLD] p3 = p3a[p3a < 1.0].head(MAX_PREDICTION) t = zip(p3.index.tolist(), p3.values) tf = [item for sublist in t for item in sublist] ex.at[id, 'entity_type'] = 'user' ex.at[id, 'target_entity_type'] = 'item' ex.at[id, 'values'] = tf write_dataframe(params.job_id, params.job_dir, 'pred_user.csv', ['entity_type', 'target_entity_type', 'values'], 'entity_id', ex)
from surprise import NMF from surprise import Dataset, Reader from surprise.model_selection import cross_validate # Load the movielens‐100k dataset (download it if needed), reader = Reader(line_format='user item rating', sep=' ') dataset = Dataset.load_from_file( './sushi3-2016/sushi3b.5000.10.score_converted', reader=reader) trainset = dataset.build_full_trainset() # We'll use the famous NMF algorithm. algo = NMF() algo.fit(trainset) # Run 5‐fold cross‐validation and print results cross_validate(algo, dataset, measures=['RMSE', 'MAE'], cv=5, verbose=True)
from surprise import KNNBaseline, Reader from surprise import Dataset import cProfile as pickle # 重建歌单id到歌单名的映射字典 id_name_dic = pickle.load(open("popular_playlist.pk1", "rb")) print("加载歌单id到歌单名的映射字典完成....") # 重建歌单名到歌单id的映射字典 name_id_dic = {} for playlist_id in id_name_dic: name_id_dic[id_name_dic[playlist_id]] = playlist_id print("加载歌单名到歌单id的映射字典完成.....") file_path = os.path.expanduser("./popular_music_suprise_format.txt") # 指定文件格式 reader = Reader(line_format="user item rating timestamp", sep=',') # 从文件读取数据 music_data = Dataset.load_from_file(file_path, reader=reader) # 计算歌曲间的相似度 print("构建数据集......") trainset = music_data.build_full_trainset() # 模板之查找最近的user print("开始训练模型.....") algo = KNNBaseline() algo.train(trainset) current_playlist = name_id_dic.keys()[39] print("歌单名称:", current_playlist) # 取出近邻