def test_deprecated_way(): """Test all Dataset constructors without passing rating_scale as a parameter. Make sure we revert back to the Reader object, with a warning message. Also, make sure ValueError is raised if reader has no rating_scale in this context. Not using dataset fixtures here for more control. """ # test load_from_file toy_data_path = (os.path.dirname(os.path.realpath(__file__)) + '/custom_dataset') with pytest.warns(UserWarning): reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=(1, 5)) data = Dataset.load_from_file(file_path=toy_data_path, reader=reader) with pytest.raises(ValueError): reader = Reader(line_format='user item rating', sep=' ', skip_lines=3, rating_scale=None) data = Dataset.load_from_file(file_path=toy_data_path, reader=reader) # test load_from_folds train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') with pytest.warns(UserWarning): reader = Reader(line_format='user item rating timestamp', sep='\t', rating_scale=(1, 5)) data = Dataset.load_from_folds([(train_file, test_file)], reader=reader) with pytest.raises(ValueError): reader = Reader(line_format='user item rating timestamp', sep='\t', rating_scale=None) data = Dataset.load_from_folds([(train_file, test_file)], reader=reader) # test load_from_df ratings_dict = {'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, '10000'], 'rating': [3, 2, 4, 3, 1]} df = pd.DataFrame(ratings_dict) with pytest.warns(UserWarning): reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader=reader) with pytest.raises(ValueError): reader = Reader(rating_scale=None) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], # noqa reader=reader)
def test_zero_rating_canary(): ratings_dict = {'itemID': [0, 0, 0, 0, 1, 1], 'userID': [0, 1, 2, 3, 3, 4], 'rating': [-10, 10, 0, -5, 0, 5]} df = pd.DataFrame(ratings_dict) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], rating_scale=(-10, 10)) trainset = data.build_full_trainset() # test ur and ir fields. Kind of OK, but the purpose of the test is # precisely to test what would happen if we removed them... assert trainset.ir[0] == [(0, -10), (1, 10), (2, 0), (3, -5)] assert trainset.ir[1] == [(3, 0), (4, 5)] assert trainset.ur[0] == [(0, -10)] assert trainset.ur[1] == [(0, 10)] assert trainset.ur[2] == [(0, 0)] assert trainset.ur[3] == [(0, -5), (1, 0)] assert trainset.ur[4] == [(1, 5)] print(trainset.ur) # ... so also test all_ratings which should be more reliable. all_ratings = list(trainset.all_ratings()) assert (0, 0, -10) in all_ratings assert (1, 0, 10) in all_ratings assert (2, 0, 0) in all_ratings assert (3, 0, -5) in all_ratings assert (3, 1, 0) in all_ratings assert (4, 1, 5) in all_ratings
def test_load_form_df(): """Ensure reading dataset from pandas dataframe is OK.""" # DF creation. ratings_dict = {'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, '10000'], 'rating': [3, 2, 4, 3, 1]} df = pd.DataFrame(ratings_dict) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], rating_scale=(1, 5)) # Assert split and folds can be used without problems with pytest.warns(UserWarning): data.split(2) assert sum(1 for _ in data.folds()) == 2 # assert users and items are correctly mapped trainset = data.build_full_trainset() assert trainset.knows_user(trainset.to_inner_uid(9)) assert trainset.knows_user(trainset.to_inner_uid('10000')) assert trainset.knows_item(trainset.to_inner_iid(2)) # assert r(9, 1) = 3 and r(2, 1) = 4 uid9 = trainset.to_inner_uid(9) uid2 = trainset.to_inner_uid(2) iid1 = trainset.to_inner_iid(1) assert trainset.ur[uid9] == [(iid1, 3)] assert trainset.ur[uid2] == [(iid1, 4)] # mess up the column ordering and assert that users are not correctly # mapped data = Dataset.load_from_df(df[['rating', 'itemID', 'userID']], rating_scale=(1, 5)) trainset = data.build_full_trainset() with pytest.raises(ValueError): trainset.to_inner_uid('10000')
def collaborative(self,ratings,user_id): reader = Reader() #ratings.head() temp_ratings = ratings data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) ## Training the data ## svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) #svd.train(trainset) ## Testing the data ## from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) count = 0 for uid, iid, true_r, est, _ in predictions: if uid == user_id: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print("count\n") #print(count) #print("\n--------here-------\n") #print(temp_ratings) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] #print("\n--------here-------\n") #print(cb) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] return(cb)
def test_build_anti_testset(): ratings_dict = {'itemID': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'userID': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'rating': [1, 2, 3, 4, 5, 6, 7, 8, 9]} df = pd.DataFrame(ratings_dict) data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], rating_scale=(1, 5)) with pytest.warns(UserWarning): data.split(2) trainset, __testset = next(data.folds()) # fill with some specific value for fillvalue in (0, 42., -1): anti = trainset.build_anti_testset(fill=fillvalue) for (u, i, r) in anti: assert r == fillvalue # fill with global_mean anti = trainset.build_anti_testset(fill=None) for (u, i, r) in anti: assert r == trainset.global_mean expect = trainset.n_users * trainset.n_items assert trainset.n_ratings + len(anti) == expect
def Cal_Knn(user_id): user = Usert.objects.all() travel = Travel.objects.all() #print(type(travel)) travels = travel.values('tourid', 'city', 'town', 'site', 'genre1', 'genre2','genre3') #print(travels,type(travels)) ''' qs = SomeModel.objects.select_related().filter(date__year=2012) q = qs.values('date', 'OtherField') df = pd.DataFrame.from_records(q) ''' # 1. raw dataset rate = Treview.objects.all() rates = rate.values('treview_no', 'user_no', 'placeid', 'rating', 'udate') #print(rate, type(rate)) #rating = pd.DataFrame(data = rate, columns=['review_no', 'user_no', 'placeid', 'rating']) rating = pd.DataFrame.from_records(rates) rating.drop('treview_no', axis=1, inplace=True) rating.drop('udate', axis=1, inplace=True) #print(rating.head()) # critic(user) title(item) rating #print(user_id) rating['user_no'].value_counts() rating['placeid'].value_counts() # 관광 vs 미관광 tab = pd.crosstab(rating['user_no'], rating['placeid']) #print(tab) # rating # 두 개의 집단변수를 가지고 나머지 rating을 그룹화 rating_g = rating.groupby(['user_no', 'placeid']) #print(rating_g.sum()) tab = rating_g.sum().unstack() # 행렬구조로 변환 #print(tab) #사용자 2이 가지 않은 곳, 1,15, 39.... #print(tab) # 2. rating 데이터셋 생성 #reader = Reader(line_format='rating["user_no"] rating["placeid"] rating["rating"]', rating_scale=(0.5, 5)) reader = Reader(rating_scale= (0.5, 5)) # 평점 범위 data = Dataset.load_from_df(df=rating, reader=reader) # rating이라는 데이터프레임은 reader(1~5)의 평점 범위를 가진다. #print(data) # 3. train/test set train = data.build_full_trainset() # 훈련셋 test = train.build_testset() # 검정셋 # 4. model 생성 option = {'name': 'pearson'} model = surprise.KNNBaseline(sim_options=option) model.fit(train) # model 생성 # 5. user_id 입력 #user_id = 1 # 추천대상자 item_ids = range(0, 2106) # placeid 범위 actual_rating = 0 # 평점 predict_result = [] for item_id in item_ids : if not actual_rating in tab: actual_rating = 0 a = model.predict(user_id, item_id, actual_rating) predict_result.append(a) ddff = pd.DataFrame(predict_result) #print(ddff) # 유저 1 추천 여행지 상위 5개 result = ddff.sort_values(by='est', ascending=False)[:5] # print('cal knn', result, type(result)) return result
from surprise import Dataset from surprise import accuracy from surprise import BaselineOnly from surprise import Dataset from surprise import Reader from surprise.model_selection import train_test_split from surprise.model_selection import GridSearchCV from surprise.model_selection import cross_validate # Use movielens-100K # data = Dataset.load_builtin('ml-100k') df = pd.read_csv("tr_mini_1.csv") reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'business_id', 'rating']], reader) trainset, testset = train_test_split(data, test_size=.15) pred = Dataset.load_from_df(pd.read_csv("tr_mini_2.csv")[['user_id', 'business_id']], reader) # reader = Reader(line_format='user item rating', sep=',', rating_scale=(0, 5), skip_lines=1) # data = Dataset.load_from_file('tr_mini_2.csv', reader=reader) trainset, testset = train_test_split(data, test_size=.15) print("About to start") # ----- SVD ----- # param_grid = {'n_factors' : [160, 200, 250], 'n_epochs' : [70, 90, 110], 'lr_all': [0.003, 0.005], 'reg_all': [0.2]} gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, joblib_verbose = 2, n_jobs=7)
from steven.steven_baselines import MeanOfMeans FILE_DIRECTORY = os.path.split(os.path.realpath(__file__))[0] DATA_DIRECTORY = os.path.join( os.path.split(FILE_DIRECTORY)[0], 'data', 'movies') if __name__ == "__main__": # Read data df = pd.read_csv(os.path.join(DATA_DIRECTORY, 'ratings.csv')) # Drop unneeded column 'timestamp' df.drop('timestamp', axis=1, inplace=True) # Load the data into the surprise format reader = Reader() data = Dataset.load_from_df(df, reader=reader) # Train ALS model print('Using ALS') bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} trainset, testset = train_test_split(data, test_size=0.25) algo = BaselineOnly(bsl_options=bsl_options) predictions = algo.fit(trainset).test(testset) # Get the RMSE of our predictions rmse = accuracy.rmse(predictions) # Get the cross-validated RMSE of our predictions cv_results = cross_validate(algo, data) cv_rmse = cv_results['test_rmse'].mean() print(f'CV RMSE: {cv_rmse}')
def get_EDA_page(): if not os.path.isfile("Data/NetflixRatings.csv"): startTime = datetime.now() data = open("Data/NetflixRatings.csv", mode="w") files = ['Data/combined_data_4.txt'] for file in files: print("Reading from file: " + str(file) + "...") with open(file) as f: for line in f: line = line.strip() if line.endswith(":"): movieID = line.replace(":", "") else: row = [] row = [ x for x in line.split(",") ] #custID, rating and date are separated by comma row.insert(0, movieID) data.write(",".join(row)) data.write("\n") print("Reading of file: " + str(file) + " is completed\n") data.close() print("Total time taken for execution of this code = " + str(datetime.now() - startTime)) else: print("data is already loaded") # creating data frame from our output csv file. if not os.path.isfile("Data/NetflixData.pkl"): startTime = datetime.now() Final_Data = pd.read_csv( "Data/NetflixRatings.csv", sep=",", names=["MovieID", "CustID", "Ratings", "Date"]) Final_Data["Date"] = pd.to_datetime(Final_Data["Date"]) Final_Data.sort_values(by="Date", inplace=True) print("Time taken for execution of above code = " + str(datetime.now() - startTime)) st.write("data frame created") else: print("data frame already present") # storing pandas dataframe as a picklefile for later use if not os.path.isfile("Data/NetflixData.pkl"): Final_Data.to_pickle("Data/NetflixData.pkl") st.write("pkl created") else: Final_Data = pd.read_pickle("Data/NetflixData.pkl") print("pkl already present") if st.checkbox("Show Final_Data"): st.write(Final_Data) if st.checkbox("Show all the column Names"): st.write(Final_Data.columns) ######## if st.checkbox("Show size of dataset"): if st.checkbox("Show row size"): st.write(Final_Data.shape[0]) if st.checkbox("Show column size"): st.write(Final_Data.shape[1]) if st.checkbox("Show complete dataset size"): st.write(Final_Data.shape) if st.checkbox("Show desc of Ratings in final data"): Final_Data.describe()["Ratings"] st.write("**displaying final dataset header lines using area chart**") st.area_chart(Final_Data) print("Number of NaN values = " + str(Final_Data.isnull().sum())) duplicates = Final_Data.duplicated(["MovieID", "CustID", "Ratings"]) print("Number of duplicate rows = " + str(duplicates.sum())) ##### if st.checkbox("Show unique customer & movieId in Total Data:"): st.write("Total number of movie ratings = ", str(Final_Data.shape[0])) st.write("Number of unique users = ", str(len(np.unique(Final_Data["CustID"])))) st.write("Number of unique movies = ", str(len(np.unique(Final_Data["MovieID"])))) ######### creating pkl file if not os.path.isfile("Data/TrainData.pkl"): Final_Data.iloc[:int(Final_Data.shape[0] * 0.80)].to_pickle("Data/TrainData.pkl") Train_Data = pd.read_pickle("Data/TrainData.pkl") Train_Data.reset_index(drop=True, inplace=True) else: Train_Data = pd.read_pickle("Data/TrainData.pkl") Train_Data.reset_index(drop=True, inplace=True) if not os.path.isfile("Data/TestData.pkl"): Final_Data.iloc[int(Final_Data.shape[0] * 0.80):].to_pickle("Data/TestData.pkl") Test_Data = pd.read_pickle("Data/TestData.pkl") Test_Data.reset_index(drop=True, inplace=True) else: Test_Data = pd.read_pickle("Data/TestData.pkl") Test_Data.reset_index(drop=True, inplace=True) ######### if st.checkbox("Showing dataset of Train_Data & Test_Data"): st.area_chart(Train_Data) st.area_chart(Test_Data) if st.checkbox("Show unique customer & movieId in Train DataSet:"): st.write("Total number of movie ratings in train data = ", str(Train_Data.shape[0])) st.write("Number of unique users in train data = ", str(len(np.unique(Train_Data["CustID"])))) st.write("Number of unique movies in train data = ", str(len(np.unique(Train_Data["MovieID"])))) st.write("Highest value of a User ID = ", str(max(Train_Data["CustID"].values))) st.write("Highest value of a Movie ID = ", str(max(Train_Data["MovieID"].values))) if st.checkbox("Show unique customer & movieId in Test DataSet:"): st.write("Total number of movie ratings in Test data = ", str(Test_Data.shape[0])) st.write("Number of unique users in Test data = ", str(len(np.unique(Test_Data["CustID"])))) st.write("Number of unique movies in trTestain data = ", str(len(np.unique(Test_Data["MovieID"])))) st.write("Highest value of a User ID = ", str(max(Test_Data["CustID"].values))) st.write("Highest value of a Movie ID = ", str(max(Test_Data["MovieID"].values))) ########## def changingLabels(number): return str(number / 10**6) + "M" plt.figure(figsize=(12, 8)) ax = sns.countplot(x="Ratings", data=Train_Data) ax.set_yticklabels([changingLabels(num) for num in ax.get_yticks()]) plt.tick_params(labelsize=15) plt.title("Distribution of Ratings in train data", fontsize=20) plt.xlabel("Ratings", fontsize=20) plt.ylabel("Number of Ratings(Millions)", fontsize=20) st.pyplot() st.write( "This graph will show how **Distribution of Ratings** which shows the overall maturity level of the whole series and is provided by the audience :smile: " ) Train_Data["DayOfWeek"] = Train_Data.Date.dt.weekday_name plt.figure(figsize=(10, 8)) ax = Train_Data.resample("M", on="Date")["Ratings"].count().plot() ax.set_yticklabels([changingLabels(num) for num in ax.get_yticks()]) ax.set_title("Number of Ratings per Month", fontsize=20) ax.set_xlabel("Date", fontsize=20) ax.set_ylabel("Number of Ratings Per Month(Millions)", fontsize=20) plt.tick_params(labelsize=15) st.pyplot() st.write( "This Graph will represents the **Number of Ratings Per Month** means counts of ratings grouped by months :smile:" ) st.write("**Analysis of Ratings given by user**") no_of_rated_movies_per_user = Train_Data.groupby( by="CustID")["Ratings"].count().sort_values(ascending=False) fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 7)) sns.kdeplot(no_of_rated_movies_per_user.values, shade=True, ax=axes[0]) axes[0].set_title("Fig1", fontsize=18) axes[0].set_xlabel("Number of Ratings by user", fontsize=18) axes[0].tick_params(labelsize=15) sns.kdeplot(no_of_rated_movies_per_user.values, shade=True, cumulative=True, ax=axes[1]) axes[1].set_title("Fig2", fontsize=18) axes[1].set_xlabel("Number of Ratings by user", fontsize=18) axes[1].tick_params(labelsize=15) fig.subplots_adjust(wspace=2) plt.tight_layout() st.pyplot() #### st.write( "Above fig1 graph shows that almost all of the users give very few ratings. There are very **few users who's ratings count is high** .Similarly, above fig2 graph shows that **almost 99% of users give very few ratings**" ) quantiles = no_of_rated_movies_per_user.quantile(np.arange(0, 1.01, 0.01)) fig = plt.figure(figsize=(10, 6)) axes = fig.add_axes([0.1, 0.1, 1, 1]) axes.set_title("Quantile values of Ratings Per User", fontsize=20) axes.set_xlabel("Quantiles", fontsize=20) axes.set_ylabel("Ratings Per User", fontsize=20) axes.plot(quantiles) plt.scatter(x=quantiles.index[::5], y=quantiles.values[::5], c="blue", s=70, label="quantiles with 0.05 intervals") plt.scatter(x=quantiles.index[::25], y=quantiles.values[::25], c="red", s=70, label="quantiles with 0.25 intervals") plt.legend(loc='upper left', fontsize=20) for x, y in zip(quantiles.index[::25], quantiles.values[::25]): plt.annotate(s='({},{})'.format(x, y), xy=(x, y), fontweight='bold', fontsize=16, xytext=(x - 0.05, y + 180)) axes.tick_params(labelsize=15) st.pyplot() st.write("this graph shows the Quantile values of Ratings Per User") st.write("**Analysis of Ratings Per Movie** :smile:") no_of_ratings_per_movie = Train_Data.groupby( by="MovieID")["Ratings"].count().sort_values(ascending=False) fig = plt.figure(figsize=(12, 6)) axes = fig.add_axes([0.1, 0.1, 1, 1]) plt.title("Number of Ratings Per Movie", fontsize=20) plt.xlabel("Movie", fontsize=20) plt.ylabel("Count of Ratings", fontsize=20) plt.plot(no_of_ratings_per_movie.values) plt.tick_params(labelsize=15) axes.set_xticklabels([]) st.pyplot() st.write( "This graph shows the number of rating(in count) each movie achieved by the audience, which clearly shows that there are some movies which are very popular and were rated by many users as comapared to other movies " ) st.write("**Analysis of Movie Ratings on Day of Week** :smile:") fig = plt.figure(figsize=(12, 8)) axes = sns.countplot(x="DayOfWeek", data=Train_Data) axes.set_title("Day of week VS Number of Ratings", fontsize=20) axes.set_xlabel("Day of Week", fontsize=20) axes.set_ylabel("Number of Ratings", fontsize=20) axes.set_yticklabels([changingLabels(num) for num in ax.get_yticks()]) axes.tick_params(labelsize=15) st.pyplot() st.write( "This graph will show Analysis of Movie Ratings on Day of Week in bar graph format ,here clearly visible that on sturday & sunday users are least interested in providing ratings " ) fig = plt.figure(figsize=(12, 8)) axes = sns.boxplot(x="DayOfWeek", y="Ratings", data=Train_Data) axes.set_title("Day of week VS Number of Ratings", fontsize=20) axes.set_xlabel("Day of Week", fontsize=20) axes.set_ylabel("Number of Ratings", fontsize=20) axes.tick_params(labelsize=15) st.pyplot() st.write( "This graph will show Analysis of Movie Ratings on Day of Week in box plot format ,here clearly visible that on sturday & sunday users are least interested in providing ratings " ) average_ratings_dayofweek = Train_Data.groupby( by="DayOfWeek")["Ratings"].mean() st.write("**Average Ratings on Day of Weeks**") st.write(average_ratings_dayofweek) st.write( "**This Average Ratings on Day of Weeks will represented in graphical format** " ) st.area_chart(average_ratings_dayofweek) st.write( "this graph represents that average rating is mostly lies between 3 to 4." ) st.write("**Distribution of Movie ratings amoung Users**") plt.scatter(Test_Data["CustID"], Test_Data["MovieID"]) st.pyplot() ####################Creating USER-ITEM sparse matrix from data frame startTime = datetime.now() print("Creating USER_ITEM sparse matrix for train Data") if os.path.isfile("Data/TrainUISparseData.npz"): print( "Sparse Data is already present in your disk, no need to create further. Loading Sparse Matrix" ) TrainUISparseData = sparse.load_npz("Data/TrainUISparseData.npz") print("Shape of Train Sparse matrix = " + str(TrainUISparseData.shape)) else: print("We are creating sparse data") TrainUISparseData = sparse.csr_matrix( (Train_Data.Ratings, (Train_Data.CustID, Train_Data.MovieID))) print("Creation done. Shape of sparse matrix = " + str(TrainUISparseData.shape)) print("Saving it into disk for furthur usage.") sparse.save_npz("Data/TrainUISparseData.npz", TrainUISparseData) print("Done\n") print(datetime.now() - startTime) ###############Creating USER-ITEM sparse matrix from data frame for test data startTime = datetime.now() print("Creating USER_ITEM sparse matrix for test Data") if os.path.isfile("Data/TestUISparseData.npz"): print( "Sparse Data is already present in your disk, no need to create further. Loading Sparse Matrix" ) TestUISparseData = sparse.load_npz("Data/TestUISparseData.npz") print("Shape of Test Sparse Matrix = " + str(TestUISparseData.shape)) else: print("We are creating sparse data") TestUISparseData = sparse.csr_matrix( (Test_Data.Ratings, (Test_Data.CustID, Test_Data.MovieID))) print("Creation done. Shape of sparse matrix = " + str(TestUISparseData.shape)) print("Saving it into disk for furthur usage.") sparse.save_npz("Data/TestUISparseData.npz", TestUISparseData) print("Done\n") print(datetime.now() - startTime) rows, cols = TrainUISparseData.shape presentElements = TrainUISparseData.count_nonzero() print("Sparsity Of Train matrix : {}% ".format( (1 - (presentElements / (rows * cols))) * 100)) rows, cols = TestUISparseData.shape presentElements = TestUISparseData.count_nonzero() print("Sparsity Of Test matrix : {}% ".format( (1 - (presentElements / (rows * cols))) * 100)) #################Finding Global average of all movie ratings, Average rating per user, and Average rating per movie def getAverageRatings(sparseMatrix, if_user): ax = 1 if if_user else 0 #axis = 1 means rows and axis = 0 means columns sumOfRatings = sparseMatrix.sum( axis=ax ).A1 #this will give an array of sum of all the ratings of user if axis = 1 else #sum of all the ratings of movies if axis = 0 noOfRatings = (sparseMatrix != 0).sum( axis=ax ).A1 #this will give a boolean True or False array, and True means 1 and False #means 0, and further we are summing it to get the count of all the non-zero cells means length of non-zero cells rows, cols = sparseMatrix.shape averageRatings = { i: sumOfRatings[i] / noOfRatings[i] for i in range(rows if if_user else cols) if noOfRatings[i] != 0 } return averageRatings Global_Average_Rating = TrainUISparseData.sum( ) / TrainUISparseData.count_nonzero() print("Global Average Rating {}".format(Global_Average_Rating)) AvgRatingUser = getAverageRatings(TrainUISparseData, True) #############Machine Learning Models def get_sample_sparse_matrix(sparseMatrix, n_users, n_movies): startTime = datetime.now() users, movies, ratings = sparse.find(sparseMatrix) uniq_users = np.unique(users) uniq_movies = np.unique(movies) np.random.seed( 15 ) #this will give same random number everytime, without replacement userS = np.random.choice(uniq_users, n_users, replace=True) movieS = np.random.choice(uniq_movies, n_movies, replace=True) mask = np.logical_and(np.isin(users, userS), np.isin(movies, movieS)) sparse_sample = sparse.csr_matrix( (ratings[mask], (users[mask], movies[mask])), shape=(max(userS) + 1, max(movieS) + 1)) print("Sparse Matrix creation done. Saving it for later use.") sparse.save_npz(path, sparse_sample) print("Done") print("Shape of Sparse Sampled Matrix = " + str(sparse_sample.shape)) print(datetime.now() - startTime) return sparse_sample ####Creating Sample Sparse Matrix for Train Data path = "Data/TrainUISparseData_Sample.npz" if not os.path.isfile(path): print( "Sample sparse matrix is not present in the disk. We are creating it..." ) train_sample_sparse = get_sample_sparse_matrix(TrainUISparseData, 4000, 400) else: print("File is already present in the disk. Loading the file...") train_sample_sparse = sparse.load_npz(path) print("File loading done.") print("Shape of Train Sample Sparse Matrix = " + str(train_sample_sparse.shape)) ##########Creating Sample Sparse Matrix for Test Data path = "Data/TestUISparseData_Sample.npz" if not os.path.isfile(path): print( "Sample sparse matrix is not present in the disk. We are creating it..." ) test_sample_sparse = get_sample_sparse_matrix(TestUISparseData, 2000, 200) else: print("File is already present in the disk. Loading the file...") test_sample_sparse = sparse.load_npz(path) print("File loading done.") print("Shape of Test Sample Sparse Matrix = " + str(test_sample_sparse.shape)) #####print("Global average of all movies ratings in Train Sample Sparse is {}".format(np.round((train_sample_sparse.sum()/train_sample_sparse.count_nonzero()), 2))) globalAvgMovies = getAverageRatings(train_sample_sparse, False) globalAvgUsers = getAverageRatings(train_sample_sparse, True) ####### Featurizing data for regression problem ###### Featurizing Train Data sample_train_users, sample_train_movies, sample_train_ratings = sparse.find( train_sample_sparse) if os.path.isfile("Data/Train_Regression.csv"): print( "File is already present in your disk. You do not have to prepare it again." ) else: startTime = datetime.now() print("Preparing Train csv file for {} rows".format( len(sample_train_ratings))) with open("Data/Train_Regression.csv", mode="w") as data: count = 0 for user, movie, rating in zip(sample_train_users, sample_train_movies, sample_train_ratings): row = list() row.append(user) #appending user ID row.append(movie) #appending movie ID row.append(train_sample_sparse.sum() / train_sample_sparse.count_nonzero() ) #appending global average rating #----------------------------------Ratings given to "movie" by top 5 similar users with "user"--------------------# similar_users = cosine_similarity(train_sample_sparse[user], train_sample_sparse).ravel() similar_users_indices = np.argsort(-similar_users)[1:] similar_users_ratings = train_sample_sparse[ similar_users_indices, movie].toarray().ravel() top_similar_user_ratings = list( similar_users_ratings[similar_users_ratings != 0][:5]) top_similar_user_ratings.extend( [globalAvgMovies[movie]] * (5 - len(top_similar_user_ratings))) #above line means that if top 5 ratings are not available then rest of the ratings will be filled by "movie" average #rating. Let say only 3 out of 5 ratings are available then rest 2 will be "movie" average rating. row.extend(top_similar_user_ratings) #----------------------------------Ratings given by "user" to top 5 similar movies with "movie"------------------#similar_movies = cosine_similarity(train_sample_sparse[:,movie].T, train_sample_sparse.T).ravel() similar_movies_indices = np.argsort(-similar_movies)[1:] similar_movies_ratings = train_sample_sparse[ user, similar_movies_indices].toarray().ravel() top_similar_movie_ratings = list( similar_movies_ratings[similar_movies_ratings != 0][:5]) top_similar_movie_ratings.extend( [globalAvgUsers[user]] * (5 - len(top_similar_movie_ratings))) #above line means that if top 5 ratings are not available then rest of the ratings will be filled by "user" average #rating. Let say only 3 out of 5 ratings are available then rest 2 will be "user" average rating. row.extend(top_similar_movie_ratings) #----------------------------------Appending "user" average, "movie" average & rating of "user""movie"-----------# row.append(globalAvgUsers[user]) row.append(globalAvgMovies[movie]) row.append(rating) #-----------------------------------Converting rows and appending them as comma separated values to csv file------# data.write(",".join(map(str, row))) data.write("\n") count += 1 if count % 2000 == 0: print("Done for {}. Time elapsed: {}".format( count, (datetime.now() - startTime))) print("Total Time for {} rows = {}".format( len(sample_train_ratings), (datetime.now() - startTime))) ################ Train_Reg = pd.read_csv("Data/Train_Regression.csv", names=[ "User_ID", "Movie_ID", "Global_Average", "SUR1", "SUR2", "SUR3", "SUR4", "SUR5", "SMR1", "SMR2", "SMR3", "SMR4", "SMR5", "User_Average", "Movie_Average", "Rating" ]) #Train_Reg.head() ######## Featurizing Test Data #####################3 sample_test_users, sample_test_movies, sample_test_ratings = sparse.find( test_sample_sparse) if os.path.isfile("Data/Test_Regression.csv"): print( "File is already present in your disk. You do not have to prepare it again." ) else: startTime = datetime.now() print("Preparing Test csv file for {} rows".format( len(sample_test_ratings))) with open("Data/Test_Regression.csv", mode="w") as data: count = 0 for user, movie, rating in zip(sample_test_users, sample_test_movies, sample_test_ratings): row = list() row.append(user) #appending user ID row.append(movie) #appending movie ID row.append( train_sample_sparse.sum() / train_sample_sparse.count_nonzero() ) #appending global average rating#-----------------------------Ratings given to "movie" by top 5 similar users with "user"-------------------------# try: similar_users = cosine_similarity( train_sample_sparse[user], train_sample_sparse).ravel() similar_users_indices = np.argsort(-similar_users)[1:] similar_users_ratings = train_sample_sparse[ similar_users_indices, movie].toarray().ravel() top_similar_user_ratings = list( similar_users_ratings[similar_users_ratings != 0][:5]) top_similar_user_ratings.extend( [globalAvgMovies[movie]] * (5 - len(top_similar_user_ratings))) #above line means that if top 5 ratings are not available then rest of the ratings will be filled by "movie" #average rating. Let say only 3 out of 5 ratings are available then rest 2 will be "movie" average rating. row.extend(top_similar_user_ratings) #########Cold Start Problem, for a new user or a new movie######### except (IndexError, KeyError): global_average_train_rating = [ train_sample_sparse.sum() / train_sample_sparse.count_nonzero() ] * 5 row.extend(global_average_train_rating) except: raise #-----------------------------Ratings given by "user" to top 5 similar movies with "movie"-----------------------# try: similar_movies = cosine_similarity( train_sample_sparse[:, movie].T, train_sample_sparse.T).ravel() similar_movies_indices = np.argsort(-similar_movies)[1:] similar_movies_ratings = train_sample_sparse[ user, similar_movies_indices].toarray().ravel() top_similar_movie_ratings = list(similar_movies_ratings[ similar_movies_ratings != 0][:5]) top_similar_movie_ratings.extend( [globalAvgUsers[user]] * (5 - len(top_similar_movie_ratings))) #above line means that if top 5 ratings are not available then rest of the ratings will be filled by "user" #average rating. Let say only 3 out of 5 ratings are available then rest 2 will be "user" average rating. row.extend(top_similar_movie_ratings) #########Cold Start Problem, for a new user or a new movie######### except (IndexError, KeyError): global_average_train_rating = [ train_sample_sparse.sum() / train_sample_sparse.count_nonzero() ] * 5 row.extend(global_average_train_rating) except: raise #-----------------------------Appending "user" average, "movie" average & rating of "user""movie"----------------#try: try: row.append(globalAvgUsers[user]) except (KeyError): global_average_train_rating = train_sample_sparse.sum( ) / train_sample_sparse.count_nonzero() row.append(global_average_train_rating) except: raise try: row.append(globalAvgMovies[movie]) except (KeyError): global_average_train_rating = train_sample_sparse.sum( ) / train_sample_sparse.count_nonzero() row.append(global_average_train_rating) except: raise row.append(rating) #------------------------------Converting rows and appending them as comma separated values to csv file-----------# data.write(",".join(map(str, row))) data.write("\n") count += 1 if count % 100 == 0: print("Done for {}. Time elapsed: {}".format( count, (datetime.now() - startTime))) print("Total Time for {} rows = {}".format( len(sample_test_ratings), (datetime.now() - startTime))) Test_Reg = pd.read_csv("Data/Test_Regression.csv", names=[ "User_ID", "Movie_ID", "Global_Average", "SUR1", "SUR2", "SUR3", "SUR4", "SUR5", "SMR1", "SMR2", "SMR3", "SMR4", "SMR5", "User_Average", "Movie_Average", "Rating" ]) #Test_Reg.head() ## ###### Transforming Data for Surprise Models Train_Reg[['User_ID', 'Movie_ID', 'Rating']].head(5) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(Train_Reg[['User_ID', 'Movie_ID', 'Rating']], reader) trainset = data.build_full_trainset() testset = list( zip(Test_Reg["User_ID"].values, Test_Reg["Movie_ID"].values, Test_Reg["Rating"].values)) error_table = pd.DataFrame(columns=[ "Model", "Train RMSE", "Train MAPE", "Test RMSE", "Test MAPE" ]) model_train_evaluation = dict() model_test_evaluation = dict() def make_table(model_name, rmse_train, mape_train, rmse_test, mape_test): global error_table #All variable assignments in a function store the value in the local symbol table; whereas variable references first look #in the local symbol table, then in the global symbol table, and then in the table of built-in names. Thus, global variables #cannot be directly assigned a value within a function (unless named in a global statement), #although they may be referenced. error_table = error_table.append( pd.DataFrame( [[model_name, rmse_train, mape_train, rmse_test, mape_test]], columns=[ "Model", "Train RMSE", "Train MAPE", "Test RMSE", "Test MAPE" ])) error_table.reset_index(drop=True, inplace=True) ###### Utility Functions for Regression Models def error_metrics(y_true, y_pred): rmse = np.sqrt(mean_squared_error(y_true, y_pred)) mape = np.mean(abs((y_true - y_pred) / y_true)) * 100 return rmse, mape def train_test_xgboost(x_train, x_test, y_train, y_test, model_name): startTime = datetime.now() train_result = dict() test_result = dict() clf = xgb.XGBRegressor(n_estimators=100, silent=False, n_jobs=10) clf.fit(x_train, y_train) print("-" * 50) print("TRAIN DATA") y_pred_train = clf.predict(x_train) rmse_train, mape_train = error_metrics(y_train, y_pred_train) print("RMSE = {}".format(rmse_train)) print("MAPE = {}".format(mape_train)) print("-" * 50) train_result = { "RMSE": rmse_train, "MAPE": mape_train, "Prediction": y_pred_train } print("TEST DATA") y_pred_test = clf.predict(x_test) rmse_test, mape_test = error_metrics(y_test, y_pred_test) print("RMSE = {}".format(rmse_test)) print("MAPE = {}".format(mape_test)) print("-" * 50) test_result = { "RMSE": rmse_test, "MAPE": mape_test, "Prediction": y_pred_test } print("Time Taken = " + str(datetime.now() - startTime)) plot_importance(xgb, clf) make_table(model_name, rmse_train, mape_train, rmse_test, mape_test) return train_result, test_result ####################### def plot_importance(model, clf): fig = plt.figure(figsize=(4, 3)) ax = fig.add_axes([0, 0, 1, 1]) model.plot_importance(clf, ax=ax, height=0.3) ax.set_xlabel("F Score", fontsize=20) ax.set_ylabel("Features", fontsize=20) ax.set_title("Feature Importance", fontsize=20) #ax.set_tick_params(labelsize = 15) st.pyplot(fig=fig) #plt.show() #st.plotly_chart(fig,use_container_width=True) ###### Utility Functions for Surprise Models def get_ratings(predictions): actual = np.array([pred.r_ui for pred in predictions]) predicted = np.array([pred.est for pred in predictions]) return actual, predicted #in surprise prediction of every data point is returned as dictionary like this: #"user: 196 item: 302 r_ui = 4.00 est = 4.06 {'actual_k': 40, 'was_impossible': False}" #In this dictionary, "r_ui" is a key for actual rating and "est" is a key for predicted rating def get_error(predictions): actual, predicted = get_ratings(predictions) rmse = np.sqrt(mean_squared_error(actual, predicted)) mape = np.mean(abs((actual - predicted) / actual)) * 100 return rmse, mape my_seed = 15 random.seed(my_seed) np.random.seed(my_seed) def run_surprise(algo, trainset, testset, model_name): startTime = datetime.now() train = dict() test = dict() algo.fit(trainset) #You can check out above function at "https://surprise.readthedocs.io/en/stable/getting_started.html" in #"Train-test split and the fit() method" section #-----------------Evaluating Train Data------------------# print("-" * 50) print("TRAIN DATA") train_pred = algo.test(trainset.build_testset()) #You can check out "algo.test()" function at "https://surprise.readthedocs.io/en/stable/getting_started.html" in #"Train-test split and the fit() method" section #You can check out "trainset.build_testset()" function at "https://surprise.readthedocs.io/en/stable/FAQ.html#can-i-use-my-own-dataset-with-surprise-and-can-it-be-a-pandas-dataframe" in #"How to get accuracy measures on the training set" section train_actual, train_predicted = get_ratings(train_pred) train_rmse, train_mape = get_error(train_pred) print("RMSE = {}".format(train_rmse)) print("MAPE = {}".format(train_mape)) print("-" * 50) train = { "RMSE": train_rmse, "MAPE": train_mape, "Prediction": train_predicted } #-----------------Evaluating Test Data------------------# print("TEST DATA") test_pred = algo.test(testset) #You can check out "algo.test()" function at "https://surprise.readthedocs.io/en/stable/getting_started.html" in #"Train-test split and the fit() method" section test_actual, test_predicted = get_ratings(test_pred) test_rmse, test_mape = get_error(test_pred) print("RMSE = {}".format(test_rmse)) print("MAPE = {}".format(test_mape)) print("-" * 50) test = { "RMSE": test_rmse, "MAPE": test_mape, "Prediction": test_predicted } print("Time Taken = " + str(datetime.now() - startTime)) make_table(model_name, train_rmse, train_mape, test_rmse, test_mape) return train, test ## ################## XGBoost 13 Features################### x_train = Train_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1) x_test = Test_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1) y_train = Train_Reg["Rating"] y_test = Test_Reg["Rating"] train_result, test_result = train_test_xgboost(x_train, x_test, y_train, y_test, "XGBoost_13") model_train_evaluation["XGBoost_13"] = train_result model_test_evaluation["XGBoost_13"] = test_result #################################################### ################### 2. Surprise BaselineOnly Model ################################# bsl_options = {"method": "sgd", "learning_rate": 0.01, "n_epochs": 25} algo = BaselineOnly(bsl_options=bsl_options) #You can check the docs of above used functions at:https://surprise.readthedocs.io/en/stable/prediction_algorithms.html#baseline-estimates-configuration #at section "Baselines estimates configuration". train_result, test_result = run_surprise(algo, trainset, testset, "BaselineOnly") model_train_evaluation["BaselineOnly"] = train_result model_test_evaluation["BaselineOnly"] = test_result ############# 3. XGBoost 13 Features + Surprise BaselineOnly Model #################### Train_Reg["BaselineOnly"] = model_train_evaluation["BaselineOnly"][ "Prediction"] Test_Reg["BaselineOnly"] = model_test_evaluation["BaselineOnly"][ "Prediction"] x_train = Train_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1) x_test = Test_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1) y_train = Train_Reg["Rating"] y_test = Test_Reg["Rating"] train_result, test_result = train_test_xgboost(x_train, x_test, y_train, y_test, "XGB_BSL") model_train_evaluation["XGB_BSL"] = train_result model_test_evaluation["XGB_BSL"] = test_result ################### 4. Surprise KNN-Baseline with User-User and Item-Item Similarity ######### param_grid = { 'sim_options': { 'name': ["pearson_baseline"], "user_based": [True], "min_support": [2], "shrinkage": [60, 80, 80, 140] }, 'k': [5, 20, 40, 80] } gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) # best RMSE score #print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score #print(gs.best_params['rmse']) ####### Applying KNNBaseline User-User with best parameters ######## sim_options = { 'name': 'pearson_baseline', 'user_based': True, 'min_support': 2, 'shrinkage': gs.best_params['rmse']['sim_options']['shrinkage'] } bsl_options = {'method': 'sgd'} algo = KNNBaseline(k=gs.best_params['rmse']['k'], sim_options=sim_options, bsl_options=bsl_options) train_result, test_result = run_surprise(algo, trainset, testset, "KNNBaseline_User") model_train_evaluation["KNNBaseline_User"] = train_result model_test_evaluation["KNNBaseline_User"] = test_result ########## 4.2 Surprise KNN-Baseline with Item-Item ############# param_grid = { 'sim_options': { 'name': ["pearson_baseline"], "user_based": [False], "min_support": [2], "shrinkage": [60, 80, 80, 140] }, 'k': [5, 20, 40, 80] } gs = GridSearchCV(KNNBaseline, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) # best RMSE score #print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score #print(gs.best_params['rmse']) ############### Applying KNNBaseline Item-Item with best parameters ###### sim_options = { 'name': 'pearson_baseline', 'user_based': False, 'min_support': 2, 'shrinkage': gs.best_params['rmse']['sim_options']['shrinkage'] } bsl_options = {'method': 'sgd'} algo = KNNBaseline(k=gs.best_params['rmse']['k'], sim_options=sim_options, bsl_options=bsl_options) train_result, test_result = run_surprise(algo, trainset, testset, "KNNBaseline_Item") model_train_evaluation["KNNBaseline_Item"] = train_result model_test_evaluation["KNNBaseline_Item"] = test_result ########### 5. XGBoost 13 Features + Surprise BaselineOnly + Surprise KNN Baseline ############### Train_Reg["KNNBaseline_User"] = model_train_evaluation["KNNBaseline_User"][ "Prediction"] Train_Reg["KNNBaseline_Item"] = model_train_evaluation["KNNBaseline_Item"][ "Prediction"] Test_Reg["KNNBaseline_User"] = model_test_evaluation["KNNBaseline_User"][ "Prediction"] Test_Reg["KNNBaseline_Item"] = model_test_evaluation["KNNBaseline_Item"][ "Prediction"] #st.write(Train_Reg.head()) x_train = Train_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1) x_test = Test_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1) y_train = Train_Reg["Rating"] y_test = Test_Reg["Rating"] train_result, test_result = train_test_xgboost(x_train, x_test, y_train, y_test, "XGB_BSL_KNN") model_train_evaluation["XGB_BSL_KNN"] = train_result model_test_evaluation["XGB_BSL_KNN"] = test_result ## ######################################################################################################### ################# 6. Matrix Factorization SVD ################################ param_grid = { 'n_factors': [5, 7, 10, 15, 20, 25, 35, 50, 70, 90] } #here, n_factors is the equivalent to dimension 'd' when matrix 'A' #is broken into 'b' and 'c'. So, matrix 'A' will be of dimension n*m. So, matrices 'b' and 'c' will be of dimension n*d and m*d. gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) # best RMSE score #print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score #print(gs.best_params['rmse']) ############# Applying SVD with best parameters ################# algo = SVD(n_factors=gs.best_params['rmse']['n_factors'], biased=True, verbose=True) train_result, test_result = run_surprise(algo, trainset, testset, "SVD") model_train_evaluation["SVD"] = train_result model_test_evaluation["SVD"] = test_result ############# 7. Matrix Factorization SVDpp with implicit feedback ############ param_grid = { 'n_factors': [10, 30, 50, 80, 100], 'lr_all': [0.002, 0.006, 0.018, 0.054, 0.10] } gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3) gs.fit(data) # best RMSE score #print(gs.best_score['rmse']) # combination of parameters that gave the best RMSE score #print(gs.best_params['rmse']) ########## algo = SVDpp(n_factors=gs.best_params['rmse']['n_factors'], lr_all=gs.best_params['rmse']["lr_all"], verbose=True) train_result, test_result = run_surprise(algo, trainset, testset, "SVDpp") model_train_evaluation["SVDpp"] = train_result model_test_evaluation["SVDpp"] = test_result ############## 8. XGBoost 13 Features + Surprise BaselineOnly + Surprise KNN Baseline + SVD + SVDpp Train_Reg["SVD"] = model_train_evaluation["SVD"]["Prediction"] Train_Reg["SVDpp"] = model_train_evaluation["SVDpp"]["Prediction"] Test_Reg["SVD"] = model_test_evaluation["SVD"]["Prediction"] Test_Reg["SVDpp"] = model_test_evaluation["SVDpp"]["Prediction"] ####### x_train = Train_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1) x_test = Test_Reg.drop(["User_ID", "Movie_ID", "Rating"], axis=1) y_train = Train_Reg["Rating"] y_test = Test_Reg["Rating"] train_result, test_result = train_test_xgboost(x_train, x_test, y_train, y_test, "XGB_BSL_KNN_MF") model_train_evaluation["XGB_BSL_KNN_MF"] = train_result model_test_evaluation["XGB_BSL_KNN_MF"] = test_result ########## 9. Surprise KNN Baseline + SVD + SVDpp ################### x_train = Train_Reg[[ "KNNBaseline_User", "KNNBaseline_Item", "SVD", "SVDpp" ]] x_test = Test_Reg[["KNNBaseline_User", "KNNBaseline_Item", "SVD", "SVDpp"]] y_train = Train_Reg["Rating"] y_test = Test_Reg["Rating"] train_result, test_result = train_test_xgboost(x_train, x_test, y_train, y_test, "XGB_KNN_MF") model_train_evaluation["XGB_KNN_MF"] = train_result model_test_evaluation["XGB_KNN_MF"] = test_result ########################### error_table2 = error_table.drop(["Train MAPE", "Test MAPE"], axis=1) error_table2.plot(x="Model", kind="bar", figsize=(14, 8), grid=True, fontsize=15) plt.title("Train and Test RMSE and MAPE of all Models", fontsize=20) plt.ylabel("Error Values", fontsize=20) plt.legend(bbox_to_anchor=(1, 1), fontsize=20) st.pyplot() #plt.show() ######### error_table.drop(["Train MAPE", "Test MAPE"], axis=1).style.highlight_min(axis=0)
group_cols=['userCode', 'project_id']) train = train.merge(gp, on=['userCode', 'project_id'], how='left') # print(name_col, train.head(), (1, train[name_col].max())) #drop duplicate train = train.drop_duplicates(['userCode', 'project_id'], keep='last') print(len(train), len(test)) #to scale scale = train[name_col].max() train[name_col] = train[name_col].apply(lambda x: x / scale) print('max: ', train[name_col].max()) reader = Reader(rating_scale=(0, 1)) trainset = Dataset.load_from_df(train[["userCode", "project_id", name_col]], reader) trainset = trainset.build_full_trainset() algo = SVD() algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset() predictions = algo.test(testset) accuracy.rmse(predictions, verbose=True) def get_top_n(predictions, n=10): # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions:
cursor = conn.cursor() query = "SELECT [UserId],[RecipeId],[Rating] FROM [Licenta].[dbo].[Favorites]" cursor.execute("Delete from Recommendations") conn.commit() import pandas as pd from surprise import Dataset from surprise import Reader db_data = pd.read_sql(query, conn) reader = Reader(rating_scale=(1, 5)) ratings = Dataset.load_from_df(db_data[["UserId", "RecipeId", "Rating"]], reader) import math def distance(u1, u2, d): ssum = 0 for r in d[u1]: if r in d[u2]: ssum += pow(d[u1][r] - d[u2][r], 2) if sum == 0: return 0 return math.sqrt(ssum) def most_near(u, d, n=10):
import pandas as pd path = '../Datasets/BookCrossings' os.chdir(path) trans = pd.read_csv('BX-Book-Ratings.csv', sep=';', error_bad_lines=False, encoding="latin-1") trans.columns = ['user', 'item', 'rating'] trans = trans[trans.rating != 0] min_item_ratings = 10 popular_items = trans['item'].value_counts() >= min_item_ratings popular_items = popular_items[popular_items].index.tolist() min_user_ratings = 10 active_users = trans['user'].value_counts() >= min_user_ratings active_users = active_users[active_users].index.tolist() trans = trans[(trans['item'].isin(popular_items)) & (trans['user'].isin(active_users))] reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(trans, reader) trainset, testset = train_test_split(data, test_size=0.002) sim_options = {'name': 'pearson', 'user_based': False} algo = KNNBasic(sim_options=sim_options) algo.fit(trainset) preds = algo.test(testset) accuracy.mae(preds)
from surprise import SVD, SVDpp, KNNBasic, KNNWithMeans, KNNWithZScore, SlopeOne, BaselineOnly, NormalPredictor from surprise import KNNWithMeans r = pd.read_csv('ratings.csv') tr = pd.read_csv('to_read.csv') b = pd.read_csv('books.csv') t = pd.read_csv('tags.csv') bt = pd.read_csv('book_tags.csv') r.head() b.head() # create a reader that takes the rating scale as a parameter reader = Reader(rating_scale=(1, 5)) # use the load_from_df function to load our book ratings dataframe data = Dataset.load_from_df(r[['user_id', 'book_id', 'rating']], reader) # split data into a training set and a test set with an 80/20 ratio trainset, testset = train_test_split(data, test_size=0.2) algo_svd = SVD() algo_svd.fit(trainset) predictions = algo_svd.test(trainset.build_anti_testset()) predictions_svd = algo_svd.test(testset) pred_svd = pd.DataFrame(predictions_svd) r.loc[(r['user_id'] == 27523) & (r['book_id'] == 2203)] SVD().fit
def collaborative_filtering_using_surprise(): """ https://towardsdatascience.com/how-to-build-a-memory-based-recommendation-system-using-python-surprise-55f3257b2cf4 Predict games for user with user_key = 93681 """ target_user_key = 93681 # import reduced dataset: df = import_reduced_reviews() # check for duplicates: duplicates = len(df) - len( df.drop_duplicates(subset=['game_key', 'user_key'])) # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) print('duplicates removed: ' + str(duplicates)) # check out our user: df_target_user = df[df['user_key'] == target_user_key] # build utility matrix: # data_pivot = df.pivot(index='user_key', columns='game_key', values='rating') # calculate sparsity # sparsity = data_pivot.isnull().sum().sum() / data_pivot.size # print('Sparcity of utility matrix: ' + str(sparsity)) ### Modelling part with Surprise: # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Split in trainset and testset trainset, testset = train_test_split(data, test_size=0.2) print('Number of users: ', trainset.n_users, '\n') print('Number of items: ', trainset.n_items, '\n') # When surprise creates a Trainset or Testset object, it takes the raw_id’s (the ones that you used in the file # you imported), and converts them to so-called inner_id’s (basically a series of integers, starting from 0). You # might need to trace back to the original names. Using the items as an example (you can do the same approach # with users, just swap iid's with uid's in the code), to get the list of inner_iids, you can use the all_items # method. To convert from raw to inner id you can use the to_inner_iid method, and the to_raw_iid to convert back. # An example on how to save a list of inner and raw item id’s: trainset_iids = list(trainset.all_items()) iid_converter = lambda x: trainset.to_raw_iid(x) trainset_raw_iids = list(map(iid_converter, trainset_iids)) ## Model parameters: of kNN: # Two hyperparameters we can tune: # 1. k parameter # 2. similarity option # a) user-user vs item-item # b) similarity function (cosine, pearson, msd) sim_option = {'name': 'pearson', 'user_based': False} # 3 different KNN Models: KNNBasic, KNNWithMeans, KNNWithZScore k = 40 min_k = 5 algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) algo.fit(trainset) ## Testing: predictions = algo.test(testset) accuracy.rmse(predictions) # Own similarity matrix: sim_matrix_imported = pd.read_csv( '../Data/Recommender/selfmade_item-item-similarity-matrix.csv', index_col=0) sim_matrix_imported.columns = sim_matrix_imported.columns.astype(int) sim_matrix_imported = sim_matrix_imported.to_numpy() algo.sim = sim_matrix_imported predictions = algo.test(testset) accuracy.rmse(predictions) # Cross validation: skip = True if not skip: results = cross_validate(algo=algo, data=data, measures=['RMSE'], cv=5, return_train_measures=True) results_mean = results['test_rmse'].mean() ## Predictions # Lets assume we are happy with the method and now want to apply it to the entire data set. # Estimate for a specific user a specific item: single_item_single_user_prediction = algo.predict(uid=target_user_key, iid=100010, verbose=True) # Estimate all items for a specific user: list_of_all_items = trainset_raw_iids target_predictions = [] for item in list_of_all_items: single_prediction = algo.predict(uid=target_user_key, iid=item) target_predictions.append( (single_prediction.uid, single_prediction.iid, single_prediction.est)) # Then sort the predictions for each user and retrieve the k highest ones: target_predictions.sort(key=lambda x: x[2], reverse=True) n = 20 top_n = target_predictions[:n] top_n = [row[1] for row in top_n] print('end')
def selfmade_approach(): # import reduced dataset: df = import_reduced_reviews( 'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv') df = df[['user_key', 'game_key', 'rating']] # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) ### Modelling part with Surprise: # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Build trainset from the whole dataset: trainsetfull = data.build_full_trainset() print('Number of users: ', trainsetfull.n_users, '\n') print('Number of items: ', trainsetfull.n_items, '\n') # Parameters: sim_option = {'name': 'cosine', 'user_based': False} k = 10 min_k = 5 algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) # Run fit: start_time = time.time() algo.fit(trainsetfull) print("--- %s seconds ---" % (time.time() - start_time)) # 1st approach: Calculate for a single user contained in dataset: target_user_key = 286189 target_user_info = df[df['user_key'] == target_user_key] # Estimate single game: target_game_key = 100098 # data structures: # sim_matrix = ndarray(312,312) # xr = defaultdict: 312 # yr = defaultdict 8787 # later on replace these by self-written structures xr = algo.xr yr = algo.yr sim_matrix = algo.sim item_means = algo.means inner_target_uid = algo.trainset.to_inner_uid(target_user_key) inner_target_iid = algo.trainset.to_inner_iid(target_game_key) # switch: uid and idd: x = inner_target_uid y = inner_target_iid # pred2: inner_2_raw_item_ids = algo.trainset._raw2inner_id_items # swap keys and values: inner_2_raw_item_ids = dict( (v, k) for k, v in inner_2_raw_item_ids.items()) # similarity matrix with raw ids instead of inner surprise ids: sim_matrix_df = pd.DataFrame(sim_matrix) sim_matrix_df = sim_matrix_df.rename( columns=lambda x: inner_2_raw_item_ids[x]) sim_matrix_df = sim_matrix_df.rename( index=lambda x: inner_2_raw_item_ids[x]) target_user_ratings = yr[x] # convert from inner to raw: target_user_ratings2 = [] for (inner_iid, rating) in target_user_ratings: target_user_ratings2.append((inner_2_raw_item_ids[inner_iid], rating)) # convert item means from inner to raw: item_means2 = {} for i, mean in enumerate(item_means): item_means2[inner_2_raw_item_ids[i]] = mean myKNN = MyKnnWithMeans(sim_matrix=sim_matrix_df, target_user_ratings=target_user_ratings2, item_means=item_means2, k=k, min_k=min_k) pred = myKNN.predict_single_game(user_key=target_user_key, game_key=target_game_key) pred_surprise = algo.predict(uid=inner_target_uid, iid=inner_target_iid) estimate = pred print("Estimate for user %s for game %s is %s" % (target_user_key, target_game_key, estimate)) # Estimate for user not contained in dataset: target_user_key = 123456789 target_game_key = 100098 user_ratings = [ (100284, 7), (100311, 8), (105154, 2), (100020, 4), (100001, 9), (100277, 7), ] myKNN2 = MyKnnWithMeans(sim_matrix_df, user_ratings, item_means2, k, min_k) prediction = myKNN2.predict_single_game(target_user_key, target_game_key) # export similarity matrix: sim_matrix_df.to_csv( '../Data/Recommender/item-item-sim-matrix-surprise.csv') # export item means: export_path = '../Data/Recommender/item-means.json' with open(export_path, 'w') as fp: json.dump(item_means2, fp, sort_keys=False, indent=4) test = sim_matrix_df.loc[100516, 100284] pass
def __init__(self, dataframe=None): self.dataframe = dataframe self.reader = Reader(rating_scale=(1, 5)) self.data = Dataset.load_from_df( self.dataframe[['user', 'trail_id', 'rating']], self.reader) self.fit_model = None
# IMPORTS import ccobra import pandas as pd from surprise import Dataset, Reader from surprise import KNNWithMeans # Ratings rcols = ['userId', 'movieId', 'rating'] ml_ratings_training = pd.read_csv('../data/final_py_data_training.csv', usecols=rcols) # Convert to Surprise Ratings reader = Reader(rating_scale=(0.5, 5)) surprise_training = Dataset.load_from_df(ml_ratings_training, reader=reader).build_full_trainset() # Train algorithm i_min_k = 5 i_max_k = 100 sim_options_item = {'name': 'pearson', 'user_based': False} algo_item = KNNWithMeans(k=i_max_k, min_k=i_min_k, sim_options=sim_options_item) algo_item.fit(surprise_training) class item_CF_model(ccobra.CCobraModel): def __init__(self, name='Item_CF'): super(item_CF_model, self).__init__(name, ["recommendation"], ["single-choice"])
def hybrid(userId,train_rd): #get_ipython().magic('matplotlib inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import warnings; warnings.simplefilter('ignore') # In[2]: #Popularity# md = pd.read_csv('CustomData/FinalData.csv') fd = pd.read_csv('avg_ratings1.csv') fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float') vote_averages= fd[fd['rating'].notnull()]['rating'] C = vote_averages.mean() fd1 = pd.read_csv('ratings_count.csv') fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float') vote_counts = fd1[fd1['rating'].notnull()]['rating'] # In[3]: m = vote_counts.quantile(0.75) # In[4]: md['ratings_count'] = fd1['rating'] md['average_rating'] = fd['rating'] # In[28]: #print(md.shape) qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']] qualified['ratings_count'] = qualified['ratings_count'].astype('float') qualified['average_rating'] = qualified['average_rating'].astype('float') #qualified.shape # In[29]: def weighted_rating(x): v = x['ratings_count'] R = x['average_rating'] return (v/(v+m) * R) + (m/(m+v) * C) # In[30]: qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1) #qualified['wr'] #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250) pop = qualified[['book_id','popularity_rating']] #print(qualified.shape) #print(pop.shape) # In[11]: ### Collaborative ## reader = Reader() ratings=train_rd #ratings = pd.read_csv('ratings.csv') #ratings.head() temp_ratings = ratings[0:1000] #print(temp_ratings) data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) # In[12]: svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) # In[13]: trainset = data.build_full_trainset() #svd.train(trainset) algo = SVD() algo.fit(trainset) ## usefule = temp_rating[rating] # In[14]: #print(len(temp_ratings[temp_ratings['user_id']==userId])) # In[ ]: def get_top_n(predictions, n=10): '''Return the top-N recommendation for each user from a set of predictions. Args: predictions(list of Prediction objects): The list of predictions, as returned by the test method of an algorithm. n(int): The number of recommendation to output for each user. Default is 10. Returns: A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. ''' # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): #user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # In[15]: from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) ''' top_n = get_top_n(predictions, n=10000) #print(top_n) #result = pd.DataFrame(top_n) #print(result) for uid, user_ratings in top_n.items(): #print(uid, [iid for (iid , _) in user_ratings]) for uid, iid, true_r, est, _ in predictions: temp_ratings.loc[uid]= [uid,iid,est] #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']] ''' count = 0 for uid, iid, true_r, est, _ in predictions: if uid == userId: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print('here') #print(uid) #temp_ratings.append([uid,iid,est],ignore_index=True) #print(count) #print(temp_ratings) # In[16]: #print(len(temp_ratings[temp_ratings['user_id']==2])) # In[ ]: # In[46]: ##### CONTENT ###### import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import csv import warnings; warnings.simplefilter('ignore') # In[48]: md=pd.read_csv('CustomData/FinalData.csv') rd=train_rd #rd=pd.read_csv('ratings.csv') md['book_id'] = md['book_id'].astype('int') rd['book_id'] = rd['book_id'].astype('int') rd['user_id'] = rd['user_id'].astype('int') rd['rating'] = rd['rating'].astype('int') #print(md.head()) md['authors'] = md['authors'].str.replace(' ','') md['authors'] = md['authors'].str.lower() md['authors'] = md['authors'].str.replace(',',' ') #print(md.head()) md['authors'] = md['authors'].apply(lambda x: [x,x]) #print(md['authors']) md['Genres']=md['Genres'].str.split(';') #print(md['Genres']) md['soup'] = md['authors'] + md['Genres'] #print(md['soup']) md['soup'] = md['soup'].str.join(' ') #md['soup'].fillna({}) #print(md['soup']) count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english') count_matrix = count.fit_transform(md['soup']) #print (count_matrix.shape) #print np.array(count.get_feature_names()) #print(count_matrix.shape) cosine_sim = cosine_similarity(count_matrix, count_matrix) # In[91]: def build_user_profiles(): user_profiles=np.zeros((53421,999)) #print(rd.iloc[0]['user_id']) #len(rd['book_id']) for i in range(0,1000): u=rd.iloc[i]['user_id'] b=rd.iloc[i]['book_id'] #print(u,b) #print(i) #if b<999: #print("match at "+str(b)) user_profiles[u][b-1]=rd.iloc[i]['rating'] #print(user_profiles) return user_profiles user_profiles=build_user_profiles() def _get_similar_items_to_user_profile(person_id): #Computes the cosine similarity between the user profile and all item profiles #print(user_profiles[person_id]) #print("\n---------\n") #print(cosine_sim[0]) user_ratings = np.empty((999,1)) cnt=0 for i in range(0,998): book_sim=cosine_sim[i] user_sim=user_profiles[person_id] user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i]) maxval = max(user_ratings) #print(maxval) for i in range(0,998): user_ratings[i]=((user_ratings[i]*5.0)/(maxval)) #print(user_ratings[i]) if(user_ratings[i]>3): #print("MILA KUCCHHH") cnt+=1 #print(max(user_ratings)) #print (cnt) #print(cosine_similarities) #return similar_items return user_ratings content_ratings = _get_similar_items_to_user_profile(userId) # In[100]: num = md[['book_id']] #print(num) num1 = pd.DataFrame(data=content_ratings[0:,0:]) frames = [num, num1] #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index]) mer = pd.concat(frames, axis =1,join_axes=[num.index]) mer.columns=['book_id', 'content_rating'] #print(mer.shape) #print('here') #print(mer) # In[102]: ## for user 2 # #print(temp_ratings.shape) cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']] # print(cb.shape) # print(pop.shape) hyb = md[['book_id']] hyb = hyb.merge(cb,on = 'book_id') hyb = hyb.merge(pop, on='book_id') hyb = hyb.merge(mer, on='book_id') #hyb.shape # In[106]: def weighted_rating(x): v = x['rating'] R = x['popularity_rating'] c = x['content_rating'] return 0.4*v + 0.2*R + 0.4 * c # In[107]: print(hyb) hyb['final'] = hyb.apply(weighted_rating, axis=1) hyb = hyb.sort_values('final', ascending=False).head(999) #print(hyb['final']) print(hyb) return hyb
""" This module descibes how to load a dataset from a pandas dataframe. """ from __future__ import (absolute_import, division, print_function, unicode_literals) import pandas as pd from surprise import NormalPredictor from surprise import Dataset from surprise.model_selection import cross_validate # Creation of the dataframe. Column names are irrelevant. ratings_dict = {'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, 'user_foo'], 'rating': [3, 2, 4, 3, 1]} df = pd.DataFrame(ratings_dict) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], rating_scale=(1, 5)) # We can now use this dataset as we please, e.g. calling cross_validate cross_validate(NormalPredictor(), data, cv=2)
from util import * user, book, user_test, book_test, rate, user_all, book_all, user_dict, book_dict = read_data() # Creation of the dataframe. Column names are irrelevant. ratings_dict = {'itemID': book, 'userID': user, 'rating': rate} df = pd.DataFrame(ratings_dict) # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 10)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) # Models algos = [] algos_name = [] algos_name.append('BS_ALS') bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 1, 'reg_i': 5 } algos.append(BaselineOnly(bsl_options=bsl_options)) algos_name.append('BS_SGD')
def benchmark_different_algorithms(): # import reduced dataset: df = import_reduced_reviews( 'C:/Users/lukas/OneDrive/Desktop/Reviews_Reduced.csv') # check for duplicates: duplicates = len(df) - len( df.drop_duplicates(subset=['game_key', 'user_key'])) # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) print('duplicates removed: ' + str(duplicates)) ## Surprise: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) results = [] algorithms = [ 'SVD\t\t\t\t\t\t', 'SlopeOne\t\t\t\t', 'CoClustering\t\t\t', 'NMF\t\t\t\t\t\t', 'KNN_Basic Item-Item\t\t', 'KNN_WithMeans Item-Item\t', 'KNN_WithZScore Item-Item', 'KNN_Basic User-User\t\t', 'KNN_WithMeans User-User\t', 'KNN_WithZScore User-User' ] # 1) SVD algo = SVD() results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 2) Slope One algo = SlopeOne() results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 3) CoClustering algo = CoClustering() results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 4) NMF algo = NMF() results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) ## K-Nearest Neighbors - Item-Item sim_option = {'name': 'cosine', 'user_based': False} k = 40 min_k = 5 # 5) KNNBasic algo = KNNBasic(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 6) KNNWithMeans algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 7) KNNWithZScore algo = KNNWithZScore(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) ## K-Nearest Neighbors - User - User sim_option = {'name': 'cosine', 'user_based': True} k = 100 min_k = 2 # 8) KNNBasic algo = KNNBasic(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 9) KNNWithMeans algo = KNNWithMeans(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) # 10) KNNWithZScore algo = KNNWithZScore(k=k, min_k=min_k, sim_options=sim_option) results.append( cross_validate(algo, data, measures=['RMSE'], cv=3, return_train_measures=True, n_jobs=-3, verbose=True)) for algorithm, result in zip(algorithms, results): print(algorithm + '\t \t RMSE Score: \t' + str(result['test_rmse'].mean()) + '\t\t Fit-Time: ' + str(result['fit_time']) + '\t\t Train-Time: ' + str(result['test_time']))
count1[count1 >= 20].index)] #print(ratings_explicit['UserID'].value_counts()) print(ratings_explicit.shape) # #### split the ratings table into taining and testing dataset ratings_train, ratings_test = train_test_split( ratings_explicit, stratify=ratings_explicit['UserID'], test_size=0.30, random_state=0) # # reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(ratings_train[['UserID', 'ISBN', 'Rating']], reader) parameter_grid = {'n_factors': [50, 100, 150, 200, 250, 300]} grid_search = GridSearch(SVD, parameter_grid, measures=['RMSE', 'MAE']) grid_search.evaluate(data) best_parameters = grid_search.best_params print(best_parameters) # best RMSE and MAE score best_result = grid_search.best_score print(best_result) # In[ ]:
def create_similarity_matrix(): start_time = time.time() # import reviews: import_path = '../Data/Joined/Results/Reviews_Reduced.csv' df = pd.read_csv(import_path) # keep only important columns: df = df[['game_key', 'user_key', 'rating']] # create surprise algorithm object sim_option = {'name': 'pearson', 'user_based': False} algo = KNNWithMeans(sim_options=sim_option) # get data in a format surprise can work with: reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # Build trainset from the whole dataset: trainset_full = data.build_full_trainset() print('Number of users: ', trainset_full.n_users, '\n') print('Number of items: ', trainset_full.n_items, '\n') # fit similarity matrix and calculate item means: algo.fit(trainset_full) print("--- %s seconds ---" % (time.time() - start_time)) # save similarity matrix and means from algo object to variable sim_matrix = algo.sim item_means = algo.means # convert numpy array to pd df: sim_matrix = pd.DataFrame(sim_matrix) # replace inner ids with raw ids: raw_2_inner_ids = trainset_full._raw2inner_id_items # swap keys and values: inner_2_raw_item_ids = dict((v, k) for k, v in raw_2_inner_ids.items()) # replace inner ids in sim_matrix index and columns by game_keys: sim_matrix = sim_matrix.rename(index=inner_2_raw_item_ids) sim_matrix = sim_matrix.rename(columns=inner_2_raw_item_ids) # export sim_matrix: sim_matrix.to_csv( '../Data/Recommender/item-item-sim-matrix-surprise-Reduced_dataset.csv' ) # convert item means from inner to raw: item_means_raw_ids = {} for i, mean in enumerate(item_means): item_means_raw_ids[inner_2_raw_item_ids[i]] = mean # export item means: export_path = '../Data/Recommender/item-means-Reduced_dataset.json' with open(export_path, 'w') as fp: json.dump(item_means_raw_ids, fp, sort_keys=False, indent=4) ## create sim matrix in long format: # get index as column: column_names = list(sim_matrix.columns.values) sim_matrix.reset_index(level=0, inplace=True) # convert df from wide to long: sim_matrix_long = pd.melt(sim_matrix, id_vars='index', value_vars=column_names, var_name='game_key_2') sim_matrix_long.rename(columns={'index': 'game_key'}) # export long sim matrix: sim_matrix_long.to_csv( '../Data/Recommender/item-item-sim-matrix-surprise-Reduced_dataset-LONG_FORMAT.csv' ) print("--- %s seconds ---" % (time.time() - start_time)) print('function end reached')
df_model = df_model[~df_model['userId'].isin(drop_user_list)] print('After Trim Shape: {}'.format(df_model.shape)) print('-Data Examples-') df_model.head(5) """COLLABORATIVE FILTERING""" !pip install surprise from surprise import Reader from surprise import Dataset from surprise import SVD from surprise.model_selection import cross_validate reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df_model[['userId', 'movieId', 'rating']], reader) """# Matrix factorization CF using sklearn surprise SVD""" svd = SVD() cross_validate(svd, data, measures=['RMSE', 'MAE']) df1.set_index('movieId', inplace = True) df1 data_596 = df_model[(df_model['userId'] == 596) & (df_model['rating'] == 5)] data_596 = data_596.set_index('movieId') data_596 = data_596.join(df1)['title'] print(data_596) data_596 = df1.copy()
def splitTrainSetTestSet(odatas,frac): reader = Reader(rating_scale=(0, 5)) data = Dataset.load_from_df(odatas[['userId', 'movieId', 'rating']], reader) trainset, testset = train_test_split(data, test_size=frac) return trainset,testset
""" SVD 알고리즘 적용 - 추천시스템 """ import pandas as pd # csv file from surprise import SVD, accuracy # model 생성/평가 from surprise import Reader, Dataset # dataset 생성 # 1. 데이터 가져오기 ratings = pd.read_csv('C:/ITWILL/4_Python-II/data/movie_rating.csv') print(ratings) # 평가자[critic] 영화[title] 평점[rating] # 2. rating dataset 생성 reader = Reader(rating_scale=(1, 5)) data = Dataset(reader) dataset = data.load_from_df(ratings[['critic','title','rating']], reader) # train/test train = dataset.build_full_trainset() test = train.build_anti_testset() svd = SVD() model = svd.fit(train) # 3. 전체 사용자 대상 예측치 pred = model.test(test) pred # uid='Jack', iid='Just My', r_ui=3.225806451612903, est=3.046417620945913, # uid : 사용자, iid : 영화, r_ui : 실제 평점, est : 예측치 평점 # uid='Toby'
return samplingDF samplingDF = NegativeSampling(remark) #%% remark = pd.read_pickle('recordsForSurprise.pkl') samplingDF = pd.read_pickle('negativeSampling.pkl') merge = pd.concat([remark, samplingDF]) merge.reset_index(inplace=True) del merge['index'], remark, samplingDF merge = merge[['User', 'Item', 'rate']] #%% reader = surprise.Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(merge, reader) del merge train, test = train_test_split(data, random_state=123, test_size=0.1) #%%训练模型(未调参) algo = SVDpp() #声明模型 algo.biased = False algo.fit(train) predictions = algo.test(test) accuracy.mae(predictions) a = algo.predict('15cbc496d67626ad90514b4243e7c045', '2204590') print(a) dump.dump(file_name='SVDmodel.pkl', algo=algo) #%%
import pandas as pd from sklearn.model_selection import GridSearchCV from sklearn.metrics import mean_squared_error from sklearn.kernel_ridge import KernelRidge import numpy as np testdf = pd.read_csv("../testset.csv") df = pd.read_csv("../trainset.csv") combined = pd.concat([testdf, df]) years = pd.read_csv("../release-year.csv") reader = Reader(rating_scale=(1, 5)) #for pandas only data = Dataset.load_from_df(combined[['user', 'item', 'rating']], reader) algo = SVD(reg_all=0.02) trainset = data.build_full_trainset() algo.fit(trainset) moviematrix = algo.qi y = years.values #KERNEL RIDGE regression for release year #best mean test MSE: 214.434 #best test MSE for a single split: 116.92 parameters = {"gamma": [1e0, 0.1, 1e-2, 1e-3, 1e-4, 1e-6]} kr = KernelRidge(kernel='rbf') clf = GridSearchCV(kr, parameters, cv=5, scoring='neg_mean_squared_error')
list_reviews = read_datafile(data_file) df = pd.DataFrame(list_reviews, columns=['UserId', 'ItemId', 'Playtime']) #filter_dataset(df) #normalize_playtime(df) reader = Reader(rating_scale=(0, max(df.Playtime))) sim_options = { "name": "cosine", "user_based": False, # Compute similarities between items } algo = KNNWithMeans(sim_options=sim_options) if cross_validate: data = Dataset.load_from_df(df, reader) cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) else: train_df, test_df = train_test_split(df, test_size=0.2) train_data = Dataset.load_from_df(train_df, reader) training_set = train_data.build_full_trainset() algo.fit(training_set) for index, row in test_df.iterrows(): user = row['UserId'] item = row['ItemId'] playtime = row['Playtime'] prediction = algo.predict(user, item) print('{}:{} - {} / {}'.format(user, item, prediction, playtime))
def make_prediction(test_data_imdb): train_data = pd.read_csv('../data/modeling/train/ratings_clean_std_0.csv', sep=',').drop(columns={'Unnamed: 0'}) omdb = pd.read_csv('../data/modeling/train/omdb_cleaned.csv') # build a reader, define the rating scale (minimum and maximum value) reader = Reader(rating_scale=(0.5, 5)) # convert data to surprise format train_surprise = Dataset.load_from_df(train_data, reader).build_full_trainset() # Collaborative Filtering Models knn_collaborative = KNNWithMeans(k=115, min_k=5, sim_options={ 'name': 'msd', 'user_based': False }) knn_collaborative.fit(train_surprise) svd = SVD(lr_all=0.01, reg_all=0.05, n_epochs=23) svd.fit(train_surprise) preds = [[ knn_collaborative.predict(test[1], test[3]).est for test in test_data_imdb.itertuples() ], [ svd.predict(test[1], test[3]).est for test in test_data_imdb.itertuples() ]] # Content-Based Models # define features for content-based models params_features = { 'threshold_actors': 0, 'ts_languages': 0, 'year': True, 'runtime': True, 'imdbvotes': True, 'series': False, 'awards': False, 'genres': True, 'imdb_rating': True, 'roto_rating': True, 'pg_rating': True, 'threshold_newkeywords': 0, 'threshold_plots': 0, 'threshold_directors': 0 } # load features features, names = preprocessing.features(**params_features) # add imdbID and set as index features = omdb[['imdbID' ]].join(pd.DataFrame(features)).set_index('imdbID') # predict ratings pred_content = [] no_of_ratings = [] train_data = train_data[train_data['imdbID'] != 'tt0720339'] for row in test_data_imdb.itertuples(): # select user and movie imdbID = row.imdbID userID = row.user_id # compute predictions if imdbID == 'tt0720339': # exclude outlier movie without information pred_content.append(svd.predict(userID, imdbID).est) else: # select ratings of the user ratings_user = train_data.loc[train_data['user_id'] == userID] ratings_user.reset_index(inplace=True, drop=True) # select features of corresponding movies and convert to array features_user = np.array(features.loc[ratings_user['imdbID']]) features_movie = np.array(features.loc[imdbID]) pred_content.append( predict_movie_rating(ratings_user, features_user, features_movie)) # store the number of predictions of a user: no_of_ratings.append(ratings_user.shape[0]) # predictions of the models predictions = weighted_prediction(preds[0], preds[1], pred_content, no_of_ratings) test_data_with_rating = test_data_imdb.join(predictions) return test_data_with_rating[['user_id', 'movieID', 'rating']]
rating_train, rating_test = train_test_split(rating, train_size=0.1, test_size=0.01, random_state=12345) print("================================================") print("Training sample:") print(rating_train.describe()) print("================================================") print("Validation sample:") print(rating_test.describe()) # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(0.5, 5)) # The columns must correspond to user id, item id and ratings (in that order). rating_train2 = Dataset.load_from_df( rating_train[['userID', 'itemID', 'rating']], reader) rating_test2 = Dataset.load_from_df( rating_test[['userID', 'itemID', 'rating']], reader) trainset = rating_train2.build_full_trainset() testset = rating_test2.build_full_trainset().build_testset() #SlopeOne Model count = 1 start = dt.datetime.today() print("================================================") algo = SlopeOne() algo.train(trainset) #print("This is the #" + str(count) + " parameter combination")
import surprise as sp from surprise import Dataset from surprise.model_selection import cross_validate import NetflixDataLoad #for 100000 rows for fast processing data = Dataset.load_from_df( NetflixDataLoad.df_filterd[['Cust_Id', 'Movie_Id', 'Rating']][:100000]) n_folds = 5 for algo in [sp.SVD(), sp.SVDpp(), sp.KNNBasic(), sp.KNNWithMeans()]: print( cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=n_folds, verbose=True)) # Output Example # Evaluating RMSE, MAE of algorithm SVD on 5 split(s). # # Fold 1 Fold 2 Fold 3 Fold 4 Fold 5 Mean Std # RMSE 0.9311 0.9370 0.9320 0.9317 0.9391 0.9342 0.0032 # MAE 0.7350 0.7375 0.7341 0.7342 0.7375 0.7357 0.0015 # Fit time 6.53 7.11 7.23 7.15 3.99 6.40 1.23 # Test time 0.26 0.26 0.25 0.15 0.13 0.21 0.06
qualified['vote_average'] = qualified['vote_average'].astype('int') qualified['wr'] = qualified.apply(weighted_rating, axis=1) qualified = qualified.sort_values('wr', ascending=False).head(10) return qualified # print("Improved Meta Data Recommender") # print(improved_recommendations('The Dark Knight')) # print("\n") # print(improved_recommendations('Mean Girls')) reader = Reader() ratings = pd.read_csv('ratings_small.csv') ratings.head() data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) #data.split(n_folds=5) svd = SVD() cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) trainset = data.build_full_trainset() svd.fit(trainset) ratings[ratings['userId'] == 1] svd.predict(1, 302, 3) def convert_int(x): try: return int(x)
amazon.describe().T["count"].sort_values(ascending = False)[:10] amazond = amazon.drop('user_id', axis = 1) amazond.head() amazond.sum().sort_values(ascending = False).to_frame()[:20] !pip install scikit-surprise from surprise import Reader from surprise import accuracy from surprise.model_selection import train_test_split amazon.columns melt_amazon = amazon.melt(id_vars = amazon.columns[0], value_vars = amazon.columns[1:], var_name="movie name", value_name="ratings") melt_amazon from surprise import Dataset reader = Reader(rating_scale=(-1,10)) data = Dataset.load_from_df(melt_amazon.fillna(0), reader = reader) trainset, testset = train_test_split(data, test_size = 0.25) from surprise import SVD algo = SVD() algo.fit(trainset) prediction = algo.test(testset) accuracy.rmse(prediction) user_id = 'A3R5OBKS7OM2IR' movie_id = 'Movie1' rating = 5.0 algo.predict(user_id, movie_id, r_ui=rating, verbose = True) # here it says the accuracy (estimated value as per the actual value , which is not good #though the rmse value is also not good)
import pandas as pd from surprise import NormalPredictor from surprise import Dataset from surprise import Reader from surprise.model_selection import cross_validate # Creation of the dataframe. Column names are irrelevant. ratings_dict = { 'itemID': [1, 1, 1, 2, 2], 'userID': [9, 32, 2, 45, 'user_foo'], 'rating': [3, 2, 4, 3, 1] } df = pd.DataFrame(ratings_dict) # A reader is still needed but only the rating_scale param is requiered. reader = Reader(rating_scale=(1, 5)) # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(df[['userID', 'itemID', 'rating']], reader) trainset, testset = train_test_split(data, test_size=.25) # We'll use the famous SVD algorithm. algo = SVD() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(trainset) predictions = algo.test(testset) print(str(predictions))
for i in movies['genres'].values.tolist(): [genres.add(ii) for ii in i.strip().split('|')] genres_length = len(genres) genres = dict(zip(list(genres), [i for i in range(len(genres))])) movies_genres = pd.DataFrame(movies['genres'].map( lambda x: trans_genres(x, genres_length, genres)).values.tolist()) movies_genres.columns = list(genres.keys()) movies['publish_years'] = movies['title'].map(lambda x: trans_publish_years(x)) movies = pd.concat([movies, movies_genres], axis=1, ignore_index=False).drop(columns=['genres']) users['age'] = users['age'].map(lambda x: 0 if x <= 6 else x) ratings = ratings[['user_id', 'movie_id', 'rating']] # ratings['rating'] = ratings['rating'].map(lambda x: 0 if x < 4 else 1) if not os.path.exists('feature/svd_pp_fi.pkl'): reader = Reader() data = Dataset.load_from_df(ratings, reader=reader) train, test = surprise_train_test_split(data, train_size=0.9, test_size=0.1, shuffle=False) svd = SVDpp(n_factors=20, n_epochs=5, random_state=321) svd.fit(train) svd_fu = pd.concat([ ratings['user_id'].drop_duplicates().reset_index(drop=True), pd.DataFrame(svd.pu.tolist()) ], axis=1) svd_fi = pd.concat([ ratings['movie_id'].drop_duplicates().reset_index(drop=True), pd.DataFrame(svd.qi.tolist()) ],
algo = SVD() algoran = SVD() test = df.sample(n=20000, random_state=1) print(test) trainact1 = pd.concat([test, trainact]).drop_duplicates(keep=False) trainact1 = trainact1.head(i) print(trainact1) train = pd.concat([df, test]).drop_duplicates(keep=False) train = train.sample(n=i) print(train) trainsetact = Dataset.load_from_df(trainact1[['user', 'item', 'rating']], reader).build_full_trainset() trainset = Dataset.load_from_df(train[['user', 'item', 'rating']], reader).build_full_trainset() testset = Dataset.load_from_df( test[['user', 'item', 'rating']], reader).build_full_trainset().build_testset() algo.fit(trainsetact) predictions = algo.test(testset) rmse_al.append(accuracy.rmse(predictions, verbose=False)) algoran.fit(trainset) predictionsran = algoran.test(testset) rmse_ran.append(accuracy.rmse(predictionsran, verbose=False))
from surprise import Reader import time import psutil import matplotlib.pyplot as plt x = [] timex = [] mem = [] m1 = psutil.virtual_memory().percent #print(m1) start = time.time() df1 = pd.read_csv('C:/Users/Foram/Desktop/Project/ratings_1million1.csv', dtype={'rating': float}) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df1[['user_id', 'book_id', 'rating']], reader) algo = SVD() result1 = cross_validate(algo, data, measures=['RMSE'], cv=5, verbose=True) #print(result1) x.append(np.mean(result1['test_rmse'])) end = time.time() #print("Time1",end - start) timex.append(end - start) #process=psutil.Process(os.getpid()) m2 = psutil.virtual_memory().percent #print(m2) mem.append(m2) start = time.time() df2 = pd.read_csv('C:/Users/Foram/Desktop/Project/ratings_1million2.csv', dtype={'rating': float})