def collaborative(self,ratings,user_id): reader = Reader() #ratings.head() temp_ratings = ratings data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) ## Training the data ## svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) #svd.train(trainset) ## Testing the data ## from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) count = 0 for uid, iid, true_r, est, _ in predictions: if uid == user_id: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print("count\n") #print(count) #print("\n--------here-------\n") #print(temp_ratings) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] #print("\n--------here-------\n") #print(cb) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] return(cb)
def test_performances(): """Test the returned dict. Also do dumping.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] reader = Reader(line_format='user item rating', sep=' ', skip_lines=3) data = Dataset.load_from_folds(folds_files=folds_files, reader=reader, rating_scale=(1, 5)) algo = NormalPredictor() tmp_dir = tempfile.mkdtemp() # create tmp dir with pytest.warns(UserWarning): performances = evaluate(algo, data, measures=['RmSe', 'Mae'], with_dump=True, dump_dir=tmp_dir, verbose=2) shutil.rmtree(tmp_dir) # remove tmp dir assert performances['RMSE'] is performances['rmse'] assert performances['MaE'] is performances['mae']
data = Dataset.load_builtin('ml-100k') u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code'] users = pd.read_csv('ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1') #print users.head() #data.split(n_folds=2) algo1 = SVD() algo2 = KNNBasic() algo3 = KNNBaseline() algo4 = KNNWithMeans() algo5 = NormalPredictor() #start_time1=time.time() start = timeit.default_timer() perf1 = evaluate(algo1, data, measures=['RMSE', 'MAE']) stop = timeit.default_timer() print("--- %s seconds ---" % (stop - start)) #start_time2=time.time() start1 = timeit.default_timer() perf2 = evaluate(algo2, data, measures=['RMSE', 'MAE']) stop1 = timeit.default_timer() print("...%s seconds..." % (stop1 - start1)) #start_time3=time.time() perf3 = evaluate(algo3, data, measures=['RMSE', 'MAE']) #print ("...%s seconds..."%(time.time()-start_time3)) #start_time4=time.time() perf4 = evaluate(algo4, data, measures=['RMSE', 'MAE']) #print ("...%s seconds..."%(time.time()-start_time4)) #start_time5=time.time() perf5 = evaluate(algo5, data, measures=['RMSE', 'MAE'])
def hybrid(userId,train_rd): #get_ipython().magic('matplotlib inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import warnings; warnings.simplefilter('ignore') # In[2]: #Popularity# md = pd.read_csv('CustomData/FinalData.csv') fd = pd.read_csv('avg_ratings1.csv') fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float') vote_averages= fd[fd['rating'].notnull()]['rating'] C = vote_averages.mean() fd1 = pd.read_csv('ratings_count.csv') fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float') vote_counts = fd1[fd1['rating'].notnull()]['rating'] # In[3]: m = vote_counts.quantile(0.75) # In[4]: md['ratings_count'] = fd1['rating'] md['average_rating'] = fd['rating'] # In[28]: #print(md.shape) qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']] qualified['ratings_count'] = qualified['ratings_count'].astype('float') qualified['average_rating'] = qualified['average_rating'].astype('float') #qualified.shape # In[29]: def weighted_rating(x): v = x['ratings_count'] R = x['average_rating'] return (v/(v+m) * R) + (m/(m+v) * C) # In[30]: qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1) #qualified['wr'] #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250) pop = qualified[['book_id','popularity_rating']] #print(qualified.shape) #print(pop.shape) # In[11]: ### Collaborative ## reader = Reader() ratings=train_rd #ratings = pd.read_csv('ratings.csv') #ratings.head() temp_ratings = ratings[0:1000] #print(temp_ratings) data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) # In[12]: svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) # In[13]: trainset = data.build_full_trainset() #svd.train(trainset) algo = SVD() algo.fit(trainset) ## usefule = temp_rating[rating] # In[14]: #print(len(temp_ratings[temp_ratings['user_id']==userId])) # In[ ]: def get_top_n(predictions, n=10): '''Return the top-N recommendation for each user from a set of predictions. Args: predictions(list of Prediction objects): The list of predictions, as returned by the test method of an algorithm. n(int): The number of recommendation to output for each user. Default is 10. Returns: A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. ''' # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): #user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # In[15]: from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) ''' top_n = get_top_n(predictions, n=10000) #print(top_n) #result = pd.DataFrame(top_n) #print(result) for uid, user_ratings in top_n.items(): #print(uid, [iid for (iid , _) in user_ratings]) for uid, iid, true_r, est, _ in predictions: temp_ratings.loc[uid]= [uid,iid,est] #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']] ''' count = 0 for uid, iid, true_r, est, _ in predictions: if uid == userId: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print('here') #print(uid) #temp_ratings.append([uid,iid,est],ignore_index=True) #print(count) #print(temp_ratings) # In[16]: #print(len(temp_ratings[temp_ratings['user_id']==2])) # In[ ]: # In[46]: ##### CONTENT ###### import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import csv import warnings; warnings.simplefilter('ignore') # In[48]: md=pd.read_csv('CustomData/FinalData.csv') rd=train_rd #rd=pd.read_csv('ratings.csv') md['book_id'] = md['book_id'].astype('int') rd['book_id'] = rd['book_id'].astype('int') rd['user_id'] = rd['user_id'].astype('int') rd['rating'] = rd['rating'].astype('int') #print(md.head()) md['authors'] = md['authors'].str.replace(' ','') md['authors'] = md['authors'].str.lower() md['authors'] = md['authors'].str.replace(',',' ') #print(md.head()) md['authors'] = md['authors'].apply(lambda x: [x,x]) #print(md['authors']) md['Genres']=md['Genres'].str.split(';') #print(md['Genres']) md['soup'] = md['authors'] + md['Genres'] #print(md['soup']) md['soup'] = md['soup'].str.join(' ') #md['soup'].fillna({}) #print(md['soup']) count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english') count_matrix = count.fit_transform(md['soup']) #print (count_matrix.shape) #print np.array(count.get_feature_names()) #print(count_matrix.shape) cosine_sim = cosine_similarity(count_matrix, count_matrix) # In[91]: def build_user_profiles(): user_profiles=np.zeros((53421,999)) #print(rd.iloc[0]['user_id']) #len(rd['book_id']) for i in range(0,1000): u=rd.iloc[i]['user_id'] b=rd.iloc[i]['book_id'] #print(u,b) #print(i) #if b<999: #print("match at "+str(b)) user_profiles[u][b-1]=rd.iloc[i]['rating'] #print(user_profiles) return user_profiles user_profiles=build_user_profiles() def _get_similar_items_to_user_profile(person_id): #Computes the cosine similarity between the user profile and all item profiles #print(user_profiles[person_id]) #print("\n---------\n") #print(cosine_sim[0]) user_ratings = np.empty((999,1)) cnt=0 for i in range(0,998): book_sim=cosine_sim[i] user_sim=user_profiles[person_id] user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i]) maxval = max(user_ratings) #print(maxval) for i in range(0,998): user_ratings[i]=((user_ratings[i]*5.0)/(maxval)) #print(user_ratings[i]) if(user_ratings[i]>3): #print("MILA KUCCHHH") cnt+=1 #print(max(user_ratings)) #print (cnt) #print(cosine_similarities) #return similar_items return user_ratings content_ratings = _get_similar_items_to_user_profile(userId) # In[100]: num = md[['book_id']] #print(num) num1 = pd.DataFrame(data=content_ratings[0:,0:]) frames = [num, num1] #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index]) mer = pd.concat(frames, axis =1,join_axes=[num.index]) mer.columns=['book_id', 'content_rating'] #print(mer.shape) #print('here') #print(mer) # In[102]: ## for user 2 # #print(temp_ratings.shape) cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']] # print(cb.shape) # print(pop.shape) hyb = md[['book_id']] hyb = hyb.merge(cb,on = 'book_id') hyb = hyb.merge(pop, on='book_id') hyb = hyb.merge(mer, on='book_id') #hyb.shape # In[106]: def weighted_rating(x): v = x['rating'] R = x['popularity_rating'] c = x['content_rating'] return 0.4*v + 0.2*R + 0.4 * c # In[107]: print(hyb) hyb['final'] = hyb.apply(weighted_rating, axis=1) hyb = hyb.sort_values('final', ascending=False).head(999) #print(hyb['final']) print(hyb) return hyb
n_users = ratings.userId.unique().shape[0] n_movies = ratings.movieId.unique().shape[0] print 'Number of users = ' + str(n_users) + ' | Number of movies = ' + str( n_movies) # In[69]: data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader=reader) data.split(n_folds=5) # In[70]: svd = SVD() perf1 = evaluate(svd, data, measures=['RMSE', 'MAE']) # In[71]: slp = SlopeOne() perf2 = evaluate(slp, data, measures=['RMSE', 'MAE']) # In[72]: knn = KNNBaseline() perf3 = evaluate(knn, data, measures=['RMSE', 'MAE']) # In[73]: trainset = data.build_full_trainset() svd.train(trainset)
file_path = os.path.expanduser('~/PycharmProjects/aashay/shuffled_ratings.csv') reader = Reader(line_format='user item rating', sep=',') data = surprise.Dataset.load_from_file(file_path, reader=reader) # data.split(5) # split data for 2-folds cross validation # dataset = 'ml-1m' # data = Dataset.load_builtin(dataset) data.split(2) kf = KFold(n_splits=2,random_state=0,shuffle=False) # folds will be the same for all algorithms. table = [] for klass in classes: start = time.time() #out = cross_validate(klass(), data, ['rmse', 'mae'], kf) out = surprise.evaluate(klass(), data, measures=['RMSE'], with_dump=False ) cv_time = str(datetime.timedelta(seconds=int(time.time() - start))) link = LINK[klass.__name__] mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse'])) mean_mae = '{:.3f}'.format(np.mean(out['test_mae'])) new_line = [link, mean_rmse, mean_mae, cv_time] print(tabulate([new_line], tablefmt="pipe")) # print current algo perf table.append(new_line) header = [LINK[data], 'RMSE', 'MAE', 'Time' ] print(tabulate(table, header, tablefmt="pipe"))
# =============================== load data =================================== # ml-latest-small # file_path = 'input/ml-latest-small/ratings.csv' # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) # ------------------------------------------------------------------------------ # ml-100k file_path = 'input/ml-100k/u.data' reader = env.Reader(line_format='user item rating timestamp', sep='\t', skip_lines=1) # ------------------------------------------------------------------------------ # ml-20m # file_path = 'input/ml-20m/ratings.csv' # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) # ============================================================================== # data = env.Dataset.load_from_file(file_path, reader=reader) # data.split(n_folds=5) # file_path = 'input/ml-100k/u.data' # reader = myDataset.Reader(line_format='user item rating timestamp', sep='\t', skip_lines=1, implicit=True, # threshold=4.5) data = myDataset.Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=5) # define algorithm algo = SLIM3(l1_reg=0.001, l2_reg=0.01, max_iter=200, tol=1e-3) # evaluate env.evaluate(algo, data, measures=['rmse', 'mae', 'fcp']) # myEvaluate.evaluate(algo, data, measures=['fcp', 'hr', 'arhr'], topN=10, leave_out_num=1, verbose=2)
self.trainset = trainset def estimate(self, u, i): '''Rende la valutazione/rating stimato dell'utente u per l'item i.''' # Rende un prodotto scalare tra p_u e q_i se l'utente e l'item sono conosciuti, altrimenti rende una media di tutti i ratings if self.trainset.knows_user(u) and self.trainset.knows_item(i): return numpi.dot(self.p[u], self.q[i]) else: return self.trainset.global_mean # Caricamento del db, utilizziamo il Movielens DataSet (https://grouplens.org/datasets/movielens/100k/) # Grazie alla libreria Surprise possiamo scaricarlo automaticamente. data = surprise.Dataset.load_builtin('ml-100k') data.split(2) # Divide i dati per 2-folds cross validation algo = RecAlgo(learning_rate=.01, n_epochs=10, n_factors=10) surprise.evaluate(algo, data, measures=['RMSE']) surprise.evaluate(algo, data, measures=['mae']) # Utilizziamo un algoritmo di neighborhood sugli stessi dati come confronto algo = surprise.KNNBasic() surprise.evaluate(algo, data, measures=['RMSE']) surprise.evaluate(algo, data, measures=['mae']) # Utilizziamo un metodo di fattorizzazione piu sofisticato sugli stessi dati algo = surprise.SVD() surprise.evaluate(algo, data, measures=['RMSE']) surprise.evaluate(algo, data, measures=['mae'])
import time import matplotlib.pyplot as plt import psutil timex=[] mem=[] m1=psutil.virtual_memory().percent start = time.time() df1 = pd.read_csv('C:/Users/Mausamee Patel/Desktop/Project/A5/Ratings_1Million1.csv', dtype={'rating': float}) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df1[['user_id','book_id','rating']], reader) data.split(2) algo = surprise.KNNBasic() result1 = surprise.evaluate(algo, data, measures=['RMSE']) end = time.time() print("Time1",end - start) timex.append(end-start) m2=psutil.virtual_memory().percent #print(m2) mem.append(m2) start = time.time() df2 = pd.read_csv('C:/Users/Mausamee Patel/Desktop/Project/A5/Ratings_1Million2.csv', dtype={'rating': float}) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df2[['user_id','book_id','rating']], reader) data.split(2) algo = surprise.KNNBasic() result2 = surprise.evaluate(algo, data, measures=['RMSE']) end = time.time()
gen_occ_mean = merged_df[['sex', 'rating', 'movie_id', 'occupation' ]].pivot_table(values='rating', index='movie_id', columns=['occupation', 'sex'], aggfunc='mean') print(gen_occ_mean.head()) print(score(cf_gen_occ)) # Define a Reader object # The Reader object helps in parsing the file or dataframe containing ratings reader = Reader() # Create the dataset to be used for building the filter data = Dataset.load_from_df(ratings, reader) # Define the algorithm object; in this case kNN knn = KNNBasic() # Evaluate the performance in terms of RMSE evaluate(knn, data, measures=['RMSE']) # Import SVD from surprise import SVD # Define the SVD algorithm object svd = SVD() # Evaluate the performance in terms of RMSE evaluate(svd, data, measures=['RMSE'])
def test_SVD_parameters(): """Ensure that all parameters are taken into account.""" # The baseline against which to compare. algo = SVD(n_factors=1, n_epochs=1) rmse_default = evaluate(algo, data, measures=['rmse'])['rmse'] # n_factors algo = SVD(n_factors=2, n_epochs=1) rmse_factors = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_factors # n_epochs algo = SVD(n_factors=1, n_epochs=2) rmse_n_epochs = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_n_epochs # biased algo = SVD(n_factors=1, n_epochs=1, biased=False) rmse_biased = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_biased # lr_all algo = SVD(n_factors=1, n_epochs=1, lr_all=5) rmse_lr_all = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_lr_all # reg_all algo = SVD(n_factors=1, n_epochs=1, reg_all=5) rmse_reg_all = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_reg_all # lr_bu algo = SVD(n_factors=1, n_epochs=1, lr_bu=5) rmse_lr_bu = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_lr_bu # lr_bi algo = SVD(n_factors=1, n_epochs=1, lr_bi=5) rmse_lr_bi = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_lr_bi # lr_pu algo = SVD(n_factors=1, n_epochs=1, lr_pu=5) rmse_lr_pu = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_lr_pu # lr_qi algo = SVD(n_factors=1, n_epochs=1, lr_qi=5) rmse_lr_qi = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_lr_qi # reg_bu algo = SVD(n_factors=1, n_epochs=1, reg_bu=5) rmse_reg_bu = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_reg_bu # reg_bi algo = SVD(n_factors=1, n_epochs=1, reg_bi=5) rmse_reg_bi = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_reg_bi # reg_pu algo = SVD(n_factors=1, n_epochs=1, reg_pu=5) rmse_reg_pu = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_reg_pu # reg_qi algo = SVD(n_factors=1, n_epochs=1, reg_qi=5) rmse_reg_qi = evaluate(algo, data, measures=['rmse'])['rmse'] assert rmse_default != rmse_reg_qi
cust_benchmark = round(df_cust_summary['count'].quantile(0.7),0) drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index df = df[~df['Movie_id'].isin(drop_movie_list)] df = df[~df['Cust_id'].isin(drop_cust_list)] #Pivot data df_p = pd.pivot_table(df, index="Cust_id", columns="Movie_id", values="Rating") #See which algorithm gives the lowest RMSE value reader = Reader() data = Dataset.load_from_df(df[['Cust_id', 'Movie_id', 'Rating']][:100000], reader) benchmark = [] for algo in [SVD(), SVDpp(), SlopeOne(), NMF(), NormalPredictor(), BaselineOnly(), CoClustering()]: data.split(n_folds=3) results = evaluate(algo, data, measures = ["RMSE"]) tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algo).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) print(pd.DataFrame(benchmark).set_index('Algorithm').sort_values('rmse')) ##Train and Test split #reader = Reader() #data = Dataset.load_from_df(df[['Cust_id', 'Movie_id', 'Rating']], reader) #trainset, testset = train_test_split(data, test_size = 0.25) #blo = BaselineOnly() #blo.fit(trainset) #predictions = blo.test(testset[:10000]) #accuracy.rmse(predictions)
from surprise import Reader, Dataset from surprise import NMF, evaluate # creating the format for the dataset when given the user, item, rating and timestamp data_reader = Reader(line_format="user item rating timestamp", sep="\t") # store the data in the specific format created above # u. data is the data we want data = Dataset.load_from_file("./ml-100k/u.data", reader=data_reader) # will be splitting the data into 5 folds for cross validation data.split(n_folds=5) # for this project I will be using the NMF algorithm algorithm = NMF() evaluate(algorithm, data, measures=["RMSE", "MAE"]) # train the whole data set now training_set = data.build_full_trainset() algorithm.train(training_set) # set the specific user and movie I want to predict user_id = str(200) item_id = str(222) actual_rating = 5 # see how it works! print(algorithm.predict(user_id, item_id, actual_rating))
top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n reader = Reader(rating_scale=(0, 5)) dataset = Dataset.load_from_df(actions[["user_id", "context_product"]], reader) algo = SVD() evaluate(algo, dataset, measures=["RMSE", "MAE"]) trainset = dataset.build_full_trainset() algo.fit(trainset) algo.predict("53ff5739aebb450829000074", "affect-health-drinking-chocolate", 15) algo.predict("53ff5739aebb450829000074", "affect-health-drinking-chocolate", 0) testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, n=10) recommendations = {} # Print the recommended items for each user for uid, user_ratings in top_n.items():
data=users.assign(key=1).merge(hotels.assign(key=1), on='key', how='inner').drop('key', axis=1) data=data.merge(activity_count, on=['user', 'hotel'], how='left') data['browse']=data.browse.fillna(0) data=data[['user', 'hotel', 'browse']] # tentatively CV test for some algorithms reader = Reader(rating_scale=(0, 1)) data = Dataset.load_from_df(data, reader) data_cv=data data_cv.split(n_folds=5) # SVD test svd = SVD() perf = evaluate(svd, data, measures=['RMSE']) print_perf(perf) # MSE 0.052 param_svd = {'n_factors': [50, 100], 'lr_all': [0.003, 0.005], 'reg_all': [0.05, 0.1, 0.5]} gs = GridSearch(SVD, param_svd, measures=['RMSE']) gs.evaluate(data_cv) # RMSE 0.2272 ~ 0.2284, after many tests notice 0.2272 is a benchmark, 100, 0.003, 0.1 # Co-clustering test coc=CoClustering() perf = evaluate(coc, data, measures=['RMSE']) print_perf(perf) # MSE 0.053 param_svd = {'n_cltr_u': [3, 5, 7], 'n_cltr_i': [3, 5, 7], 'n_epochs': [10, 20]} gs = GridSearch(CoClustering, param_svd, measures=['RMSE'])
from surprise import Dataset from surprise import SVD from surprise import evaluate, print_perf # Load the movielens-100k dataset (download it if needed), # and split it into 3 folds for cross-validation. data = Dataset.load_builtin('ml-100k') data.split(n_folds=3) # We'll use the famous SVD algorithm. algo = SVD() # Evaluate performances of our algorithm on the dataset. perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf)
train[['id_usuario', 'id_restaurante', 'rating_ambiente', 'fecha']].to_csv('surprise_format.csv', index=False) file_path = 'surprise_format.csv' reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=5) # We'll use the famous SVD++ algorithm. algo = SVDpp() # Evaluate performances of our algorithm on the dataset. perf = evaluate(algo, data, measures=['RMSE', 'MAE']) print_perf(perf) #grid search SVDpp_ambiente = SVDpp() param_grid = { 'n_epochs': [50, 100, 150], 'lr_all': [0.002, 0.003, 0.004], 'reg_all': [0.2, 0.3], 'n_factors': [10, 20, 30, 40] } grid_search = GridSearch(SVDpp_ambiente, param_grid, measures=['RMSE', 'FCP'])
def eval(self): # Evaluate performances of our algorithm on the dataset. perf = evaluate(self.svd, self.data, measures=['RMSE']) print_perf(perf)
import os from surprise import Reader, Dataset # 指定文件路径 file_path = os.path.expanduser('./popular_music_suprise_format.txt') # 指定文件格式 reader = Reader(line_format='user item rating timestamp', sep=',') # 从文件读取数据 music_data = Dataset.load_from_file(file_path, reader=reader) # 分成5折 music_data.split(n_folds=5) # 使用NormalPredictor from surprise import NormalPredictor, evaluate algo1 = NormalPredictor() perf1 = evaluate(algo1, music_data, measures=['RMSE', 'MAE']) # 使用BaselineOnly from surprise import BaselineOnly, evaluate algo2 = BaselineOnly() perf2 = evaluate(algo2, music_data, measures=['RMSE', 'MAE']) # 使用基础版协同过滤 from surprise import KNNBasic, evaluate algo3 = KNNBasic() perf3 = evaluate(algo3, music_data, measures=['RMSE', 'MAE']) # 使用均值协同过滤 from surprise import KNNWithMeans, evaluate algo4 = KNNWithMeans() perf4 = evaluate(algo4, music_data, measures=['RMSE', 'MAE'])
param_grid = { 'n_factors':range(10,30,2), 'n_epochs': [10,15,20], 'lr_all': [0.002, 0.005, 0.1],'reg_all': [0.4, 0.6, 0.8]} param_grid = { 'n_factors':range(2,22,2), 'n_epochs': [10], 'lr_all': [0.1],'reg_all': [0.4]} param_grid = { 'n_factors':[2], 'n_epochs':range(11), 'lr_all': [0.1],'reg_all': [0.4]} grid_search = GridSearch(SVDpp, param_grid, measures=['RMSE', 'MAE']) grid_search.evaluate(music_data) print(grid_search.best_params['RMSE']) print(grid_search.best_params['MAE']) # 开始训练模型 print('开始训练模型...') #algo = KNNBaseline() algo = SVDpp(n_factors=grid_search.best_params['RMSE']['n_factors'],n_epochs=grid_search.best_params['RMSE']['n_epochs'],lr_all=grid_search.best_params['RMSE']['lr_all'],reg_all=grid_search.best_params['RMSE']['reg_all'],verbose=2) algo=SVDpp() #algo=SVD() #algo=SVDpp() perf = evaluate(algo, music_data, measures=['RMSE', 'MAE'],verbose=1) print_perf(perf) #print() #print('针对歌单进行预测:') #current_playlist_name =list(name_id_dict.keys())[3] #print('歌单名称', current_playlist_name) #playlist_rid = name_id_dict[current_playlist_name] #print('歌单rid', playlist_rid) #playlist_inner_id = algo.trainset.to_inner_uid(playlist_rid) #print('歌曲inid', playlist_inner_id) #algo.compute_similarities()
#Intialising the Reader which is used to parse the file containing the ratings reader=Reader() #Making the dataset containing the column as userid itemid ratings #the order is very specific and we have to follow the same order dataset=Dataset.load_from_df(ratings_dataset[['userId','movieId','rating']],reader) #Using the split function to perform cross validation dataset.split(n_folds=6) #Intialising the SVD model and specifying the number of latent features #we can tune this parameters according to our requirement svd=SVD(n_factors=25) #evaluting the model on the based on the root mean square error and Mean absolute error evaluate(svd,dataset,measures=['rmse','mae']) #making the dataset to train our model train=dataset.build_full_trainset() #training our model svd.train(train) #Making a new series which have two columns in it #Movie name and movie id movies_dataset = movies_dataset.reset_index() titles = movies_dataset['movie_name'] indices = pd.Series(movies_dataset.index, index=movies_dataset['movie_name']) #Function to make recommendation to the user def recommendataion(user_id,movie):
def metrics(self, measures): return evaluate(self.algo, self.data, measures=measures)
from surprise import Reader, Dataset, SVD, evaluate sns.set_style("darkgrid") df1 = pd.read_csv('../../Data/combined_data_1.txt', header=None, usecols=[0, 1], names=['uid', 'rating']) df1['rating'] = df1['rating'].astype(float).fillna(1.0) df1['iid'] = pd.DataFrame(list(range(len(df1)))) df = df1.head(100000) df = df[['uid', 'iid', 'rating']] df_title = pd.read_csv('../../Data/movie_titles.csv', encoding="ISO-8859-1", header=None, names=['Movie_Id', 'Year', 'Name']) USERID = '822109' reader = Reader() data = Dataset.load_from_df(df, reader) alg = SVD() output = alg.fit(data.build_full_trainset()) evaluate(alg, data) pickle.dump([alg, df, df_title], open('../../Evaluations/matrix-data.p', "wb")) print(df[df['rating'] == 5]['uid'])
def surprise_algorithms_print_perf(): print('Surprise Algorithms (Tabla de resultados finales)...') print('Que data desea utilizar?') print('(1) Android') print('(2) WordPress') data_utilizar = input() # Funcion de encoding para no tener error de lectura del archivo. reload(sys) sys.setdefaultencoding('utf8') if data_utilizar == 1: file_path = configuration.FILE_PATH_ANDROID reader = Reader(line_format='user item rating', sep='\t') else: file_path = configuration.FILE_PATH_WORDPRESS reader = Reader(line_format='user item rating', sep=',') # Dataset data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=5) # BaselineOnly algo_normal_predictor = NormalPredictor() perf_normal_predictor = evaluate(algo_normal_predictor, data, measures=['RMSE', 'MAE'], verbose=False) # SVD algo_svd = SVD() perf_svd = evaluate(algo_svd, data, measures=['RMSE', 'MAE'], verbose=False) # BaselineOnly algo_baseline_only = BaselineOnly() perf_baseline_only = evaluate(algo_baseline_only, data, measures=['RMSE', 'MAE'], verbose=False) # SVDpp algo_svdpp = SVDpp() perf_svdpp = evaluate(algo_svdpp, data, measures=['RMSE', 'MAE'], verbose=False) # NMF algo_nmf = NMF() perf_nmf = evaluate(algo_nmf, data, measures=['RMSE', 'MAE'], verbose=False) # SlopeOne algo_slope_one = SlopeOne() perf_slope_one = evaluate(algo_slope_one, data, measures=['RMSE', 'MAE'], verbose=False) # CoClustering algo_coclustering = CoClustering() perf_coclustering = evaluate(algo_coclustering, data, measures=['RMSE', 'MAE'], verbose=False) """Segmento que utiliza KNN para el analisis: 'k' Es el numero maximo de vecinos a tomar en cuenta para la agregacion 'min_k' El numero minimo de vecinos a tomar en cuenta para la agregacion. Si no hay suficientes vecinos,la predicción se establece en la media global de todas las calificaciones 'sim_options' son las opciones de similitud que utiliza el knn 'bsl_options' configuracion de las estimaciones de base""" k = 40 min_k = 1 sim_options = { 'name': 'pearson_baseline', 'user_based': 0 # no shrinkage } bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5} algo_knn_basic = KNNBasic(k=k, min_k=k, sim_options=sim_options) perf_knn_basic = evaluate(algo_knn_basic, data, measures=['RMSE', 'MAE'], verbose=False) algo_knn_with_means = KNNWithMeans(k=k, min_k=k, sim_options=sim_options) perf_knn_with_means = evaluate(algo_knn_with_means, data, measures=['RMSE', 'MAE'], verbose=False) algo_knn_base_line = KNNBaseline(k=k, min_k=k, sim_options=sim_options, bsl_options=bsl_options) perf_knn_base_line = evaluate(algo_knn_base_line, data, measures=['RMSE', 'MAE'], verbose=False) """Imprimiendo resultados de los algoritmos""" print('') print('Printing results from algorithms...') print('- Normal predictor') print_perf(perf_normal_predictor) print('') print('- Normal SVD') print_perf(perf_svd) print('') print('- Normal Baseline Only') print_perf(perf_baseline_only) print('') print('- Normal SVD++') print_perf(perf_svdpp) print('') print('- Normal NMF') print_perf(perf_nmf) print('') print('- Normal Slope One') print_perf(perf_slope_one) print('') print('- Normal Co-Clustering') print_perf(perf_coclustering) print('') print('- Normal KNN Basic') print_perf(perf_knn_basic) print('') print('- Normal KNN With Means') print_perf(perf_knn_with_means) print('') print('- Normal KNN Base Line') print_perf(perf_knn_base_line)
def run_svd(n_epochs, reg_all, init_mean): start_time = time.time() algo = SVD(n_epochs=n_epochs, reg_all=reg_all, init_mean=init_mean) evaluate(algo, data) running_time = time.time() - start_time print("SVD:", running_time, " s")
from surprise import evaluate, print_perf from surprise import GridSearch import pandas as pd import io #载入数据集,该数据集是一个电影评分数据集,数据结构:uid,iid,score,time data = Dataset.load_builtin('ml-100k') #将数据集采用交叉验证均分为3份 data.split(n_folds=3) '''协调过滤''' #使用协调过滤算法 algo = KNNBasic() #评估算法的效果,这里采用RMSE和MAE perf = evaluate(algo, data, measures=['rmse', 'mae']) print_perf(perf) '''SVD分解''' #指定参数取值范围 param_grid = { 'n_epochs': [5, 10], 'lr_all': [0.002, 0.005], 'reg_all': [0.4, 0.6] } #利用surprise自带的GridSearch确定最优参数 grid_search = GridSearch(SVD, param_grid, measures=['rmse', 'fcp']) grid_search.evaluate(data) #确定最优参数和结果 print(grid_search.best_score['rmse']) print(grid_search.best_params['rmse'])
# In[ ]: from surprise import SVD, evaluate from surprise import NMF from surprise import KNNBasic # In[ ]: # svd algo = SVD() evaluate(algo, data, measures=['RMSE']) # In[ ]: # nmf algo = NMF() evaluate(algo, data, measures=['RMSE']) # In[ ]: # knn algo = KNNBasic()
import surprise as env path = "input/ml-latest-small/ratings.csv" reader = env.Reader(line_format="user item rating timestamp", sep=",", skip_lines=1) data = env.Dataset.load_from_file(path, reader=reader) data.split(n_folds=3) algo = env.SVD() env.evaluate(algo, data)
for i in open('C:\\Users\\Dimple Shah\\Desktop\\mtech\\reco\\3l.csv', 'r').readlines() ] ratings_df1 = pd.DataFrame(ratings_list1, columns=['UserID', 'BookID', 'Rating'], dtype=float) ratings_df1.loc[:, 'Rating'] = sk.minmax_scale(ratings_df1.loc[:, 'Rating']) data1 = Dataset.load_from_df(ratings_df1[['UserID', 'BookID', 'Rating']], reader) data1.split(2) # split data for 2-folds cross validation algo1 = MatrixFacto(learning_rate=.01, n_epochs=10, n_factors=10) #print(algo) #test_rms= result1 = surprise.evaluate(algo1, data1, measures=['RMSE']) #print(test_rms) x.append(np.mean(result1['RMSE'])) end = time.time() #print("Time1",end - start) timex.append(end - start) process = psutil.Process(os.getpid()) m2 = process.memory_full_info().uss #m2=m2-m1 print(m2) mem.append(m2) #Checking RMSE with 500k data records start = time.time() ratings_list2 = [ i.strip().split(",")
print('-' * 12) print('-' * 12) return hr, arhr if __name__ == '__main__': # builtin dataset # data = env.Dataset.load_builtin('ml-100k') # =============================== load data ============================ # ml-latest-small # file_path = 'input/ml-latest-small/ratings.csv' # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) # ------------------------------------------------------------------------------ # ml-100k file_path = 'input/ml-100k/u.data' reader = env.Reader(line_format='user item rating timestamp', sep='\t', skip_lines=1) # ------------------------------------------------------------------------------ # ml-20m # file_path = 'input/ml-20m/ratings.csv' # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) # ============================================================================== data = env.Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=5) algo = env.SVDpp() # evaluate_topn(algo, data, top_n=100, threshold=3, verbose=1) env.evaluate(algo, data, measures=['rmse', 'mae', 'fcp'], verbose=1)
from surprise import SVD from surprise import Reader from surprise import Dataset from surprise import evaluate import pandas as pd from pandas import plotting import matplotlib.pyplot as plt import warnings warnings.simplefilter('ignore') reader = Reader() ratings = pd.read_csv('./tmdb-5000-movie-dataset/ratings.csv') ratings.head() data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader) data.split(n_folds=5) svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() svd.fit(trainset) X = raw_input("Select a UserId") Y = raw_input("Select a MovieId") print(svd.predict(int(X),int(Y) ,3))
# ml-latest-small # file_path = 'input/ml-latest-small/ratings.csv' # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) # ------------------------------------------------------------------------------ # ml-100k file_path = 'input/ml-100k/u.data' reader = env.Reader(line_format='user item rating timestamp', sep='\t', skip_lines=1) # ------------------------------------------------------------------------------ # ml-20m # file_path = 'input/ml-20m/ratings.csv' # reader = env.Reader(line_format='user item rating timestamp', sep=',', skip_lines=1) # ============================================================================== data = env.Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=5) # define algorithm algo = WAPR(learning_rate=0.01, factor_num=20, epoch_num=1, batch_num=512, alpha=0.01, eps=1e-2, random=False) # evaluate # topn.evaluate_topn(algo, data, top_n=100, threshold=4.5) env.evaluate(algo, data, measures=['fcp'])
AlgoBase.train(self, trainset) # Compute baselines and similarities self.bu, self.bi = self.compute_baselines() self.sim = self.compute_similarities() def estimate(self, u, i): if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)): raise PredictionImpossible('User and/or item is unkown.') # Compute similarities between u and v, where v describes all other # users that have also rated item i. neighbors = [(v, self.sim[u, v]) for (v, r) in self.trainset.ir[i]] # Sort these neighbors by similarity neighbors = sorted(neighbors, key=lambda x: x[1], reverse=True) print('The 3 nearest neighbors of user', str(u), 'are:') for v, sim_uv in neighbors[:3]: print('user {0:} with sim {1:1.2f}'.format(v, sim_uv)) # ... Aaaaand return the baseline estimate anyway ;) bsl = self.trainset.global_mean + self.bu[u] + self.bi[i] return bsl data = Dataset.load_builtin('ml-100k') algo = MyOwnAlgorithm() evaluate(algo, data)
data = Dataset.load_from_file(file_path, reader=reader) data.split(n_folds=5) sim_options = { 'name': 'pearson_baseline', 'shrinkage': 0 # no shrinkage } knnbasic_ambiente = KNNBasic() k_neig = np.array([40, 45, 50, 60]) for i in range(0, len(k_neig)): knnbasic_ambiente = KNNBasic(k=k_neig[i]) perf = evaluate(knnbasic_ambiente, data, measures=['RMSE', 'MAE'], verbose=0) print('K es ', k_neig[i], 'media', np.array(perf['rmse']).mean()) #mejor k de ambiente es 40 knnbasic_ambiente = KNNBasic(k=40) # Retrieve the trainset. trainset = data.build_full_trainset() knnbasic_ambiente.train(trainset) from sklearn.externals import joblib joblib.dump(knnbasic_ambiente, 'knnbasic_ambiente.pkl') ####comida knn######