def trainer(self): # Set the random seed that numpy (used internally by Surprise) will use. my_seed = random.randint(0, 2**32) random.seed(my_seed) numpy.random.seed(my_seed) # Reassurance that the script is actually running. self.printer( "\nNow training on the MovieLens latest small dataset. (8 folds used)" ) self.printer("Please wait...\n") # Define the file's format reader = Reader(line_format='user item rating timestamp', sep=',') # Load the data from the ratings.csv file data = Dataset.load_from_file('./ml-latest-small/ratings.csv', reader=reader) # Use the SVD algorithm for prediction method = SVD() start = time.time() # Use 8-fold cross validation and evaluate the results with RMSE and MAE measurements = cross_validate(method, data, measures=['RMSE', 'MAE'], cv=8, verbose=False, n_jobs=-2, return_train_measures=True) # Print the random seed used for fold assignments self.printer( "Random seed used for fold assignment: {}\n".format(my_seed)) # Show the stats meanFitTime = numpy.mean(measurements["fit_time"]) meanTestTime = numpy.mean(measurements["test_time"]) meanTestMAE = numpy.mean(measurements["test_mae"]) meanTestRMSE = numpy.mean(measurements["test_rmse"]) meanTrainMAE = numpy.mean(measurements["train_mae"]) meanTrainRMSE = numpy.mean(measurements["train_rmse"]) self.printer( "Mean fit time per fold: {:0.5f} seconds".format(meanFitTime)) self.printer( "Mean test time per fold: {:0.5f} seconds".format(meanTestTime)) self.printer("Mean train MAE per fold: {:0.5f}".format(meanTrainMAE)) self.printer("Mean train RMSE per fold: {:0.5f}".format(meanTrainRMSE)) self.printer("Mean test MAE per fold: {:0.5f}".format(meanTestMAE)) self.printer("Mean test RMSE per fold: {:0.5f}\n".format(meanTestRMSE)) # Train with the dataset trainset = data.build_full_trainset() method.fit(trainset) end = time.time() spent = end - start self.printer( "Training and testing time: {:0.3f} seconds\n".format(spent)) process = psutil.Process(os.getpid()) self.printer("Memory used:") self.printer("{:0.5f}".format(process.memory_info().rss / 1048576.0) + " MB Physical") self.printer("{:0.5f}".format(process.memory_info().vms / 1048576.0) + " MB Virtual") return method, trainset
def main(): # Initialize dataset (from old code) Y_train = np.loadtxt('data/train.txt').astype(int) Y_test = np.loadtxt('data/test.txt').astype(int) M = max(max(Y_train[:, 0]), max(Y_test[:, 0])).astype(int) # users N = max(max(Y_train[:, 1]), max(Y_test[:, 1])).astype(int) # movies print("Factorizing with M: ", M, " users, N: ", N, " movies.") # Load data with Surprise reader = Reader(line_format='user item rating', sep='\t') Y_train = Dataset.load_from_file('data/train.txt', reader=reader) Y_test = Dataset.load_from_file('data/test.txt', reader=reader) trainset = Y_train.build_full_trainset() testset = Y_test.build_full_trainset().build_testset() K = 20 reg = 0.1 lr = 0.01 # PART 5-3: INTRODUCE MEAN AND REGULARIZED BIAS TERMS # (based off of Step 1c in the guide) # Create model and fit it algo = SVD(n_factors=K, lr_all=lr, reg_all=reg, n_epochs=30, biased=True) algo.fit(trainset) predictions = algo.test(testset) # Evaluate error using err function from problem set E_out = get_err(predictions) print('E_out (MSE): ', E_out) # Try GridSearchCV ''' param_grid = {'n_epochs': [10, 15, 20, 25, 30], 'lr_all': [0.002, 0.005, 0.01, 0.02, 0.03], 'reg_all': [0.005, 0.01, 0.05, 0.1, 0.2, 0.3]} gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3) gs.fit(Y_train) print('Grid Search:') print(0.5 * gs.best_score['rmse'] ** 2) print(gs.best_params['rmse']) ''' # Results: best params were n_epochs=30, reg=0.1, lr=0.01 # Apply SVD to V V = algo.qi.T U = algo.pu A, s, B = np.linalg.svd(V) # Use first 2 columns of A A2 = A[:, :2] U_projected = np.dot(A2.T, U.T) V_projected = np.dot(A2.T, V).T X = V_projected[:, 0] Y = V_projected[:, 1] visualize(X, Y, '5-3')
def Liked(id): testSubject = id ml = MovieLens() print("Loading movie ratings...") data = ml.loadMovieLensLatestSmall() userRatings = ml.getUserRatings(testSubject) loved = [] hated = [] for ratings in userRatings: if (float(ratings[1]) > 4.0): loved.append(ratings) if (float(ratings[1]) < 3.0): hated.append(ratings) print("\nUser ", testSubject, " loved these movies:") for ratings in loved: print(ml.getMovieName(ratings[0])) print("\n...and didn't like these movies:") for ratings in hated: print(ml.getMovieName(ratings[0])) print("\nBuilding recommendation model...") trainSet = data.build_full_trainset() algo = SVD() algo.fit(trainSet) print("Computing recommendations...") testSet = BuildAntiTestSetForUser(testSubject, trainSet) predictions = algo.test(testSet) recommendations = [] print ("\nWe recommend:") for userID, movieID, actualRating, estimatedRating, _ in predictions: intMovieID = int(movieID) recommendations.append((intMovieID, estimatedRating)) recommendations.sort(key=lambda x: x[1], reverse=True) s="\n"+str(id) for ratings in recommendations[:10]: s+=","+ml.getMovieName(ratings[0]) file = open("E:\\Neeraj\\LikhedBase.txt", "r") alld=file.readlines() file.close() file1 = open("E:\\Neeraj\\LikhedBase.txt", "w") for r1 in alld: print(r1) u=r1.find(",") if(r1[0:u]==str(id)): pass else: file1.write(r1) file1.write(s) file1.close() print ("\nDone")
def recommender_testing(file_name): '''Perform testing on the recommender, main function :param file_name: dataset file name :return: ''' print('began testing') listening_data = pd.read_table(file_name) raw_data = listening_data.drop(listening_data.columns[1], axis=1) raw_data.columns = ['user', 'artist', 'plays'] # Drop NaN columns data = raw_data.dropna() data = data.copy() # Create a numeric user_id and artist_id column data['user'] = data['user'].astype("category") data['artist'] = data['artist'].astype("category") data['user_id'] = data['user'].cat.codes data['artist_id'] = data['artist'].cat.codes # from Surprise documentation algo = SVD() reader = Reader(rating_scale=(1, 50)) sampled_data = data.sample(500000) surprise_data = Dataset.load_from_df(sampled_data[['user_id', 'artist_id', 'plays']], reader) trainset = surprise_data.build_full_trainset() algo.fit(trainset=trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, n=10) # Print the recommended items for each user for uid, user_ratings in top_n.items(): print(uid, [iid for (iid, _) in user_ratings]) cross_validate(algo, surprise_data, ['RMSE', 'MAE'], cv=4, verbose=True) # Create a numeric user_id and artist_id column data['user'] = data['user'].astype("category") data['artist'] = data['artist'].astype("category") data['user_id'] = data['user'].cat.codes data['artist_id'] = data['artist'].cat.codes # The implicit library expects data as a item-user matrix so we # create two matricies, one for fitting the model (item-user) # and one for recommendations (user-item) sparse_item_user = scipy.sparse.csr_matrix((data['plays'].astype(float), (data['artist_id'], data['user_id']))) sparse_user_item = scipy.sparse.csr_matrix((data['plays'].astype(float), (data['user_id'], data['artist_id']))) # Initialize the als model and fit it using the sparse item-user matrix model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=20) data_conf = (sparse_item_user).astype('double') model.fit(data_conf) surprise_testing(surprise_data)
def train_user_base(movies): reader = Reader() ratings = pd.read_csv('ratings_small.csv') data = Dataset.load_from_df(ratings[['userId','movieId','rating']],reader) data.split(n_folds=5) svd = SVD() trainset = data.build_full_trainset() svd.fit(trainset) return svd
def recomendacion(usuario): array = [] for rate in Calificacion.objects.all(): array.append([rate.usuario_id, rate.asignatura_id, rate.calificacion]) df = pd.DataFrame(data=array) reader = Reader(rating_scale=(0, 10)) data = Dataset.load_from_df(df, reader) trainingSet = data.build_full_trainset() param_grid = { 'n_factors': [50, 100, 150], "n_epochs": [40, 50, 60], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6] } gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3) gs.fit(data) #Parametros optimos params = gs.best_params["rmse"] SVDoptimized = SVD(n_factors=params['n_factors'], n_epochs=params['n_epochs'], lr_all=params['lr_all'], reg_all=params['reg_all']) SVDoptimized.fit(trainingSet) asig = Asignatura.objects.all() asig_user = Calificacion.objects.all().filter(usuario_id=usuario.id) #Asignaturas sin calificar asignaturas_SinC = [] for asignatura in asig: encontrado = False for asignatura_usuario in asig_user: if (asignatura_usuario.asignatura_id == asignatura.codigo): encontrado = True if (not encontrado): asignaturas_SinC.append(asignatura) #asignaturas_recomendados asignaturas_rec = [] for asignatura in asignaturas_SinC: asignaturas_rec.append({ 'asignatura': asignatura, 'svd': SVDoptimized.predict(usuario.id, asignatura.codigo).est }) # A function that returns the 'year' value: def ordenador(e): return e['svd'] asignaturas_rec.sort(reverse=True, key=ordenador) return asignaturas_rec
def get_svd(df_ratings): reader = Reader() #training data = Dataset.load_from_df(df_ratings, reader) data.split(n_folds=5) svd = SVD() trainset = data.build_full_trainset() svd.fit(trainset) return svd
def train(data): reader = Reader() svd = SVD() data_sp = Dataset.load_from_df(data[['user_id', 'movie_id', 'rating']], reader) train = data_sp.build_full_trainset() svd.fit(train) return svd
def train_model(data): """ Accepts dataset and returns trained model """ trainsetfull = data.build_full_trainset() algo = SVD() cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) algo.fit(trainsetfull) return algo
def svdalgorithm(trainset, testset): print("\n" + "-" * 5 + " SVD algorithm using surprise package " + "-" * 5) algo = SVD() algo.fit(trainset) predictions = algo.test(testset) rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return rmse, mae, predictions
class SVDCollaborativeFiltering: # Based on Singular Value Decomposition (SVD) implementation built into surprise library # Uses a matrix factorization method to reduce a matrix into lower dimension parts simplifying the calculations def __init__(self, ratings): # Surprise library does not allow using data frames as training and test set values reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(ratings[['user_id', 'book_id', 'rating']], reader) self.train, self.test = train_test_split(data, test_size=.20) self.model = SVD() def test_model(self): # Checks the predicted values against the test set # Returns Root Mean Square Error (RMSE) accuracy predictions = self.model.test(self.test) return accuracy.mae(predictions, verbose=False), accuracy.rmse(predictions, verbose=False) def train_model(self): # Trains the model on the training set (80% of the total ratings data) self.model.fit(self.train) def predict(self, user_id, books, ratings, already_read=None): # Predicts recommended books for a given user # Gets all unread books if already_read is None: already_read = ratings[ratings['user_id'] == user_id]['book_id'].unique() prediction = books[[ 'book_id', 'title', 'authors', 'average_rating', 'image_url' ]].copy() prediction = prediction[~prediction['book_id'].isin(already_read)] # Predicts a rating for each book and sorts them prediction['predict'] = prediction['book_id'].apply( lambda x: self.model.predict(user_id, x).est) prediction = prediction.sort_values('predict', ascending=False) return convert(prediction) def save(self, location): # Fully saves the model pickle.dump(self, open(location, 'wb')) @staticmethod def load(location): # Loads the model infile = open(location, 'rb') obj = pickle.load(infile) infile.close() return obj
def create_utility_matrix(self): trainsetfull = self.data.build_full_trainset() testsetfull = trainsetfull.build_anti_testset() algo = SVD(n_factors=85) #n_factors=80,reg_all=0.05 algo.fit(trainsetfull) self.predictions = algo.test(testsetfull)
def SVDAlgorithmPredict(self, k): model = SVD(n_factors=k) model.fit(self.Train) predictions = model.test(self.Test) df = pd.DataFrame(predictions, columns=[ 'user_id', 'song_id', 'listen_count', 'prediction', 'details' ]) return model, df
class GlobalProportionAlgo(AlgoBase): def __init__(self, cat_products, cat_target): """ Cette méthode consiste à recommander peu à peu des objets en prenant à chaque fois l'objet avec la meilleure similarité dans la catégorie des objets qui est le plus loin de sa valeur cible en proportion parmi les résultat déjà obtenus """ AlgoBase.__init__(self) # Le modèle qui nous donne les \hat{r}_ij. self.SVD = SVD() # Les informations pour la partnership self.cat_products = cat_products self.cat_target = cat_target def fit(self, trainset): AlgoBase.fit(self, trainset) self.SVD.fit(trainset) return self def preprocess(self, test_set): C = len(self.cat_target) self.predicted = dict() heaps = [[] for _ in range(C)] n = 0 current_prop = np.zeros(C) # We use C heaps to gather the similarities for u, i, _ in test_set: heapq.heappush(heaps[self.cat_products[i]],(self.SVD.estimate(u,i),u,i)) while 1: if n == 0: selected_category = np.argmax(np.array([heap==[] for heap in heaps])) else: status = current_prop - self.cat_target*(n+1) status = np.abs(np.clip(status,a_min = None, a_max = 0)) for c in range(C): if heaps[c]==[]: status[c]=-1 selected_category = np.argmax(status) continu = True while heaps[selected_category]!=[] and continu: est, u, i = heapq.heappop(heaps[selected_category]) if not (int(u) in self.predicted): self.predicted[int(u)]=int(i) current_prop[selected_category]+=1 n+=1 continu = False if heaps == [[] for _ in range(C)]: return def estimate(self, u, i): return -1
def main(): train_y = np.loadtxt('data/train.txt').astype(int) test_y = np.loadtxt('data/test.txt').astype(int) reader = Reader() Y_train = Dataset.load_from_file('data/train.txt', reader) Y_train = Y_train.build_full_trainset() # regularization factor (0.1 was the best) regs = [10**-4, 10**-3, 10**-2, 10**-1, 1] # learning rates (0.01 was the best) eta = [0.005, 0.01, 0.03, 0.05, 0.07] # number of Latent Factors (5 is the best) Ks = [5, 10, 15, 20, 30, 40] E_ins = [] E_outs = [] # Use to compute Ein and Eout for reg in regs: E_ins_for_lambda = [] E_outs_for_lambda = [] for k in Ks: print('MODEL') algo = SVD(n_factors=k, n_epochs=300, biased=True, lr_all=0.01, reg_all=reg) algo.fit(Y_train) e_in = error(train_y, algo) E_ins_for_lambda.append(e_in) eout = error(test_y, algo) E_outs_for_lambda.append(eout) E_ins.append(E_ins_for_lambda) E_outs.append(E_outs_for_lambda) for i in range(len(regs)): plt.plot(Ks, E_ins[i], label='$E_{in}, \lambda=$' + str(regs[i])) plt.title('$E_{in}$ vs. Number of Latent Factors (K)') plt.xlabel('K') plt.ylabel('Error') plt.legend() plt.savefig('E_in_SURPRISE(Latent Factors).png') plt.clf() for i in range(len(regs)): plt.plot(Ks, E_outs[i], label='$E_{out}, \lambda=$' + str(regs[i])) plt.title('$E_{out}$ vs. Number of Latent Factors (K)') plt.xlabel('K') plt.ylabel('Error') plt.legend() plt.savefig('E_out_SURPRISE(Latent Factors).png')
def SVD_surprise_only(Trainset, N=30): reader = Reader() Trainset_changetype = Dataset.load_from_df( Trainset[['Member_encoding', 'Game_encoding', 'score']], reader) Trainset_changetype_result = Trainset_changetype.build_full_trainset() svd = SVD( n_factors=20, n_epochs=20, lr_all=0.01, #0.0001, random_state=1234) svd.fit(Trainset_changetype_result) games = list(Trainset.Game_encoding.unique() ) # Get our unique games that were purchased #model SVD_New data = np.transpose(np.dot(svd.pu, np.transpose(svd.qi))) x = cosine_similarity(data, data) cosine_sim_x = pd.DataFrame(data=x, index=games, columns=games) gamesplayed = Trainset.groupby([ 'Member_encoding' ])['Game_encoding'].apply(list).reset_index(name='games') gamesmax = np.array( gamesplayed.games.map(lambda x: ((cosine_sim_x.loc[x, :].values).max(axis=0)))) gamelist = np.array(cosine_sim_x.columns) def Get_neighbor_30(x): # x[x>0.99] = 0.0 return (gamelist[np.flip(np.argsort(x, axis=0))[0:N, ]]) filtered = list(map(Get_neighbor_30, gamesmax)) filtered_array = np.array(filtered) filtered_array = filtered_array.reshape( filtered_array.shape[0] * filtered_array.shape[1], -1) filtered_array = filtered_array.reshape(-1, ) SVD_Neighbor = pd.DataFrame({ 'Member_encoding': np.repeat(np.array(np.unique(Trainset.Member_encoding)), N, axis=0), 'Game_encoding': filtered_array }) #SVD_Neighbor_result = SVD_Neighbor.groupby('member_id').head(12) SVD_Neighbor_result = SVD_Neighbor.merge( Trainset[['Member_encoding', 'Game_encoding', 'score']], how='left', on=['Member_encoding', 'Game_encoding']) SVD_Neighbor_result.score = np.where(SVD_Neighbor_result.score.isna(), 0, SVD_Neighbor_result.score) SVD_Neighbor_result = SVD_Neighbor_result.sort_values( by=['Member_encoding', 'score'], ascending=False) SVD_Neighbor_result = SVD_Neighbor_result.groupby('Member_encoding').head( 12) return SVD_Neighbor, SVD_Neighbor_result
def hybrid_rec(userid, favemovie, n): '''this takes in a userid, favemovie and n number of recs and outputs those in a sorted list''' rec_hybrid = content_recommendations(favemovie, n) svd = SVD(n_factors=50, reg_all=0.05, random_state=150) trainset = data.build_full_trainset() svd.fit(trainset) for index, row in rec_hybrid.iterrows(): pred = svd.predict(userid, index) rec_hybrid.at[index, 'score'] = pred.est rec_hybrid = rec_hybrid.sort_values('score', ascending=False) return rec_hybrid
def get_predictions(data): # First train an SVD algorithm on the movielens dataset. trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_testset() predictions = algo.test(testset) return predictions
def biassvd(dataset): start = time.time() algo = SVD(biased=True) kf = KFold(n_splits=5) for trainset, testset in kf.split(dataset): algo.fit(trainset) predictions = algo.test(testset) acc = accuracy.rmse(predictions, verbose=True) end = time.time() print('biassvd花分钟数为:', (end - start) / 60) return acc
class SVDRS(AbstractRS): def __init__(self, path): self.path = path self.algo = SVD() def train(self): try: trainset = self.data.build_full_trainset() self.algo.fit(trainset) except Exception as ex: Logger('error').get_log().error(ex)
def create_model(self): n = 1000000 raw_data = self.get_ratings()[:n].fillna(0)[["userId", "id", "rating"]] reader = Reader() data = Dataset.load_from_df(raw_data, reader) data.split(n_folds=5) svd = SVD() trainset = data.build_full_trainset() svd.fit(trainset) filename = "C:/datasets/the-movies-dataset/models/collaborative_based/coll_svd.sav" joblib.dump(svd, filename)
def SVD_calculation(data , trainset, testset, time, cv): start = time.time() algo = SVD() algo.fit(trainset) predictions = algo.test(testset) #svd_accuracy = accuracy.rmse(predictions) cross_validate_svd_dict = cross_validate(algo, data, measures = ['RMSE'],cv=cv,verbose=True) end = time.time() time = end-start return time, cross_validate_svd_dict
def fit_model(data): train, test = train_test_split(data, test_size=0.25) svd = SVD(n_epochs=25, lr_all=0.01, reg_all=0.4) svd.fit(train) pred = svd.test(test) print('RMSE for test set: {}'.format(accuracy.rmse(pred))) print('MAE for test set: {}'.format(accuracy.mae(pred))) # save model path = '../Models/Collaborative_filtering2.model' pickle.dump(svd, open(path, 'wb')) print("Model is saved to: {}".format(path))
def func2(): from surprise import SVD from surprise import Dataset from surprise import accuracy from surprise.model_selection import train_test_split data = Dataset.load_builtin('ml-100k') trainset, testset = train_test_split(data, test_size=.25) algo = SVD() algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions)
def generate_svd_recommendation_df() -> pd.DataFrame: # Prepare input DataFrame and algorithm score_df = genearte_score_df() svd_data = MyDataSet(score_df) #Try SVD algo = SVD() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #Try the NMF nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) algo = NMF() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) accuracy.mae(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #--------------------------------------------------- # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False) knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False) knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False) # Matrix Factorization Based Algorithms svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False) svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False) nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) #Other Collaborative Filtering Algorithms slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False) coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
def setCFItemBased(self): reader = Reader(line_format='item user rating', sep=',', rating_scale=(1, 5)) data_folds = Dataset.load_from_df( self.ratings[['shop_id', 'ch_id', 'rating']], reader) trainset = data_folds.build_full_trainset() algo = SVD(n_factors=50, n_epochs=20) # Matrix Factorization; 파라미터 튜닝 결과 최적 값으로 조정 algo.fit(trainset) dump.dump('./model/cf_itembase_ForShop.py', algo=algo) """
def get(self, algorithm, user_id): # SQL query conn = mysql.connect() cursor = conn.cursor() df = pd.read_sql_query("SELECT * FROM story_reviews", conn) # Data and Model reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'story_id', 'star']], reader) if algorithm=='svd': print('Using SVD') model = SVD() elif algorithm=='svdpp': print('Using SVD++') model = SVDpp() elif (algorithm=='nmf'): print('Using NMF') model = NMF() elif (algorithm=='slopeone'): print('Using Slope One') model = SlopeOne() elif (algorithm=='coclustering'): print('Using Co-Clustering') model = CoClustering() else: print('Using SVD') model = SVD() # Training training_set = data.build_full_trainset() model.fit(training_set) # Prediction anti_training_set = training_set.build_anti_testset() prediction_set = [x for x in anti_training_set if x[0]==user_id] predictions = model.test(prediction_set) # TESTING : Run 5-fold Cross Validation using Root Mean Square Error and Mean Average Error # cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Return Top N Recommendations n = 10 predictions.sort(key=lambda x:x.est, reverse=True) top_n_predictions = predictions[:n] story_recommendations = [] for predictionItem in top_n_predictions: story_recommendations.append(predictionItem.iid) return jsonify(recommendations = story_recommendations)
def get_top_n(teaid, score, qq, n=10): # A reader is still needed but only the rating_scale param is requiered. qq = pd.concat([ qq, pd.DataFrame([[score, teaid, 'user1']], columns=['Score', 'Tea Name', 'User Name']) ], ignore_index=True) reader = Reader(rating_scale=(0, 100)) algo = SVD() # The columns must correspond to user id, item id and ratings (in that order). data = Dataset.load_from_df(qq[['User Name', 'Tea Name', 'Score']], reader) trainset = data.build_full_trainset() algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) '''Return the top-N recommendation for each user from a set of predictions. Args: predictions(list of Prediction objects): The list of predictions, as returned by the test method of an algorithm. n(int): The number of recommendation to output for each user. Default is 10. Returns: A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. ''' want = [] for i in predictions: if i[0] == 'user1': want.append(i) # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in want: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:10] teadist = [] mindist = [] eudist = 0 for i in top_n['user1']: eudis=(euclidean_distances(teadf.iloc[itemdf[itemdf['Tea Name']==teaid].index,:], \ teadf.iloc[(itemdf[itemdf['Tea Name']==i[0]].index),:])) teadist.append((i[0], eudist)) mindist = sorted(teadist, key=lambda x: x[1]) return mindist[0], mindist[1], mindist[2]
def do_svd(trainingSet, start_time): svd = SVD() # evaluate(svd, Dataset.load_builtin("ml-100k"), measures=['RMSE', 'MAE']) svd.fit(trainingSet) testSet = trainingSet.build_anti_testset() print("Training complete") predictions = svd.test(testSet) print("Predictions ready") LOGGER.info("0;Data prediction completed in '%s' minutes", str((time.time() - start_time) / 60)) print("Rmse values for doing svd based recomm on movielens data is " + str(accuracy.rmse(predictions))) return predictions
def func6(): from surprise import SVD from surprise import Dataset from surprise import accuracy from surprise.model_selection import KFold data = Dataset.load_builtin('ml-100k') kf = KFold(n_splits=3) algo = SVD() for trainset, testset in kf.split(data): algo.fit(trainset) predictions = algo.test(testset) accuracy.rmse(predictions, verbose=True)
def collaborative(self,ratings,user_id): reader = Reader() #ratings.head() temp_ratings = ratings data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) ## Training the data ## svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) #svd.train(trainset) ## Testing the data ## from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) count = 0 for uid, iid, true_r, est, _ in predictions: if uid == user_id: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print("count\n") #print(count) #print("\n--------here-------\n") #print(temp_ratings) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] #print("\n--------here-------\n") #print(cb) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] return(cb)
from __future__ import (absolute_import, division, print_function, unicode_literals) from surprise import Dataset from surprise import SVD from surprise import accuracy from surprise.model_selection import KFold data = Dataset.load_builtin('ml-100k') algo = SVD() trainset = data.build_full_trainset() algo.fit(trainset) testset = trainset.build_testset() predictions = algo.test(testset) # RMSE should be low as we are biased accuracy.rmse(predictions, verbose=True) # ~ 0.68 (which is low) # We can also do this during a cross-validation procedure! print('CV procedure:') kf = KFold(n_splits=3) for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)): print('fold number', i + 1) algo.fit(trainset_cv) print('On testset,', end=' ')
then reloaded and can be used again for making predictions. """ from __future__ import (absolute_import, division, print_function, unicode_literals) import os from surprise import SVD from surprise import Dataset from surprise import dump data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) # Compute predictions of the 'original' algorithm. predictions = algo.test(trainset.build_testset()) # Dump algorithm and reload it. file_name = os.path.expanduser('~/dump_file') dump.dump(file_name, algo=algo) _, loaded_algo = dump.load(file_name) # We now ensure that the algo is still the same by checking the predictions. predictions_loaded_algo = loaded_algo.test(trainset.build_testset()) assert predictions == predictions_loaded_algo print('Predictions are the same')
def hybrid(userId,train_rd): #get_ipython().magic('matplotlib inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import warnings; warnings.simplefilter('ignore') # In[2]: #Popularity# md = pd.read_csv('CustomData/FinalData.csv') fd = pd.read_csv('avg_ratings1.csv') fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float') vote_averages= fd[fd['rating'].notnull()]['rating'] C = vote_averages.mean() fd1 = pd.read_csv('ratings_count.csv') fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float') vote_counts = fd1[fd1['rating'].notnull()]['rating'] # In[3]: m = vote_counts.quantile(0.75) # In[4]: md['ratings_count'] = fd1['rating'] md['average_rating'] = fd['rating'] # In[28]: #print(md.shape) qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']] qualified['ratings_count'] = qualified['ratings_count'].astype('float') qualified['average_rating'] = qualified['average_rating'].astype('float') #qualified.shape # In[29]: def weighted_rating(x): v = x['ratings_count'] R = x['average_rating'] return (v/(v+m) * R) + (m/(m+v) * C) # In[30]: qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1) #qualified['wr'] #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250) pop = qualified[['book_id','popularity_rating']] #print(qualified.shape) #print(pop.shape) # In[11]: ### Collaborative ## reader = Reader() ratings=train_rd #ratings = pd.read_csv('ratings.csv') #ratings.head() temp_ratings = ratings[0:1000] #print(temp_ratings) data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) # In[12]: svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) # In[13]: trainset = data.build_full_trainset() #svd.train(trainset) algo = SVD() algo.fit(trainset) ## usefule = temp_rating[rating] # In[14]: #print(len(temp_ratings[temp_ratings['user_id']==userId])) # In[ ]: def get_top_n(predictions, n=10): '''Return the top-N recommendation for each user from a set of predictions. Args: predictions(list of Prediction objects): The list of predictions, as returned by the test method of an algorithm. n(int): The number of recommendation to output for each user. Default is 10. Returns: A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. ''' # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): #user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # In[15]: from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) ''' top_n = get_top_n(predictions, n=10000) #print(top_n) #result = pd.DataFrame(top_n) #print(result) for uid, user_ratings in top_n.items(): #print(uid, [iid for (iid , _) in user_ratings]) for uid, iid, true_r, est, _ in predictions: temp_ratings.loc[uid]= [uid,iid,est] #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']] ''' count = 0 for uid, iid, true_r, est, _ in predictions: if uid == userId: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print('here') #print(uid) #temp_ratings.append([uid,iid,est],ignore_index=True) #print(count) #print(temp_ratings) # In[16]: #print(len(temp_ratings[temp_ratings['user_id']==2])) # In[ ]: # In[46]: ##### CONTENT ###### import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import csv import warnings; warnings.simplefilter('ignore') # In[48]: md=pd.read_csv('CustomData/FinalData.csv') rd=train_rd #rd=pd.read_csv('ratings.csv') md['book_id'] = md['book_id'].astype('int') rd['book_id'] = rd['book_id'].astype('int') rd['user_id'] = rd['user_id'].astype('int') rd['rating'] = rd['rating'].astype('int') #print(md.head()) md['authors'] = md['authors'].str.replace(' ','') md['authors'] = md['authors'].str.lower() md['authors'] = md['authors'].str.replace(',',' ') #print(md.head()) md['authors'] = md['authors'].apply(lambda x: [x,x]) #print(md['authors']) md['Genres']=md['Genres'].str.split(';') #print(md['Genres']) md['soup'] = md['authors'] + md['Genres'] #print(md['soup']) md['soup'] = md['soup'].str.join(' ') #md['soup'].fillna({}) #print(md['soup']) count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english') count_matrix = count.fit_transform(md['soup']) #print (count_matrix.shape) #print np.array(count.get_feature_names()) #print(count_matrix.shape) cosine_sim = cosine_similarity(count_matrix, count_matrix) # In[91]: def build_user_profiles(): user_profiles=np.zeros((53421,999)) #print(rd.iloc[0]['user_id']) #len(rd['book_id']) for i in range(0,1000): u=rd.iloc[i]['user_id'] b=rd.iloc[i]['book_id'] #print(u,b) #print(i) #if b<999: #print("match at "+str(b)) user_profiles[u][b-1]=rd.iloc[i]['rating'] #print(user_profiles) return user_profiles user_profiles=build_user_profiles() def _get_similar_items_to_user_profile(person_id): #Computes the cosine similarity between the user profile and all item profiles #print(user_profiles[person_id]) #print("\n---------\n") #print(cosine_sim[0]) user_ratings = np.empty((999,1)) cnt=0 for i in range(0,998): book_sim=cosine_sim[i] user_sim=user_profiles[person_id] user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i]) maxval = max(user_ratings) #print(maxval) for i in range(0,998): user_ratings[i]=((user_ratings[i]*5.0)/(maxval)) #print(user_ratings[i]) if(user_ratings[i]>3): #print("MILA KUCCHHH") cnt+=1 #print(max(user_ratings)) #print (cnt) #print(cosine_similarities) #return similar_items return user_ratings content_ratings = _get_similar_items_to_user_profile(userId) # In[100]: num = md[['book_id']] #print(num) num1 = pd.DataFrame(data=content_ratings[0:,0:]) frames = [num, num1] #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index]) mer = pd.concat(frames, axis =1,join_axes=[num.index]) mer.columns=['book_id', 'content_rating'] #print(mer.shape) #print('here') #print(mer) # In[102]: ## for user 2 # #print(temp_ratings.shape) cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']] # print(cb.shape) # print(pop.shape) hyb = md[['book_id']] hyb = hyb.merge(cb,on = 'book_id') hyb = hyb.merge(pop, on='book_id') hyb = hyb.merge(mer, on='book_id') #hyb.shape # In[106]: def weighted_rating(x): v = x['rating'] R = x['popularity_rating'] c = x['content_rating'] return 0.4*v + 0.2*R + 0.4 * c # In[107]: print(hyb) hyb['final'] = hyb.apply(weighted_rating, axis=1) hyb = hyb.sort_values('final', ascending=False).head(999) #print(hyb['final']) print(hyb) return hyb