def svd(trainset, testset, predset): modelname = 'svd' # Check if predictions already exist if is_already_predicted(modelname): return algo = SVD(n_factors=100, n_epochs=40, lr_bu=0.01, lr_bi=0.01, lr_pu=0.1, lr_qi=0.1, reg_bu=0.05, reg_bi=0.05, reg_pu=0.09, reg_qi=0.1) print('SVD Model') algo.train(trainset) predictions = algo.test(trainset.build_testset()) print(' RMSE on Train: ', accuracy.rmse(predictions, verbose=False)) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def svd_factorization(): """ Predict games for user with user_key = 158123 """ target_user_key = 158123 run_reduce_dataset = True # reduce dataset: if run_reduce_dataset: df = import_all_reviews() df_reduced = reduce_reviews(df) export_reviews(df_reduced) # import reduced dataset: df = import_reduced_reviews() # check for duplicates: duplicates = len(df) - len( df.drop_duplicates(subset=['game_key', 'user_key'])) # drop duplicates: df = df.drop_duplicates(subset=['game_key', 'user_key']) print('duplicates removed: ' + str(duplicates)) # check out our user: df_target_user = df[df['user_key'] == target_user_key] # build utility matrix: data_pivot = df.pivot(index='user_key', columns='game_key', values='rating') # calculate sparsity sparsity = data_pivot.isnull().sum().sum() / data_pivot.size print('Sparcity of utility matrix: ' + str(sparsity)) # reader belongs to Scikit-surprise reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(df[['user_key', 'game_key', 'rating']], reader) # split in training and test set trainset, testset = train_test_split(data, test_size=0.2) # apply SVD algorithm: algo = SVD() algo.fit(trainset) predictions = algo.test(testset) # Evaluation: rsme = accuracy.rmse(predictions) print('RSME of: ' + str(rsme)) ### Prediction for target user: # Predict ratings for all pairs (u, i) that are NOT in the training set. testset = Dataset.load_from_df( df_target_user[['user_key', 'game_key', 'rating']], reader) predictions = algo.test(testset)
def generate_svd_recommendation_df() -> pd.DataFrame: # Prepare input DataFrame and algorithm score_df = genearte_score_df() svd_data = MyDataSet(score_df) #Try SVD algo = SVD() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #Try the NMF nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) algo = NMF() full_train_set = svd_data.build_full_trainset() test_set = full_train_set.build_anti_testset() # 5 fold validation score = cross_validate(algo, svd_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Fitting the SVD algo.fit(full_train_set) predictions = algo.test(test_set) # Then compute RMSE accuracy.rmse(predictions) accuracy.mae(predictions) # Generate recommendation DataFrame recommendation_df_svd = get_top_n(predictions, n=5) #print (recommendation_df) #--------------------------------------------------- # as per - https://bmanohar16.github.io/blog/recsys-evaluation-in-surprise knnbasic_cv = cross_validate(KNNBasic(), svd_data, cv=5, n_jobs=5, verbose=False) knnmeans_cv = cross_validate(KNNWithMeans(), svd_data, cv=5, n_jobs=5, verbose=False) knnz_cv = cross_validate(KNNWithZScore(), svd_data, cv=5, n_jobs=5, verbose=False) # Matrix Factorization Based Algorithms svd_cv = cross_validate(SVD(), svd_data, cv=5, n_jobs=5, verbose=False) svdpp_cv = cross_validate(SVDpp(),svd_data, cv=5, n_jobs=5, verbose=False) nmf_cv = cross_validate(NMF(), svd_data, cv=5, n_jobs=5, verbose=False) #Other Collaborative Filtering Algorithms slope_cv = cross_validate(SlopeOne(), svd_data, cv=5, n_jobs=5, verbose=False) coclus_cv = cross_validate(CoClustering(), svd_data, cv=5, n_jobs=5, verbose=False)
class MangakiSSVD(RecommendationAlgorithm): def __init__(self, rank=10, nb_iterations=20, *args, **kwargs): super().__init__(*args, **kwargs) self.model = SVD(n_factors=rank, n_epochs=nb_iterations) def fit(self, X, y): self.reader = Reader(rating_scale=(y.min(), y.max())) data = Dataset.load_from_df(pd.DataFrame(np.column_stack((X, y))), self.reader) train = data.build_full_trainset() self.chrono.save('prepare data') self.model.fit(train) self.chrono.save('fit') def predict(self, X): y = np.repeat(0, len(X)) data = Dataset.load_from_df(pd.DataFrame(np.column_stack((X, y))), self.reader) train = data.build_full_trainset() test = train.build_testset() pred = self.model.test(test) return np.array([rating.est for rating in pred]) def get_shortname(self): return 'ssvd'
def fit_and_predict(): try: db = DbCursor() except Exception as ex: return [] else: sql = 'select * from user_video' user_videos = db.get(sql) if user_videos.__len__() > 0: df = pd.DataFrame(user_videos) reader = Reader(rating_scale=(0, 100)) data = Dataset.load_from_df(df[['user_id', 'video_id', 'percent']], reader) train_set = data.build_full_trainset() algo = SVD() algo.fit(train_set) test_set = train_set.build_anti_testset() predictions = algo.test(test_set) top_n = get_top_n(predictions, n=10) return top_n else: return []
def modelo_svd_best_n(data): reader = Reader(rating_scale=(1, 5)) # 'lr_all':[0.01,0.002,0.005], #'reg_all':[0.01,0.02,0.04], data = Dataset.load_from_df( data[['userid', 'businessid', 'mean_by_business']], reader) param_grid = { 'n_factors': [5, 20, 50, 100], 'n_epochs': [100, 200, 300], } gs = Gridsearch_svd(SVD, param_grid, measures=['rmse'], cv=5, n_jobs=5) gs.fit(data) # combination of parameters that gave the best RMSE score k = gs.best_params['rmse']['n_factors'] n_epochs = gs.best_params['rmse']['n_epochs'] #Predictions with best parameters data_ = data.build_full_trainset() algo = SVD(n_factors=k, n_epochs=n_epochs) algo.fit(data_) prediciones = algo.test(data_.build_anti_testset()) return prediciones
def run_svd(dataset): # Load the movielens_hetesage-100k dataset (download it if needed), data = Dataset.load_builtin(dataset) # sample random trainset and testset # test set is made of 25% of the ratings. trainset, testset = train_test_split(data, test_size=.33) # We'll use the famous SVD algorithm. algo = SVD() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(trainset) predictions = algo.test(testset) # Then compute RMSE accuracy.rmse(predictions) y_test = [item[2] for item in testset] preds = [pred[3] for pred in predictions] preds_round = np.rint(preds) rmse_round = np.sqrt(np.mean(np.square(np.array(preds_round - np.array(y_test))))) print(f'rmse_round {rmse_round}') utils.hist_plot(y_test, preds, preds_round)
def get_recommendation(user): conn = pymysql.connect(Account.link, Account.user, Account.password, Account.db, charset="utf8mb4") df = pd.read_sql_query('SELECT * FROM USERS', conn) if (df.empty): return "Error - empty DF" conn.close() # Anime can be rated from 1 - 10 data = Dataset.load_from_df(df, Reader(rating_scale=(1, 10))) data.split(n_folds=10) algo = SVD() trainset = data.build_full_trainset() algo.train(trainset) # predict ratings for all pairs (user, score) that are NOT in the train set testset = trainset.build_anti_testset() predictions = algo.test(testset) # Get top 15 predictions top_n = get_top_n(predictions, n=15) if top_n.get(user) is None: return "Error - cannot find User" return [iid for (iid, _) in top_n.get(user)]
def svd(data, training, testing): ''' Tune SVD parameters then calculates RMSE, coverage and running time of SVD Args: data(Dataset): the whole dataset divided into 5 folds training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of SVD with Z-score with optimized parameters top_n: number of unique predictions for top n items ''' # candidate parameters param_grid = {'n_factors': [25, 50, 100, 250], 'n_epochs': [10, 20, 30, 40, 50]} # optimize parameters grid_search = GridSearch(SVD, param_grid, measures=['RMSE'], verbose=False) grid_search.evaluate(data) param = grid_search.best_params['RMSE'] print('SVD:', param) # fit model using the optimized parameters svd = SVD(n_factors=param['n_factors'], n_epochs=param['n_epochs']) svd.train(training) # evaluate the model using test data predictions = svd.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def predict_VSD(userid): df = pd.read_csv('ratings_small.csv').drop(['timestamp'], axis=1) reader = Reader(rating_scale=(1, 5)) #使用reader格式从文件中读取数据 data = Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader=reader) #拆分训练集与测试集,75%的样本作为训练集,25%的样本作为测试集 trainset, testset = train_test_split(data, test_size=.25) model = SVD(n_factors=100) model.fit(trainset) predictions = model.test(testset) top_n = get_top_n(predictions, n=30) movie_titles = pd.read_csv('movies_metadata.csv', usecols=['id', 'title']) movie_titles = movie_titles.rename(columns={'id': 'movieId'}) movie_titles['movieId'] = pd.to_numeric(movie_titles['movieId'], errors='coerce').fillna(0) movie_titles['movieId'] = movie_titles['movieId'].astype('int') movie_titles.drop_duplicates() for uid, user_ratings in top_n.items(): if (uid == userid): title_list = [iid for (iid, _) in user_ratings] #print(uid, [iid for (iid, _) in user_ratings]) #print(title_list) #print(uid, title_list) titles = movie_titles[movie_titles.movieId.isin(title_list)] print(titles[2:]) return titles[2:]
class RecipeRecommender: def __init__(self, n_factors=100, n_epochs=20, lr_all=0.005, reg_all=0.02): self.model = SVD(n_factors=n_factors, n_epochs=n_epochs, lr_all=lr_all, reg_all=reg_all, random_state=2020) def fit(self, reviews): # SurPRISE supports only pandas DataFrame or folds as data input data = Dataset.load_from_df( DataFrame(reviews), Reader(rating_scale=(1, 5)) ) self.trainset = data.build_full_trainset() self.testset = self.trainset.build_anti_testset() return self.model.fit(self.trainset) def predict(self, n=20): self.predictions = self.model.test(self.testset) recommended_dict = RecipeRecommender.get_top_n(self.predictions, n=n) return [id_tuple[0] for id_tuple in recommended_dict[1]] @staticmethod def get_top_n(predictions, n): top_n = defaultdict(list) for uid, iid, _, est, _ in predictions: top_n[uid].append((iid, est)) for uid, user_ratings in top_n.items(): user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n
def main(): """ ... """ #get data from surprise data = Dataset.load_builtin('ml-100k') trainset, testset = train_test_split(data, test_size=.25) algo = SVD() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(trainset) predictions = algo.test(testset) #calculate the delta x = [elem[2] - elem[3] for elem in predictions] #number of column in the graph clmnNb = 69 plt.hist(x, clmnNb, facecolor='b', alpha=0.75) plt.xlabel('Delta values') plt.ylabel('Number of same delta') plt.title('Delta of rating') plt.show()
def surpriseSVD(movieLensDataPath='data_clean.txt'): ''' Basic use of the surprise SVD algorithm. ''' ''' Params: movieLensDataPath is the path to the movielens data we're looking at. ''' ''' Note: replace with cleaned data. ''' ''' We want to return U and V where for a Y of a matrix of movie ratings, Y ~/= U^TV.''' # Load the data as a pandas data frame, as reading from text didn't quite work at first. df = pd.read_csv(movieLensDataPath, sep="\t", header=None) df.columns = ["User Id", "Movie Id", "Rating"] # We need the rating scale. reader = Reader(rating_scale=(1, 5)) # The columns are User Id, Movie Id, and Rating. data = Dataset.load_from_df(df[["User Id", "Movie Id", "Rating"]], reader) # To fit to the SVD algorithm, we have to convert it to a trainset. algo = SVD() trainset = data.build_full_trainset() algo.fit(trainset) # U and V! algop = algo.pu algoq = algo.qi # Simple crossvalidation kf = KFold(n_splits=3) algo = SVD() for trainset, testset in kf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True) # Return U (pu) and V (qi) return algop, algoq
def getPrediction(UserId): ratings_dict = { "userID": [1, 1, 3, 4, 4, 6], "POIID": [1, 2, 1, 4, 2, 6], "rating": [5, 5, 1, 4, 5, 3], } #users = User.objects() #for user in users: # print(user) frame = pd.DataFrame(ratings_dict) print(frame) reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(frame[['userID', 'POIID', 'rating']], reader) cross_validate(NormalPredictor(), data, cv=2) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = Predictions.get_top_n(predictions, n=10) for uid, user_ratings in top_n.items(): if (uid == UserId): return [iid for (iid, _) in user_ratings]
class SVDModel: def __init__(self): self.model = SVD() self.name = 'Singular Value Decomposition' def best_estimator_gridsearchCV(self, data, n_epochs=[5, 10], lr_all=[0.002, 0.005], reg_all=[0.4, 0.5], cv=3): param_grid = { 'n_epochs': n_epochs, 'lr_all': lr_all, 'reg_all': reg_all } gs = GridSearchCV(self.model, param_grid, measures=['rmse'], cv=cv) gs.fit(data) gs.best_params['rmse'] return params def train(self, *args, **kwargs): self.model.fit(*args, **kwargs) def predict(self, *args, **kwargs): self.model.predict(*args, **kwargs) def test(self, *args, **kwargs): return self.model.test(*args, **kwargs)
def collaborative(): conn = sqlite3.connect("mf.sqlite3") movies = pd.read_sql_query( "select title, poster_path, runtime, genres, vote_average, vote_count from movies", conn) ratings = pd.read_sql_query("select * from ratings", conn) reader = Reader() data = Dataset.load_from_df(ratings[['userid', 'movieid', 'rating']], reader=reader) svd = SVD() cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) trainset = data.build_full_trainset() print(trainset) testset = trainset.build_anti_testset() predictions = svd.test(testset) top_n = get_top_n(predictions, n=10) # Print the recommended items for each user recommendations = {} for uid, user_ratings in top_n.items(): recommendations[uid] = [iid for (iid, _) in user_ratings] with open('catalog/output.py', 'w') as filehandle: filehandle.write('recommendations=') filehandle.write(json.dumps(recommendations)) return recommendations
def recommend_place(user_id): try: find_user_rating = 'SELECT * FROM rating_place where user_id=%(user_id)s;' params = {"user_id" : int(user_id)} user_rating = read_data_from_db(find_user_rating, params) sql = 'SELECT user_id, place_id, rating FROM rating_place' ds = read_data_from_db(sql, None) if len(ds) > 0 and len(user_rating) >0: reader = Reader() data = Dataset.load_from_df(ds[['user_id', 'place_id', 'rating']], reader=reader) alg = SVD() alg.fit(data.build_full_trainset()) iids = ds['place_id'].unique() rated_iids = ds.loc[ds['user_id'] == user_id, 'place_id'] iids_to_pred = np.setdiff1d(iids, rated_iids) testset = [[user_id, iid, 4.] for iid in iids_to_pred] predictions = alg.test(testset) evaluate_surprise_alg(predictions) predictions.sort(key=lambda x: x.est, reverse=True) list_of_ids = [] for i in range(50 if len(predictions) >= 50 else len(predictions)): list_of_ids.append(int(predictions[i].iid)) similar_places = get_list_db_objects_from_ids(tuple(list_of_ids)) return Response(similar_places.to_json(orient="records"), status=200, mimetype='application/json') return "not found", 404 except Exception as e: print(str(e)) return "", 500
def testreview(): df1 = pd.DataFrame(my_client['mimi']['review'].find()) df2 = pd.DataFrame(my_client['mimi']['appReview'].find()) df = pd.concat([df1, df2]).reset_index() store_df = pd.DataFrame(my_client['mimi']['store'].find()) store_addr = {} store = store_df.values.tolist() for s in store: store_addr[s[0]] = s[1:] # Load the dataset (download it if needed) reader = Reader(rating_scale=(0.0, 5.0)) data = Dataset.load_from_df(df[["userName", "resId", "rating"]], reader) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) # Than predict ratings for all pairs (u, i) that are NOT in the training set. # testset = trainset.build_full_trainset() testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, store_addr) # Print the recommended items for each user # recom_qs = pd.DataFrame.my_client['mimi']['recommand'].find("Uid" : mid) x = my_client['mimi']['recommand'].insert_many(top_n) print(len(x))
def train_benchmark(): # Load the movielens-100k dataset (download it if needed), data = Dataset.load_builtin('ml-100k') # sample random trainset and testset # test set is made of 25% of the ratings. trainset, testset = train_test_split(data, test_size=.25) # We'll use the famous SVD algorithm. algo = SVD() # Train the algorithm on the trainset, and predict ratings for the testset algo.fit(trainset) #print benchmark: predictions = algo.test(testset) print(accuracy.rmse(predictions)) algo_filename = 'rec_algo.pkl' testset_filename = 'testset.pkl' with open(algo_filename, 'wb') as f: print("saving model to disk") pickle.dump(algo, f) with open(testset_filename, 'wb') as f: print("saving testset to disk") pickle.dump(testset, f)
def surprise_SVD(train_file, test_file): """ Svd with Surprise library. Compute the predictions on a test_set after training on a train_set using the method Svd from Surprise. Args: train_file (string): path to created test file test_file (string): path to created train file Hyperparameters: n_factors : The number of factors. n_epochs : The number of iteration of the SGD procedure lr_all: The learning rate for all reg_all : The regularization term for all Returns: numpy array: predictions """ print("SVD") fold = [(train_file, test_file)] reader = Reader(line_format='user item rating', sep=',') data = Dataset.load_from_folds(fold, reader=reader) pkf = PredefinedKFold() # Algorithm algo = SVD(n_epochs=30, lr_all=0.01, reg_all=0.1) for trainset, testset in pkf.split(data): # Train algo.fit(trainset) # Predict predictions = algo.test(testset) pred = np.zeros(len(predictions)) for i in range(len(predictions)): val = predictions[i].est pred[i] = val return pred
def get_start(user=85): ml = MovieLens() print("Loading movie ratings...") data = ml.loadMovieLensLatestSmall() testSubject = user user_preference(testSubject, ml) print( "\nBuilding SVD recommendation model using the WHOLE dataset as trainSet(only for test)..." ) trainSet = data.build_full_trainset( ) # Do not split the dataset into folds and just return a trainset as is, built from the whole dataset. algo = SVD() algo.fit(trainSet) print("Computing recommendations...") testSet = BuildAntiTestSetForUser(testSubject, trainSet) predictions = algo.test(testSet) recommendations = [] print("\nWe recommend:") for userID, movieID, actualRating, estimatedRating, _ in predictions: intMovieID = int(movieID) recommendations.append((intMovieID, estimatedRating)) recommendations.sort(key=lambda x: x[1], reverse=True) for ratings in recommendations[:10]: print(ml.getMovieName(ratings[0]))
def solve_matrix_factorisation(pathw): reader = Reader(line_format='user item rating timestamp', sep=',') data = Dataset.load_from_file(pathw, reader=reader) data.split(n_folds=5) # param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008], # 'reg_all': [0.08, 0.1, 0.15]} # gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3) # gs.fit(data) # algo = gs.best_estimator['rmse'] # print(gs.best_score['rmse']) # print(gs.best_params['rmse']) # cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # print("reached") # Use the new parameters with the train data algo = SVD(n_factors=160, n_epochs=100, lr_all=0.005, reg_all=0.1) trainset = data.build_full_trainset() algo.fit(trainset) print("fitting crossed") # Than predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset() predictions = algo.test(testset) top_n = get_top_n(predictions, n=10) # Print the recommended items for each user for uid, user_ratings in top_n.items(): if uid == '615': # print(uid, [iid for (iid, _) in user_ratings]) return [iid for (iid, _) in user_ratings]
def collaborative(self, ratings, user_id): reader = Reader() #ratings.head() temp_ratings = ratings data = Dataset.load_from_df( temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) ## Training the data ## svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) #svd.train(trainset) ## Testing the data ## testset = trainset.build_anti_testset() predictions = algo.test(testset) count = 0 for uid, iid, true_r, est, _ in predictions: if uid == user_id: count = count + 1 temp_ratings.loc[len(temp_ratings) + 1] = [uid, iid, est] cb = temp_ratings[(temp_ratings['user_id'] == user_id)][[ 'book_id', 'rating' ]] return (cb)
def get_svd_recommender(df, test_size=0.25, path="", exists=False): """ builds and trains an SVD recommender :param df: a dataframe containing user ID's, beer ID's and ratings :param test_size: the fraction of samples that should be reserved for testing :param path: the path to an existing svd recommender that was saved to a file :param exists: whether or not to upload the algo from a saved file :return: trained recommender, list of predictions, and the root mean square error of the recommender """ if exists: return dump.load(path)[1] # allows surprise to read df reader = Reader(rating_scale=(1, 5)) # must load in particular column order data = Dataset.load_from_df(df[['user_id', 'beer_id', 'user_score']], reader) trainset, testset = train_test_split(data, test_size=test_size) algo = SVD() # Train the algorithm on the trainset algo.fit(trainset) # and predict ratings for the testset. test() returns a list of prediction objects # which have several attributes such as est (the prediction) and r_ui (the true rating) predictions = algo.test(testset) # rmse below 1 is considered low rmse = accuracy.rmse(predictions) mae = accuracy.mae(predictions) return algo, predictions, rmse
def retrain(): file = os.path.join(cwd, 'src', 'rec_sys', 'rec_methods', 'data', 'custom_dataset.data') # 1. Load the dataset data = Dataset.load_from_file(file, reader=reader) logger.info("> dataset OK") # 2. Creating train dataset... trainset = data.build_full_trainset() logger.info("> train dataset OK") # 3. Training... algo = SVD() algo.fit(trainset) logger.info("> Training OK") # 4. Predict ratings for all pairs (u, i) that are NOT in the training set. testset = trainset.build_anti_testset() predictions = algo.test(testset) logger.info("> Predictions OK") top_n = get_top_n(predictions, n=5) logger.info("Top N retrieved > OK") return top_n
def recommend(given_user_id): # given_user_id = int(get_object_or_404(User, username=given_user_id).id) print(given_user_id, "recommend function printing given_user_id") queryset = Rate.objects.all() query, params = queryset.query.as_sql( compiler='django.db.backends.sqlite3.compiler.SQLCompiler', connection=connections['default']) df = pd.read_sql_query(query, con=connections['default'], params=params) print("load df") reader = Reader(rating_scale=(1, 5)) data = Dataset.load_from_df(df[['user_id', 'item_id', 'rate']], reader) trainset = data.build_full_trainset() testset = trainset.build_anti_testset() algo = SVD() algo.fit(trainset) print("fit 완료") predictions = algo.test(testset) print("예측 완료") top_10_items = get_top_n(predictions, 10, given_user_id) print("top 10 선별 완료, 길이 : %s" % len(list(top_10_items.keys()))) print(top_10_items[given_user_id]) for item_prediction in top_10_items[given_user_id]: if Prediction.objects.filter(item_id=item_prediction[0], user_id=given_user_id): pass else: obj = Prediction(user_id=given_user_id, item_id=item_prediction[0], prediction=round(item_prediction[1], 1)) obj.save() print("해당 유저 %s 에 대한 데이터 저장완료" % given_user_id) # return [item_prediction[0] for item_prediction in top_10_items[given_user_id]] return top_10_items[given_user_id]
def predict_ratings(data): """ 可以简单地将算法适合整个数据集, 而不是运行交叉验证。 这可以通过使用build_full_trainset()将创建trainset对象的方法来完成 可以通过直接调用该predict()方法来预测收视率 :return: """ trainset = data.build_full_trainset() svg = SVD() svg.fit(trainset) testset = trainset.build_anti_testset() predictions = svg.test(testset) algo = KNNBasic() algo.fit(trainset) #收视率预测:假设对用户196和项目302感兴趣(确保它们在trainset中!),并且知道真实的评分rui=4 uid = str(196) iid = str(302) # algo.predict(uid,iid,r_ui=4,verbose=True) return predictions
def SVDTopNRecs(self, ml, userId, n): #Using recommender SVD SVDAlgorithm = SVD(n_factors=100, random_state=10) #Building recommendation model... trainSet = self.dataset.GetFullTrainSet() SVDAlgorithm.fit(trainSet) #Computing recommendations... testSet = self.dataset.GetAntiTestSetForUser(userId) predictions = SVDAlgorithm.test(testSet) recommendations = [] #filtering movieid and estimate rating from predictions to recommendations for userID, movieID, actualRating, estimatedRating, _ in predictions: intMovieID = int(movieID) recommendations.append((intMovieID, estimatedRating)) #Sorting the recommendations list using ratings in descending order to return top n recs recommendations.sort(key=lambda x: x[1], reverse=True) recommendations = recommendations[:n] return recommendations
def train(self): # 점수 1~ 10 self.df = pd.read_csv(csv_name) reader = Reader(rating_scale=(1, 10)) data = Dataset.load_from_df(self.df[['user_id', 'item_id', 'rating']], reader) # TrainSet trainset = data.build_full_trainset() # algo = self.checkBestAlgorithm()[0] algo = SVD() algo.fit(trainset) # TestSet testset = trainset.build_anti_testset() predictions = algo.test(testset) self.predictions = predictions self.algo = algo # Validate Algo cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True) # Save Dump dump.dump(file_name, predictions=predictions, algo=algo)
def svd_model(df): """ Creates svd model for predcitions and cross validation Returns: data """ from surprise.model_selection.split import train_test_split data = df[['user_id', 'business_id', 'average_stars']].loc[df.city == 'Scottsdale'] reader = Reader() data = Dataset.load_from_df(data, reader) trainset, testset = train_test_split(data, test_size=0.25) algo = SVD() algo.fit(trainset) predictions = algo.test(testset) acc = accuracy.rmse(predictions) svd_cv = cross_validate(SVD(), data, cv=5) return data, acc, svd_cv['test_rmse']
def collaborative(self,ratings,user_id): reader = Reader() #ratings.head() temp_ratings = ratings data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) ## Training the data ## svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) #svd.train(trainset) ## Testing the data ## from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) count = 0 for uid, iid, true_r, est, _ in predictions: if uid == user_id: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print("count\n") #print(count) #print("\n--------here-------\n") #print(temp_ratings) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] #print("\n--------here-------\n") #print(cb) cb = temp_ratings[(temp_ratings['user_id'] == user_id)][['book_id', 'rating']] return(cb)
from surprise import Dataset from surprise import SVD from surprise import accuracy from surprise.model_selection import KFold data = Dataset.load_builtin('ml-100k') algo = SVD() trainset = data.build_full_trainset() algo.fit(trainset) testset = trainset.build_testset() predictions = algo.test(testset) # RMSE should be low as we are biased accuracy.rmse(predictions, verbose=True) # ~ 0.68 (which is low) # We can also do this during a cross-validation procedure! print('CV procedure:') kf = KFold(n_splits=3) for i, (trainset_cv, testset_cv) in enumerate(kf.split(data)): print('fold number', i + 1) algo.fit(trainset_cv) print('On testset,', end=' ') predictions = algo.test(testset_cv) accuracy.rmse(predictions, verbose=True)
from surprise import Dataset from surprise import Reader from surprise import accuracy from surprise.model_selection import PredefinedKFold # path to dataset folder files_dir = os.path.expanduser('~/.surprise_data/ml-100k/ml-100k/') # This time, we'll use the built-in reader. reader = Reader('ml-100k') # folds_files is a list of tuples containing file paths: # [(u1.base, u1.test), (u2.base, u2.test), ... (u5.base, u5.test)] train_file = files_dir + 'u%d.base' test_file = files_dir + 'u%d.test' folds_files = [(train_file % i, test_file % i) for i in (1, 2, 3, 4, 5)] data = Dataset.load_from_folds(folds_files, reader=reader, rating_scale=(1, 5)) pkf = PredefinedKFold() algo = SVD() for trainset, testset in pkf.split(data): # train and test algorithm. algo.fit(trainset) predictions = algo.test(testset) # Compute and print Root Mean Squared Error accuracy.rmse(predictions, verbose=True)
def hybrid(userId,train_rd): #get_ipython().magic('matplotlib inline') import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import warnings; warnings.simplefilter('ignore') # In[2]: #Popularity# md = pd.read_csv('CustomData/FinalData.csv') fd = pd.read_csv('avg_ratings1.csv') fd[fd['rating'].notnull()]['rating'] = fd[fd['rating'].notnull()]['rating'].astype('float') vote_averages= fd[fd['rating'].notnull()]['rating'] C = vote_averages.mean() fd1 = pd.read_csv('ratings_count.csv') fd1[fd1['rating'].notnull()]['rating'] = fd1[fd1['rating'].notnull()]['rating'].astype('float') vote_counts = fd1[fd1['rating'].notnull()]['rating'] # In[3]: m = vote_counts.quantile(0.75) # In[4]: md['ratings_count'] = fd1['rating'] md['average_rating'] = fd['rating'] # In[28]: #print(md.shape) qualified = md[(md['ratings_count'].notnull())][['book_id','title', 'authors', 'ratings_count', 'average_rating']] qualified['ratings_count'] = qualified['ratings_count'].astype('float') qualified['average_rating'] = qualified['average_rating'].astype('float') #qualified.shape # In[29]: def weighted_rating(x): v = x['ratings_count'] R = x['average_rating'] return (v/(v+m) * R) + (m/(m+v) * C) # In[30]: qualified['popularity_rating'] = qualified.apply(weighted_rating, axis=1) #qualified['wr'] #qualified = qualified.sort_values('popularity_rating', ascending=False).head(250) pop = qualified[['book_id','popularity_rating']] #print(qualified.shape) #print(pop.shape) # In[11]: ### Collaborative ## reader = Reader() ratings=train_rd #ratings = pd.read_csv('ratings.csv') #ratings.head() temp_ratings = ratings[0:1000] #print(temp_ratings) data = Dataset.load_from_df(temp_ratings[['user_id', 'book_id', 'rating']], reader) data.split(n_folds=2) # In[12]: svd = SVD() evaluate(svd, data, measures=['RMSE', 'MAE']) # In[13]: trainset = data.build_full_trainset() #svd.train(trainset) algo = SVD() algo.fit(trainset) ## usefule = temp_rating[rating] # In[14]: #print(len(temp_ratings[temp_ratings['user_id']==userId])) # In[ ]: def get_top_n(predictions, n=10): '''Return the top-N recommendation for each user from a set of predictions. Args: predictions(list of Prediction objects): The list of predictions, as returned by the test method of an algorithm. n(int): The number of recommendation to output for each user. Default is 10. Returns: A dict where keys are user (raw) ids and values are lists of tuples: [(raw item id, rating estimation), ...] of size n. ''' # First map the predictions to each user. top_n = defaultdict(list) for uid, iid, true_r, est, _ in predictions: top_n[uid].append((iid, est)) # Then sort the predictions for each user and retrieve the k highest ones. for uid, user_ratings in top_n.items(): #user_ratings.sort(key=lambda x: x[1], reverse=True) top_n[uid] = user_ratings[:n] return top_n # In[15]: from collections import defaultdict testset = trainset.build_anti_testset() predictions = algo.test(testset) ''' top_n = get_top_n(predictions, n=10000) #print(top_n) #result = pd.DataFrame(top_n) #print(result) for uid, user_ratings in top_n.items(): #print(uid, [iid for (iid , _) in user_ratings]) for uid, iid, true_r, est, _ in predictions: temp_ratings.loc[uid]= [uid,iid,est] #temp_ratings[i]['cf'] = temp_ratings[(temp_ratings['user_id'] == uid)][['book_id']] ''' count = 0 for uid, iid, true_r, est, _ in predictions: if uid == userId: count = count+1 temp_ratings.loc[len(temp_ratings)+1]= [uid,iid,est] #print('here') #print(uid) #temp_ratings.append([uid,iid,est],ignore_index=True) #print(count) #print(temp_ratings) # In[16]: #print(len(temp_ratings[temp_ratings['user_id']==2])) # In[ ]: # In[46]: ##### CONTENT ###### import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from scipy import stats from ast import literal_eval from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer from sklearn.metrics.pairwise import linear_kernel, cosine_similarity from nltk.stem.snowball import SnowballStemmer from nltk.stem.wordnet import WordNetLemmatizer from nltk.corpus import wordnet from surprise import Reader, Dataset, SVD, evaluate import csv import warnings; warnings.simplefilter('ignore') # In[48]: md=pd.read_csv('CustomData/FinalData.csv') rd=train_rd #rd=pd.read_csv('ratings.csv') md['book_id'] = md['book_id'].astype('int') rd['book_id'] = rd['book_id'].astype('int') rd['user_id'] = rd['user_id'].astype('int') rd['rating'] = rd['rating'].astype('int') #print(md.head()) md['authors'] = md['authors'].str.replace(' ','') md['authors'] = md['authors'].str.lower() md['authors'] = md['authors'].str.replace(',',' ') #print(md.head()) md['authors'] = md['authors'].apply(lambda x: [x,x]) #print(md['authors']) md['Genres']=md['Genres'].str.split(';') #print(md['Genres']) md['soup'] = md['authors'] + md['Genres'] #print(md['soup']) md['soup'] = md['soup'].str.join(' ') #md['soup'].fillna({}) #print(md['soup']) count = CountVectorizer(analyzer='word',ngram_range=(1,1),min_df=0, stop_words='english') count_matrix = count.fit_transform(md['soup']) #print (count_matrix.shape) #print np.array(count.get_feature_names()) #print(count_matrix.shape) cosine_sim = cosine_similarity(count_matrix, count_matrix) # In[91]: def build_user_profiles(): user_profiles=np.zeros((53421,999)) #print(rd.iloc[0]['user_id']) #len(rd['book_id']) for i in range(0,1000): u=rd.iloc[i]['user_id'] b=rd.iloc[i]['book_id'] #print(u,b) #print(i) #if b<999: #print("match at "+str(b)) user_profiles[u][b-1]=rd.iloc[i]['rating'] #print(user_profiles) return user_profiles user_profiles=build_user_profiles() def _get_similar_items_to_user_profile(person_id): #Computes the cosine similarity between the user profile and all item profiles #print(user_profiles[person_id]) #print("\n---------\n") #print(cosine_sim[0]) user_ratings = np.empty((999,1)) cnt=0 for i in range(0,998): book_sim=cosine_sim[i] user_sim=user_profiles[person_id] user_ratings[i]=(book_sim.dot(user_sim))/sum(cosine_sim[i]) maxval = max(user_ratings) #print(maxval) for i in range(0,998): user_ratings[i]=((user_ratings[i]*5.0)/(maxval)) #print(user_ratings[i]) if(user_ratings[i]>3): #print("MILA KUCCHHH") cnt+=1 #print(max(user_ratings)) #print (cnt) #print(cosine_similarities) #return similar_items return user_ratings content_ratings = _get_similar_items_to_user_profile(userId) # In[100]: num = md[['book_id']] #print(num) num1 = pd.DataFrame(data=content_ratings[0:,0:]) frames = [num, num1] #result = pd.concat([df1, df4], axis=1, join_axes=[df1.index]) mer = pd.concat(frames, axis =1,join_axes=[num.index]) mer.columns=['book_id', 'content_rating'] #print(mer.shape) #print('here') #print(mer) # In[102]: ## for user 2 # #print(temp_ratings.shape) cb = temp_ratings[(temp_ratings['user_id'] == userId)][['book_id', 'rating']] # print(cb.shape) # print(pop.shape) hyb = md[['book_id']] hyb = hyb.merge(cb,on = 'book_id') hyb = hyb.merge(pop, on='book_id') hyb = hyb.merge(mer, on='book_id') #hyb.shape # In[106]: def weighted_rating(x): v = x['rating'] R = x['popularity_rating'] c = x['content_rating'] return 0.4*v + 0.2*R + 0.4 * c # In[107]: print(hyb) hyb['final'] = hyb.apply(weighted_rating, axis=1) hyb = hyb.sort_values('final', ascending=False).head(999) #print(hyb['final']) print(hyb) return hyb
then reloaded and can be used again for making predictions. """ from __future__ import (absolute_import, division, print_function, unicode_literals) import os from surprise import SVD from surprise import Dataset from surprise import dump data = Dataset.load_builtin('ml-100k') trainset = data.build_full_trainset() algo = SVD() algo.fit(trainset) # Compute predictions of the 'original' algorithm. predictions = algo.test(trainset.build_testset()) # Dump algorithm and reload it. file_name = os.path.expanduser('~/dump_file') dump.dump(file_name, algo=algo) _, loaded_algo = dump.load(file_name) # We now ensure that the algo is still the same by checking the predictions. predictions_loaded_algo = loaded_algo.test(trainset.build_testset()) assert predictions == predictions_loaded_algo print('Predictions are the same')