def test_predict(rating_true): svd = surprise.SVD() train_set = surprise.Dataset.load_from_df( rating_true, reader=surprise.Reader() ).build_full_trainset() svd.fit(train_set) preds = predict(svd, rating_true) assert set(preds.columns) == {"userID", "itemID", "prediction"} assert preds["userID"].dtypes == rating_true["userID"].dtypes assert preds["itemID"].dtypes == rating_true["itemID"].dtypes user = rating_true.iloc[0]["userID"] item = rating_true.iloc[0]["itemID"] assert preds[(preds["userID"] == user) & (preds["itemID"] == item)][ "prediction" ].values == pytest.approx(svd.predict(user, item).est, rel=TOL) preds = predict( svd, rating_true.rename(columns={"userID": "uid", "itemID": "iid"}), usercol="uid", itemcol="iid", predcol="pred", ) assert set(preds.columns) == {"uid", "iid", "pred"} assert preds["uid"].dtypes == rating_true["userID"].dtypes assert preds["iid"].dtypes == rating_true["itemID"].dtypes user = rating_true.iloc[1]["userID"] item = rating_true.iloc[1]["itemID"] assert preds[(preds["uid"] == user) & (preds["iid"] == item)][ "pred" ].values == pytest.approx(svd.predict(user, item).est, rel=TOL)
def __init__(self, hyper_params, user_count, item_count): latent_size = hyper_params['latent_size'] if hyper_params['model_type'] == 'kNN': self.model = surprise.prediction_algorithms.knns.KNNBasic( k=10, verbose=True) elif hyper_params['model_type'] == 'NMF': self.model = surprise.NMF(n_factors=latent_size, biased=False, n_epochs=50, verbose=True) elif hyper_params['model_type'] == 'SVD': self.model = surprise.SVD(n_factors=latent_size, verbose=True) elif hyper_params['model_type'] == 'SVD++': self.model = surprise.SVDpp(n_factors=latent_size, verbose=True) elif hyper_params['model_type'] == 'baseline': bsl_options = { 'method': 'sgd', 'n_epochs': 20, } self.model = surprise.prediction_algorithms.baseline_only.BaselineOnly( bsl_options=bsl_options, verbose=True) self.hyper_params = hyper_params self.user_count = user_count self.item_count = item_count
def get_matrix_factorization(ratings, meta_data, n_user, n_movies): # Matrix Faktorization algo = surprise.SVD(n_factors=50, biased=False) reader = surprise.Reader(rating_scale=(0.5, 5)) surprise_data = surprise.Dataset.load_from_df( ratings[["userId", "movieId", "rating"]], reader).build_full_trainset() algo.fit(surprise_data) pred = algo.test(surprise_data.build_testset()) print("MSE: ", surprise.accuracy.mse(pred)) print("RMSE: ", surprise.accuracy.rmse(pred)) ranking_matrix = np.dot(algo.pu, algo.qi.T) # ranking_matrix = np.clip(ranking_matrix, 0.5, 5) # movie_idx_to_id = [surprise_data.to_raw_iid(x) for x in movies_to_pick] movie_idx_to_id = [surprise_data.to_raw_iid(x) for x in range(n_movies)] features_matrix_factorization = algo.pu print("Means: ", np.mean(features_matrix_factorization), np.mean(algo.qi.T)) print("Feature STD:", np.std(features_matrix_factorization), np.std(algo.qi)) print("Full Matrix Shape", np.shape(ranking_matrix), "rankinG_shape", np.shape(ranking_matrix)) return ranking_matrix, features_matrix_factorization, movie_idx_to_id
def svd_surprise(k=20, epochs=20, learning_rate=0.005, bias=True, test_fraction=0.0): """ Performs SVD on the ratings data using surprise. """ # Load the data reader = surprise.Reader(rating_scale=(1, 5), sep='\t') data = surprise.Dataset.load_from_file('data/data.txt', reader) if test_fraction == 0.0: _, test_set = surprise.model_selection.train_test_split(data, test_size=0.25) train_set = data.build_full_trainset() else: # Split the data into a training set and test set train_set, test_set = surprise.model_selection.train_test_split(data, test_size=test_fraction) # Declare the model model = surprise.SVD(n_factors=k, n_epochs=epochs, lr_all=learning_rate, biased=bias) # Train the model on the data model.fit(train_set) predictions = model.test(test_set) # Print the accuracy of the predictions print("SVD Test RMSE: " + str(surprise.accuracy.rmse(predictions, verbose=False))) # Return U, V, the user bias terms, and the movie bias terms return model.pu, model.qi, model.bu, model.bi
def getUserBaseData(user, addr, raw, count): # when importing from a DF, you only need to specify the scale of the ratings. reader = surprise.Reader(rating_scale=(1, 4)) #into surprise: dataframe = surprise.Dataset.load_from_df(raw, reader) trainset = dataframe.build_full_trainset() algo = surprise.SVD() algo.fit(trainset) iids = raw['around'].unique() iidsUsrnotVisited = raw.loc[raw['user'] == user, 'around'] iids_to_pred = np.setdiff1d(iids, iidsUsrnotVisited) # 안 간 가게 구함(차집합) # user_id가 가지않은 가게들로 testset 생성 testset = [[user, iid, 4.] for iid in iids_to_pred] predictions = algo.test(testset) # print(surprise.accuracy.rmse(predictions)) pred_ratings = np.array([pred.est for pred in predictions]) # print(len(pred_ratings)) if len(pred_ratings) < count: i_max = pred_ratings.argsort()[::-1] else: i_max = pred_ratings.argsort()[::-1][:count] #i_max = pred_ratings.argmax() iid = iids_to_pred[i_max] results = {} results_ids = [] for i, m in zip(iid, i_max): # print('{0} : {1}'.format(i,pred_ratings[m])) results[i] = pred_ratings[m] results_ids.append(i) print(results_ids) return results_ids
def train(args): """Training script taking parsed command line / SageMaker variable arguments """ input_files = [ os.path.join(args.train, file) for file in os.listdir(args.train) ] if len(input_files) == 0: raise ValueError(( "There are no files in {}.\n" + "This usually indicates that the channel ({}) was incorrectly specified,\n" + "the data specification in S3 was incorrectly specified or the role specified\n" + "does not have permission to access the data.").format( args.train, "train")) train_df = pd.concat( [pd.read_csv(file, engine="python") for file in input_files]) train_data = surprise.Dataset.load_from_df( train_df, surprise.Reader(line_format=u"user item rating", rating_scale=(1, 5))) algo = surprise.SVD() # Note: Quality metrics like this can be exposed to SageMaker if wanted, see: # https://sagemaker.readthedocs.io/en/stable/overview.html#training-metrics results = surprise.model_selection.cross_validate( algo, train_data, measures=("RMSE", "MAE"), verbose=True, cv=args.cross_validation_folds) # The main mission of our script is to train a model and then save it to file: algo.fit(train_data.build_full_trainset()) surprise.dump.dump(os.path.join(args.model_dir, ALGO_FILE_NAME), algo=algo)
def train_model(M, N, K, eta, reg, Y, eps=0.0001, max_epochs=300): """ Given a training data matrix Y containing rows (i, j, Y_ij) where Y_ij is user i's rating on movie j, learns an M x K matrix U and N x K matrix V such that rating Y_ij is approximated by (UV^T)_ij. Uses a learning rate of <eta> and regularization of <reg>. Stops after <max_epochs> epochs, or once the magnitude of the decrease in regularized MSE between epochs is smaller than a fraction <eps> of the decrease in MSE after the first epoch. Returns a tuple (U, V, err) consisting of U, V, and the unregularized MSE of the model. """ df = pd.DataFrame(Y) df = df.sort_values(1) model = surprise.SVD() reader = surprise.Reader(rating_scale=(1, 5)) data = surprise.Dataset.load_from_df(df[[0, 1, 2]], reader) trainset = data.build_full_trainset() model.fit(trainset) return (model, get_err(model, Y), trainset)
def test_compute_rating_predictions(python_data): rating_true, _, _ = python_data(binary_rating=False) svd = surprise.SVD() train_set = surprise.Dataset.load_from_df( rating_true, reader=surprise.Reader()).build_full_trainset() svd.fit(train_set) preds = compute_rating_predictions(svd, rating_true) assert set(preds.columns) == {'userID', 'itemID', 'prediction'} assert preds['userID'].dtypes == rating_true['userID'].dtypes assert preds['itemID'].dtypes == rating_true['itemID'].dtypes user = rating_true.iloc[0]['userID'] item = rating_true.iloc[0]['itemID'] assert preds[(preds['userID'] == user) & (preds['itemID'] == item)]['prediction'].values == \ pytest.approx(svd.predict(user, item).est, rel=TOL) preds = compute_rating_predictions(svd, rating_true.rename(columns={ 'userID': 'uid', 'itemID': 'iid' }), usercol='uid', itemcol='iid', predcol='pred') assert set(preds.columns) == {'uid', 'iid', 'pred'} assert preds['uid'].dtypes == rating_true['userID'].dtypes assert preds['iid'].dtypes == rating_true['itemID'].dtypes user = rating_true.iloc[1]['userID'] item = rating_true.iloc[1]['itemID'] assert preds[(preds['uid'] == user) & (preds['iid'] == item)]['pred'].values == \ pytest.approx(svd.predict(user, item).est, rel=TOL)
def train( dataset: surprise.dataset.Dataset ) -> surprise.prediction_algorithms.AlgoBase: algo = surprise.SVD() cv_iterator = 5 # cv_iterator = surprise.model_selection.ShuffleSplit(n_splits=10, test_size=0.2) surprise.model_selection.cross_validate( algo, dataset, cv=cv_iterator, n_jobs=-1, measures=['rmse', 'mae'], # 'fcp' return_train_measures=True, verbose=True, ) trainset = dataset.build_full_trainset() testset = trainset.build_testset() # TODO: Verificar algo.fit(trainset) print('running test') predictions = algo.test(testset) print('test done') surprise.accuracy.rmse(predictions) return algo
def surprise_SVD(trainset, finalset): "SVD model" algo = spr.SVD(n_factors=40, n_epochs=20, lr_all=0.001) algo.fit(trainset) predictions_final = algo.test(finalset) return spr_estimate_to_vect(predictions_final)
def train(self, dataset: RecommendationDataset) -> None: train_set = surprise.Dataset.load_from_df( dataset.data[[ dataset.user_col, dataset.user_col, dataset.score_col ]], reader=surprise.Reader()).build_full_trainset() self.svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=self.epochs, verbose=True) self.svd.fit(train_set)
def SVD(train, test, rate): """ Run the SVD model from Surprise library. The number of factors is 40. The number of iterations is 20. @param train: the training set in the Surprise format. @param test: the test set in the Surprise format. @param rate: the learning rate of all parameters. @return: the predictions in a numpy array. """ algo = spr.SVD(n_factors=40, lr_all=rate) algo.fit(train) predictions = algo.test(test) return get_predictions(predictions)
def main(args): user_item_based = 'item_based' if args.item_based else 'user_based' filename = '_'.join([ args.exp_name, args.algorithm, args.sim_name, user_item_based, str(args.num_rows) ]) + '.pkl' output_file = Path(filename) if output_file.exists(): print(f'ERROR! Output file {output_file} already exists. Exiting!') sys.exit(1) print(f'Saving scores in {output_file}\n') reader = surprise.Reader(rating_scale=(1, 5)) df = pq.read_table('all_ratings_with_indices.parquet', columns=['user_idx', 'movie_idx', 'rating']).to_pandas() df.user_idx = df.user_idx.astype(np.uint32) df.movie_idx = df.movie_idx.astype(np.uint16) df.rating = df.rating.astype(np.uint8) print(df.dtypes) data = surprise.Dataset.load_from_df(df[:args.num_rows], reader=reader) del df sim_options = { 'name': args.sim_name, 'user_based': False if args.item_based else True } if args.algorithm == 'knn': algo = surprise.KNNBasic(sim_options=sim_options) elif args.algorithm == 'baseline': algo = surprise.BaselineOnly() elif args.algorithm == 'normal': algo = surprise.NormalPredictor() elif args.algorithm == 'knn_zscore': algo = surprise.KNNWithZScore(sim_options=sim_options) elif args.algorithm == 'svd': algo = surprise.SVD() elif args.algorithm == 'nmf': algo = surprise.NMF() else: print(f'Algorithm {args.algorithm} is not a valid choice.') scores = surprise.model_selection.cross_validate(algo, data, cv=args.cv_folds, verbose=True, n_jobs=-1) pickle.dump(scores, open(output_file, 'wb'))
def factorisation(self, n_user, n_item): #retourne la matrice note complète avec n_user et n_item reader = Reader() data = Dataset.load_from_df(self.data, reader) SVD = surprise.SVD(n_factors=10, n_epochs=10, lr_all=.01, reg_all=.01) results = surprise.model_selection.validation.cross_validate( SVD, data, measures=['MSE'], cv=3, verbose=True) #maintenant on rempli la matrice print("temps d'attente estimé : ", round(n_user * n_item / 105000), "secondes.") M = [] for u in range(n_user): M.append([SVD.predict(u, i).est for i in range(n_item)]) return np.array(M)
def test_recommend_k_items(rating_true): n_users = len(rating_true["userID"].unique()) n_items = len(rating_true["itemID"].unique()) svd = surprise.SVD() train_set = surprise.Dataset.load_from_df( rating_true, reader=surprise.Reader()).build_full_trainset() svd.fit(train_set) preds = compute_ranking_predictions(svd, rating_true, remove_seen=True) assert set(preds.columns) == {"userID", "itemID", "prediction"} assert preds["userID"].dtypes == rating_true["userID"].dtypes assert preds["itemID"].dtypes == rating_true["itemID"].dtypes user = preds.iloc[0]["userID"] item = preds.iloc[0]["itemID"] assert preds[(preds["userID"] == user) & ( preds["itemID"] == item)]["prediction"].values == pytest.approx( svd.predict(user, item).est, rel=TOL) # Test default remove_seen=True assert pd.merge(rating_true, preds, on=["userID", "itemID"]).shape[0] == 0 assert preds.shape[0] == (n_users * n_items - rating_true.shape[0]) preds = compute_ranking_predictions( svd, rating_true.rename(columns={ "userID": "uid", "itemID": "iid", "rating": "r" }), usercol="uid", itemcol="iid", predcol="pred", remove_seen=False, ) assert set(preds.columns) == {"uid", "iid", "pred"} assert preds["uid"].dtypes == rating_true["userID"].dtypes assert preds["iid"].dtypes == rating_true["itemID"].dtypes user = preds.iloc[1]["uid"] item = preds.iloc[1]["iid"] assert preds[(preds["uid"] == user) & (preds["iid"] == item)]["pred"].values == pytest.approx( svd.predict(user, item).est, rel=TOL) # Test remove_seen=False assert (pd.merge(rating_true, preds, left_on=["userID", "itemID"], right_on=["uid", "iid"]).shape[0] == rating_true.shape[0]) assert preds.shape[0] == n_users * n_items
def __init__(self, arm_list, cf_params=None): # To use item-based cosine similarity sim_options = { "name": "pearson", "user_based": False, # Compute similarities between items } if cf_params: self.algo = surprise.SVD( ) if cf_params['algo'] == 'svd' else surprise.KNNBaseline( sim_options=sim_options, verbose=False) self.first_train = False self.data = [] self.action_num_played_arr = np.zeros(len(arm_list)) self.is_action_initialized_arr = np.zeros(len(arm_list)) self.num_rounds = 0 self.action_rewards = np.zeros(len(arm_list)) self.action_list = arm_list
def algo_tester(data_object): ''' Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms ---Parameters--- data_object(variable) created from the read_data_surprise function ---Returns--- returns a dataframe where you can compare the performance of different algorithms ''' benchmark = [] algos = [ sp.SVDpp(), sp.SVD(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering() ] # Iterate over all algorithms for algorithm in algos: # Perform cross validation results = cross_validate(algorithm, data_object, measures=['RMSE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append( pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) benchmark.append(tmp) benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values( 'test_rmse') return benchmark
def svd_training(args): """ Train Surprise SVD using the given hyper-parameters """ print("Start training...") train_data = pd.read_pickle(path=os.path.join(args.datastore, args.train_datapath)) validation_data = pd.read_pickle(path=os.path.join(args.datastore, args.validation_datapath)) svd = surprise.SVD(random_state=args.random_state, n_epochs=args.epochs, verbose=args.verbose, biased=args.biased, n_factors=args.n_factors, init_mean=args.init_mean, init_std_dev=args.init_std_dev, lr_all=args.lr_all, reg_all=args.reg_all, lr_bu=args.lr_bu, lr_bi=args.lr_bi, lr_pu=args.lr_pu, lr_qi=args.lr_qi, reg_bu=args.reg_bu, reg_bi=args.reg_bi, reg_pu=args.reg_pu, reg_qi=args.reg_qi) train_set = surprise.Dataset.load_from_df(train_data, reader=surprise.Reader(args.surprise_reader)) \ .build_full_trainset() svd.fit(train_set) print("Evaluating...") rating_metrics = args.rating_metrics if len(rating_metrics) > 0: predictions = compute_rating_predictions(svd, validation_data, usercol=args.usercol, itemcol=args.itemcol) for metric in rating_metrics: result = eval(metric)(validation_data, predictions) print(metric, result) if HAS_AML: run.log(metric, result) ranking_metrics = args.ranking_metrics if len(ranking_metrics) > 0: all_predictions = compute_ranking_predictions(svd, train_data, usercol=args.usercol, itemcol=args.itemcol, remove_seen=args.remove_seen) k = args.k for metric in ranking_metrics: result = eval(metric)(validation_data, all_predictions, col_prediction='prediction', k=k) print("{}@{}".format(metric, k), result) if HAS_AML: run.log(metric, result) if len(ranking_metrics) == 0 and len(rating_metrics) == 0: raise ValueError("No metrics were specified.") return svd
def algo_metrics(df): ''' Return metrics algo metrics for df: rmse ---Parameters--- df (Pandas DataFrame) RUS DataFrame u (int) Number of ratings threshold for users r (int) Number of ratings threshold for routeIDs ---Returns--- RMSE metrics ''' reader = sp.Reader(line_format='user item rating', sep=',', skip_lines=1) data = sp.Dataset.load_from_df(df, reader=reader) trainset, testset = train_test_split(data, test_size=.2) # Fit out of the box SVD to trainset and predict on test set algo = sp.SVD() algo.fit(trainset) predictions = algo.test(testset) return sp.accuracy.rmse(predictions)
def test_compute_ranking_predictions(python_data): rating_true, _, _ = python_data(binary_rating=False) n_users = len(rating_true['userID'].unique()) n_items = len(rating_true['itemID'].unique()) svd = surprise.SVD() train_set = surprise.Dataset.load_from_df( rating_true, reader=surprise.Reader()).build_full_trainset() svd.fit(train_set) preds = compute_ranking_predictions(svd, rating_true) assert set(preds.columns) == {'userID', 'itemID', 'prediction'} assert preds['userID'].dtypes == rating_true['userID'].dtypes assert preds['itemID'].dtypes == rating_true['itemID'].dtypes user = preds.iloc[0]['userID'] item = preds.iloc[0]['itemID'] assert preds[(preds['userID'] == user) & (preds['itemID'] == item)]['prediction'].values == \ pytest.approx(svd.predict(user, item).est, rel=TOL) # Test default recommend_seen=False assert pd.merge(rating_true, preds, on=['userID', 'itemID']).shape[0] == 0 assert preds.shape[0] == (n_users * n_items - rating_true.shape[0]) preds = compute_ranking_predictions( svd, rating_true.rename(columns={ 'userID': 'uid', 'itemID': 'iid', 'rating': 'r' }), usercol='uid', itemcol='iid', predcol='pred', recommend_seen=True) assert set(preds.columns) == {'uid', 'iid', 'pred'} assert preds['uid'].dtypes == rating_true['userID'].dtypes assert preds['iid'].dtypes == rating_true['itemID'].dtypes user = preds.iloc[1]['uid'] item = preds.iloc[1]['iid'] assert preds[(preds['uid'] == user) & (preds['iid'] == item)]['pred'].values == \ pytest.approx(svd.predict(user, item).est, rel=TOL) # Test recommend_seen=True assert pd.merge(rating_true, preds, left_on=['userID', 'itemID'], right_on=['uid', 'iid']).shape[0] == \ rating_true.shape[0] assert preds.shape[0] == n_users * n_items
def find_recommendations(data, usr_pref, user_data, user_id): # adding the user preferences dataframe to our train data df2 = usr_pref data = data.append(df2) data.reset_index(inplace=True, drop=True) # making movielens data compatible with the model by creating a reader object reader = Reader(rating_scale=(1, 5)) data_dr = Dataset.load_from_df(data[['userId', 'movieId', 'rating']], reader) # defining our model based on the parameters that yielded the lowest rmse while having the most efficient running time # notice that we use n_factors=15 for two reasons, 1: it drastically reduces the amount of time to train the model. # 2: it still results in high model accuracy, and reduces the dimension of the subspace being worked with in the SVD (both movie and user latent factor value vectors become 'shorter', of a smaller dimension), providing # an efficient approximation [we get the most significant factors explaining a majority of user behaviour] algo = surprise.SVD(n_factors=15, n_epochs=10, lr_all=0.03, reg_all=0.04, verbose=True) # training the model on the dataset that now includes the users preferences algo.fit(data_dr.build_full_trainset()) # list to hold the models predictions predictions = [] # getting arrays that can be traversed of the user id, movie (item) id, # these are used to apply the predict function uids = user_data['userId'].to_numpy() iids = user_data['movieId'].to_numpy() # counting variable i = 0 # loop to add predictions for each uid, movieid pair to the predictions list defined above while i < len(uids): predictions.append(algo.predict(uid=uids[i], iid=iids[i])) i += 1 # making the function return the list containing our predictions return predictions
def model_fit(self): ''' Train model using surprise.SVD algorithm. ''' self.build_trainset() algo = self._algo_choise if algo == 'SVD': self.algorithm = surprise.SVD() elif algo == 'Baseline': self.algorithm = surprise.BaselineOnly() elif algo == 'SlopeOne': self.algorithm = surprise.SlopeOne() elif algo == 'CoClustering': self.algorithm = surprise.CoClustering() else: self.algorithm = surprise.KNNBasic() print('Training Recommender System using %s...' % algo) self.algorithm.fit(self.trainset) self.ratings_changed = False print('Done')
def train_matrix(ratings, factor, k_folds): """ Train a model and return it. Then we can use the model and evaluate it elsewhere @param ratings dataframe pandas dataframe to train on, with columns UserId, MovieId, Ratings @param n_folds number of folds for cross validation @returns List of (algo, test data) We can call methods such as `test` and `evaluate` on this object """ train_data, test_data = cv.train_test_split(ratings, test_size=0.20) reader = sp.Reader(rating_scale=(1, 5)) trainset = sp.Dataset.load_from_df(train_data, reader) testset = sp.Dataset.load_from_df(test_data, reader) trainset.split(n_folds=k_folds) algo = sp.SVD(n_factors=factor) for trainset, _ in trainset.folds(): algo.train(trainset) testset = testset.build_full_trainset().build_testset() return (algo, testset)
def Initialize_q_models(q, r, dat): U = {} # each U_t: m-by-r V = {} # each V_t: n-by-r anchors = {} # (u_t, i_t) index for t in range(1, (q+1)): # Step 5: initialize U_t and V_t by using SVD tmp_reader = surprise.Reader(rating_scale=(dat.click.min(), dat.click.max())) tmp_data = surprise.Dataset.load_from_df(dat, tmp_reader) tmp_svd = surprise.SVD(random_state = 123 + t, n_factors = r) tmp_output = tmp_svd.fit(tmp_data.build_full_trainset()) U[t] = tmp_output.pu # user factors (m, r) V[t] = tmp_output.qi # item factors (n, r) # Step 6: pick an observed pair (u_t, i_t) from M at random tmp_anchor = dat[['userId', 'articleId']].sample(1, random_state=123+t) anchors[t] = [(user_IdtoInd[u], article_IdtoInd[i]) for u, i in tmp_anchor.values][0] return U, V, anchors
def svd_training(params): """ Train Surprise SVD using the given hyper-parameters """ logger.debug("Start training...") train_data = pd.read_pickle( os.path.join(params["datastore"], params["train_datapath"])) validation_data = pd.read_pickle( os.path.join(params["datastore"], params["validation_datapath"])) svd_params = { p: params[p] for p in [ "random_state", "n_epochs", "verbose", "biased", "n_factors", "init_mean", "init_std_dev", "lr_all", "reg_all", "lr_bu", "lr_bi", "lr_pu", "lr_qi", "reg_bu", "reg_bi", "reg_pu", "reg_qi", ] } svd = surprise.SVD(**svd_params) train_set = surprise.Dataset.load_from_df( train_data, reader=surprise.Reader( params["surprise_reader"])).build_full_trainset() svd.fit(train_set) logger.debug("Evaluating...") metrics_dict = {} rating_metrics = params["rating_metrics"] if len(rating_metrics) > 0: predictions = predict(svd, validation_data, usercol=params["usercol"], itemcol=params["itemcol"]) for metric in rating_metrics: result = getattr(evaluation, metric)(validation_data, predictions) logger.debug("%s = %g", metric, result) if metric == params["primary_metric"]: metrics_dict["default"] = result else: metrics_dict[metric] = result ranking_metrics = params["ranking_metrics"] if len(ranking_metrics) > 0: all_predictions = compute_ranking_predictions( svd, train_data, usercol=params["usercol"], itemcol=params["itemcol"], remove_seen=params["remove_seen"], ) k = params["k"] for metric in ranking_metrics: result = getattr(evaluation, metric)(validation_data, all_predictions, col_prediction="prediction", k=k) logger.debug("%s@%d = %g", metric, k, result) if metric == params["primary_metric"]: metrics_dict["default"] = result else: metrics_dict[metric] = result if len(ranking_metrics) == 0 and len(rating_metrics) == 0: raise ValueError("No metrics were specified.") # Report the metrics nni.report_final_result(metrics_dict) # Save the metrics in a JSON file output_dir = os.environ.get("NNI_OUTPUT_DIR") with open(os.path.join(output_dir, "metrics.json"), "w") as fp: temp_dict = metrics_dict.copy() temp_dict[params["primary_metric"]] = temp_dict.pop("default") json.dump(temp_dict, fp) return svd
epochs=2, validation_split=0.1, shuffle=True) y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf]) y_true = df_hybrid_test['Rating'].values rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true)) print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse)) # Load dataset into surprise specific data-structure data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader()) benchmark = [] # Iterate over all algorithms for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]: # Perform cross validation results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False) # Get results & append algorithm name tmp = pd.DataFrame.from_dict(results).mean(axis=0) tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm'])) # Store data benchmark.append(tmp) # Store results surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False) # Get data data = surprise_results[['test_rmse', 'test_mae']]
idcgs[uid] = sum(rel_true/discount_true) dcg = sum(dcgu for (_,dcgu) in dcgs.items()) idcg = sum(idcgu for (_,idcgu) in idcgs.items()) return dcg/idcg data = pd.read_csv('sampled.csv') print "Users: "+str(len(np.unique(data['User-ID'])))+ " items: "+str(len(np.unique(data['ISBN']))) print "No. of ratings: "+str(len(data)) sim_options = {'name': 'pearson', 'user_based': False } algo_knn = surprise.KNNBasic(k=5, sim_options=sim_options) algo_svd = surprise.SVD(n_factors = 10, lr_all= 0.001, reg_all =1) #Around 80% train data for each of these splits sample_sizes = [0.4, 0.2, 0.1,0.05, 0.01] time_knn = [] time_svd = [] for s in sample_sizes: a = data.sample(frac = s, random_state = 111) print "s= "+str(len(a)) print("Removing users with less than 20 ratings....") b = a.groupby('User-ID').filter(lambda x: len(x) >= 20) densityu = (float(len(b))/(len(np.unique(b['User-ID']))*len(np.unique(b['ISBN']))))*100 print "Density after filtering users: "+str(densityu) #0.061
# In[2]: pandas_df.head() # In[3]: u_matrix = (pandas_df.pivot(index="UserID", columns="MovieID", values="Rating").fillna(0)) # In[4]: import time import surprise svd = surprise.SVD(random_state=2, n_factors=200, n_epochs=1000, verbose=True) df_train = pandas_df.drop(columns='Timestamp') # In[5]: from surprise import Reader reader = Reader() df_set_train = surprise.Dataset.load_from_df( df_train[['MovieID', 'UserID', 'Rating']], reader) # In[6]: from surprise.model_selection import cross_validate cross_validate(svd, df_set_train, measures=['RMSE', 'MAE'], cv=5, verbose=True)
songs_ex.drop(['isrc', 'name'], axis=1, inplace=True) #song_ex中的name暂不处理 train = train.merge(songs_ex, on='song_id', how='left') test = test.merge(songs_ex, on='song_id', how='left') del members, songs, songs_ex gc.collect() #2.4:特征提取-统计类特征 #2.5:特征提取-信息类特征(爬虫) ###################################################################### #3:训练user对物品特征的偏好 song_area\song_year\language\genre_ids\song_length\artist_name\composer\lyricist reader = surprise.Reader(rating_scale=(0, 1)) #3.1:msno<->song_area mtrain = train[['msno', 'song_area', 'target']].dropna().drop_duplicates() #去空值去重 algo_area = surprise.SVD() data = surprise.Dataset.load_from_df(mtrain, reader) algo_area.train(data.build_full_trainset()) print('完成msno<->song_area训练') #3.2:msno<->song_year mtrain = train[['msno', 'song_year', 'target']].dropna().drop_duplicates() #去空值去重 algo_year = surprise.SVD() data = surprise.Dataset.load_from_df(mtrain, reader) algo_year.train(data.build_full_trainset()) print('完成msno<->song_year训练') #3.3:msno<->language mtrain = train[['msno', 'language', 'target']].dropna().drop_duplicates() #去空值去重 algo_lang = surprise.SVD() data = surprise.Dataset.load_from_df(mtrain, reader)
import surprise import pandas as pd rating_df = pd.read_csv( '/Users/mac/Desktop/推荐系统/RecommendedSystemCallPackage/data_set/MovieLens/ratings.csv', sep=';') rating_df = rating_df[['UserID', 'MovieID', 'Rating']] reader = surprise.Reader(rating_scale=(1, 5)) rating_data = surprise.Dataset.load_from_df(rating_df, reader=reader) svd = surprise.SVD() svd_temp = surprise.model_selection.cross_validate(svd, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('SVD--------------') print(svd_temp) normalPredictor = surprise.NormalPredictor() normalPredictor_temp = surprise.model_selection.cross_validate( normalPredictor, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('normalPredictor--------------') print(normalPredictor_temp) baselineOnly = surprise.BaselineOnly() baselineOnly_temp = surprise.model_selection.cross_validate( baselineOnly, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('baselineOnly-----------------') print(baselineOnly_temp) knnBasic = surprise.KNNBasic() knnBasic_temp = surprise.model_selection.cross_validate( knnBasic, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True) print('knnBasic-----------------')