def base_running_time(data): ''' Calculates the running times for training and predictions for Baseline algorithm Args: data(Dataset): a list of datasets with different numbers of users Returns: elapsedtime_Basetrain: running time for training elapsedtime_Basetest: running time for predictions on testset ''' elapsedtime_Basetrain = [] elapsedtime_Basetest = [] # calculate running times for i in range(len(data)): # training running time training_start = time.time() training = data[i].build_full_trainset() testing = training.build_anti_testset() baseline = BaselineOnly() baseline.train(training) elapsedtime_Basetrain.append(time.time() - training_start) # prediction running time test_start = time.time() baseline.test(testing) elapsedtime_Basetest.append(time.time() - test_start) return elapsedtime_Basetrain, elapsedtime_Basetrain
def test_dump(): """Train an algorithm, compute its predictions then dump them. Ensure that the predictions that are loaded back are the correct ones, and that the predictions of the dumped algorithm are also equal to the other ones.""" random.seed(0) train_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_train') test_file = os.path.join(os.path.dirname(__file__), './u1_ml100k_test') data = Dataset.load_from_folds([(train_file, test_file)], Reader('ml-100k')) for trainset, testset in data.folds(): pass algo = BaselineOnly() algo.train(trainset) predictions = algo.test(testset) with tempfile.NamedTemporaryFile() as tmp_file: dump.dump(tmp_file.name, predictions, algo) predictions_dumped, algo_dumped = dump.load(tmp_file.name) predictions_algo_dumped = algo_dumped.test(testset) assert predictions == predictions_dumped assert predictions == predictions_algo_dumped
def baseline(trainset, testset, predset): modelname = 'baseline' # Check if predictions already exist if is_already_predicted(modelname): return bsl_options = { 'method': 'als', 'reg_i': 1.e-5, 'reg_u': 14.6, 'n_epochs': 10 } algo = BaselineOnly(bsl_options=bsl_options) print('Baseline Model') algo.train(trainset) predictions = algo.test(trainset.build_testset()) print(' RMSE on Train: ', accuracy.rmse(predictions, verbose=False)) predictions = algo.test(testset) rmse = accuracy.rmse(predictions, verbose=False) print(' RMSE on Test: ', rmse) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds, 'test') print(' Evaluate predicted ratings...') predictions = algo.test(predset) preds = np.zeros(len(predictions)) for j, pred in enumerate(predictions): preds[j] = pred.est save_predictions(modelname, rmse, preds)
def grid_search_knn(data_train, data_test, n_epochs, reg_us, reg_is, file_name): print('KNN Surprise manual grid search') result_train = pd.DataFrame() result_test = pd.DataFrame() # loops on the parameters for n_epoch in n_epochs: for reg_u in reg_us: for reg_i in reg_is: bsl_options = { 'method': 'als', 'n_epochs': n_epoch, 'reg_u': reg_u, 'reg_i': reg_i } algo = BaselineOnly(bsl_options=bsl_options) # Retrieve the trainset. trainset = data_train.build_full_trainset() # Build an algorithm, and train it. algo.train(trainset) #Evaluate the performance perf_train = evaluate(algo, data_train, measures=['RMSE']) perf_test = evaluate(algo, data_test, measures=['RMSE']) perf_train["n_epoch"] = n_epoch perf_train["reg_u"] = reg_u perf_train["reg_i"] = reg_i #Store the mean performance RMSE on train perf_train["rmse"] = np.mean(perf_train['rmse']) perf_test["n_epoch"] = n_epoch perf_test["reg_u"] = reg_u perf_test["reg_i"] = reg_i #Store the mean performance RMSE on test perf_test["rmse"] = np.mean(perf_test['rmse']) #Store on a dataframe result_train = result_train.append(perf_train, ignore_index=True) result_test = result_test.append(perf_test, ignore_index=True) # Save the dataframe so we will see or plot the differencies if it's interesting writer = pd.ExcelWriter(file_name, engine='xlsxwriter') result_train.to_excel(writer, 'Sheet1') result_test.to_excel(writer, 'Sheet2') writer.save()
def knn_surprise(data_train, n_epoch, reg_u, reg_i, name_file): print('KNN Surprise') #We construct our KNN algo with surprise and the best parameters bsl_options = { 'method': 'als', 'n_epochs': n_epoch, 'reg_u': reg_u, 'reg_i': reg_i } #Create algo KNN BaselineOnly algo = BaselineOnly(bsl_options=bsl_options) # Retrieve the trainset. trainset = data_train.build_full_trainset() #Build an algorithm, and train it. algo.train(trainset) #Evaluate the RMSE of the algo evaluate(algo, data_train, measures=['RMSE']) # Make the prediction make_prediction_surprise(algo, name_file)
def baseline(training, testing): ''' Calculates RMSE, coverage and running time of Baseline model Args: training(Dataset): training dataset testing(Dataset): test dataset Returns: rmse: RMSE of Baseline with optimized parameters top_n: number of unique predictions for top n items ''' # fit model baseline = BaselineOnly() baseline.train(training) # evaluate the model using test data predictions = baseline.test(testing) top_n = get_top_n(predictions, n=5) rmse = accuracy.rmse(predictions, verbose=True) return rmse, top_n
def compute_recommendations(user_id, prediction_table, numeric_prediction_table): algo = 'Baseline' algorithm = BaselineOnly() # add_pageview(user_id=user_id, item_id=None, page="Model Predictions", activity_type="Initialize Predictions - " + algo, rating=None) #pageview engine = create_engine(config.DB_URI, echo=True) session = scoped_session( sessionmaker(bind=engine, autocommit=False, autoflush=False)) #reading in the database df_ratings = pd.read_sql('SELECT * FROM ratings;', con=engine) df_ratings = df_ratings[['user_id', 'item_id', 'rating']] df_ratings = df_ratings.dropna() df_ratings = df_ratings.drop_duplicates() df_ratings2 = pd.read_csv('data/ratings.csv', low_memory=False) df_ratings2 = df_ratings2.rename(columns={'movie_id': 'item_id'}) df_ratings2 = df_ratings2[['user_id', 'item_id', 'rating']] df_ratings2 = df_ratings2.dropna() df_ratings2 = df_ratings2.drop_duplicates() df_ratings = pd.concat([df_ratings, df_ratings2], axis=0) reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10)) data = Dataset.load_from_df(df_ratings, reader=reader) trainset = data.build_full_trainset() # algorithm = eval(algo + "()")# set the algorithm............................................... algorithm.train(trainset) items = pd.read_sql('SELECT distinct id FROM items;', con=engine) df_user_items = df_ratings.loc[df_ratings['user_id'] == user_id] total_items = items.id.unique() user_items = df_user_items.item_id.unique() # user_id = str(user_id) prediction_items = [x for x in total_items if x not in user_items] predictions = pd.DataFrame(columns=['user_id', 'item_id', 'prediction']) predicted_ratings = [] for i in prediction_items: a = user_id b = i est = algorithm.predict(a, b) predicted_ratings.append(est[3]) predictions['item_id'] = prediction_items predictions['user_id'] = pd.Series( [user_id for x in range(len(predictions.index))], index=predictions.index) predictions['prediction'] = predicted_ratings predictions = predictions.sort_values('prediction', ascending=False) test_prediction = predictions predictions = predictions.head(n=10) cols = [ 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ] df_pred = predictions[['item_id']].T df_pred.columns = cols df_pred['id'] = user_id df_pred = df_pred[[ 'id', 'pred_1', 'pred_2', 'pred_3', 'pred_4', 'pred_5', 'pred_6', 'pred_7', 'pred_8', 'pred_9', 'pred_10' ]] df_pred['id'] = df_pred['id'].astype(int) df_pred.to_sql(prediction_table, engine, if_exists='append', index=False) #if_exists='append' session.commit() df_num_ratings = test_prediction df_num_ratings = df_num_ratings.head(n=20) df_num_ratings['algorithm'] = algo df_num_ratings.rename(columns={'prediction': 'predicted_rating'}, inplace=True) df_num_ratings.to_sql('numeric_predictions', engine, if_exists='append', index=False) #if_exists='append' session.commit() predcols = [ 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8', 'num_9', 'num_10' ] df_num_ratings_transpose = predictions[['prediction']].T df_num_ratings_transpose.columns = predcols df_num_ratings_transpose['id'] = user_id df_num_ratings_transpose = df_num_ratings_transpose[[ 'id', 'num_1', 'num_2', 'num_3', 'num_4', 'num_5', 'num_6', 'num_7', 'num_8', 'num_9', 'num_10' ]] df_num_ratings_transpose['id'] = df_num_ratings_transpose['id'].astype(int) df_num_ratings_transpose.to_sql(numeric_prediction_table, engine, if_exists='append', index=False) #if_exists='append' session.commit()
def test_trainset_testset(): """Test the construct_trainset and construct_testset methods.""" current_dir = os.path.dirname(os.path.realpath(__file__)) folds_files = [(current_dir + '/custom_train', current_dir + '/custom_test')] data = Dataset.load_from_folds(folds_files=folds_files, reader=reader) for trainset, testset in data.folds(): pass # just need trainset and testset to be set # test ur ur = trainset.ur assert ur[0] == [(0, 4)] assert ur[1] == [(0, 4), (1, 2)] assert ur[40] == [] # not in the trainset # test ir ir = trainset.ir assert ir[0] == [(0, 4), (1, 4), (2, 1)] assert ir[1] == [(1, 2), (2, 1), (3, 5)] assert ir[20000] == [] # not in the trainset # test n_users, n_items, n_ratings, rating_scale assert trainset.n_users == 4 assert trainset.n_items == 2 assert trainset.n_ratings == 6 assert trainset.rating_scale == (1, 5) # test raw2inner for i in range(4): assert trainset.to_inner_uid('user' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_uid('unkown_user') for i in range(2): assert trainset.to_inner_iid('item' + str(i)) == i with pytest.raises(ValueError): trainset.to_inner_iid('unkown_item') # test inner2raw assert trainset._inner2raw_id_users is None assert trainset._inner2raw_id_items is None for i in range(4): assert trainset.to_raw_uid(i) == 'user' + str(i) for i in range(2): assert trainset.to_raw_iid(i) == 'item' + str(i) assert trainset._inner2raw_id_users is not None assert trainset._inner2raw_id_items is not None # Test the build_testset() method algo = BaselineOnly() algo.train(trainset) testset = trainset.build_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', 4) in testset assert ('user3', 'item1', 5) in testset assert ('user3', 'item1', 0) not in testset # Test the build_anti_testset() method algo = BaselineOnly() algo.train(trainset) testset = trainset.build_anti_testset() algo.test(testset) # ensure an algorithm can manage the data assert ('user0', 'item0', trainset.global_mean) not in testset assert ('user3', 'item1', trainset.global_mean) not in testset assert ('user0', 'item1', trainset.global_mean) in testset assert ('user3', 'item0', trainset.global_mean) in testset
#reader = Reader(line_format='user item rating', sep='\t') # A reader is still needed but only the rating_scale param is requiered. data.split(n_folds=20) # data can now be used normally data_full = data.build_full_trainset() bu, bi, global_mean = get_baseline(datamat_missing, lr=0.01, n_epochs=50, reg=0) best_item_est_oma = np.mean(bu) + bi + global_mean algo_baseline = BaselineOnly(reg_u=0, reg_i=0) algo_baseline.train(data_full) best_item_est = algo_baseline.trainset._global_mean + np.mean( algo_baseline.bu) + algo_baseline.bi algo_SVD = NMF(verbose=True, n_factors=5, n_epochs=50, reg_bu=0, reg_bi=0, reg_pu=0.1, reg_qi=0.1, biased=True) algo_SVD.train(data_full) best_item_est_svd = algo_SVD.trainset._global_mean + np.mean( algo_SVD.bu) + algo_SVD.bi
count=1 for i in n_epochs: for j in reg_u: for k in reg_i: print("================================================") bsl_options = {'method': 'als', 'n_epochs': i, 'reg_u': j, 'reg_i': k } algo = BaselineOnly(bsl_options=bsl_options) algo.train(trainset) print("This is the #" + str(count) + " parameter combination") predictions=algo.test(testset) print("n_epochs="+str(i)+", "+"reg_u="+str(j)+", "+"reg_i="+str(k)) accuracy.rmse(predictions, verbose=True) accuracy.fcp(predictions, verbose=True) accuracy.mae(predictions, verbose=True) count=count+1 ## baseline model using SGD n_epochs=[5, 10] reg=[0.2, 0.02] # where reg_u>0, and default = 0.02 learning_rate=[0.05, 0.005] # where between 0 and 1, and default = 0.005
pred print("Predicted Rating:") pred[3] # print('Using ALS') bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5 } algo_2 = BaselineOnly(bsl_options=bsl_options) trainset = data.build_full_trainset() algo_2.train(trainset) pred = algo_2.predict('374', '500') print("Prediction Object:") pred print("Predicted Rating:") pred[3] #Predicting all missing entries #First lets start by visualising our matrix of all observed entries. #This matrix is quite sparse. import numpy as np n_users = trainset.n_users