def test_fcp(): """Tests for the FCP function.""" predictions = [ pred(0, 0, u0='u1'), pred(1, 1, u0='u1'), pred(2, 2, u0='u2'), pred(100, 100, u0='u2') ] assert fcp(predictions) == 1 predictions = [pred(0, 0, u0='u1'), pred(0, 0, u0='u1')] with pytest.raises(ValueError): fcp(predictions) predictions = [pred(0, 0, u0='u1')] with pytest.raises(ValueError): fcp(predictions) predictions = [ pred(0, 1, u0='u1'), pred(1, 0, u0='u1'), pred(2, 0.5, u0='u2'), pred(0, 0.6, u0='u2') ] assert fcp(predictions) == 0 with pytest.raises(ValueError): fcp([])
def generate_test_score(test_preds, error_metric): if error_metric == 'rmse': return accuracy.rmse(test_preds) elif error_metric == 'mae': return accuracy.mae(test_preds) elif error_metric == 'fcp': return accuracy.fcp(test_preds)
def main(): row_num = 5000 #reading the important ratings file to make it a pandas dataframe in order to be used by surprise ratings_data = pd.read_csv('datasets/song_dataset_ranking.txt', sep="\t", header=None, nrows = row_num) #define the document's columns ratings_data.columns = ['userId', 'songId', 'rating'] #read the csv where it is the songs data song_data = open('datasets/song_data.csv', 'rt') c_reader = csv.reader(song_data, delimiter=',', quotechar='|') #create a hash where we will store the important info from all songs song_dict = {} #update the hash, example #keysonisonioiaofnai: ['Smoke on the water', 'Deep purple'] for row in c_reader: song_dict.update({row[0]: [row[1], row[3]]}) #surprise reader, define the rating scale to use reader = Reader(rating_scale=(1,100)) #transform info to a surprise dataset data = Dataset.load_from_df(ratings_data, reader) #split data into training and testSet training_set, testSet = train_test_split(data, test_size=.25) #define the algorithm to use knn = KNNBasic(name="cosine", user_based=False) #train the algorithm knn.fit(training_set) print("Done training") print("Test set length", len(testSet)) print("testing") #make predictions predictions = knn.test(testSet) print("getting recommendations") #measure accuracy, Compute FCP (Fraction of Concordant Pairs). accuracy.fcp(predictions) #get top n predictions top_n = get_top_n(predictions,4) file = open('predictions.txt', 'w') for uid, user_ratings in top_n.items(): file.write("prediction for " +str(uid) +":\n") result_array = [find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings] for item in result_array: file.write("\t") file.write('-'.join(item)) file.write("\n") #print("prediction for " +str(uid) +"\n" +str([find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings]) + "\n") file.close()
def metric(predictions, verbose=True, metric_type="rmse"): assert metric_type in {"mse", "fcp", "mae", "rmse"} if metric_type == "mse": metric = accuracy.mse(predictions=predictions, verbose=verbose) elif metric_type == "fcp": metric = accuracy.fcp(predictions=predictions, verbose=verbose) elif metric_type == "mae": metric = accuracy.mae(predictions=predictions, verbose=verbose) else: metric = accuracy.rmse(predictions=predictions, verbose=verbose) return metric
def test_fcp(): """Tests for the FCP function.""" predictions = [pred(0, 0, u0='u1'), pred(1, 1, u0='u1'), pred(2, 2, u0='u2'), pred(100, 100, u0='u2')] assert fcp(predictions) == 1 predictions = [pred(0, 0, u0='u1'), pred(0, 0, u0='u1')] with pytest.raises(ValueError): fcp(predictions) predictions = [pred(0, 0, u0='u1')] with pytest.raises(ValueError): fcp(predictions) predictions = [pred(0, 1, u0='u1'), pred(1, 0, u0='u1'), pred(2, 0.5, u0='u2'), pred(0, 0.6, u0='u2')] assert fcp(predictions) == 0 with pytest.raises(ValueError): fcp([])
def algo_metrics(df): ''' Return metrics algo metrics for df: (rmse,mae,fcp) ---Parameters--- df (Pandas DataFrame) RUS DataFrame u (int) Number of ratings threshold for users r (int) Number of ratings threshold for routeIDs ---Returns--- metrics (tuple) ''' reader = Reader(line_format='user item rating', sep=',', skip_lines=1) data = Dataset.load_from_df(df, reader=reader) trainset, testset = train_test_split(data, test_size=.2) # Fit out of the box SVD to trainset and predict on test set algo = SVD() algo.fit(trainset) predictions = algo.test(testset) return accuracy.rmse(predictions), accuracy.mae(predictions), accuracy.fcp( predictions)
testset = rating_test2.build_full_trainset().build_testset() #SVD Model n_factors = [20] # where default = 20 n_epochs = [5] # where default = 20 lr_all = [0.007] # where default = 0.007 reg_all = [0.02] # where default = 0.02 count = 1 for i in n_factors: for j in n_epochs: for k in lr_all: for m in reg_all: start = dt.datetime.today() print("================================================") algo = SVDpp(n_factors=i, n_epochs=j, lr_all=k, reg_all=m) algo.train(trainset) print("This is the #" + str(count) + " parameter combination") predictions = algo.test(testset) print("n_factors=" + str(i) + ", n_epochs=" + str(j) + ", lr_all=" + str(k) + ", reg_all=" + str(m)) accuracy.rmse(predictions, verbose=True) accuracy.fcp(predictions, verbose=True) accuracy.mae(predictions, verbose=True) count = count + 1 end = dt.datetime.today() print("Runtime: " + str(end - start))
reader = Reader( line_format="user item rating timestamp", sep=",", rating_scale=(1, 5), skip_lines=1 ) # * loading the csv data = Dataset.load_from_file( file_path="../../ML_Dataset/ml-latest-small/ratings.csv", reader=reader ) # * dividing in train and test sets trainset, testset = train_test_split(data, test_size=0.25) # * define a cross-validation iterator kf = KFold(n_splits=5) # * Choosing KNN with Baseline as algorithm algo = KNNBaseline() # * Train the algorithm on the trainset, and predict ratings for the testset for trainset, testset in kf.split(data): predictions = algo.fit(trainset).test(testset) precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4) accuracy.rmse(predictions) accuracy.mae(predictions) accuracy.mse(predictions) accuracy.fcp(predictions) print("Precision: ", sum(prec for prec in precisions.values()) / len(precisions)) print("Recall: ", sum(rec for rec in recalls.values()) / len(recalls)) df = pd.DataFrame(predictions, columns=["uid", "iid", "rui", "est", "details"]) df["err"] = abs(df.est - df.rui) df.to_csv("predictions_KNNBaseline.csv")
prediction_mf # Tes rekomendasinya recom_svd = algo_svd.predict(uid='Jays',iid='AWMjT0WguC1rwyj_rFh3') recom_svd sim_options = {'name': 'pearson_baseline','shrinkage': 0} algo = KNNBasic(sim_options=sim_options) algo_knn = KNNBasic(k=50, sim_options=sim_options) prediction_knn = algo_knn.fit(trainset).test(testset) # Prediksi prediction_knn # Tes rekomendasinya recom_knn = algo_knn.predict(uid='Jays',iid='AWMjT0WguC1rwyj_rFh3') recom_knn accuracy.mae(prediction_mf) accuracy.fcp(prediction_mf) accuracy.rmse(prediction_mf) accuracy.mae(prediction_knn) accuracy.fcp(prediction_knn) accuracy.rmse(prediction_knn) # Dataset yang akan dipakai untuk train test split dengan framework surprise rating[['reviews.username','id','reviewsRating']]
def rec(): reviewsPath = 'data/reviews_ssc.csv' df_reviews = pd.read_csv(reviewsPath, sep=',') df_reviews['unixReviewTime'] = pd.to_numeric(df_reviews['unixReviewTime'], errors='coerce') reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(1, 5), skip_lines=1) reviewsData = Dataset.load_from_file(reviewsPath, reader=reader) trainset, testset = train_test_split(reviewsData, test_size=.25) """ param_grid = {'k':[40,50], 'min_k':[3,7], 'sim_options': {'name': ['msd'], 'min_support': [1,5], 'user_based': [False]}} gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'],cv=5) gs.fit(reviewsData) print(gs.best_score['rmse']) print(gs.best_params['rmse'])""" results = [] n_cltr_u = [3, 5, 7, 9, 11] n_cltr_i = [3, 5, 7, 9, 11] for a in n_cltr_u: for b in n_cltr_i: algo = CoClustering(n_cltr_u=a, n_cltr_i=b) predictions = algo.fit(trainset).test(testset) rmse = accuracy.rmse(predictions, verbose=False) mae = accuracy.mae(predictions, verbose=False) fcp = accuracy.fcp(predictions, verbose=False) results.append((rmse, mae, fcp, a, b)) print('{} {} {} {} {}'.format(rmse, mae, fcp, a, b)) #rows = sorted(results, key=lambda x: x[0]) df = pd.DataFrame(results, columns=['rmse', 'mae', 'fcp', 'k', 'min_k']) df.to_csv('co_clustering.csv', index=False) """ param_grid = {'lr_pu': [0.019775, 0.019825], 'reg_bi': [0.06275, 0.06325], 'reg_pu': [0.20775, 0.20825], 'lr_bu': [0.01075, 0.01125], 'lr_bi': [0.005275, 0.005325], 'reg_bu': [0.06675, 0.06725], 'reg_qi': [0.14775, 0.14825], 'lr_qi': [0.014775, 0.014825]} results = [] lr_bu = [0.001,0.005,0.01] lr_bi = [0.001,0.005,0.01] lr_pu = [0.001,0.005,0.01] lr_qi = [0.001,0.005,0.01] reg_bu = [0.005,0.02,0.05] reg_bi = [0.005,0.02,0.05] reg_pu = [0.005,0.02,0.05] reg_qi = [0.005,0.02,0.05] g = itt.product(lr_bu,lr_bi,lr_pu,lr_qi,reg_bu,reg_bi,reg_pu,reg_qi) for i in g: algo = SVD(n_factors=200,n_epochs=50,lr_bu=i[0],lr_bi=i[1],lr_pu=i[2], lr_qi=i[3],reg_bu=i[4],reg_bi=i[5],reg_pu=i[6],reg_qi=i[7]) predictions = algo.fit(trainset).test(testset) acc = accuracy.rmse(predictions, verbose=False) results.append((acc,)+i) rows = sorted(results, key=lambda x: x[0]) df = pd.DataFrame(rows, columns=['rmse','lr_bu','lr_bi','lr_pu','lr_qi', 'reg_bu','reg_bi','reg_pu','reg_qi']) df.to_csv('svd.csv',index=False)""" print('done')
def fcp_func(predictions): return accuracy.fcp(predictions, verbose=False)
def hyper_tune(self): """ Use Surprises RandomizedSearchCV to tune SVD model hyperparameters. As recommended by https://surprise.readthedocs.io/en/stable/FAQ.html , split the data set into an A and B set to allow for unbiased accuracy evaluation of the tuned parameters. RandomizedSearchCV is much faster than GridSearchCV when data set is not small. Returns ------- algo : Tuned Surprise algorithm object Can be used to train and test. """ tune_method = self.tune_method print('Tuning...') # Seperate data into A and B sets for unbiased accuracy evaluation raw_ratings = self.data_ml.raw_ratings # shuffle ratings random.shuffle(raw_ratings) # A = 90% of the data, B = 10% of the data threshold = int(.9 * len(raw_ratings)) A_raw_ratings = raw_ratings[:threshold] B_raw_ratings = raw_ratings[threshold:] # make data_ml the set A data_ml = self.data_ml data_ml.raw_ratings = A_raw_ratings # search grid param_grid = { 'n_factors': [50, 100, 150], 'n_epochs': [30, 50, 70], 'lr_all': [0.002, 0.005, 0.01], 'reg_all': [0.02, 0.1, 0.4, 0.6] } gs = RandomizedSearchCV(SVD, param_grid, measures=['rmse', 'mae', 'fcp'], cv=self.n_splits) # fit start_time = time.time() gs.fit(data_ml) search_time = time.time() - start_time print("Took {} seconds for search.".format(search_time)) # best score print('Best score: ' + str(gs.best_score[tune_method])) # combination of parameters that gave the best score according to the tune_method print('Best params: ' + str(gs.best_params[tune_method])) # get resulting algorithm with tuned parameters algo = gs.best_estimator[tune_method] # retrain on the whole set A trainset = data_ml.build_full_trainset() algo.fit(trainset) # Compute biased accuracy on A predictions = algo.test(trainset.build_testset()) print('Biased accuracy:') accuracy.rmse(predictions) accuracy.mae(predictions) accuracy.fcp(predictions) # Compute unbiased accuracy on B # make data_ml the set B testset = data_ml.construct_testset(B_raw_ratings) predictions = algo.test(testset) print('Unbiased accuracy:') accuracy.rmse(predictions) accuracy.mae(predictions) accuracy.fcp(predictions) return algo