Example #1
0
def test_fcp():
    """Tests for the FCP function."""

    predictions = [
        pred(0, 0, u0='u1'),
        pred(1, 1, u0='u1'),
        pred(2, 2, u0='u2'),
        pred(100, 100, u0='u2')
    ]
    assert fcp(predictions) == 1

    predictions = [pred(0, 0, u0='u1'), pred(0, 0, u0='u1')]
    with pytest.raises(ValueError):
        fcp(predictions)

    predictions = [pred(0, 0, u0='u1')]
    with pytest.raises(ValueError):
        fcp(predictions)

    predictions = [
        pred(0, 1, u0='u1'),
        pred(1, 0, u0='u1'),
        pred(2, 0.5, u0='u2'),
        pred(0, 0.6, u0='u2')
    ]
    assert fcp(predictions) == 0

    with pytest.raises(ValueError):
        fcp([])
def generate_test_score(test_preds, error_metric):
    if error_metric == 'rmse':
        return accuracy.rmse(test_preds)
    elif error_metric == 'mae':
        return accuracy.mae(test_preds)
    elif error_metric == 'fcp':
        return accuracy.fcp(test_preds)
def main():
    row_num = 5000
    #reading the important ratings file to make it a pandas dataframe in order to be used by surprise
    ratings_data = pd.read_csv('datasets/song_dataset_ranking.txt', sep="\t", header=None, nrows = row_num)
    #define the document's columns
    ratings_data.columns = ['userId', 'songId', 'rating']
    #read the csv where it is the songs data
    song_data = open('datasets/song_data.csv', 'rt')
    c_reader = csv.reader(song_data, delimiter=',', quotechar='|')
    #create a hash where we will store the important info from all songs
    song_dict = {}
    #update the hash, example
    #keysonisonioiaofnai: ['Smoke on the water', 'Deep purple']
    for row in c_reader:
        song_dict.update({row[0]: [row[1], row[3]]})
    #surprise reader, define the rating scale to use
    reader = Reader(rating_scale=(1,100))
    #transform info to a surprise dataset
    data = Dataset.load_from_df(ratings_data, reader)
    #split data into training and testSet
    training_set, testSet = train_test_split(data, test_size=.25)
    #define the algorithm to use
    knn = KNNBasic(name="cosine", user_based=False)
    #train the algorithm
    knn.fit(training_set)
    print("Done training")
    print("Test set length", len(testSet))
    print("testing")
    #make predictions
    predictions = knn.test(testSet)
    print("getting recommendations")
    #measure accuracy, Compute FCP (Fraction of Concordant Pairs).
    accuracy.fcp(predictions)
    #get top n predictions
    top_n = get_top_n(predictions,4)
    file = open('predictions.txt', 'w')

    for uid, user_ratings in top_n.items():
        file.write("prediction for " +str(uid) +":\n")
        result_array = [find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings]
        for item in result_array:
            file.write("\t")
            file.write('-'.join(item))
            file.write("\n")
        #print("prediction for " +str(uid) +"\n" +str([find_song_info_in_data(iid,song_dict) for (iid, _) in user_ratings]) + "\n")
    file.close()
Example #4
0
 def metric(predictions, verbose=True, metric_type="rmse"):
     assert metric_type in {"mse", "fcp", "mae", "rmse"}
     if metric_type == "mse":
         metric = accuracy.mse(predictions=predictions, verbose=verbose)
     elif metric_type == "fcp":
         metric = accuracy.fcp(predictions=predictions, verbose=verbose)
     elif metric_type == "mae":
         metric = accuracy.mae(predictions=predictions, verbose=verbose)
     else:
         metric = accuracy.rmse(predictions=predictions, verbose=verbose)
     return metric
Example #5
0
def test_fcp():
    """Tests for the FCP function."""

    predictions = [pred(0, 0, u0='u1'), pred(1, 1, u0='u1'), pred(2, 2,
                   u0='u2'), pred(100, 100, u0='u2')]
    assert fcp(predictions) == 1

    predictions = [pred(0, 0, u0='u1'), pred(0, 0, u0='u1')]
    with pytest.raises(ValueError):
        fcp(predictions)

    predictions = [pred(0, 0, u0='u1')]
    with pytest.raises(ValueError):
        fcp(predictions)

    predictions = [pred(0, 1, u0='u1'), pred(1, 0, u0='u1'), pred(2, 0.5,
                   u0='u2'), pred(0, 0.6, u0='u2')]
    assert fcp(predictions) == 0

    with pytest.raises(ValueError):
        fcp([])
Example #6
0
def algo_metrics(df):
    '''
    Return metrics algo metrics for df: (rmse,mae,fcp)

    ---Parameters---
    df (Pandas DataFrame) RUS DataFrame
    u (int) Number of ratings threshold for users
    r (int) Number of ratings threshold for routeIDs

    ---Returns---
    metrics (tuple)
    '''
    reader = Reader(line_format='user item rating', sep=',', skip_lines=1)
    data = Dataset.load_from_df(df, reader=reader)
    trainset, testset = train_test_split(data, test_size=.2)
    # Fit out of the box SVD to trainset and predict on test set
    algo = SVD()
    algo.fit(trainset)
    predictions = algo.test(testset)
    return accuracy.rmse(predictions), accuracy.mae(predictions), accuracy.fcp(
        predictions)
Example #7
0
testset = rating_test2.build_full_trainset().build_testset()

#SVD Model

n_factors = [20]  # where default = 20
n_epochs = [5]  # where default = 20
lr_all = [0.007]  # where default = 0.007
reg_all = [0.02]  # where default = 0.02

count = 1

for i in n_factors:
    for j in n_epochs:
        for k in lr_all:
            for m in reg_all:
                start = dt.datetime.today()
                print("================================================")
                algo = SVDpp(n_factors=i, n_epochs=j, lr_all=k, reg_all=m)

                algo.train(trainset)
                print("This is the #" + str(count) + " parameter combination")
                predictions = algo.test(testset)

                print("n_factors=" + str(i) + ", n_epochs=" + str(j) +
                      ", lr_all=" + str(k) + ", reg_all=" + str(m))
                accuracy.rmse(predictions, verbose=True)
                accuracy.fcp(predictions, verbose=True)
                accuracy.mae(predictions, verbose=True)
                count = count + 1
                end = dt.datetime.today()
                print("Runtime: " + str(end - start))
reader = Reader(
    line_format="user item rating timestamp", sep=",", rating_scale=(1, 5), skip_lines=1
)
# * loading the csv
data = Dataset.load_from_file(
    file_path="../../ML_Dataset/ml-latest-small/ratings.csv", reader=reader
)
# * dividing in train and test sets
trainset, testset = train_test_split(data, test_size=0.25)

# * define a cross-validation iterator
kf = KFold(n_splits=5)

# * Choosing KNN with Baseline as algorithm
algo = KNNBaseline()

# * Train the algorithm on the trainset, and predict ratings for the testset
for trainset, testset in kf.split(data):
    predictions = algo.fit(trainset).test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)
    accuracy.rmse(predictions)
    accuracy.mae(predictions)
    accuracy.mse(predictions)
    accuracy.fcp(predictions)
    print("Precision: ", sum(prec for prec in precisions.values()) / len(precisions))
    print("Recall: ", sum(rec for rec in recalls.values()) / len(recalls))

df = pd.DataFrame(predictions, columns=["uid", "iid", "rui", "est", "details"])
df["err"] = abs(df.est - df.rui)
df.to_csv("predictions_KNNBaseline.csv")
Example #9
0
prediction_mf

# Tes rekomendasinya
recom_svd = algo_svd.predict(uid='Jays',iid='AWMjT0WguC1rwyj_rFh3')
recom_svd

sim_options = {'name': 'pearson_baseline','shrinkage': 0}
algo = KNNBasic(sim_options=sim_options)
algo_knn = KNNBasic(k=50, sim_options=sim_options)
prediction_knn = algo_knn.fit(trainset).test(testset)

# Prediksi
prediction_knn

# Tes rekomendasinya
recom_knn = algo_knn.predict(uid='Jays',iid='AWMjT0WguC1rwyj_rFh3')
recom_knn


accuracy.mae(prediction_mf)
accuracy.fcp(prediction_mf)
accuracy.rmse(prediction_mf)


accuracy.mae(prediction_knn)
accuracy.fcp(prediction_knn)
accuracy.rmse(prediction_knn)


# Dataset yang akan dipakai untuk train test split dengan framework surprise
rating[['reviews.username','id','reviewsRating']]
Example #10
0
def rec():
    reviewsPath = 'data/reviews_ssc.csv'
    df_reviews = pd.read_csv(reviewsPath, sep=',')
    df_reviews['unixReviewTime'] = pd.to_numeric(df_reviews['unixReviewTime'],
                                                 errors='coerce')

    reader = Reader(line_format='user item rating timestamp',
                    sep=',',
                    rating_scale=(1, 5),
                    skip_lines=1)
    reviewsData = Dataset.load_from_file(reviewsPath, reader=reader)
    trainset, testset = train_test_split(reviewsData, test_size=.25)
    """
  param_grid = {'k':[40,50],
                'min_k':[3,7],
                'sim_options': {'name': ['msd'],
                                'min_support': [1,5],
                                'user_based': [False]}}
  gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'],cv=5)
  gs.fit(reviewsData)
  print(gs.best_score['rmse'])
  print(gs.best_params['rmse'])"""

    results = []
    n_cltr_u = [3, 5, 7, 9, 11]
    n_cltr_i = [3, 5, 7, 9, 11]
    for a in n_cltr_u:
        for b in n_cltr_i:
            algo = CoClustering(n_cltr_u=a, n_cltr_i=b)
            predictions = algo.fit(trainset).test(testset)
            rmse = accuracy.rmse(predictions, verbose=False)
            mae = accuracy.mae(predictions, verbose=False)
            fcp = accuracy.fcp(predictions, verbose=False)
            results.append((rmse, mae, fcp, a, b))
            print('{} {} {} {} {}'.format(rmse, mae, fcp, a, b))

    #rows = sorted(results, key=lambda x: x[0])
    df = pd.DataFrame(results, columns=['rmse', 'mae', 'fcp', 'k', 'min_k'])
    df.to_csv('co_clustering.csv', index=False)
    """
    param_grid = {'lr_pu': [0.019775, 0.019825],
                'reg_bi': [0.06275, 0.06325],
                'reg_pu': [0.20775, 0.20825],
                'lr_bu': [0.01075, 0.01125],
                'lr_bi': [0.005275, 0.005325],
                'reg_bu': [0.06675, 0.06725],
                'reg_qi': [0.14775, 0.14825],
                'lr_qi': [0.014775, 0.014825]}
  results = []
  lr_bu = [0.001,0.005,0.01]
  lr_bi = [0.001,0.005,0.01]
  lr_pu = [0.001,0.005,0.01]
  lr_qi = [0.001,0.005,0.01]
  reg_bu = [0.005,0.02,0.05]
  reg_bi = [0.005,0.02,0.05]
  reg_pu = [0.005,0.02,0.05]
  reg_qi = [0.005,0.02,0.05]
  g = itt.product(lr_bu,lr_bi,lr_pu,lr_qi,reg_bu,reg_bi,reg_pu,reg_qi)
  for i in g:
    algo = SVD(n_factors=200,n_epochs=50,lr_bu=i[0],lr_bi=i[1],lr_pu=i[2],
               lr_qi=i[3],reg_bu=i[4],reg_bi=i[5],reg_pu=i[6],reg_qi=i[7])
    predictions = algo.fit(trainset).test(testset)
    acc = accuracy.rmse(predictions, verbose=False)
    results.append((acc,)+i)

  rows = sorted(results, key=lambda x: x[0])
  df = pd.DataFrame(rows, columns=['rmse','lr_bu','lr_bi','lr_pu','lr_qi',
                                   'reg_bu','reg_bi','reg_pu','reg_qi'])
  df.to_csv('svd.csv',index=False)"""

    print('done')
Example #11
0
def fcp_func(predictions):
    return accuracy.fcp(predictions, verbose=False)
Example #12
0
    def hyper_tune(self):
        """
        Use Surprises RandomizedSearchCV to tune SVD model hyperparameters.
        
        As recommended by https://surprise.readthedocs.io/en/stable/FAQ.html ,
        split the data set into an A and B set to allow for unbiased accuracy
        evaluation of the tuned parameters.
        
        RandomizedSearchCV is much faster than GridSearchCV when data set is
        not small.

        Returns
        -------
        algo : Tuned Surprise algorithm object
            Can be used to train and test.

        """
        tune_method = self.tune_method
        print('Tuning...')
        # Seperate data into A and B sets for unbiased accuracy evaluation
        raw_ratings = self.data_ml.raw_ratings
        # shuffle ratings
        random.shuffle(raw_ratings)
        # A = 90% of the data, B = 10% of the data
        threshold = int(.9 * len(raw_ratings))
        A_raw_ratings = raw_ratings[:threshold]
        B_raw_ratings = raw_ratings[threshold:]
        # make data_ml the set A
        data_ml = self.data_ml
        data_ml.raw_ratings = A_raw_ratings
        # search grid
        param_grid = {
            'n_factors': [50, 100, 150],
            'n_epochs': [30, 50, 70],
            'lr_all': [0.002, 0.005, 0.01],
            'reg_all': [0.02, 0.1, 0.4, 0.6]
        }
        gs = RandomizedSearchCV(SVD,
                                param_grid,
                                measures=['rmse', 'mae', 'fcp'],
                                cv=self.n_splits)
        # fit
        start_time = time.time()
        gs.fit(data_ml)
        search_time = time.time() - start_time
        print("Took {} seconds for search.".format(search_time))
        # best score
        print('Best score: ' + str(gs.best_score[tune_method]))
        # combination of parameters that gave the best score according to the tune_method
        print('Best params: ' + str(gs.best_params[tune_method]))

        # get resulting algorithm with tuned parameters
        algo = gs.best_estimator[tune_method]

        # retrain on the whole set A
        trainset = data_ml.build_full_trainset()
        algo.fit(trainset)

        # Compute biased accuracy on A
        predictions = algo.test(trainset.build_testset())
        print('Biased accuracy:')
        accuracy.rmse(predictions)
        accuracy.mae(predictions)
        accuracy.fcp(predictions)

        # Compute unbiased accuracy on B
        # make data_ml the set B
        testset = data_ml.construct_testset(B_raw_ratings)
        predictions = algo.test(testset)
        print('Unbiased accuracy:')
        accuracy.rmse(predictions)
        accuracy.mae(predictions)
        accuracy.fcp(predictions)

        return algo