Example #1
0
def svd_ratings_predicate(observed_ratings_df,
                          truth_ratings_df,
                          fold='0',
                          phase='eval'):
    """
    pmf_ratings Predicates
    """
    print("SVD predicates")
    svd_model = SVD()
    reader = Reader(rating_scale=(0.2, 1))
    train_dataset = Dataset.load_from_df(df=observed_ratings_df.reset_index(
    ).loc[:, ['userId', 'movieId', 'rating']],
                                         reader=reader)
    svd_model.fit(train_dataset.build_full_trainset())

    # make predictions
    predictions = pd.DataFrame(index=truth_ratings_df.index,
                               columns=['rating'])

    for row in truth_ratings_df.loc[:, ['rating']].iterrows():
        uid = row[0][0]
        iid = row[0][1]
        predictions.loc[(uid, iid), 'rating'] = svd_model.predict(uid, iid).est

    write(predictions, 'svd_rating_obs', fold, phase)
def train_trim_svd(data, R):
    kfold = KFold(n_splits=10)
    rmse_list = [[], [], []]
    for k in range(2, 52, 2):
        print("using k = %d" % k)
        p_rmse = []
        u_rmse = []
        hv_rmse = []
        svd = SVD(n_factors=k)
        for trainset, testset in kfold.split(data):
            svd.fit(trainset)
            (p_testset, u_testset, hv_testset) = trim(testset, R)

            p_pred = svd.test(p_testset)
            u_pred = svd.test(u_testset)
            hv_pred = svd.test(hv_testset)

            p_rmse.append(accuracy.rmse(p_pred))
            u_rmse.append(accuracy.rmse(u_pred))
            hv_rmse.append(accuracy.rmse(hv_pred))
        rmse_list[0].append(np.mean(p_rmse))
        rmse_list[1].append(np.mean(u_rmse))
        rmse_list[2].append(np.mean(hv_rmse))
    print("SVD with trim is finished!!")
    return rmse_list
def mfb_compute_high_var_trim_rmse(k):
    mfb = SVD(n_factors=k, random_state=42)
    rmse = []
    for trainset, testset in KFold(n_splits=10, random_state=42).split(R):
        mfb.fit(trainset)
        testset_trimmed = high_variance_trimming(testset, frequency, variance)
        pred = mfb.test(testset_trimmed)
        rmse.append(accuracy.rmse(pred, verbose=False))
    print('k: %s | RMSE: %f' % (k, np.mean(rmse)))
    return np.mean(rmse)
def mfb_compute_prec_rec(t):
    precision, recall = [], []
    for trainset, testset in KFold(n_splits=10, random_state=42).split(R):
        mfb = SVD(n_factors=mfb_best_k, random_state=42)
        mfb.fit(trainset)
        trimmed_testset = trim_unpopular_user(testset, t, threshold)
        pred = mfb.test(trimmed_testset)

        precision_dict, recall_dict = calculate_precision_recall(
            pred, t, threshold)
        precision.append(np.mean([prec for prec in precision_dict.values()]))
        recall.append(np.mean([rec for rec in recall_dict.values()]))
    return np.mean(precision), np.mean(recall)
Example #5
0
def MF_trim_filter(ratings, dims, func, mv_dict):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    RMSE = np.empty([len(dims)])
    MAE = np.empty([len(dims)])
    min_RMSE = False
    min_MAE = False
    fac_num_RMSE = 0
    fac_num_MAE = 0
    kf = KFold(n_splits=10, random_state=42)

    for k in range(len(dims)):
        svd = SVD(n_factors=dims[k], random_state=42)
        test_rmse = np.array([])
        test_mae = np.array([])
        for trainset, testset in kf.split(data):
            svd.fit(trainset)
            full_data = trainset.build_testset() + testset
            func(mv_dict, testset)
            pred = svd.test(testset)
            test_rmse = np.append(test_rmse, accuracy.rmse(pred,
                                                           verbose=False))
            test_mae = np.append(test_mae, accuracy.mae(pred, verbose=False))
        RMSE[k] = np.mean(test_rmse)
        if ((not min_RMSE) or RMSE[k] < min_RMSE):
            min_RMSE = RMSE[k]
            fac_num_RMSE = dims[k]

        MAE[k] = np.mean(test_mae)
        if ((not min_MAE) or MAE[k] < min_MAE):
            min_MAE = MAE[k]
            fac_num_MAE = dims[k]
        print('For k = %i :' % dims[k])
        print('RMSE: ', RMSE[k])
        print('MAE: ', MAE[k])

    plt.plot(dims, RMSE)
    plt.plot(dims, MAE)
    plt.legend(['RMSE', 'MAE'])
    plt.show()
    print('Finishing Plotting...')
    print('For RMSE:')
    print('\t---Optimal number of latent factors is ', fac_num_RMSE)
    print('\t---Minumun Average RMSE is ', min_RMSE)
    print('\nFor MAE:')
    print('\t---Optimal number of latent factors is ', fac_num_MAE)
    print('\t---Minumun Average MAE is ', min_MAE)
Example #6
0
def MF_bin_pre(ratings, ts, nmf_fac, thrd):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    trainset, testset = train_test_split(data, test_size=ts)
    algo = SVD(n_factors=nmf_fac, random_state=42)
    algo.fit(trainset)
    pre = algo.test(testset)

    true_rating = np.empty(len(pre))
    pred_rating = np.empty(len(pre))

    for i in range(len(pre)):
        true_rating[i] = pre[i][2]
        pred_rating[i] = pre[i][3]

    bi_rating = np.empty(len(pre))
    one_idx = true_rating >= thrd
    zero_idx = true_rating < thrd
    bi_rating[one_idx] = 1.0
    bi_rating[zero_idx] = 0.0

    return bi_rating, pred_rating
Example #7
0
def Q26To28(qNum, n_splits=10):
    data = load_data()
    kf = KFold(n_splits=10)

    trimFun = {26: popularTrim, 27: unpopularTrim, 28: highVarTrim}
    RMSE = []
    for k in range(2, 52, 2):
        MF_svd = SVD(n_factors=k)
        subRMSE = []
        for trainSet, testSet in kf.split(data):
            subsubRMSE = 0
            MF_svd.fit(trainSet)
            testSet = trimFun[qNum](testSet)
            nTest = len(testSet)
            print("test set size after trimming: %d", nTest)
            for (r, c, rating) in testSet:
                predictedRating = MF_svd.predict(str(r), str(c))
                subsubRMSE += (pow(rating - predictedRating.est, 2))
            # calculate RMSE of this train-test split
            subRMSE.append(np.sqrt(subsubRMSE / nTest))
        # average of all train-test splits of k-NN
        RMSE.append(np.mean(subRMSE))

    return RMSE
from surprise import Dataset, evaluate, Reader, KNNBasic
from surprise.model_selection import cross_validate
from surprise.prediction_algorithms.matrix_factorization import SVD

from DataProcessing.dataprocessing import ratings

print("Training SVD Algorithm")
reader = Reader()
data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
data.split(n_folds=5)

svd = SVD()
# print(cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True))
trainset = data.build_full_trainset()
svd.fit(trainset)


def recColl(userid, movieid, gt=None):
    return svd.predict(userid, movieid, gt)


# print(recColl(1,862,3))
Example #9
0
    return rating_dataset.sample(frac=fraction, random_state=random_state)


rating_dataset = load_dataset(RATING_FILE, FRACTION, constants.RANDOM_SEED)
reader = Reader(line_format='user item rating', sep=',', rating_scale=(1, 10))
data = Dataset.load_from_df(rating_dataset, reader)

trainset = data.build_full_trainset()

start_time = time.time()
algo = SVD(n_factors=50,
           n_epochs=40,
           lr_all=0.005,
           reg_all=0.1,
           random_state=constants.RANDOM_SEED)
algo.fit(trainset)
end_time = time.time()

predictions = algo.test(testset)

accuracy.rmse(predictions)
results_df = pd.DataFrame.from_dict(gs.cv_results)
results_df.to_csv(constants.DATASET1_RESULTS_FOLDER +
                  '{}_{}.csv'.format(algo_name, rating_file_name),
                  index=False)

print('Best score: {}'.format(gs.best_score['rmse']))
print('Best params: {}'.format(gs.best_params['rmse']))
print('Time: {} sec'.format(round(end_time - start_time, 4)))

print()
plt.show()
fig33.savefig(path + 'fig/Part_8_nnmf_preVSrec.png')

#define model for training
k_min_rmse = 16
mf = SVD(n_factors=k_min_rmse, random_state=1, biased=True)

#train, test and rank
top_t_list = range(1, 26)
pre_list_mf = []
rec_list_mf = []
for top_t in top_t_list:
    pre = 0
    rec = 0
    for trainset, testset in kf.split(data_raw):
        mf.fit(trainset)
        prediction = mf.test(testset)
        G = create_dict(testset)
        G_s = create_dict(prediction, if_pred=1)
        R, R_s = threshold_rank_filter(G, G_s, thre=3, top_t=top_t)
        #precision and recall for each fold
        pre_fold = 0
        rec_fold = 0
        for key in R.keys():
            pre_temp, rec_temp = precision_recall(R[key], R_s[key])
            pre_fold += pre_temp
            rec_fold += rec_temp
        pre += pre_fold / len(R)
        rec += rec_fold / len(R)

    pre_list_mf.append(pre / num_fold)
plot_curve(ks, mfb_rmse_high_var_trim, 'k', 'Root Mean Squared Error',
           'RMSE after High Variance Movie Trimming')

# In[59]:

print("Minimum average RMSE after high variance movie trimming: %.4f" %
      np.min(mfb_rmse_high_var_trim))

# <font size=4>**Question 29:** Plot the ROC curves for the MF with bias collaborative filter designed in Question 24 for threshold values [2.5,3,3.5,4]. For the ROC plotting use the optimal number of latent factors found in Question 25. For each of the plots, also report the area under the curve (AUC) value.</font>

# In[60]:

mfb_best_k = ks[np.argmin(mfb_rmse)]
trainset, testset = train_test_split(R, test_size=0.1, random_state=42)
mfb_best = SVD(n_factors=mfb_best_k, random_state=42)
mfb_best.fit(trainset)
mfb_best_pred = mfb_best.test(testset)

plot_roc_curves(testset, mfb_best_pred, 'MF with bias')

# # PART 3 - Naive Collaborative Filtering

# <font size=4>**Question 30:** Design a naive collaborative filter to predict the ratings of the movies in the MovieLens dataset and evaluate it’s performance using 10-fold cross validation. Compute the average RMSE by averaging the RMSE across all 10 folds. Report the average RMSE.
#
# Note that in this case, when performing the cross-validation, there is no need to calculate $\mu_i$’s for the training folds each time. You are only asked to use a single set of $\mu_i$’s calculated on the entire dataset and validate on 10 validation folds.</font>

# In[61]:

from surprise import AlgoBase