Beispiel #1
0
def svd_model(df):
    """
    Creates svd model for predcitions and cross validation
    Returns: data 
    """
    from surprise.model_selection.split import train_test_split
    data = df[['user_id', 'business_id',
               'average_stars']].loc[df.city == 'Scottsdale']

    reader = Reader()

    data = Dataset.load_from_df(data, reader)

    trainset, testset = train_test_split(data, test_size=0.25)

    algo = SVD()
    algo.fit(trainset)

    predictions = algo.test(testset)

    acc = accuracy.rmse(predictions)

    svd_cv = cross_validate(SVD(), data, cv=5)

    return data, acc, svd_cv['test_rmse']
Beispiel #2
0
def surprise_bench(df):
    """
    Creates benchmark dataframe of SVD, NMF, NormalPredictor, and Baseline with 
    5 Fold cross validation and returns rmse metrics
    """
    from surprise import (SVD, SVDpp, NMF, NormalPredictor, BaselineOnly)

    from surprise import Dataset
    from surprise import Reader

    from surprise.model_selection.validation import cross_validate
    from surprise import accuracy

    data = df[['user_id', 'business_id',
               'average_stars']].loc[df.city == 'Scottsdale']

    reader = Reader()

    data = Dataset.load_from_df(data, reader)
    benchmark = []

    # Iterate over all algorithms
    for algorithm in [
            SVD(n_factors=10),
            NMF(n_factors=10),
            NormalPredictor(),
            BaselineOnly()
    ]:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data,
                                 measures=['RMSE', 'MAE'],
                                 cv=7,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    return pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
        'test_rmse')
Beispiel #3
0
def NMF_filter(ratings, dims):
    reader = Reader(rating_scale=(0.0, 5.0))
    data = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']],
                                reader)
    RMSE = np.empty([len(dims)])
    MAE = np.empty([len(dims)])
    min_RMSE = False
    min_MAE = False
    fac_num_RMSE = 0
    fac_num_MAE = 0

    for k in range(len(dims)):
        nmf = NMF(n_factors=dims[k], biased=False)
        cv = cross_validate(algo=nmf,
                            data=data,
                            measures=['RMSE', 'MAE'],
                            cv=10,
                            verbose=True)
        RMSE[k] = np.mean(cv['test_rmse'])
        if ((not min_RMSE) or RMSE[k] < min_RMSE):
            min_RMSE = RMSE[k]
            fac_num_RMSE = dims[k]

        MAE[k] = np.mean(cv['test_mae'])
        if ((not min_MAE) or MAE[k] < min_MAE):
            min_MAE = MAE[k]
            fac_num_MAE = dims[k]

    plt.plot(dims, RMSE)
    plt.plot(dims, MAE)
    plt.legend(['RMSE', 'MAE'])
    plt.show()
    print('Finishing Plotting...')
    print('For RMSE:')
    print('\t---Optimal number of latent factors is ', fac_num_RMSE)
    print('\t---Minumun Average RMSE is ', min_RMSE)
    print('\nFor MAE:')
    print('\t---Optimal number of latent factors is ', fac_num_MAE)
    print('\t---Minumun Average MAE is ', min_MAE)
 def nmf(dataName, data, biased=True):
     print('Start building NMF with ' + dataName + '!')
     for i, k in enumerate(ks):
         nmf = NMF(n_factors=k, biased=biased)
         scores = cross_validate(nmf, data, cv=10)
         mae[i] = scores['test_mae'].mean()
         rmse[i] = scores['test_rmse'].mean()
         print('k = ' + str(k) + ' finished!')
     plt.figure()
     plt.subplot(211)
     plt.plot(ks, mae)
     plt.xlabel('k')
     plt.ylabel('mean absolute error')
     plt.title('Mean absolute error vs. k of ' + dataName)
     plt.subplot(212)
     plt.plot(ks, rmse)
     plt.xlabel('k')
     plt.ylabel('root mean squared error')
     plt.title('Root mean squared error vs. k of ' + dataName)
     print('mae:')
     print(mae)
     print('rmse:')
     print(rmse)
     print('Finish building NMF with ' + dataName + '!')
    def estimate(self, u, i):
        """
        return the mean of means estimate
        """

        if u not in self.user_means:
            return (np.mean([self.global_mean, self.item_means[i]]))

        if i not in self.item_means:
            return (np.mean([self.global_mean, self.user_means[u]]))

        return (np.mean(
            [self.global_mean, self.user_means[u], self.item_means[i]]))


# if __name__ == "__main__":

df = pd.read_csv('ratings.csv')
df.drop('timestamp', axis=1, inplace=True)
reader = Reader()
data = Dataset.load_from_df(df, reader)
print("\nGlobal Mean...")
algo = GlobalMean()
cross_validate(algo, data)

print("\nMeanOfMeans...")
algo = MeanofMeans()
cross_validate(algo, data)
with open('model.sav', 'wb') as file:
    pickle.dump(algo, file)
Beispiel #6
0
if __name__ == "__main__":
    # Read data
    df = pd.read_csv(os.path.join(DATA_DIRECTORY, 'ratings.csv'))

    # Drop unneeded column 'timestamp'
    df.drop('timestamp', axis=1, inplace=True)

    # Load the data into the surprise format
    reader = Reader()
    data = Dataset.load_from_df(df, reader=reader)

    # Train ALS model
    print('Using ALS')
    bsl_options = {'method': 'als', 'n_epochs': 5, 'reg_u': 12, 'reg_i': 5}
    trainset, testset = train_test_split(data, test_size=0.25)
    algo = BaselineOnly(bsl_options=bsl_options)
    predictions = algo.fit(trainset).test(testset)

    # Get the RMSE of our predictions
    rmse = accuracy.rmse(predictions)

    # Get the cross-validated RMSE of our predictions
    cv_results = cross_validate(algo, data)
    cv_rmse = cv_results['test_rmse'].mean()
    print(f'CV RMSE: {cv_rmse}')

    # Get true values and predicted values for our test set
    y_true = [x.r_ui for x in predictions]
    y_pred = [x.est for x in predictions]
# In order to fit surprise
file_path = os.path.expanduser('ratings.csv')
reader = Reader(line_format='user item rating',
                sep=',',
                skip_lines=1,
                rating_scale=(0.5, 5))
data = Dataset.load_from_file(file_path, reader=reader)

acc_cv = np.zeros((2, 50))
sim_options = {'name': 'pearson'}
i = 0
for k in range(2, 101, 2):
    algo = KNNWithMeans(k=k, sim_options=sim_options)
    cv1 = cross_validate(algo,
                         data,
                         measures=['RMSE', 'MAE'],
                         cv=10,
                         verbose=False)
    acc_cv[0, i] = np.mean(cv1['test_rmse'])
    acc_cv[1, i] = np.mean(cv1['test_mae'])
    print('test_rmse = %f, test_mae = %f' % (acc_cv[0, i], acc_cv[1, i]))
    i = i + 1
pass
ks = np.arange(2, 101, 2)

plt.xlabel('k')
plt.ylabel('Error value')
plt.title('Test RMSE and MAE vs k in KNN with 10 Validation')
plt.plot(ks, acc_cv[0, :])
plt.plot(ks, acc_cv[1, :])
plt.legend(['RMSE', 'MAE'], loc='upper right')
restaurants_and_food = restoran[mask_restaurants & mask_food]
# number of businesses that have food and restaurant in their category
restaurants_and_food.drop_duplicates(subset='name', keep=False, inplace=True)

review = reviews[['review_id', 'business_id', 'user_id']]
combined_business_data = pd.merge(restaurants_and_food,
                                  review,
                                  on='business_id')
print(combined_business_data.shape)
print(combined_business_data[['name', 'categories', 'user_id']].head(50))

from surprise import Reader, Dataset, SVD
from surprise.model_selection.validation import cross_validate
reader = Reader()

data = Dataset.load_from_df(
    combined_business_data[['user_id', 'business_id', 'stars']], reader)
svd = SVD()

# Run 5-fold cross-validation and print results
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

trainset = data.build_full_trainset()
svd.fit(trainset)

r = restaurants_and_food.copy()
r['Estimate_Score'] = r['business_id'].apply(
    lambda x: svd.predict('xIm6CP6pAqS3XQ7QF3Z89g', x).est)

r = r.sort_values(by=['Estimate_Score'], ascending=False)
print(r[['name', 'categories', 'stars', 'Estimate_Score']].head(10))
Beispiel #9
0
svd.fit(x_train)

prediction = svd.test(x_test)
accuracy.rmse( predictions=prediction )
top_n = get_top_n(prediction, n=10)

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
  print(uid, [iid for (iid, _) in user_ratings])

# 交叉验证
from surprise.model_selection.validation import cross_validate

algo =SVD(biased= False) 
res = cross_validate(algo, data, measures=['rmse', 'mae'], 
              cv= 3, return_train_measures=False ,
              verbose=False)
print(res )

# 网格搜索
# GrideSearchCV
from surprise.model_selection.search import GridSearchCV

param_grid = {'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
              'reg_all': [0.4, 0.6] , 'biased' :[False]          }


gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)

# best RMSE score
        if u not in self.user_means:
            return (np.mean([self.global_mean, self.item_means[i]]))

        if i not in self.item_means:
            return (np.mean([self.global_mean, self.user_means[u]]))

        return (np.mean(
            [self.global_mean, self.user_means[u], self.item_means[i]]))


if __name__ == "__main__":

    data = Dataset.load_builtin('ml-100k')

    print(data)
    print("\nGlobal Mean...")
    algo = GlobalMean()
    print(np.mean(cross_validate(algo, data)['test_rmse']))

    print("\nMeanOfMeans...")
    algo = MeanofMeans()
    print(np.mean(cross_validate(algo, data)['test_rmse']))
    # print("\nGlobal Mean...")
    # algo = GlobalMean()
    # print('RMSE',  {np.mean(cross_validate(algo, data)['test_rmse'])})

    # print("\nMeanOfMeans...")
    # algo = MeanofMeans()
    # print('RMSE'.format(), {np.mean(cross_validate(algo, data)['test_rmse'])})

    print(df.head())
Beispiel #11
0
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        user_cut = [i for i in user_ratings[:n] if i[1] > threshold]
        top_n[uid] = user_cut

    return top_n


from surprise.model_selection.validation import cross_validate
score = cross_validate(algo,
                       data,
                       measures=['RMSE', 'MAE'],
                       cv=3,
                       verbose=False)
scoredf = pd.DataFrame(score)
mean_score = scoredf.mean()

trainset = data.build_full_trainset()
algo = SVD(n_factors=5, n_epochs=5, lr_all=0.005, reg_all=0.002)
algo.fit(trainset)
algo.pu

# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

top_n_svd = get_top_n(predictions, n=3, threshold=2.5)
Beispiel #12
0
# In[15]:

recommend('Iron Man')

# In[16]:

reader = Reader()
df = Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)
kf = KFold(n_splits=5)
kf.split(df)

# In[ ]:

svd = SVD()
cross_validate(svd, df, measures=['RMSE', 'MAE'])

trainset = df.build_full_trainset()
svd.fit(trainset)

# In[ ]:

ratings[ratings['userId'] == 10]

# In[ ]:

# smaller link file reload
links_df = pd.read_csv('Dataset/links_small.csv')
col = np.array(links_df['tmdbId'], np.int64)
links_df['tmdbId'] = col
        user_means, item_means = {}, {}
        for user in np.unique(users):
            user_means[user] = ratings[users == user].mean()
        for item in np.unique(items):
            item_means[item] = ratings[items == item].mean()
        self.global_mean = ratings.mean()
        self.user_means = user_means
        self.item_means = item_means

    def estimate(self, u, i):
        """
        return the mean of means estimate
        """
        if u not in self.user_means:
            return (np.mean([self.global_mean, self.item_means[i]]))
        if i not in self.item_means:
            return (np.mean([self.global_mean, self.user_means[u]]))
        return (np.mean(
            [self.global_mean, self.user_means[u], self.item_means[i]]))


if __name__ == "__main__":
    data = Dataset.load_builtin('ml-100k')
    print("\nGlobal Mean...")
    algo = GlobalMean()
    glob = cross_validate(algo, data)
    print('RMSE: ', np.mean(glob['test_rmse']))
    print("\nMeanOfMeans...")
    algo = MeanofMeans()
    mom = cross_validate(algo, data)
    print('RMSE: ', np.mean(mom['test_rmse']))