Ejemplo n.º 1
0
def main(args):

    user_item_based = 'item_based' if args.item_based else 'user_based'
    filename = '_'.join([
        args.exp_name, args.algorithm, args.sim_name, user_item_based,
        str(args.num_rows)
    ]) + '.pkl'

    output_file = Path(filename)
    if output_file.exists():
        print(f'ERROR! Output file {output_file} already exists. Exiting!')
        sys.exit(1)

    print(f'Saving scores in {output_file}\n')

    reader = surprise.Reader(rating_scale=(1, 5))
    df = pq.read_table('all_ratings_with_indices.parquet',
                       columns=['user_idx', 'movie_idx',
                                'rating']).to_pandas()
    df.user_idx = df.user_idx.astype(np.uint32)
    df.movie_idx = df.movie_idx.astype(np.uint16)
    df.rating = df.rating.astype(np.uint8)
    print(df.dtypes)
    data = surprise.Dataset.load_from_df(df[:args.num_rows], reader=reader)
    del df
    sim_options = {
        'name': args.sim_name,
        'user_based': False if args.item_based else True
    }

    if args.algorithm == 'knn':
        algo = surprise.KNNBasic(sim_options=sim_options)
    elif args.algorithm == 'baseline':
        algo = surprise.BaselineOnly()
    elif args.algorithm == 'normal':
        algo = surprise.NormalPredictor()
    elif args.algorithm == 'knn_zscore':
        algo = surprise.KNNWithZScore(sim_options=sim_options)
    elif args.algorithm == 'svd':
        algo = surprise.SVD()
    elif args.algorithm == 'nmf':
        algo = surprise.NMF()
    else:
        print(f'Algorithm {args.algorithm} is not a valid choice.')

    scores = surprise.model_selection.cross_validate(algo,
                                                     data,
                                                     cv=args.cv_folds,
                                                     verbose=True,
                                                     n_jobs=-1)

    pickle.dump(scores, open(output_file, 'wb'))
Ejemplo n.º 2
0
def algo_tester(data_object):
    '''
  Produces a dataframe displaying all the different RMSE's, test & train times of the different surprise algorithms

  ---Parameters---
  data_object(variable) created from the read_data_surprise function

  ---Returns---
  returns a dataframe where you can compare the performance of different algorithms
  '''
    benchmark = []
    algos = [
        sp.SVDpp(),
        sp.SVD(),
        sp.SlopeOne(),
        sp.NMF(),
        sp.NormalPredictor(),
        sp.KNNBaseline(),
        sp.KNNBasic(),
        sp.KNNWithMeans(),
        sp.KNNWithZScore(),
        sp.BaselineOnly(),
        sp.CoClustering()
    ]

    # Iterate over all algorithms
    for algorithm in algos:
        # Perform cross validation
        results = cross_validate(algorithm,
                                 data_object,
                                 measures=['RMSE'],
                                 cv=3,
                                 verbose=False)

        # Get results & append algorithm name
        tmp = pd.DataFrame.from_dict(results).mean(axis=0)
        tmp = tmp.append(
            pd.Series([str(algorithm).split(' ')[0].split('.')[-1]],
                      index=['Algorithm']))
        benchmark.append(tmp)

    benchmark = pd.DataFrame(benchmark).set_index('Algorithm').sort_values(
        'test_rmse')
    return benchmark
          epochs=2,
          validation_split=0.1,
          shuffle=True)

y_pred = model.predict([df_hybrid_test['User'], df_hybrid_test['Movie'], test_tfidf])
y_true = df_hybrid_test['Rating'].values

rmse = np.sqrt(mean_squared_error(y_pred=y_pred, y_true=y_true))
print('\n\nTesting Result With Keras Hybrid Deep Learning: {:.4f} RMSE'.format(rmse))

# Load dataset into surprise specific data-structure
data = sp.Dataset.load_from_df(df_filterd[['User', 'Movie', 'Rating']].sample(20000), sp.Reader())

benchmark = []
# Iterate over all algorithms
for algorithm in [sp.SVD(), sp.SVDpp(), sp.SlopeOne(), sp.NMF(), sp.NormalPredictor(), sp.KNNBaseline(), sp.KNNBasic(), sp.KNNWithMeans(), sp.KNNWithZScore(), sp.BaselineOnly(), sp.CoClustering()]:
    # Perform cross validation
    results = cross_validate(algorithm, data, measures=['RMSE', 'MAE'], cv=3, verbose=False)
    
    # Get results & append algorithm name
    tmp = pd.DataFrame.from_dict(results).mean(axis=0)
    tmp = tmp.append(pd.Series([str(algorithm).split(' ')[0].split('.')[-1]], index=['Algorithm']))
    
    # Store data
    benchmark.append(tmp)
    
    # Store results
surprise_results = pd.DataFrame(benchmark).set_index('Algorithm').sort_values('test_rmse', ascending=False)

# Get data
data = surprise_results[['test_rmse', 'test_mae']]
Ejemplo n.º 4
0
rating_df = pd.read_csv(
    '/Users/mac/Desktop/推荐系统/RecommendedSystemCallPackage/data_set/MovieLens/ratings.csv',
    sep=';')
rating_df = rating_df[['UserID', 'MovieID', 'Rating']]
reader = surprise.Reader(rating_scale=(1, 5))
rating_data = surprise.Dataset.load_from_df(rating_df, reader=reader)
svd = surprise.SVD()
svd_temp = surprise.model_selection.cross_validate(svd,
                                                   rating_data,
                                                   measures=['RMSE', 'MAE'],
                                                   cv=5,
                                                   verbose=True)
print('SVD--------------')
print(svd_temp)
normalPredictor = surprise.NormalPredictor()
normalPredictor_temp = surprise.model_selection.cross_validate(
    normalPredictor, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('normalPredictor--------------')
print(normalPredictor_temp)
baselineOnly = surprise.BaselineOnly()
baselineOnly_temp = surprise.model_selection.cross_validate(
    baselineOnly, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('baselineOnly-----------------')
print(baselineOnly_temp)
knnBasic = surprise.KNNBasic()
knnBasic_temp = surprise.model_selection.cross_validate(
    knnBasic, rating_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
print('knnBasic-----------------')
print(knnBasic_temp)
knnWithMeans = surprise.KNNWithMeans()
Ejemplo n.º 5
0
train, test = train_test_split(data, random_state=123, test_size=0.1)
#%%训练模型(未调参)
algo = SVDpp()  #声明模型
algo.biased = False

algo.fit(train)

predictions = algo.test(test)
accuracy.mae(predictions)
a = algo.predict('15cbc496d67626ad90514b4243e7c045', '2204590')
print(a)
dump.dump(file_name='SVDmodel.pkl', algo=algo)
#%%
algo = dump.load('best_model.pkl')[1]
#%%瞎猜模型(供对比)
algocompare = surprise.NormalPredictor()
algocompare.fit(train)
preCompare = algocompare.test(test)
accuracy.mae(preCompare)

#%%计算precision and recall
## code from scikit-surprise documentation FAQs
from collections import defaultdict


def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
n_items_train = len(np.unique(train['ISBN']))
n_users_train = len(np.unique(train['User-ID']))

reader = surprise.Reader(rating_scale=(1, 10))
data = surprise.Dataset.load_from_df(train[['User-ID', 'ISBN', 'Book-Rating']],
                                     reader)

test = pd.read_csv('test.csv')
n_items_test = len(np.unique(test['ISBN']))
n_users_test = len(np.unique(train['User-ID']))
t = [tuple(x) for x in test[['User-ID', 'ISBN', 'Book-Rating']].values]
'''
Training
'''
#algo = surprise.BaselineOnly()
algo = surprise.NormalPredictor()
sim_options = {'name': 'pearson', 'user_based': False}
algo_knn = surprise.KNNBasic(k=5, sim_options=sim_options)
#algo_svd = surprise.SVD(n_factors = 5, lr_all= 0.01, reg_all =1.3)#n_factors = , lr_all =, reg_all =
algo_svd = surprise.SVD(n_factors=10, lr_all=0.001, reg_all=1)
'''
Baseline
'''
print "\n Baseline\n"
# retrain on the whole train set
trainset = data.build_full_trainset()
algo.train(trainset)

# Compute biased accuracy on train set
predictions = algo.test(trainset.build_testset())
precision, recall = precision_recall_at_k(predictions, k=10, threshold=7)
Ejemplo n.º 7
0
data = sp.Dataset.load_from_file(file_name, reader=reader)
print("Done.")

# defining the number of folds = 5
print("Performing splits...")
kf = sp.model_selection.KFold(n_splits=5, random_state=0)
print("Done.")

###
### PART 1.1
###
'''
application of all algorithms for recommendation made available by 
“Surprise” libraries, according to their default configuration.
'''
algorithms = [sp.NormalPredictor(), sp.BaselineOnly(), sp.KNNBasic(),\
              sp.KNNWithMeans(), sp.KNNWithZScore(), sp.KNNBaseline(),\
              sp.SVD(), sp.SVDpp(), sp.NMF(), sp.SlopeOne(), sp.CoClustering()]
for elem in algorithms:
    start_time = time.time()
    algo = elem
    sp.model_selection.cross_validate(algo, data, measures=['RMSE'], \
                                      cv=kf, n_jobs = 2, verbose=True)
    print("--- %s seconds ---" % (time.time() - start_time))
    print()

###
### PART 1.2
###
'''
Improvement of the quality of both KNNBaseline and SVD methods, 
Ejemplo n.º 8
0
                            'NormalPredictor', 'BaselineOnly', 'KNNBasic',
                            'KNNWithMeans', 'KNNWithZScore', 'KNNBaseline',
                            'SVD', 'SVDpp', 'NMF', 'SlopeOne', 'CoClustering'
                        ])
    args = parser.parse_args()

    train_path = path + '/Data/train_format.txt'

    train_reader = Reader(line_format='user item rating timestamp',
                          sep=',',
                          rating_scale=(0, 5))
    trainset = Dataset.load_from_file(train_path, reader=train_reader)
    trainset = trainset.build_full_trainset()

    if args.model == 'NormalPredictor':
        model = surprise.NormalPredictor()
    elif args.model == 'BaselineOnly':
        model = surprise.BaselineOnly()
    elif args.model == 'KNNBasic':
        model = surprise.KNNBasic()
    elif args.model == 'KNNWithMeans':
        model = surprise.KNNWithMeans()
    elif args.model == 'KNNWithZScore':
        model = surprise.KNNWithZScore()
    elif args.model == 'KNNBaseline':
        model = surprise.KNNBaseline()
    elif args.model == 'SVD':
        model = surprise.SVD()
    elif args.model == 'SVDpp':
        model = surprise.SVDpp(verbose=True)
    elif args.model == 'NMF':