def get_topn_algo_class(algo): if algo == 'popular': return basic.Popular() elif algo == 'bias': return basic.TopN(basic.Bias()) # elif algo == 'topn': # return basic.TopN(basic.Bias()) elif algo == 'itemitem': return basic.TopN(iknn.ItemItem(center=False, aggregate='sum')) elif algo == 'useruser': return basic.TopN(uknn.UserUser(center=False, aggregate='sum')) elif algo == 'biasedmf': return basic.TopN(als.BiasedMF(50, iterations=10)) elif algo == 'implicitmf': return basic.TopN(als.ImplicitMF(20, iterations=10)) elif algo == 'funksvd': return basic.TopN(svd.FunkSVD(20, iterations=20)) elif algo == 'bpr': return basic.TopN(BPR(25))
def test_uu_batch_accuracy(): from lenskit.algorithms import basic import lenskit.crossfold as xf import lenskit.metrics.predict as pm ratings = lktu.ml100k.load_ratings() uu_algo = knn.UserUser(30) algo = basic.Fallback(uu_algo, basic.Bias()) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = [__batch_eval((algo, train, test)) for (train, test) in folds] preds = pd.concat(preds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.71, abs=0.028) user_rmse = preds.groupby('user').apply( lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.91, abs=0.055)
def test_fallback_save_load(tmp_path): original = basic.Fallback(basic.Memorized(simple_df), basic.Bias()) original.fit(lktu.ml_test.ratings) fn = tmp_path / 'fb.mod' binpickle.dump(original, fn) algo = binpickle.load(fn) bias = algo.algorithms[1] assert bias.mean_ == approx(lktu.ml_test.ratings.rating.mean()) def exp_val(user, item): v = bias.mean_ if user is not None: v += bias.user_offsets_.loc[user] if item is not None: v += bias.item_offsets_.loc[item] return v # first user + item preds = algo.predict_for_user(10, [1]) assert preds.loc[1] == 4.0 # second user + first item preds = algo.predict_for_user(15, [1]) assert preds.loc[1] == approx(exp_val(15, 1)) # second item + user item preds = algo.predict_for_user(12, [2]) assert preds.loc[2] == approx(exp_val(12, 2)) # blended preds = algo.predict_for_user(10, [1, 5]) assert preds.loc[1] == 4.0 assert preds.loc[5] == approx(exp_val(10, 5)) # blended unknown preds = algo.predict_for_user(10, [5, 1, -23081]) assert len(preds) == 3 assert preds.loc[1] == 4.0 assert preds.loc[5] == approx(exp_val(10, 5)) assert preds.loc[-23081] == approx(exp_val(10, None))
def test_topn_big(): ratings = lktu.ml_test.ratings users = ratings.user.unique() items = ratings.item.unique() user_items = ratings.set_index('user').item algo = basic.TopN(basic.Bias()) a2 = algo.fit(ratings) assert a2 is algo # test 100 random users for u in np.random.choice(users, 100, False): recs = algo.recommend(u, 100) assert len(recs) == 100 rated = user_items.loc[u] assert all(~recs['item'].isin(rated)) unrated = np.setdiff1d(items, rated) scores = algo.predictor.predict_for_user(u, unrated) top = scores.nlargest(100) assert top.values == approx(recs.score.values)
def test_alogrithms(): data = MovieLens('ml-latest-small') #data = ML1M('ml-1m') ratings = data.ratings print('Initial ratings table head:') print(ratings.head()) algorithms = { 'Bias': basic.Bias(damping=5), 'Popular': basic.Popular(), 'ItemItem': item_knn.ItemItem(20), 'UserUser': user_knn.UserUser(20), 'BiasedMF': als.BiasedMF(50), 'ImplicitMF': als.ImplicitMF(50), 'FunkSVD': funksvd.FunkSVD(50) } all_recs, test_data = eval_algos(ratings, algorithms) ndcg_means = eval_ndcg(all_recs, test_data) print('NDCG means:') print(ndcg_means) plot_comparison(ndcg_means)
def test_bias_train_ml_ratings(): algo = bl.Bias() ratings = ml_pandas.ratings.rename(columns={ 'userId': 'user', 'movieId': 'item' }) algo.fit(ratings) assert algo.mean_ == approx(ratings.rating.mean()) imeans_data = ratings.groupby('item').rating.mean() imeans_algo = algo.item_offsets_ + algo.mean_ ares, data = imeans_algo.align(imeans_data) assert ares.values == approx(data.values) urates = ratings.set_index('user').loc[2].set_index('item').rating umean = (urates - imeans_data[urates.index]).mean() p = algo.predict_for_user(2, [10, 11, -1]) assert len(p) == 3 assert p.iloc[0] == approx(imeans_data.loc[10] + umean) assert p.iloc[1] == approx(imeans_data.loc[11] + umean) assert p.iloc[2] == approx(ratings.rating.mean() + umean)
def test_fallback_predict(): algo = basic.Fallback(basic.Memorized(simple_df), basic.Bias()) algo.fit(lktu.ml_test.ratings) assert len(algo.algorithms) == 2 bias = algo.algorithms[1] assert isinstance(bias, basic.Bias) assert bias.mean_ == approx(lktu.ml_test.ratings.rating.mean()) def exp_val(user, item): v = bias.mean_ if user is not None: v += bias.user_offsets_.loc[user] if item is not None: v += bias.item_offsets_.loc[item] return v # first user + item preds = algo.predict_for_user(10, [1]) assert preds.loc[1] == 4.0 # second user + first item preds = algo.predict_for_user(15, [1]) assert preds.loc[1] == approx(exp_val(15, 1)) # second item + user item preds = algo.predict_for_user(12, [2]) assert preds.loc[2] == approx(exp_val(12, 2)) # blended preds = algo.predict_for_user(10, [1, 5]) assert preds.loc[1] == 4.0 assert preds.loc[5] == approx(exp_val(10, 5)) # blended unknown preds = algo.predict_for_user(10, [5, 1, -23081]) assert len(preds) == 3 assert preds.loc[1] == 4.0 assert preds.loc[5] == approx(exp_val(10, 5)) assert preds.loc[-23081] == approx(exp_val(10, None))
def get_topn_algo_class(algo): if algo == 'popular': return basic.Popular() elif algo == 'bias': return basic.TopN(basic.Bias()) elif algo == 'itemitem': return basic.TopN( iknn.ItemItem(nnbrs=-1, center=False, aggregate='sum')) elif algo == 'useruser': return basic.TopN(uknn.UserUser(nnbrs=5, center=False, aggregate='sum')) elif algo == 'biasedmf': return basic.TopN(als.BiasedMF(50, iterations=10)) elif algo == 'implicitmf': return basic.TopN(als.ImplicitMF(20, iterations=10)) elif algo == 'funksvd': return basic.TopN(svd.FunkSVD(20, iterations=20)) elif algo == 'bpr': return basic.TopN(BPR(25)) elif algo == 'tf_bpr': return basic.TopN( lktf.BPR(20, batch_size=1024, epochs=5, neg_count=2, rng_spec=42))
def all_movie_recommends(ratings, optionList): all_recs = [] test_data = [] #Declare algorithm models basic_bias_model = basic.Bias() knn_model = iknn.ItemItem(20) knn_u_model = uknn.UserUser(20) als_b_model = als.BiasedMF(50) als_i_model = als.ImplicitMF(50) funk_model = funksvd.FunkSVD(50) for train, test in xf.partition_users(ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2)): test_data.append(test) for option in optionList: if option == 1: all_recs.append( batch_eval('BasicBias', basic_bias_model, train, test)) if option == 2: all_recs.append(batch_eval('ItemItem', knn_model, train, test)) if option == 3: all_recs.append( batch_eval('UserUser', knn_u_model, train, test)) if option == 4: all_recs.append( batch_eval('ALS-Biased', als_b_model, train, test)) if option == 5: all_recs.append( batch_eval('ALS-Implicit', als_i_model, train, test)) if option == 6: all_recs.append(batch_eval('FunkSVD', funk_model, train, test)) all_recs = pd.concat(all_recs, ignore_index=True) test_data = pd.concat(test_data, ignore_index=True) return all_recs, test_data
def test_als_batch_accuracy(): from lenskit.algorithms import basic import lenskit.crossfold as xf import lenskit.metrics.predict as pm ratings = lktu.ml100k.load_ratings() svd_algo = als.BiasedMF(25, iterations=20, damping=5) algo = basic.Fallback(svd_algo, basic.Bias(damping=5)) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return test.assign(prediction=algo.predict(test)) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.73, abs=0.025) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.91, abs=0.05)
def test_bias_batch_recommend(): from lenskit.algorithms import basic import lenskit.crossfold as xf from lenskit import batch, topn import lenskit.metrics.topn as lm if not os.path.exists('ml-100k/u.data'): raise pytest.skip() ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp']) algo = basic.Bias(damping=5) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) cand_fun = topn.UnratedCandidates(train) recs = batch.recommend(algo, test.user.unique(), 100, cand_fun) # combine with test ratings for relevance data res = pd.merge(recs, test, how='left', on=('user', 'item')) # fill in missing 0s res.loc[res.rating.isna(), 'rating'] = 0 return res recs = pd.concat( (eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))) _log.info('analyzing recommendations') dcg = recs.groupby('user').rating.apply(lm.dcg) _log.info('DCG for %d users is %f (max=%f)', len(dcg), dcg.mean(), dcg.max()) assert dcg.mean() > 0
def test_fsvd_batch_accuracy(): from lenskit.algorithms import basic import lenskit.crossfold as xf from lenskit import batch import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings svd_algo = svd.FunkSVD(25, 125, damping=10) algo = basic.Fallback(svd_algo, basic.Bias(damping=10)) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return batch.predict(algo, test) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.74, abs=0.025) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.92, abs=0.05)
def test_bias_save(): original = bl.Bias(damping=5) original.fit(simple_df) assert original.mean_ == approx(3.5) _log.info('saving baseline model') mod = pickle.dumps(original) _log.info('serialized to %d bytes', len(mod)) algo = pickle.loads(mod) assert algo.mean_ == original.mean_ assert algo.item_offsets_ is not None assert algo.item_offsets_.index.name == 'item' assert set(algo.item_offsets_.index) == set([1, 2, 3]) assert algo.item_offsets_.loc[1:3].values == approx( np.array([0, 0.25, -0.25])) assert algo.user_offsets_ is not None assert algo.user_offsets_.index.name == 'user' assert set(algo.user_offsets_.index) == set([10, 12, 13]) assert algo.user_offsets_.loc[[10, 12, 13]].values == \ approx(np.array([0.25, -00.08333, -0.20833]), abs=1.0e-4)
def test_bias_global_only(): algo = bl.Bias(users=False, items=False) algo.fit(simple_df) assert algo.mean_ == approx(3.5) assert algo.item_offsets_ is None assert algo.user_offsets_ is None
def test_fallback_train_one(): algo = basic.Fallback(basic.Bias()) algo.fit(lktu.ml_pandas.renamed.ratings) assert len(algo.algorithms) == 1 assert isinstance(algo.algorithms[0], basic.Bias) assert algo.algorithms[0].mean_ == approx(lktu.ml_pandas.ratings.rating.mean())
""" Basic algorithm definitions as starting points. """ from lenskit.algorithms import item_knn, user_knn, als, funksvd from lenskit.algorithms import basic Bias = basic.Bias(damping=5) Pop = basic.Popular() II = item_knn.ItemItem(20, save_nbrs=2500) UU = user_knn.UserUser(30) ALS = als.BiasedMF(50) IALS = als.ImplicitMF(50) MFSGD = funksvd.FunkSVD(50)
from sklearn.metrics import roc_auc_score import numpy as np import pandas as pd if __name__ == '__main__': train = pd.read_csv('data/beer/train.csv') validation = pd.read_csv('data/beer/validation.csv') test = pd.read_csv('data/beer/test.csv') # train = pd.read_csv('data/jester/clean/train.csv') # validation = pd.read_csv('data/jester/clean/validation.csv') # test = pd.read_csv('data/jester/clean/test.csv') algo = basic.Bias() # algo = als.BiasedMF() train = train[['reviewer_id', 'beer_beerid', 'review_overall']] validation = validation[['reviewer_id', 'beer_beerid', 'review_overall']] test = test[['reviewer_id', 'beer_beerid', 'review_overall']] train = train.rename(columns={ 'review_overall': 'rating', 'reviewer_id': 'user', 'beer_beerid': 'item' }) validation = validation.rename(columns={ 'review_overall': 'rating', 'reviewer_id': 'user', 'beer_beerid': 'item' })
test = pd.read_csv("/project/naray190/ml-20m/truncated_user_ratings.csv") train = train[['userId', 'movieId', 'rating']] test = test[['userId', 'movieId', 'rating']] train.columns = ['user', 'item', 'rating'] test.columns = ['user', 'item', 'rating'] algo_30als = als.BiasedMF(features=30, iterations=50, reg=0.1) algo_40als = als.BiasedMF(features=40, iterations=50, reg=0.1) algo_20als = als.BiasedMF(features=20, iterations=50, reg=0.1) algo_25als = als.BiasedMF(features=25, iterations=50, reg=0.1) algo_15als = als.BiasedMF(features=15, iterations=50, reg=0.1) algo_50als = als.BiasedMF(features=50, iterations=50, reg=0.1) algo_60als = als.BiasedMF(features=60, iterations=50, reg=0.1) algo_10als = als.BiasedMF(features=10, iterations=50, reg=0.1) algo_70als = als.BiasedMF(features=70, iterations=50, reg=0.1) algo_80als = als.BiasedMF(features=80, iterations=50, reg=0.1) algo_base = basic.Bias() algo_ii = item_knn.ItemItem(nnbrs=20) def eval(algo, train, test): fittable = util.clone(algo) algo.fit(train) users = test.user.unique() preds = algo.predict(test) rmse = predict.rmse(preds, test['rating']) return rmse rmse_scores = pd.DataFrame(columns=['Algorithm', 'Dataset', 'RMSE']) count = 0
def test_fallback_string(): algo = basic.Fallback([basic.Memorized(simple_df), basic.Bias()]) assert 'Fallback' in str(algo)