def test_tf_bmf_batch_accuracy(tf_session): from lenskit.algorithms import basic from lenskit.algorithms import bias import lenskit.crossfold as xf from lenskit import batch import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings algo = lktf.BiasedMF(25, damping=10, batch_size=1024, epochs=20, rng_spec=42) algo = basic.Fallback(algo, bias.Bias(damping=10)) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return batch.predict(algo, test) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.83, abs=0.025) user_rmse = preds.groupby('user').apply( lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(1.03, abs=0.05)
def test_als_implicit_batch_accuracy(): import lenskit.crossfold as xf from lenskit import batch from lenskit import topn ratings = lktu.ml100k.load_ratings() algo = als.ImplicitMF(25, iterations=20) def eval(train, test): _log.info('running training') train['rating'] = train.rating.astype(np.float_) algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) candidates = topn.UnratedCandidates(train) recs = batch.recommend(algo, users, 100, candidates) return recs folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) test = pd.concat(te for (tr, te) in folds) recs = pd.concat(eval(train, test) for (train, test) in folds) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) _log.info('nDCG for users is %.4f', results.ndcg.mean()) assert results.ndcg.mean() > 0
def test_implicit_als_batch_accuracy(): import lenskit.crossfold as xf from lenskit import batch, topn ratings = lktu.ml100k.ratings algo_t = ALS(25) def eval(train, test): _log.info('running training') train['rating'] = train.rating.astype(np.float_) algo = util.clone(algo_t) algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) recs = batch.recommend(algo, users, 100) return recs folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) test = pd.concat(f.test for f in folds) recs = pd.concat(eval(train, test) for (train, test) in folds) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) dcg = results.ndcg _log.info('nDCG for %d users is %.4f', len(dcg), dcg.mean()) assert dcg.mean() > 0
def test_als_batch_accuracy(): from lenskit.algorithms import basic import lenskit.crossfold as xf import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings lu_algo = als.BiasedMF(25, iterations=20, damping=5, method='lu') cd_algo = als.BiasedMF(25, iterations=25, damping=5, method='cd') # algo = basic.Fallback(svd_algo, basic.Bias(damping=5)) def eval(train, test): _log.info('training LU') lu_algo.fit(train) _log.info('training CD') cd_algo.fit(train) _log.info('testing %d users', test.user.nunique()) return test.assign(lu_pred=lu_algo.predict(test), cd_pred=cd_algo.predict(test)) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) preds['abs_diff'] = np.abs(preds.lu_pred - preds.cd_pred) _log.info('predictions:\n%s', preds.sort_values('abs_diff', ascending=False)) _log.info('diff summary:\n%s', preds.abs_diff.describe()) lu_mae = pm.mae(preds.lu_pred, preds.rating) assert lu_mae == approx(0.73, abs=0.025) cd_mae = pm.mae(preds.cd_pred, preds.rating) assert cd_mae == approx(0.73, abs=0.025) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.lu_pred, df.rating)) assert user_rmse.mean() == approx(0.91, abs=0.05) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.cd_pred, df.rating)) assert user_rmse.mean() == approx(0.91, abs=0.05)
def test_ii_batch_accuracy(): from lenskit.algorithms import basic import lenskit.crossfold as xf from lenskit import batch import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings ii_algo = knn.ItemItem(30) algo = basic.Fallback(ii_algo, basic.Bias()) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return batch.predict(algo, test, n_jobs=4) preds = pd.concat((eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.70, abs=0.025) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.90, abs=0.05)
def test_uu_implicit_batch_accuracy(): from lenskit import batch, topn import lenskit.crossfold as xf ratings = lktu.ml100k.ratings algo = knn.UserUser(30, center=False, aggregate='sum') folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) all_test = pd.concat(f.test for f in folds) rec_lists = [] for train, test in folds: _log.info('running training') rec_algo = Recommender.adapt(algo) rec_algo.fit(train.loc[:, ['user', 'item']]) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(rec_algo, test.user.unique(), 100, n_jobs=2) rec_lists.append(recs) recs = pd.concat(rec_lists) rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, all_test) user_dcg = results.ndcg dcg = user_dcg.mean() assert dcg >= 0.03
def test_ii_batch_recommend(ncpus): import lenskit.crossfold as xf from lenskit import batch, topn if not os.path.exists('ml-100k/u.data'): raise pytest.skip() ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp']) def eval(train, test): _log.info('running training') algo = knn.ItemItem(30) algo = Recommender.adapt(algo) algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(algo, test.user.unique(), 100, n_jobs=ncpus) return recs test_frames = [] recs = [] for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)): test_frames.append(test) recs.append(eval(train, test)) test = pd.concat(test_frames) recs = pd.concat(recs) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) dcg = results.ndcg _log.info('nDCG for %d users is %f', len(dcg), dcg.mean()) assert dcg.mean() > 0.03
def split_dataset(ratings, user_fraction=.1): """Split a dataset in train/test data""" n_users = len(ratings['user'].unique()) # There are many ways to separate a dataset in (train, test) data, here are two: # - Row separation: the test set will contain users that the model knows. # The performance of the model will be its ability to predict "new" # tastes for a known user # - User separation: the test set will contain users that the model has # never encountered. The performance of the model will be its abiliy to # predict new users behaviours considering the behaviour of other # known users. # see [lkpy documentation](https://lkpy.readthedocs.io/en/stable/crossfold.html) # Here the sampling is as follow: # - Sample test_fraction * n_total users # - Randomly select half of their listenings for the test set result = list( xf.sample_users(ratings[['user', 'item', 'rating']], partitions=1, size=int(n_users * user_fraction), method=xf.SampleFrac(.5)))[0] print(f'n test users: {len(result.test["user"].unique())}') return result.train, result.test
def test_bias_batch_predict(ncpus): from lenskit.algorithms import bias import lenskit.crossfold as xf from lenskit import batch import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings algo = bias.Bias(damping=5) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.predict(algo, test, n_jobs=ncpus) return recs preds = pd.concat( (eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))) _log.info('analyzing predictions') rmse = pm.rmse(preds.prediction, preds.rating) _log.info('RMSE is %f', rmse) assert rmse == pytest.approx(0.95, abs=0.1)
def test_fsvd_batch_accuracy(): from lenskit.algorithms import basic from lenskit.algorithms import bias import lenskit.crossfold as xf from lenskit import batch import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings svd_algo = svd.FunkSVD(25, 125, damping=10) algo = basic.Fallback(svd_algo, bias.Bias(damping=10)) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return batch.predict(algo, test) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.74, abs=0.025) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.92, abs=0.05)
def test_bias_batch_recommend(): from lenskit.algorithms import basic import lenskit.crossfold as xf from lenskit import batch, topn if not os.path.exists('ml-100k/u.data'): raise pytest.skip() ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp']) algo = basic.Bias(damping=5) algo = TopN(algo) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(algo, test.user.unique(), 100) return recs folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) test = pd.concat(y for (x, y) in folds) recs = pd.concat(eval(train, test) for (train, test) in folds) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) dcg = results.ndcg _log.info('nDCG for %d users is %f (max=%f)', len(dcg), dcg.mean(), dcg.max()) assert dcg.mean() > 0
def test_hpf_batch_accuracy(): import lenskit.crossfold as xf from lenskit import batch, topn import lenskit.metrics.topn as lm ratings = lktu.ml100k.load_ratings() algo = hpf.HPF(25) def eval(train, test): _log.info('running training') train['rating'] = train.rating.astype(np.float_) algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) candidates = topn.UnratedCandidates(train) recs = batch.recommend(algo, users, 100, candidates, test) return recs folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) recs = pd.concat(eval(train, test) for (train, test) in folds) _log.info('analyzing recommendations') dcg = recs.groupby('user').rating.apply(lm.dcg) _log.info('dcg for users is %.4f', dcg.mean()) assert dcg.mean() > 0
def test_tf_bpr_batch_accuracy(tf_session): from lenskit.algorithms import basic import lenskit.crossfold as xf from lenskit import batch, topn ratings = lktu.ml100k.ratings algo = lktf.BPR(20, batch_size=1024, epochs=20, rng_spec=42) algo = Recommender.adapt(algo) all_recs = [] all_test = [] for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(algo, np.unique(test.user), 50) all_recs.append(recs) all_test.append(test) _log.info('analyzing results') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) rla.add_metric(topn.recip_rank) scores = rla.compute(pd.concat(all_recs, ignore_index=True), pd.concat(all_test, ignore_index=True), include_missing=True) scores.fillna(0, inplace=True) _log.info('MRR: %f', scores['recip_rank'].mean()) _log.info('nDCG: %f', scores['ndcg'].mean()) assert scores['ndcg'].mean() > 0.1
def test_ii_batch_recommend(ncpus): import lenskit.crossfold as xf from lenskit import topn ratings = lktu.ml100k.ratings def eval(train, test): _log.info('running training') algo = knn.ItemItem(30) algo = Recommender.adapt(algo) algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(algo, test.user.unique(), 100, n_jobs=ncpus) return recs test_frames = [] recs = [] for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)): test_frames.append(test) recs.append(eval(train, test)) test = pd.concat(test_frames) recs = pd.concat(recs) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) dcg = results.ndcg _log.info('nDCG for %d users is %f', len(dcg), dcg.mean()) assert dcg.mean() > 0.03
def recommend(algo_wrappers, ratings): all_recs = [] test_data = [] for train, test in xf.partition_users(ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2)): test_data.append(test) for algo_wrapper in algo_wrappers: all_recs.append(do_recommend(algo_wrapper, train, test)) return all_recs, test_data
class LegMedLensKit(): def loadData(): ratings = pd.read_csv('/Users/josse/Desktop/ratings.dat', sep='::', names=['user', 'item', 'rating', 'timestamp']) print(ratings.head()) return (ratings) #print ("test") ratings = loadData() data_matrix = np.array( ratings.pivot(index='item', columns='user', values='rating')) print(data_matrix) data_matrix_rev = np.nan_to_num(data_matrix) print(data_matrix_rev) algo_ii = knn.ItemItem(20) algo_als = als.BiasedMF(50) def eval(aname, algo, train, test): print("test") fittable = util.clone(algo) fittable = Recommender.adapt(fittable) fittable.fit(train) users = test.user.unique() # now we run the recommender recs = batch.recommend(fittable, users, 100) # add the algorithm name for analyzability recs['Algorithm'] = aname print("recs") print(recs.head()) return recs all_recs = [] test_data = [] for train, test in xf.partition_users(ratings[['user', 'item', 'rating']], 1, xf.SampleFrac(0.2)): test_data.append(test) #print(test.head(10)) all_recs.append(eval('ItemItem', algo_ii, train, test)) all_recs.append(eval('ALS', algo_als, train, test)) print("test2") print(all_recs.head()) all_recs = pd.concat(all_recs, ignore_index=True) print(all_recs.head()) test_data = pd.concat(test_data, ignore_index=True) #print(test_data.head) rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(all_recs, test_data) results.head() results.groupby('Algorithm').ndcg.mean() results.groupby('Algorithm').ndcg.mean().plot.bar()
def test_sample_frac(): ratings = lktu.ml_test.ratings users = np.random.choice(ratings.user.unique(), 5, replace=False) samp = xf.SampleFrac(0.2) for u in users: udf = ratings[ratings.user == u] tst = samp(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) + len(trn) == len(udf) assert len(tst) >= math.floor(len(udf) * 0.2) assert len(tst) <= math.ceil(len(udf) * 0.2) samp = xf.SampleFrac(0.5) for u in users: udf = ratings[ratings.user == u] tst = samp(udf) trn = udf.loc[udf.index.difference(tst.index), :] assert len(tst) + len(trn) == len(udf) assert len(tst) >= math.floor(len(udf) * 0.5) assert len(tst) <= math.ceil(len(udf) * 0.5)
def run(self, strategy_context: RecommenderAlgorithmStrategyContext) -> np.ndarray: data_set_source = strategy_context.data_set_source data_frame_reader: DataFrameReaderStrategy = self.data_frame_reader_factory.create(data_set_source) data_set: DataFrame = data_frame_reader.parse(DataFrameReaderStrategyContext(data_set_source)) partition = list(partition_users(data=data_set, partitions=1, method=crossfold.SampleFrac(0.2)))[0] test, train = partition.test, partition.train number_of_recommendations = strategy_context.number_of_recommendations algorithm = Recommender.adapt(Bias()) trained_algorithm = algorithm.fit(train) recommendations = lenskit.batch.recommend(trained_algorithm, test['user'].unique(), number_of_recommendations) return recommendations.groupby('user')['item'].apply(lambda x: x).to_numpy().reshape( (-1, number_of_recommendations))
def test_global_metric(): import lenskit.crossfold as xf import lenskit.batch as batch from lenskit.algorithms.bias import Bias train, test = next( xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5))) algo = Bias() algo.fit(train) preds = batch.predict(algo, test) rmse = pm.global_metric(preds) assert rmse == pm.rmse(preds.prediction, preds.rating) mae = pm.global_metric(preds, metric=pm.mae) assert mae == pm.mae(preds.prediction, preds.rating)
def test_tf_isvd(ml20m): algo = lenskit_tf.IntegratedBiasMF(20) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return batch.predict(algo, test) folds = xf.sample_users(ml20m, 2, 5000, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.60, abs=0.025) user_rmse = preds.groupby('user').apply( lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.92, abs=0.05)
def test_uu_batch_accuracy(): from lenskit.algorithms import basic import lenskit.crossfold as xf import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings uu_algo = knn.UserUser(30) algo = basic.Fallback(uu_algo, basic.Bias()) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = [__batch_eval((algo, train, test)) for (train, test) in folds] preds = pd.concat(preds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.71, abs=0.028) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.91, abs=0.055)
def test_partition_users_frac(): ratings = lktu.ml_test.ratings splits = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) splits = list(splits) assert len(splits) == 5 ucounts = ratings.groupby('user').item.count() uss = ucounts * 0.2 for s in splits: tucs = s.test.groupby('user').item.count() assert all(tucs >= uss.loc[tucs.index] - 1) assert all(tucs <= uss.loc[tucs.index] + 1) assert all(s.test.index.union(s.train.index) == ratings.index) assert len(s.test) + len(s.train) == len(ratings) # we have all users users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) assert len(users) == ratings.user.nunique() assert users == set(ratings.user)
def test_user_metric(): import lenskit.crossfold as xf import lenskit.batch as batch from lenskit.algorithms.bias import Bias train, test = next( xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5))) algo = Bias() algo.fit(train) preds = batch.predict(algo, test) rmse = pm.user_metric(preds) u_rmse = preds.groupby('user').apply( lambda df: pm.rmse(df.prediction, df.rating)) assert rmse == approx(u_rmse.mean()) mae = pm.user_metric(preds, metric=pm.mae) u_mae = preds.groupby('user').apply( lambda df: pm.mae(df.prediction, df.rating)) assert mae == approx(u_mae.mean())
def test_als_implicit_batch_accuracy(): import lenskit.crossfold as xf from lenskit import batch from lenskit import topn ratings = lktu.ml100k.ratings def eval(train, test): train['rating'] = train.rating.astype(np.float_) _log.info('training CG') cg_algo = als.ImplicitMF(25, iterations=20, method='cg') cg_algo = Recommender.adapt(cg_algo) cg_algo.fit(train) _log.info('training LU') lu_algo = als.ImplicitMF(25, iterations=20, method='lu') lu_algo = Recommender.adapt(lu_algo) lu_algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) cg_recs = batch.recommend(cg_algo, users, 100, n_jobs=2) lu_recs = batch.recommend(lu_algo, users, 100, n_jobs=2) return pd.concat({ 'CG': cg_recs, 'LU': lu_recs }, names=['Method']).reset_index('Method') folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) test = pd.concat(te for (tr, te) in folds) recs = pd.concat((eval(train, test) for (train, test) in folds), ignore_index=True) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) results = results.groupby('Method')['ndcg'].mean() _log.info('LU nDCG for users is %.4f', results.loc['LU'].mean()) _log.info('CG nDCG for users is %.4f', results.loc['CG'].mean()) assert all(results > 0.28) assert results.loc['LU'] == approx(results.loc['CG'], rel=0.05)
def test_pop_batch_recommend(ncpus): from lenskit.algorithms import basic import lenskit.crossfold as xf from lenskit import batch, topn import lenskit.metrics.topn as lm if not os.path.exists('ml-100k/u.data'): raise pytest.skip() ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp']) algo = basic.Popular() def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) cand_fun = topn.UnratedCandidates(train) recs = batch.recommend(algo, test.user.unique(), 100, cand_fun, test, nprocs=ncpus) return recs recs = pd.concat( (eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))) _log.info('analyzing recommendations') _log.info('have %d recs for good items', (recs.rating > 0).sum()) dcg = recs.groupby('user').rating.agg(lm.dcg) _log.info('DCG for %d users is %f (max=%f)', len(dcg), dcg.mean(), dcg.max()) assert dcg.mean() > 0
def test_ii_batch_recommend(ncpus): import lenskit.crossfold as xf from lenskit import batch, topn import lenskit.metrics.topn as lm if not os.path.exists('ml-100k/u.data'): raise pytest.skip() ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp']) algo = knn.ItemItem(30) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) cand_fun = topn.UnratedCandidates(train) recs = batch.recommend(algo, test.user.unique(), 100, cand_fun, nprocs=ncpus) # combine with test ratings for relevance data res = pd.merge(recs, test, how='left', on=('user', 'item')) # fill in missing 0s res.loc[res.rating.isna(), 'rating'] = 0 return res recs = pd.concat( (eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))) _log.info('analyzing recommendations') dcg = recs.groupby('user').rating.apply(lm.dcg) _log.info('DCG for %d users is %f', len(dcg), dcg.mean()) assert dcg.mean() > 0
def all_movie_recommends(ratings, optionList): all_recs = [] test_data = [] #Declare algorithm models basic_bias_model = basic.Bias() knn_model = iknn.ItemItem(20) knn_u_model = uknn.UserUser(20) als_b_model = als.BiasedMF(50) als_i_model = als.ImplicitMF(50) funk_model = funksvd.FunkSVD(50) for train, test in xf.partition_users(ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2)): test_data.append(test) for option in optionList: if option == 1: all_recs.append( batch_eval('BasicBias', basic_bias_model, train, test)) if option == 2: all_recs.append(batch_eval('ItemItem', knn_model, train, test)) if option == 3: all_recs.append( batch_eval('UserUser', knn_u_model, train, test)) if option == 4: all_recs.append( batch_eval('ALS-Biased', als_b_model, train, test)) if option == 5: all_recs.append( batch_eval('ALS-Implicit', als_i_model, train, test)) if option == 6: all_recs.append(batch_eval('FunkSVD', funk_model, train, test)) all_recs = pd.concat(all_recs, ignore_index=True) test_data = pd.concat(test_data, ignore_index=True) return all_recs, test_data
def test_als_batch_accuracy(): from lenskit.algorithms import basic import lenskit.crossfold as xf import lenskit.metrics.predict as pm ratings = lktu.ml100k.load_ratings() svd_algo = als.BiasedMF(25, iterations=20, damping=5) algo = basic.Fallback(svd_algo, basic.Bias(damping=5)) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return test.assign(prediction=algo.predict(test)) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.73, abs=0.025) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.91, abs=0.05)
def test_uu_implicit_batch_accuracy(): from lenskit import batch, topn import lenskit.crossfold as xf import lenskit.metrics.topn as lm ratings = lktu.ml100k.load_ratings() algo = knn.UserUser(30, center=False, aggregate='sum') folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) rec_lists = [] for train, test in folds: _log.info('running training') algo.fit(train.loc[:, ['user', 'item']]) cands = topn.UnratedCandidates(train) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(algo, test.user.unique(), 100, cands, test) rec_lists.append(recs) recs = pd.concat(rec_lists) user_dcg = recs.groupby('user').rating.apply(lm.dcg) dcg = user_dcg.mean() assert dcg >= 0.1
def test_sample_users_frac(): ratings = lktu.ml_test.ratings splits = xf.sample_users(ratings, 5, 100, xf.SampleFrac(0.2)) splits = list(splits) assert len(splits) == 5 ucounts = ratings.groupby('user').item.count() uss = ucounts * 0.2 for s in splits: tucs = s.test.groupby('user').item.count() assert len(tucs) == 100 assert all(tucs >= uss.loc[tucs.index] - 1) assert all(tucs <= uss.loc[tucs.index] + 1) assert all(s.test.index.union(s.train.index) == ratings.index) assert len(s.test) + len(s.train) == len(ratings) # no overlapping users for s1, s2 in it.product(splits, splits): if s1 is s2: continue us1 = s1.test.user.unique() us2 = s2.test.user.unique() assert len(np.intersect1d(us1, us2)) == 0