def test_implicit_als_batch_accuracy(): import lenskit.crossfold as xf from lenskit import batch, topn ratings = lktu.ml100k.ratings algo_t = ALS(25) def eval(train, test): _log.info('running training') train['rating'] = train.rating.astype(np.float_) algo = util.clone(algo_t) algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) recs = batch.recommend(algo, users, 100) return recs folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) test = pd.concat(f.test for f in folds) recs = pd.concat(eval(train, test) for (train, test) in folds) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) dcg = results.ndcg _log.info('nDCG for %d users is %.4f', len(dcg), dcg.mean()) assert dcg.mean() > 0
def test_als_implicit_batch_accuracy(): import lenskit.crossfold as xf from lenskit import batch from lenskit import topn ratings = lktu.ml100k.load_ratings() algo = als.ImplicitMF(25, iterations=20) def eval(train, test): _log.info('running training') train['rating'] = train.rating.astype(np.float_) algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) candidates = topn.UnratedCandidates(train) recs = batch.recommend(algo, users, 100, candidates) return recs folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) test = pd.concat(te for (tr, te) in folds) recs = pd.concat(eval(train, test) for (train, test) in folds) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) _log.info('nDCG for users is %.4f', results.ndcg.mean()) assert results.ndcg.mean() > 0
def test_fsvd_batch_accuracy(): from lenskit.algorithms import basic from lenskit.algorithms import bias import lenskit.crossfold as xf from lenskit import batch import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings svd_algo = svd.FunkSVD(25, 125, damping=10) algo = basic.Fallback(svd_algo, bias.Bias(damping=10)) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return batch.predict(algo, test) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.74, abs=0.025) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.92, abs=0.05)
def test_tf_bmf_batch_accuracy(tf_session): from lenskit.algorithms import basic from lenskit.algorithms import bias import lenskit.crossfold as xf from lenskit import batch import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings algo = lktf.BiasedMF(25, damping=10, batch_size=1024, epochs=20, rng_spec=42) algo = basic.Fallback(algo, bias.Bias(damping=10)) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return batch.predict(algo, test) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.83, abs=0.025) user_rmse = preds.groupby('user').apply( lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(1.03, abs=0.05)
def test_sweep_oneshot(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, combine=False) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms(Bias(damping=5)) try: sweep.run(3) finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert not (work / 'runs.csv').exists() assert not (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() assert (work / 'run-3.json').exists() assert (work / 'predictions-3.parquet').exists() assert (work / 'recommendations-3.parquet').exists() with (work / 'run-3.json').open() as f: run = json.load(f) assert run['RunId'] == 3
def test_sweep_nopreds(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, eval_n_jobs=1) ratings = ml_test.ratings folds = [(train, test.drop(columns=['rating'])) for (train, test) in xf.partition_users(ratings, 5, xf.SampleN(5)) ] sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms(Popular()) sweep.add_algorithms(Bias(damping=0)) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 2 algorithms by 5 partitions assert len(runs) == 10 assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular']) bias_runs = runs[runs.AlgoClass == 'Bias'] recs = pd.read_parquet(work / 'recommendations.parquet') assert all(recs.RunId.isin(runs.RunId)) assert recs['score'].dtype == np.float64
def test_tf_bpr_batch_accuracy(tf_session): from lenskit.algorithms import basic import lenskit.crossfold as xf from lenskit import batch, topn ratings = lktu.ml100k.ratings algo = lktf.BPR(20, batch_size=1024, epochs=20, rng_spec=42) algo = Recommender.adapt(algo) all_recs = [] all_test = [] for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(algo, np.unique(test.user), 50) all_recs.append(recs) all_test.append(test) _log.info('analyzing results') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) rla.add_metric(topn.recip_rank) scores = rla.compute(pd.concat(all_recs, ignore_index=True), pd.concat(all_test, ignore_index=True), include_missing=True) scores.fillna(0, inplace=True) _log.info('MRR: %f', scores['recip_rank'].mean()) _log.info('nDCG: %f', scores['ndcg'].mean()) assert scores['ndcg'].mean() > 0.1
def test_bias_batch_predict(ncpus): from lenskit.algorithms import bias import lenskit.crossfold as xf from lenskit import batch import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings algo = bias.Bias(damping=5) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.predict(algo, test, n_jobs=ncpus) return recs preds = pd.concat( (eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))) _log.info('analyzing predictions') rmse = pm.rmse(preds.prediction, preds.rating) _log.info('RMSE is %f', rmse) assert rmse == pytest.approx(0.95, abs=0.1)
def test_als_batch_accuracy(): from lenskit.algorithms import basic import lenskit.crossfold as xf import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings lu_algo = als.BiasedMF(25, iterations=20, damping=5, method='lu') cd_algo = als.BiasedMF(25, iterations=25, damping=5, method='cd') # algo = basic.Fallback(svd_algo, basic.Bias(damping=5)) def eval(train, test): _log.info('training LU') lu_algo.fit(train) _log.info('training CD') cd_algo.fit(train) _log.info('testing %d users', test.user.nunique()) return test.assign(lu_pred=lu_algo.predict(test), cd_pred=cd_algo.predict(test)) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = pd.concat(eval(train, test) for (train, test) in folds) preds['abs_diff'] = np.abs(preds.lu_pred - preds.cd_pred) _log.info('predictions:\n%s', preds.sort_values('abs_diff', ascending=False)) _log.info('diff summary:\n%s', preds.abs_diff.describe()) lu_mae = pm.mae(preds.lu_pred, preds.rating) assert lu_mae == approx(0.73, abs=0.025) cd_mae = pm.mae(preds.cd_pred, preds.rating) assert cd_mae == approx(0.73, abs=0.025) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.lu_pred, df.rating)) assert user_rmse.mean() == approx(0.91, abs=0.05) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.cd_pred, df.rating)) assert user_rmse.mean() == approx(0.91, abs=0.05)
def test_bias_batch_recommend(): from lenskit.algorithms import basic import lenskit.crossfold as xf from lenskit import batch, topn if not os.path.exists('ml-100k/u.data'): raise pytest.skip() ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp']) algo = basic.Bias(damping=5) algo = TopN(algo) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(algo, test.user.unique(), 100) return recs folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) test = pd.concat(y for (x, y) in folds) recs = pd.concat(eval(train, test) for (train, test) in folds) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) dcg = results.ndcg _log.info('nDCG for %d users is %f (max=%f)', len(dcg), dcg.mean(), dcg.max()) assert dcg.mean() > 0
def test_alogrithms(): # data = MovieLens('ml-latest-small') data = ML1M('ml-1m') ratings = data.ratings print('Initial ratings table head:') print(ratings.head()) algorithms = [ basic.Bias(damping=5), basic.Popular(), item_knn.ItemItem(20), user_knn.UserUser(20), als.BiasedMF(50), als.ImplicitMF(50), funksvd.FunkSVD(50) ] pairs = list( partition_users(ratings[['user', 'item', 'rating']], 5, SampleFrac(0.2))) eval_algorithms(dataset=pairs, algorithms=algorithms) runs = display_runs() recs = display_recommendations() truth = pd.concat((p.test for p in pairs), ignore_index=True) ndcg_means = check_recommendations(runs, recs, truth) print('NDCG means:') print(ndcg_means) plot_comparison(ndcg_means)
def test_sweep_save(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms(Bias(damping=5)) sweep.persist_data() pf = work / 'sweep.dat' with pf.open('wb') as f: pickle.dump(sweep, f) with pf.open('rb') as f: sweep = pickle.load(f) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 1 algorithms by 5 partitions assert len(runs) == 5
def test_uu_implicit_batch_accuracy(): from lenskit import batch, topn import lenskit.crossfold as xf ratings = lktu.ml100k.ratings algo = knn.UserUser(30, center=False, aggregate='sum') folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) all_test = pd.concat(f.test for f in folds) rec_lists = [] for train, test in folds: _log.info('running training') rec_algo = Recommender.adapt(algo) rec_algo.fit(train.loc[:, ['user', 'item']]) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(rec_algo, test.user.unique(), 100, n_jobs=2) rec_lists.append(recs) recs = pd.concat(rec_lists) rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, all_test) user_dcg = results.ndcg dcg = user_dcg.mean() assert dcg >= 0.03
def test_sweep_filenames(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings folds = [] for part, (train, test) in enumerate(xf.partition_users(ratings, 2, xf.SampleN(5))): trfn = work / 'p{}-train.csv'.format(part) tefn = work / 'p{}-test.csv'.format(part) train.to_csv(trfn) test.to_csv(tefn) folds.append((trfn, tefn)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 2 partitions assert len(runs) == 8
def test_sweep_norecs(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, recommend=None) ratings = ml_test.ratings folds = xf.partition_users(ratings, 5, xf.SampleN(5)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 5 partitions assert len(runs) == 20 assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular']) bias_runs = runs[runs.AlgoClass == 'Bias'] assert all(bias_runs.damping.notna()) pop_runs = runs[runs.AlgoClass == 'Popular'] assert all(pop_runs.damping.isna()) preds = pd.read_parquet(work / 'predictions.parquet') assert all(preds.RunId.isin(bias_runs.RunId))
def test_ii_batch_recommend(ncpus): import lenskit.crossfold as xf from lenskit import topn ratings = lktu.ml100k.ratings def eval(train, test): _log.info('running training') algo = knn.ItemItem(30) algo = Recommender.adapt(algo) algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(algo, test.user.unique(), 100, n_jobs=ncpus) return recs test_frames = [] recs = [] for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)): test_frames.append(test) recs.append(eval(train, test)) test = pd.concat(test_frames) recs = pd.concat(recs) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) dcg = results.ndcg _log.info('nDCG for %d users is %f', len(dcg), dcg.mean()) assert dcg.mean() > 0.03
def test_batch_rmse(): import lenskit.crossfold as xf import lenskit.batch as batch import lenskit.algorithms.basic as bl ratings = lktu.ml100k.ratings algo = bl.Bias(damping=5) def eval(train, test): algo.fit(train) preds = batch.predict(algo, test) return preds.set_index(['user', 'item']) results = pd.concat((eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleN(5)))) user_rmse = results.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating)) # we should have all users users = ratings.user.unique() assert len(user_rmse) == len(users) missing = np.setdiff1d(users, user_rmse.index) assert len(missing) == 0 # we should not have any missing values assert all(user_rmse.notna()) # we should have a reasonable mean assert user_rmse.mean() == approx(0.93, abs=0.05)
def test_hpf_batch_accuracy(): import lenskit.crossfold as xf from lenskit import batch, topn import lenskit.metrics.topn as lm ratings = lktu.ml100k.load_ratings() algo = hpf.HPF(25) def eval(train, test): _log.info('running training') train['rating'] = train.rating.astype(np.float_) algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) candidates = topn.UnratedCandidates(train) recs = batch.recommend(algo, users, 100, candidates, test) return recs folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) recs = pd.concat(eval(train, test) for (train, test) in folds) _log.info('analyzing recommendations') dcg = recs.groupby('user').rating.apply(lm.dcg) _log.info('dcg for users is %.4f', dcg.mean()) assert dcg.mean() > 0
def test_ii_batch_recommend(ncpus): import lenskit.crossfold as xf from lenskit import batch, topn if not os.path.exists('ml-100k/u.data'): raise pytest.skip() ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp']) def eval(train, test): _log.info('running training') algo = knn.ItemItem(30) algo = Recommender.adapt(algo) algo.fit(train) _log.info('testing %d users', test.user.nunique()) recs = batch.recommend(algo, test.user.unique(), 100, n_jobs=ncpus) return recs test_frames = [] recs = [] for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)): test_frames.append(test) recs.append(eval(train, test)) test = pd.concat(test_frames) recs = pd.concat(recs) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) dcg = results.ndcg _log.info('nDCG for %d users is %f', len(dcg), dcg.mean()) assert dcg.mean() > 0.03
def test_ii_batch_accuracy(): from lenskit.algorithms import basic import lenskit.crossfold as xf from lenskit import batch import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings ii_algo = knn.ItemItem(30) algo = basic.Fallback(ii_algo, basic.Bias()) def eval(train, test): _log.info('running training') algo.fit(train) _log.info('testing %d users', test.user.nunique()) return batch.predict(algo, test, n_jobs=4) preds = pd.concat((eval(train, test) for (train, test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.70, abs=0.025) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.90, abs=0.05)
def test_partition_users(): """Partitioning ratings when dataframe has non-unique indices""" ratings = lktu.ml_test.ratings ratings = ratings.set_index('user') ##forces non-unique index with pytest.raises(ValueError): for split in xf.partition_users(ratings, 5, xf.SampleN(5)): pass
def recommend(algo_wrappers, ratings): all_recs = [] test_data = [] for train, test in xf.partition_users(ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2)): test_data.append(test) for algo_wrapper in algo_wrappers: all_recs.append(do_recommend(algo_wrapper, train, test)) return all_recs, test_data
class LegMedLensKit(): def loadData(): ratings = pd.read_csv('/Users/josse/Desktop/ratings.dat', sep='::', names=['user', 'item', 'rating', 'timestamp']) print(ratings.head()) return (ratings) #print ("test") ratings = loadData() data_matrix = np.array( ratings.pivot(index='item', columns='user', values='rating')) print(data_matrix) data_matrix_rev = np.nan_to_num(data_matrix) print(data_matrix_rev) algo_ii = knn.ItemItem(20) algo_als = als.BiasedMF(50) def eval(aname, algo, train, test): print("test") fittable = util.clone(algo) fittable = Recommender.adapt(fittable) fittable.fit(train) users = test.user.unique() # now we run the recommender recs = batch.recommend(fittable, users, 100) # add the algorithm name for analyzability recs['Algorithm'] = aname print("recs") print(recs.head()) return recs all_recs = [] test_data = [] for train, test in xf.partition_users(ratings[['user', 'item', 'rating']], 1, xf.SampleFrac(0.2)): test_data.append(test) #print(test.head(10)) all_recs.append(eval('ItemItem', algo_ii, train, test)) all_recs.append(eval('ALS', algo_als, train, test)) print("test2") print(all_recs.head()) all_recs = pd.concat(all_recs, ignore_index=True) print(all_recs.head()) test_data = pd.concat(test_data, ignore_index=True) #print(test_data.head) rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(all_recs, test_data) results.head() results.groupby('Algorithm').ndcg.mean() results.groupby('Algorithm').ndcg.mean().plot.bar()
def test_sweep_combine(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, combine=False) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5)], attrs=['damping']) sweep.add_algorithms(Popular()) sweep.persist_data() for i in range(1, 6): assert (work / 'ds{}-train.parquet'.format(i)).exists() assert (work / 'ds{}-test.parquet'.format(i)).exists() for ds, cf, dsa in sweep.datasets: assert isinstance(ds, tuple) train, test = ds assert isinstance(train, pathlib.Path) assert isinstance(test, pathlib.Path) assert sweep.run_count() == 5 * 3 try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert not (work / 'runs.csv').exists() assert not (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() for i, (ds, a) in enumerate(sweep._flat_runs()): run = i + 1 assert (work / 'run-{}.json'.format(run)).exists() if isinstance(a.algorithm, Predictor): assert (work / 'predictions-{}.parquet'.format(run)).exists() assert (work / 'recommendations-{}.parquet'.format(run)).exists() sweep.collect_results() assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') assert len(runs) == 5 * 3
def eval_algos(ratings, algorithms): all_recs = [] test_data = [] for train, test in partition_users(ratings[['user', 'item', 'rating']], 5, SampleFrac(0.2)): test_data.append(test) for key, value in algorithms.items(): all_recs.append(eval(key, value, train, test)) all_recs = pd.concat(all_recs, ignore_index=True) print('Algorithms\' results table head:') print(all_recs.head()) test_data = pd.concat(test_data, ignore_index=True) return all_recs, test_data
def create_train_test_rec_data(self): # For now, no cross-validation, just split the data into 1 train and 1 test set. for i, tp in enumerate( xf.partition_users(data=self.data_dense, partitions=1, method=xf.SampleN(5), rng_spec=1)): train = tp.train test = tp.test train.to_csv( f'{conf.SYN_DATA_DIR}syn_train_{self.current_date}.csv') test.to_csv(f'{conf.SYN_DATA_DIR}syn_test_{self.current_date}.csv') return train, test
def run(self, strategy_context: RecommenderAlgorithmStrategyContext) -> np.ndarray: data_set_source = strategy_context.data_set_source data_frame_reader: DataFrameReaderStrategy = self.data_frame_reader_factory.create(data_set_source) data_set: DataFrame = data_frame_reader.parse(DataFrameReaderStrategyContext(data_set_source)) partition = list(partition_users(data=data_set, partitions=1, method=crossfold.SampleFrac(0.2)))[0] test, train = partition.test, partition.train number_of_recommendations = strategy_context.number_of_recommendations algorithm = Recommender.adapt(Bias()) trained_algorithm = algorithm.fit(train) recommendations = lenskit.batch.recommend(trained_algorithm, test['user'].unique(), number_of_recommendations) return recommendations.groupby('user')['item'].apply(lambda x: x).to_numpy().reshape( (-1, number_of_recommendations))
def create_save_train_val_test_rec_data(dense_data, fn): # For now, no cross-validation, just split the data into 1 train and 1 test set. for i, tp in enumerate( xf.partition_users(data=dense_data, partitions=1, method=xf.SampleN(5), rng_spec=1)): train = tp.train test = tp.test test.to_csv(f'{conf.SYN_DATA_DIR}syn_test_{fn}.csv') print("[INFO] Train/test split created") for i, tp in enumerate( xf.partition_users(data=train, partitions=1, method=xf.SampleN(5), rng_spec=1)): train = tp.train val = tp.test train.to_csv(f'{conf.SYN_DATA_DIR}syn_train_{fn}.csv') val.to_csv(f'{conf.SYN_DATA_DIR}syn_val_{fn}.csv') print("[INFO] Train/val split created") return train, val, test
def test_partition_users(): ratings = lktu.ml_pandas.renamed.ratings splits = xf.partition_users(ratings, 5, xf.SampleN(5)) splits = list(splits) assert len(splits) == 5 for s in splits: ucounts = s.test.groupby('user').agg('count') assert all(ucounts == 5) assert all(s.test.index.union(s.train.index) == ratings.index) assert len(s.test) + len(s.train) == len(ratings) users = ft.reduce(lambda us1, us2: us1 | us2, (set(s.test.user) for s in splits)) assert len(users) == ratings.user.nunique() assert users == set(ratings.user)
def test_uu_batch_accuracy(): from lenskit.algorithms import basic import lenskit.crossfold as xf import lenskit.metrics.predict as pm ratings = lktu.ml100k.ratings uu_algo = knn.UserUser(30) algo = basic.Fallback(uu_algo, basic.Bias()) folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) preds = [__batch_eval((algo, train, test)) for (train, test) in folds] preds = pd.concat(preds) mae = pm.mae(preds.prediction, preds.rating) assert mae == approx(0.71, abs=0.028) user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating)) assert user_rmse.mean() == approx(0.91, abs=0.055)