def test_sweep_oneshot(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, combine=False) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms(Bias(damping=5)) try: sweep.run(3) finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert not (work / 'runs.csv').exists() assert not (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() assert (work / 'run-3.json').exists() assert (work / 'predictions-3.parquet').exists() assert (work / 'recommendations-3.parquet').exists() with (work / 'run-3.json').open() as f: run = json.load(f) assert run['RunId'] == 3
def test_sweep_filenames(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings folds = [] for part, (train, test) in enumerate(xf.partition_users(ratings, 2, xf.SampleN(5))): trfn = work / 'p{}-train.csv'.format(part) tefn = work / 'p{}-test.csv'.format(part) train.to_csv(trfn) test.to_csv(tefn) folds.append((trfn, tefn)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 2 partitions assert len(runs) == 8
def test_sweep_save(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms(Bias(damping=5)) sweep.persist_data() pf = work / 'sweep.dat' with pf.open('wb') as f: pickle.dump(sweep, f) with pf.open('rb') as f: sweep = pickle.load(f) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 1 algorithms by 5 partitions assert len(runs) == 5
def test_sweep_nopreds(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, eval_n_jobs=1) ratings = ml_test.ratings folds = [(train, test.drop(columns=['rating'])) for (train, test) in xf.partition_users(ratings, 5, xf.SampleN(5)) ] sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms(Popular()) sweep.add_algorithms(Bias(damping=0)) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 2 algorithms by 5 partitions assert len(runs) == 10 assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular']) bias_runs = runs[runs.AlgoClass == 'Bias'] recs = pd.read_parquet(work / 'recommendations.parquet') assert all(recs.RunId.isin(runs.RunId)) assert recs['score'].dtype == np.float64
def test_sweep_norecs(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, recommend=None) ratings = ml_test.ratings folds = xf.partition_users(ratings, 5, xf.SampleN(5)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 5 partitions assert len(runs) == 20 assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular']) bias_runs = runs[runs.AlgoClass == 'Bias'] assert all(bias_runs.damping.notna()) pop_runs = runs[runs.AlgoClass == 'Popular'] assert all(pop_runs.damping.isna()) preds = pd.read_parquet(work / 'predictions.parquet') assert all(preds.RunId.isin(bias_runs.RunId))
def test_sweep_combine(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, combine=False) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5)], attrs=['damping']) sweep.add_algorithms(Popular()) sweep.persist_data() for i in range(1, 6): assert (work / 'ds{}-train.parquet'.format(i)).exists() assert (work / 'ds{}-test.parquet'.format(i)).exists() for ds, cf, dsa in sweep.datasets: assert isinstance(ds, tuple) train, test = ds assert isinstance(train, pathlib.Path) assert isinstance(test, pathlib.Path) assert sweep.run_count() == 5 * 3 try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert not (work / 'runs.csv').exists() assert not (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() for i, (ds, a) in enumerate(sweep._flat_runs()): run = i + 1 assert (work / 'run-{}.json'.format(run)).exists() if isinstance(a.algorithm, Predictor): assert (work / 'predictions-{}.parquet'.format(run)).exists() assert (work / 'recommendations-{}.parquet'.format(run)).exists() sweep.collect_results() assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') assert len(runs) == 5 * 3
def test_save_models(tmp_path, format): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, save_models=format) sweep.add_algorithms(Bias(5)) sweep.add_algorithms(Popular()) ratings = ml_test.ratings sweep.add_datasets(lambda: xf.sample_users(ratings, 2, 100, xf.SampleN(5)), name='ml-small') sweep.run() runs = pd.read_parquet(fspath(tmp_path / 'runs.parquet')) runs = runs.set_index('RunId') for i in range(4): run_id = i + 1 fn = work / 'model-{}'.format(run_id) if format is True: fn = fn.with_suffix('.pkl') assert fn.exists() with fn.open('rb') as f: algo = pickle.load(f) elif format == 'gzip': fn = fn.with_suffix('.pkl.gz') assert fn.exists() with gzip.open(fspath(fn), 'rb') as f: algo = pickle.load(f) elif format == 'joblib': fn = fn.with_suffix('.jlpkl') assert fn.exists() algo = joblib.load(fn) else: assert False assert algo is not None algo_class = algo.__class__.__name__ if isinstance(algo, TopN): algo_class = algo.predictor.__class__.__name__ assert algo_class == runs.loc[run_id, 'AlgoClass']
def test_sweep_persist(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.persist_data() for i in range(1, 6): assert (work / 'ds{}-train.parquet'.format(i)).exists() assert (work / 'ds{}-test.parquet'.format(i)).exists() for ds, cf, dsa in sweep.datasets: assert isinstance(ds, tuple) train, test = ds assert isinstance(train, pathlib.Path) assert isinstance(test, pathlib.Path) sweep.add_algorithms( [Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 5 partitions assert len(runs) == 20
def algo_eval(path, algo, dataset): evaluation = batch.MultiEval(path=path, predict=False, recommend=100) evaluation.add_algorithms(algos=algo) evaluation.add_datasets(data=dataset) evaluation.run()
import pandas as pd from lenskit import batch from lenskit import crossfold as xf from lenskit.algorithms import funksvd, item_knn, user_knn from lenskit.metrics import topn ratings = pd.read_csv('data/ratings.csv') ratings.rename({'userId': 'user', 'movieId': 'item'}, axis = 'columns', inplace = True) print(ratings.head()) xf_dataset_batch, xf_dataset_test = tee(xf.partition_users(ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2))) truth = pd.concat([test for _, test in xf_dataset_test], ignore_index = True) runner = batch.MultiEval('result', False, nprocs = 4) runner.add_algorithms( [item_knn.ItemItem(10), item_knn.ItemItem(20), item_knn.ItemItem(30)], False, ['nnbrs'] ) runner.add_algorithms( [user_knn.UserUser(10), user_knn.UserUser(20), user_knn.UserUser(30)], True, ['nnbrs'] ) runner.add_algorithms( [funksvd.FunkSVD(40, damping = 0), funksvd.FunkSVD(50, damping = 5), funksvd.FunkSVD(60, damping = 10)], False, ['features', 'damping'] )
biased_mf_200 = als.BiasedMF(200) implicit_mf_50 = als.ImplicitMF(50) implicit_mf_100 = als.ImplicitMF(100) implicit_mf_200 = als.ImplicitMF(200) funk_svd_mf_50 = funksvd.FunkSVD(50) funk_svd_mf_100 = funksvd.FunkSVD(100) funk_svd_mf_200 = funksvd.FunkSVD(200) bayesian = BPR() hierarchical_poisson_fact_50 = HPF(50) hierarchical_poisson_fact_100 = HPF(100) hierarchical_poisson_fact_200 = HPF(200) train, test = train_test_split(ratings[['user', 'item', 'rating']], test_size=0.2) eval = batch.MultiEval('../recs/cf', recommend=NUM_OF_RECS) eval.add_datasets((train, test), name='ml-1m') eval.add_algorithms(random, name='random') eval.add_algorithms(popular, name='popular') eval.add_algorithms(item_to_item_100, name='item_to_item_100') eval.add_algorithms(item_to_item_200, name='item_to_item_200') eval.add_algorithms(item_to_item_500, name='item_to_item_500') eval.add_algorithms(user_to_user_100, name='user_to_user_100') eval.add_algorithms(user_to_user_200, name='user_to_user_200') eval.add_algorithms(user_to_user_500, name='user_to_user_500') eval.add_algorithms(biased_mf_50, name='biased_mf_50') eval.add_algorithms(biased_mf_100, name='biased_mf_100') eval.add_algorithms(biased_mf_200, name='biased_mf_200') eval.add_algorithms(implicit_mf_50, name='implicit_mf_50') eval.add_algorithms(implicit_mf_100, name='implicit_mf_100') eval.add_algorithms(implicit_mf_200, name='implicit_mf_200')