def test_bias_item_predict(): algo = Bias(users=False) algo.fit(simple_df) p = algo.predict_for_user(10, [1, 2, 3]) assert len(p) == 3 assert p.values == approx((algo.item_offsets_ + algo.mean_).values)
def test_bias_predict_unknown_user(): algo = Bias() algo.fit(simple_df) p = algo.predict_for_user(15, [1, 3]) assert len(p) == 2 assert p.values == approx((algo.item_offsets_.loc[[1, 3]] + algo.mean_).values)
def test_bias_global_predict(): algo = Bias(items=False, users=False) algo.fit(simple_df) p = algo.predict_for_user(10, [1, 2, 3]) assert len(p) == 3 assert (p == algo.mean_).all() assert p.values == approx(algo.mean_)
def test_bias_no_item(): algo = Bias(items=False) algo.fit(simple_df) assert algo.mean_ == approx(3.5) assert algo.item_offsets_ is None assert algo.user_offsets_ is not None assert algo.user_offsets_.index.name == 'user' assert set(algo.user_offsets_.index) == set([10, 12, 13]) assert algo.user_offsets_.loc[[10, 12, 13]].values == approx(np.array([1.0, -0.5, -1.5]))
def test_bias_predict_unknown_item(): algo = Bias() algo.fit(simple_df) p = algo.predict_for_user(10, [1, 3, 4]) assert len(p) == 3 intended = algo.item_offsets_.loc[[1, 3]] + algo.mean_ + 0.25 assert p.loc[[1, 3]].values == approx(intended.values) assert p.loc[4] == approx(algo.mean_ + 0.25)
def test_bias_transform(): algo = Bias() ratings = ml_test.ratings normed = algo.fit_transform(ratings) assert all(normed['user'] == ratings['user']) assert all(normed['item'] == ratings['item']) denorm = algo.inverse_transform(normed) assert approx(denorm['rating'] == ratings['rating'], 1.0e-6)
def test_bias_no_user(): algo = Bias(users=False) algo.fit(simple_df) assert algo.mean_ == approx(3.5) assert algo.item_offsets_ is not None assert algo.item_offsets_.index.name == 'item' assert set(algo.item_offsets_.index) == set([1, 2, 3]) assert algo.item_offsets_.loc[1:3].values == approx(np.array([0, 1.5, -1.5])) assert algo.user_offsets_ is None
def test_bias_user_damp(): algo = Bias(items=False, damping=5) algo.fit(simple_df) assert algo.mean_ == approx(3.5) assert algo.item_offsets_ is None assert algo.user_offsets_ is not None assert algo.user_offsets_.index.name == 'user' assert set(algo.user_offsets_.index) == set([10, 12, 13]) assert algo.user_offsets_.loc[[10, 12, 13]].values == \ approx(np.array([0.2857, -0.08333, -0.25]), abs=1.0e-4)
def test_transform_user_without_user_bias(): user = 12 algo = Bias() algo.fit(simple_df) new_ratings = pd.Series([-0.5, 1.5], index=[2, 3]) # items as index and ratings as values v = algo.inverse_transform_user(user, new_ratings) assert v[2] == new_ratings[2] + algo.user_offsets_.loc[user] + algo.item_offsets_.loc[2] + algo.mean_ assert v[3] == new_ratings[3] + algo.user_offsets_.loc[user] + algo.item_offsets_.loc[3] + algo.mean_
def test_bias_check_arguments(): # negative damping is not allowed with raises(ValueError): Bias(damping=-1) # negative user damping not allowed with raises(ValueError): Bias(damping=(-1, 5)) # negative user damping not allowed with raises(ValueError): Bias(damping=(5, -1))
def test_bias_transform_indexes(): algo = Bias() ratings = ml_test.ratings normed = algo.fit_transform(ratings, indexes=True) assert all(normed['user'] == ratings['user']) assert all(normed['item'] == ratings['item']) assert all(normed['uidx'] == algo.user_offsets_.index.get_indexer(ratings['user'])) assert all(normed['iidx'] == algo.item_offsets_.index.get_indexer(ratings['item'])) denorm = algo.inverse_transform(normed) assert approx(denorm['rating'] == ratings['rating'], 1.0e-6)
def test_bias_clone(): algo = Bias() algo.fit(simple_df) params = algo.get_params() assert sorted(params.keys()) == ['damping', 'items', 'users'] a2 = lku.clone(algo) assert a2 is not algo assert getattr(a2, 'mean_', None) is None assert getattr(a2, 'item_offsets_', None) is None assert getattr(a2, 'user_offsets_', None) is None
def test_sweep_combine(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, combine=False) ratings = ml_test.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5)], attrs=['damping']) sweep.add_algorithms(Popular()) sweep.persist_data() for i in range(1, 6): assert (work / 'ds{}-train.parquet'.format(i)).exists() assert (work / 'ds{}-test.parquet'.format(i)).exists() for ds, cf, dsa in sweep.datasets: assert isinstance(ds, tuple) train, test = ds assert isinstance(train, pathlib.Path) assert isinstance(test, pathlib.Path) assert sweep.run_count() == 5 * 3 try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert not (work / 'runs.csv').exists() assert not (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() for i, (ds, a) in enumerate(sweep._flat_runs()): run = i + 1 assert (work / 'run-{}.json'.format(run)).exists() if isinstance(a.algorithm, Predictor): assert (work / 'predictions-{}.parquet'.format(run)).exists() assert (work / 'recommendations-{}.parquet'.format(run)).exists() sweep.collect_results() assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') assert len(runs) == 5 * 3
def test_bias_new_user_predict(): algo = Bias() algo.fit(simple_df) ratings = pd.DataFrame({'item': [1, 2, 3], 'rating': [1.5, 2.5, 3.5]}) ratings = ratings.set_index('item').rating p = algo.predict_for_user(None, [1, 3], ratings=ratings) offs = ratings - algo.mean_ - algo.item_offsets_ umean = offs.mean() _log.info('user mean is %f', umean) assert len(p) == 2 assert p.values == approx((algo.mean_ + algo.item_offsets_ + umean).loc[[1, 3]].values)
def test_bias_full(): algo = Bias() algo.fit(simple_df) assert algo.mean_ == approx(3.5) assert algo.item_offsets_ is not None assert algo.item_offsets_.index.name == 'item' assert set(algo.item_offsets_.index) == set([1, 2, 3]) assert algo.item_offsets_.loc[1:3].values == approx(np.array([0, 1.5, -1.5])) assert algo.user_offsets_ is not None assert algo.user_offsets_.index.name == 'user' assert set(algo.user_offsets_.index) == set([10, 12, 13]) assert algo.user_offsets_.loc[[10, 12, 13]].values == approx(np.array([0.25, -0.5, 0]))
def test_bias_transform(): algo = Bias() ratings = ml_test.ratings normed = algo.fit_transform(ratings) assert all(normed['user'] == ratings['user']) assert all(normed['item'] == ratings['item']) denorm = algo.inverse_transform(normed) assert denorm['rating'].values == approx(ratings['rating'], 1.0e-6) n2 = ratings.join(algo.item_offsets_, on='item') n2 = n2.join(algo.user_offsets_, on='user') nr = n2.rating - algo.mean_ - n2.i_off - n2.u_off assert normed['rating'].values == approx(nr.values)
def test_bias_damped(): algo = Bias(damping=5) algo.fit(simple_df) assert algo.mean_ == approx(3.5) assert algo.item_offsets_ is not None assert algo.item_offsets_.index.name == 'item' assert set(algo.item_offsets_.index) == set([1, 2, 3]) assert algo.item_offsets_.loc[1:3].values == approx(np.array([0, 0.25, -0.25])) assert algo.user_offsets_ is not None assert algo.user_offsets_.index.name == 'user' assert set(algo.user_offsets_.index) == set([10, 12, 13]) assert algo.user_offsets_.loc[[10, 12, 13]].values == \ approx(np.array([0.25, -00.08333, -0.20833]), abs=1.0e-4)
def test_sweep_oneshot(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, combine=False) ratings = ml_test.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms(Bias(damping=5)) try: sweep.run(3) finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert not (work / 'runs.csv').exists() assert not (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() assert (work / 'run-3.json').exists() assert (work / 'predictions-3.parquet').exists() assert (work / 'recommendations-3.parquet').exists() with (work / 'run-3.json').open() as f: run = json.load(f) assert run['RunId'] == 3
def test_sweep_nopreds(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, eval_n_jobs=1) ratings = ml_test.ratings folds = [(train, test.drop(columns=['rating'])) for (train, test) in xf.partition_users(ratings, 5, xf.SampleN(5))] sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms(Popular()) sweep.add_algorithms(Bias(damping=0)) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 2 algorithms by 5 partitions assert len(runs) == 10 assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular']) bias_runs = runs[runs.AlgoClass == 'Bias'] recs = pd.read_parquet(work / 'recommendations.parquet') assert all(recs.RunId.isin(runs.RunId)) assert recs['score'].dtype == np.float64
def test_sweep_norecs(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, recommend=None) ratings = ml_test.ratings folds = xf.partition_users(ratings, 5, xf.SampleN(5)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms(Bias(damping=0)) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 2 algorithms by 5 partitions assert len(runs) == 10 assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular']) bias_runs = runs[runs.AlgoClass == 'Bias'] preds = pd.read_parquet(work / 'predictions.parquet') assert all(preds.RunId.isin(bias_runs.RunId))
def test_sweep_save(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_test.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms(Bias(damping=5)) sweep.persist_data() pf = work / 'sweep.dat' with pf.open('wb') as f: pickle.dump(sweep, f) with pf.open('rb') as f: sweep = pickle.load(f) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 1 algorithms by 5 partitions assert len(runs) == 5
def test_fallback_train_one(): algo = basic.Fallback(Bias()) algo.fit(lktu.ml_test.ratings) assert len(algo.algorithms) == 1 assert isinstance(algo.algorithms[0], Bias) assert algo.algorithms[0].mean_ == approx( lktu.ml_test.ratings.rating.mean())
def test_bias_separate_damping(): algo = Bias(damping=(5, 10)) algo.fit(simple_df) assert algo.mean_ == approx(3.5) assert algo.item_offsets_ is not None assert algo.item_offsets_.index.name == 'item' assert set(algo.item_offsets_.index) == set([1, 2, 3]) assert algo.item_offsets_.loc[1:3].values == \ approx(np.array([0, 0.136364, -0.13636]), abs=1.0e-4) assert algo.user_offsets_ is not None assert algo.user_offsets_.index.name == 'user' assert set(algo.user_offsets_.index) == set([10, 12, 13]) assert algo.user_offsets_.loc[[10, 12, 13]].values == \ approx(np.array([0.266234, -0.08333, -0.22727]), abs=1.0e-4)
def test_bias_batch_recommend(ml_folds: MLFolds, ncpus, isolate): algo = Bias(damping=5) algo = TopN(algo) ml_folds.isolate = isolate recs = ml_folds.eval_all(algo, n_jobs=ncpus) ml_folds.check_positive_ndcg(recs)
def test_global_metric(): import lenskit.crossfold as xf import lenskit.batch as batch from lenskit.algorithms.bias import Bias train, test = next( xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5))) algo = Bias() algo.fit(train) preds = batch.predict(algo, test) rmse = pm.global_metric(preds) assert rmse == pm.rmse(preds.prediction, preds.rating) mae = pm.global_metric(preds, metric=pm.mae) assert mae == pm.mae(preds.prediction, preds.rating)
def _build_predict(ratings, fold): algo = Fallback(knn.ItemItem(20), Bias(5)) train = ratings[ratings['partition'] != fold] algo.fit(train) test = ratings[ratings['partition'] == fold] preds = batch.predict(algo, test, n_jobs=1) return preds
def test_bias_train_ml_ratings(): algo = Bias() ratings = ml_test.ratings algo.fit(ratings) assert algo.mean_ == approx(ratings.rating.mean()) imeans_data = ratings.groupby('item').rating.mean() imeans_algo = algo.item_offsets_ + algo.mean_ ares, data = imeans_algo.align(imeans_data) assert ares.values == approx(data.values) urates = ratings.set_index('user').loc[2].set_index('item').rating umean = (urates - imeans_data[urates.index]).mean() p = algo.predict_for_user(2, [10, 11, -1]) assert len(p) == 3 assert p.iloc[0] == approx(imeans_data.loc[10] + umean) assert p.iloc[1] == approx(imeans_data.loc[11] + umean) assert p.iloc[2] == approx(ratings.rating.mean() + umean)
def test_fallback_list(): algo = basic.Fallback([basic.Memorized(simple_df), Bias()]) algo.fit(lktu.ml_test.ratings) assert len(algo.algorithms) == 2 params = algo.get_params() assert list(params.keys()) == ['algorithms'] assert len(params['algorithms']) == 2 assert isinstance(params['algorithms'][0], basic.Memorized) assert isinstance(params['algorithms'][1], Bias)
def test_fallback_clone(): algo = basic.Fallback([basic.Memorized(simple_df), Bias()]) algo.fit(lktu.ml_test.ratings) assert len(algo.algorithms) == 2 clone = lku.clone(algo) assert clone is not algo for a1, a2 in zip(algo.algorithms, clone.algorithms): assert a1 is not a2 assert type(a2) == type(a1)
def test_sweep_filenames(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_test.ratings folds = [] for part, (train, test) in enumerate(xf.partition_users(ratings, 2, xf.SampleN(5))): trfn = work / 'p{}-train.csv'.format(part) tefn = work / 'p{}-test.csv'.format(part) train.to_csv(trfn) test.to_csv(tefn) folds.append((trfn, tefn)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms( [Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) def progress(iter, total=None): assert total == len(folds) * 4 return iter try: sweep.run(progress=progress) finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 2 partitions assert len(runs) == 8