Esempio n. 1
0
def test_sample_users():
    """Sampling users when dataframe has non-unique indices"""
    ratings = lktu.ml_test.ratings
    ratings = ratings.set_index('user')  ##forces non-unique index
    with pytest.raises(ValueError):
        for split in xf.sample_users(ratings, 5, 100, xf.SampleN(5)):
            pass
Esempio n. 2
0
def test_fill_users():
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.precision)
    rla.add_metric(topn.recall)

    algo = UserUser(20, min_nbrs=10)
    algo = Recommender.adapt(algo)

    splits = xf.sample_users(ml_test.ratings, 1, 50, xf.SampleN(5))
    train, test = next(splits)
    algo.fit(train)

    rec_users = test['user'].sample(50).unique()
    recs = batch.recommend(algo, rec_users, 25)

    scores = rla.compute(recs, test, include_missing=True)
    assert len(scores) == test['user'].nunique()
    assert scores['recall'].notna().sum() == len(rec_users)
    assert all(scores['ntruth'] == 5)

    mscores = rla.compute(recs, test)
    assert len(mscores) < len(scores)

    recall = scores.loc[scores['recall'].notna(), 'recall'].copy()
    recall, mrecall = recall.align(mscores['recall'])
    assert all(recall == mrecall)
Esempio n. 3
0
def split_dataset(ratings, user_fraction=.1):
    """Split a dataset in train/test data"""

    n_users = len(ratings['user'].unique())

    # There are many ways to separate a dataset in (train, test) data, here are two:
    #   - Row separation: the test set will contain users that the model knows.
    #     The performance of the model will be its ability to predict "new"
    #     tastes for a known user
    #   - User separation: the test set will contain users that the model has
    #     never encountered. The performance of the model will be its abiliy to
    #     predict new users behaviours considering the behaviour of other
    #     known users.
    # see [lkpy documentation](https://lkpy.readthedocs.io/en/stable/crossfold.html)
    # Here the sampling is as follow:
    #   - Sample test_fraction * n_total users
    #   - Randomly select half of their listenings for the test set
    result = list(
        xf.sample_users(ratings[['user', 'item', 'rating']],
                        partitions=1,
                        size=int(n_users * user_fraction),
                        method=xf.SampleFrac(.5)))[0]

    print(f'n test users: {len(result.test["user"].unique())}')

    return result.train, result.test
Esempio n. 4
0
def do_prepare(opts):
    name = opts['-d']
    ml = MovieLens(f'data/{name}')

    train, test = next(sample_users(ml.ratings, 1, 10000, SampleN(5)))

    test.to_parquet(f'data/{name}-test.parquet', index=False)

    _log.info('getting popular recs')
    pop = Popular()
    pop.fit(train)
    pop_recs = recommend(pop, test['user'].unique(), 100)

    _log.info('getting ALS recs')
    als = ImplicitMF(20, iterations=10)
    als = Recommender.adapt(als)
    als.fit(train.drop(columns=['rating']))
    als_recs = recommend(als, test['user'].unique(), 100)

    _log.info('merging recs')
    recs = pd.concat({
        'Popular': pop_recs,
        'ALS': als_recs
    },
                     names=['Algorithm'])
    recs.reset_index('Algorithm', inplace=True)
    recs.to_parquet(f'data/{name}-recs.parquet', index=False)
Esempio n. 5
0
def test_adv_fill_users():
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.precision)
    rla.add_metric(topn.recall)

    a_uu = UserUser(30, min_nbrs=10)
    a_uu = Recommender.adapt(a_uu)
    a_ii = ItemItem(20, min_nbrs=4)
    a_ii = Recommender.adapt(a_ii)

    splits = xf.sample_users(ml_test.ratings, 2, 50, xf.SampleN(5))
    all_recs = {}
    all_test = {}
    for i, (train, test) in enumerate(splits):
        a_uu.fit(train)
        rec_users = test['user'].sample(50).unique()
        all_recs[(i + 1, 'UU')] = batch.recommend(a_uu, rec_users, 25)

        a_ii.fit(train)
        rec_users = test['user'].sample(50).unique()
        all_recs[(i + 1, 'II')] = batch.recommend(a_ii, rec_users, 25)
        all_test[i + 1] = test

    recs = pd.concat(all_recs, names=['part', 'algo'])
    recs.reset_index(['part', 'algo'], inplace=True)
    recs.reset_index(drop=True, inplace=True)

    test = pd.concat(all_test, names=['part'])
    test.reset_index(['part'], inplace=True)
    test.reset_index(drop=True, inplace=True)

    scores = rla.compute(recs, test, include_missing=True)
    inames = scores.index.names
    scores.sort_index(inplace=True)
    assert len(scores) == 50 * 4
    assert all(scores['ntruth'] == 5)
    assert scores['recall'].isna().sum() > 0
    _log.info('scores:\n%s', scores)

    ucounts = scores.reset_index().groupby('algo')['user'].agg(
        ['count', 'nunique'])
    assert all(ucounts['count'] == 100)
    assert all(ucounts['nunique'] == 100)

    mscores = rla.compute(recs, test)
    mscores = mscores.reset_index().set_index(inames)
    mscores.sort_index(inplace=True)
    assert len(mscores) < len(scores)
    _log.info('mscores:\n%s', mscores)

    recall = scores.loc[scores['recall'].notna(), 'recall'].copy()
    recall, mrecall = recall.align(mscores['recall'])
    assert all(recall == mrecall)
Esempio n. 6
0
def test_batch_predict_preshared():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf

    algo = basic.Bias()
    splits = xf.sample_users(lktu.ml_test.ratings, 1, 100, xf.SampleN(5))
    train, test = next(splits)

    ares = lkb.train_isolated(algo, train)
    preds = lkb.predict(ares, test)
    assert len(preds) == len(test)
    assert not any(preds['prediction'].isna())
Esempio n. 7
0
def test_sample_users_frac_oversize_ndj():
    ratings = lktu.ml_test.ratings
    splits = xf.sample_users(ratings, 20, 100, xf.SampleN(5), disjoint=False)
    splits = list(splits)
    assert len(splits) == 20

    for s in splits:
        ucounts = s.test.groupby('user').agg('count')
        assert len(ucounts) == 100
        assert len(s.test) == 5 * 100
        assert all(ucounts == 5)
        assert all(s.test.index.union(s.train.index) == ratings.index)
        assert len(s.test) + len(s.train) == len(ratings)
Esempio n. 8
0
def test_tf_isvd(ml20m):
    algo = lenskit_tf.IntegratedBiasMF(20)

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        return batch.predict(algo, test)

    folds = xf.sample_users(ml20m, 2, 5000, xf.SampleFrac(0.2))
    preds = pd.concat(eval(train, test) for (train, test) in folds)
    mae = pm.mae(preds.prediction, preds.rating)
    assert mae == approx(0.60, abs=0.025)

    user_rmse = preds.groupby('user').apply(
        lambda df: pm.rmse(df.prediction, df.rating))
    assert user_rmse.mean() == approx(0.92, abs=0.05)
Esempio n. 9
0
def test_global_metric():
    import lenskit.crossfold as xf
    import lenskit.batch as batch
    from lenskit.algorithms.bias import Bias

    train, test = next(
        xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5)))
    algo = Bias()
    algo.fit(train)

    preds = batch.predict(algo, test)

    rmse = pm.global_metric(preds)
    assert rmse == pm.rmse(preds.prediction, preds.rating)

    mae = pm.global_metric(preds, metric=pm.mae)
    assert mae == pm.mae(preds.prediction, preds.rating)
Esempio n. 10
0
def test_save_models(tmp_path, format):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, save_models=format)

    sweep.add_algorithms(Bias(5))
    sweep.add_algorithms(Popular())

    ratings = ml_test.ratings
    sweep.add_datasets(lambda: xf.sample_users(ratings, 2, 100, xf.SampleN(5)),
                       name='ml-small')

    sweep.run()

    runs = pd.read_parquet(fspath(tmp_path / 'runs.parquet'))
    runs = runs.set_index('RunId')

    for i in range(4):
        run_id = i + 1
        fn = work / 'model-{}'.format(run_id)
        if format is True:
            fn = fn.with_suffix('.pkl')
            assert fn.exists()
            with fn.open('rb') as f:
                algo = pickle.load(f)

        elif format == 'gzip':
            fn = fn.with_suffix('.pkl.gz')
            assert fn.exists()
            with gzip.open(fspath(fn), 'rb') as f:
                algo = pickle.load(f)
        elif format == 'joblib':
            fn = fn.with_suffix('.jlpkl')
            assert fn.exists()
            algo = joblib.load(fn)
        else:
            assert False

        assert algo is not None
        algo_class = algo.__class__.__name__
        if isinstance(algo, TopN):
            algo_class = algo.predictor.__class__.__name__

        assert algo_class == runs.loc[run_id, 'AlgoClass']
Esempio n. 11
0
def test_sample_users():
    ratings = lktu.ml_test.ratings
    splits = xf.sample_users(ratings, 5, 100, xf.SampleN(5))
    splits = list(splits)
    assert len(splits) == 5

    for s in splits:
        ucounts = s.test.groupby('user').agg('count')
        assert len(s.test) == 5 * 100
        assert len(ucounts) == 100
        assert all(ucounts == 5)
        assert all(s.test.index.union(s.train.index) == ratings.index)
        assert len(s.test) + len(s.train) == len(ratings)

    # no overlapping users
    for s1, s2 in it.product(splits, splits):
        if s1 is s2:
            continue
        us1 = s1.test.user.unique()
        us2 = s2.test.user.unique()
        assert len(np.intersect1d(us1, us2)) == 0
Esempio n. 12
0
def test_user_metric():
    import lenskit.crossfold as xf
    import lenskit.batch as batch
    from lenskit.algorithms.bias import Bias

    train, test = next(
        xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5)))
    algo = Bias()
    algo.fit(train)

    preds = batch.predict(algo, test)

    rmse = pm.user_metric(preds)
    u_rmse = preds.groupby('user').apply(
        lambda df: pm.rmse(df.prediction, df.rating))
    assert rmse == approx(u_rmse.mean())

    mae = pm.user_metric(preds, metric=pm.mae)
    u_mae = preds.groupby('user').apply(
        lambda df: pm.mae(df.prediction, df.rating))
    assert mae == approx(u_mae.mean())
Esempio n. 13
0
def test_sample_users_frac():
    ratings = lktu.ml_test.ratings
    splits = xf.sample_users(ratings, 5, 100, xf.SampleFrac(0.2))
    splits = list(splits)
    assert len(splits) == 5
    ucounts = ratings.groupby('user').item.count()
    uss = ucounts * 0.2

    for s in splits:
        tucs = s.test.groupby('user').item.count()
        assert len(tucs) == 100
        assert all(tucs >= uss.loc[tucs.index] - 1)
        assert all(tucs <= uss.loc[tucs.index] + 1)
        assert all(s.test.index.union(s.train.index) == ratings.index)
        assert len(s.test) + len(s.train) == len(ratings)

    # no overlapping users
    for s1, s2 in it.product(splits, splits):
        if s1 is s2:
            continue
        us1 = s1.test.user.unique()
        us2 = s2.test.user.unique()
        assert len(np.intersect1d(us1, us2)) == 0
Esempio n. 14
0
def test_sample_users_frac_oversize():
    ratings = lktu.ml_test.ratings
    splits = xf.sample_users(ratings, 20, 100, xf.SampleN(5))
    splits = list(splits)
    assert len(splits) == 20

    for s in splits:
        ucounts = s.test.groupby('user').agg('count')
        assert len(ucounts) < 100
        assert all(ucounts == 5)
        assert all(s.test.index.union(s.train.index) == ratings.index)
        assert len(s.test) + len(s.train) == len(ratings)

    users = ft.reduce(lambda us1, us2: us1 | us2,
                      (set(s.test.user) for s in splits))
    assert len(users) == ratings.user.nunique()
    assert users == set(ratings.user)
    for s1, s2 in it.product(splits, splits):
        if s1 is s2:
            continue

        us1 = s1.test.user.unique()
        us2 = s2.test.user.unique()
        assert len(np.intersect1d(us1, us2)) == 0
Esempio n. 15
0
    user_n = df.loc[df['count'] < condition]
    return user_n


game_count = groupby_count(result, 'user', 'item')

user_5 = prune(game_count, 5)

user_less_5 = user_5.index
user_less_5

pruned_data_5 = result.set_index('user').drop(user_less_5)
pruned_data_5.reset_index(inplace=True)

#pairs_user = list(partition_users(pruned_data_5, 5, xf.SampleN(1)))
pairs_user = list(sample_users(pruned_data_5, 5, 12000, xf.SampleN(1)))
pickle_out = open("sample_user.pickle", "wb")
pickle.dump(pairs_user, pickle_out)
pickle_out.close()

truth = pd.concat((p.test for p in pairs_user))
#truth.to_csv(r'results/steam/pruned_5.csv')


def algo_eval(path, algo, dataset):
    evaluation = batch.MultiEval(path=path, predict=False, recommend=100)
    evaluation.add_algorithms(algos=algo)
    evaluation.add_datasets(data=dataset)
    evaluation.run()