Esempio n. 1
0
def test_fill_users():
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.precision)
    rla.add_metric(topn.recall)

    algo = UserUser(20, min_nbrs=10)
    algo = Recommender.adapt(algo)

    splits = xf.sample_users(ml_test.ratings, 1, 50, xf.SampleN(5))
    train, test = next(splits)
    algo.fit(train)

    rec_users = test['user'].sample(50).unique()
    recs = batch.recommend(algo, rec_users, 25)

    scores = rla.compute(recs, test, include_missing=True)
    assert len(scores) == test['user'].nunique()
    assert scores['recall'].notna().sum() == len(rec_users)
    assert all(scores['ntruth'] == 5)

    mscores = rla.compute(recs, test)
    assert len(mscores) < len(scores)

    recall = scores.loc[scores['recall'].notna(), 'recall'].copy()
    recall, mrecall = recall.align(mscores['recall'])
    assert all(recall == mrecall)
Esempio n. 2
0
def test_adv_fill_users():
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.precision)
    rla.add_metric(topn.recall)

    a_uu = UserUser(30, min_nbrs=10)
    a_uu = Recommender.adapt(a_uu)
    a_ii = ItemItem(20, min_nbrs=4)
    a_ii = Recommender.adapt(a_ii)

    splits = xf.sample_users(ml_test.ratings, 2, 50, xf.SampleN(5))
    all_recs = {}
    all_test = {}
    for i, (train, test) in enumerate(splits):
        a_uu.fit(train)
        rec_users = test['user'].sample(50).unique()
        all_recs[(i + 1, 'UU')] = batch.recommend(a_uu, rec_users, 25)

        a_ii.fit(train)
        rec_users = test['user'].sample(50).unique()
        all_recs[(i + 1, 'II')] = batch.recommend(a_ii, rec_users, 25)
        all_test[i + 1] = test

    recs = pd.concat(all_recs, names=['part', 'algo'])
    recs.reset_index(['part', 'algo'], inplace=True)
    recs.reset_index(drop=True, inplace=True)

    test = pd.concat(all_test, names=['part'])
    test.reset_index(['part'], inplace=True)
    test.reset_index(drop=True, inplace=True)

    scores = rla.compute(recs, test, include_missing=True)
    inames = scores.index.names
    scores.sort_index(inplace=True)
    assert len(scores) == 50 * 4
    assert all(scores['ntruth'] == 5)
    assert scores['recall'].isna().sum() > 0
    _log.info('scores:\n%s', scores)

    ucounts = scores.reset_index().groupby('algo')['user'].agg(
        ['count', 'nunique'])
    assert all(ucounts['count'] == 100)
    assert all(ucounts['nunique'] == 100)

    mscores = rla.compute(recs, test)
    mscores = mscores.reset_index().set_index(inames)
    mscores.sort_index(inplace=True)
    assert len(mscores) < len(scores)
    _log.info('mscores:\n%s', mscores)

    recall = scores.loc[scores['recall'].notna(), 'recall'].copy()
    recall, mrecall = recall.align(mscores['recall'])
    assert all(recall == mrecall)
from lenskit.algorithms.user_knn import UserUser
from recsys.cf.usercf import UserCF
from recsys.utils.data import load_movielen_data
from recsys.utils.debug import Timer, LogUtil

LogUtil.configLog()
ratings, users, movies = load_movielen_data()
model0 = UserCF(min_threshold=0.1, min_nn=5, max_nn=20)
model0.fit(ratings)

model = UserUser(nnbrs=20, min_nbrs=5, min_sim=0.1, center=False)
model.fit(ratings)

user = 1
movies = list(movies.item.astype(int))
movies = [1]
clock = Timer()
for _ in range(5):
    df = model.predict_for_user(user, movies)
    print(clock.restart())

print("=" * 60)

for _ in range(5):
    df0 = model0.predict_for_user(user, movies)
    print(clock.restart())

print(df.describe())
print(df0.describe())