Exemple #1
0
def test_tf_bmf_batch_accuracy(tf_session):
    from lenskit.algorithms import basic
    from lenskit.algorithms import bias
    import lenskit.crossfold as xf
    from lenskit import batch
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    algo = lktf.BiasedMF(25,
                         damping=10,
                         batch_size=1024,
                         epochs=20,
                         rng_spec=42)
    algo = basic.Fallback(algo, bias.Bias(damping=10))

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        return batch.predict(algo, test)

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    preds = pd.concat(eval(train, test) for (train, test) in folds)
    mae = pm.mae(preds.prediction, preds.rating)
    assert mae == approx(0.83, abs=0.025)

    user_rmse = preds.groupby('user').apply(
        lambda df: pm.rmse(df.prediction, df.rating))
    assert user_rmse.mean() == approx(1.03, abs=0.05)
Exemple #2
0
def test_als_implicit_batch_accuracy():
    import lenskit.crossfold as xf
    from lenskit import batch
    from lenskit import topn

    ratings = lktu.ml100k.load_ratings()

    algo = als.ImplicitMF(25, iterations=20)

    def eval(train, test):
        _log.info('running training')
        train['rating'] = train.rating.astype(np.float_)
        algo.fit(train)
        users = test.user.unique()
        _log.info('testing %d users', len(users))
        candidates = topn.UnratedCandidates(train)
        recs = batch.recommend(algo, users, 100, candidates)
        return recs

    folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))
    test = pd.concat(te for (tr, te) in folds)
    recs = pd.concat(eval(train, test) for (train, test) in folds)

    _log.info('analyzing recommendations')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, test)
    _log.info('nDCG for users is %.4f', results.ndcg.mean())
    assert results.ndcg.mean() > 0
Exemple #3
0
def test_implicit_als_batch_accuracy():
    import lenskit.crossfold as xf
    from lenskit import batch, topn

    ratings = lktu.ml100k.ratings

    algo_t = ALS(25)

    def eval(train, test):
        _log.info('running training')
        train['rating'] = train.rating.astype(np.float_)
        algo = util.clone(algo_t)
        algo.fit(train)
        users = test.user.unique()
        _log.info('testing %d users', len(users))
        recs = batch.recommend(algo, users, 100)
        return recs

    folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))
    test = pd.concat(f.test for f in folds)

    recs = pd.concat(eval(train, test) for (train, test) in folds)

    _log.info('analyzing recommendations')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, test)
    dcg = results.ndcg
    _log.info('nDCG for %d users is %.4f', len(dcg), dcg.mean())
    assert dcg.mean() > 0
Exemple #4
0
def test_als_batch_accuracy():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    lu_algo = als.BiasedMF(25, iterations=20, damping=5, method='lu')
    cd_algo = als.BiasedMF(25, iterations=25, damping=5, method='cd')
    # algo = basic.Fallback(svd_algo, basic.Bias(damping=5))

    def eval(train, test):
        _log.info('training LU')
        lu_algo.fit(train)
        _log.info('training CD')
        cd_algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        return test.assign(lu_pred=lu_algo.predict(test), cd_pred=cd_algo.predict(test))

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    preds = pd.concat(eval(train, test) for (train, test) in folds)
    preds['abs_diff'] = np.abs(preds.lu_pred - preds.cd_pred)
    _log.info('predictions:\n%s', preds.sort_values('abs_diff', ascending=False))
    _log.info('diff summary:\n%s', preds.abs_diff.describe())

    lu_mae = pm.mae(preds.lu_pred, preds.rating)
    assert lu_mae == approx(0.73, abs=0.025)
    cd_mae = pm.mae(preds.cd_pred, preds.rating)
    assert cd_mae == approx(0.73, abs=0.025)

    user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.lu_pred, df.rating))
    assert user_rmse.mean() == approx(0.91, abs=0.05)
    user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.cd_pred, df.rating))
    assert user_rmse.mean() == approx(0.91, abs=0.05)
def test_ii_batch_accuracy():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    from lenskit import batch
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    ii_algo = knn.ItemItem(30)
    algo = basic.Fallback(ii_algo, basic.Bias())

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        return batch.predict(algo, test, n_jobs=4)

    preds = pd.concat((eval(train, test)
                       for (train, test)
                       in xf.partition_users(ratings, 5, xf.SampleFrac(0.2))))
    mae = pm.mae(preds.prediction, preds.rating)
    assert mae == approx(0.70, abs=0.025)

    user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating))
    assert user_rmse.mean() == approx(0.90, abs=0.05)
def test_uu_implicit_batch_accuracy():
    from lenskit import batch, topn
    import lenskit.crossfold as xf

    ratings = lktu.ml100k.ratings

    algo = knn.UserUser(30, center=False, aggregate='sum')

    folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))
    all_test = pd.concat(f.test for f in folds)

    rec_lists = []
    for train, test in folds:
        _log.info('running training')
        rec_algo = Recommender.adapt(algo)
        rec_algo.fit(train.loc[:, ['user', 'item']])
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(rec_algo, test.user.unique(), 100, n_jobs=2)
        rec_lists.append(recs)
    recs = pd.concat(rec_lists)

    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, all_test)
    user_dcg = results.ndcg

    dcg = user_dcg.mean()
    assert dcg >= 0.03
def test_ii_batch_recommend(ncpus):
    import lenskit.crossfold as xf
    from lenskit import batch, topn

    if not os.path.exists('ml-100k/u.data'):
        raise pytest.skip()

    ratings = pd.read_csv('ml-100k/u.data', sep='\t', names=['user', 'item', 'rating', 'timestamp'])

    def eval(train, test):
        _log.info('running training')
        algo = knn.ItemItem(30)
        algo = Recommender.adapt(algo)
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(algo, test.user.unique(), 100, n_jobs=ncpus)
        return recs

    test_frames = []
    recs = []
    for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)):
        test_frames.append(test)
        recs.append(eval(train, test))

    test = pd.concat(test_frames)
    recs = pd.concat(recs)

    _log.info('analyzing recommendations')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, test)
    dcg = results.ndcg
    _log.info('nDCG for %d users is %f', len(dcg), dcg.mean())
    assert dcg.mean() > 0.03
Exemple #8
0
def split_dataset(ratings, user_fraction=.1):
    """Split a dataset in train/test data"""

    n_users = len(ratings['user'].unique())

    # There are many ways to separate a dataset in (train, test) data, here are two:
    #   - Row separation: the test set will contain users that the model knows.
    #     The performance of the model will be its ability to predict "new"
    #     tastes for a known user
    #   - User separation: the test set will contain users that the model has
    #     never encountered. The performance of the model will be its abiliy to
    #     predict new users behaviours considering the behaviour of other
    #     known users.
    # see [lkpy documentation](https://lkpy.readthedocs.io/en/stable/crossfold.html)
    # Here the sampling is as follow:
    #   - Sample test_fraction * n_total users
    #   - Randomly select half of their listenings for the test set
    result = list(
        xf.sample_users(ratings[['user', 'item', 'rating']],
                        partitions=1,
                        size=int(n_users * user_fraction),
                        method=xf.SampleFrac(.5)))[0]

    print(f'n test users: {len(result.test["user"].unique())}')

    return result.train, result.test
def test_bias_batch_predict(ncpus):
    from lenskit.algorithms import bias
    import lenskit.crossfold as xf
    from lenskit import batch
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    algo = bias.Bias(damping=5)

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.predict(algo, test, n_jobs=ncpus)
        return recs

    preds = pd.concat(
        (eval(train, test)
         for (train,
              test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2))))

    _log.info('analyzing predictions')
    rmse = pm.rmse(preds.prediction, preds.rating)
    _log.info('RMSE is %f', rmse)
    assert rmse == pytest.approx(0.95, abs=0.1)
Exemple #10
0
def test_fsvd_batch_accuracy():
    from lenskit.algorithms import basic
    from lenskit.algorithms import bias
    import lenskit.crossfold as xf
    from lenskit import batch
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    svd_algo = svd.FunkSVD(25, 125, damping=10)
    algo = basic.Fallback(svd_algo, bias.Bias(damping=10))

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        return batch.predict(algo, test)

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    preds = pd.concat(eval(train, test) for (train, test) in folds)
    mae = pm.mae(preds.prediction, preds.rating)
    assert mae == approx(0.74, abs=0.025)

    user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating))
    assert user_rmse.mean() == approx(0.92, abs=0.05)
def test_bias_batch_recommend():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    from lenskit import batch, topn

    if not os.path.exists('ml-100k/u.data'):
        raise pytest.skip()

    ratings = pd.read_csv('ml-100k/u.data',
                          sep='\t',
                          names=['user', 'item', 'rating', 'timestamp'])

    algo = basic.Bias(damping=5)
    algo = TopN(algo)

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(algo, test.user.unique(), 100)
        return recs

    folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))
    test = pd.concat(y for (x, y) in folds)

    recs = pd.concat(eval(train, test) for (train, test) in folds)

    _log.info('analyzing recommendations')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, test)
    dcg = results.ndcg
    _log.info('nDCG for %d users is %f (max=%f)', len(dcg), dcg.mean(),
              dcg.max())
    assert dcg.mean() > 0
Exemple #12
0
def test_hpf_batch_accuracy():
    import lenskit.crossfold as xf
    from lenskit import batch, topn
    import lenskit.metrics.topn as lm

    ratings = lktu.ml100k.load_ratings()

    algo = hpf.HPF(25)

    def eval(train, test):
        _log.info('running training')
        train['rating'] = train.rating.astype(np.float_)
        algo.fit(train)
        users = test.user.unique()
        _log.info('testing %d users', len(users))
        candidates = topn.UnratedCandidates(train)
        recs = batch.recommend(algo, users, 100, candidates, test)
        return recs

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    recs = pd.concat(eval(train, test) for (train, test) in folds)

    _log.info('analyzing recommendations')
    dcg = recs.groupby('user').rating.apply(lm.dcg)
    _log.info('dcg for users is %.4f', dcg.mean())
    assert dcg.mean() > 0
Exemple #13
0
def test_tf_bpr_batch_accuracy(tf_session):
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    from lenskit import batch, topn

    ratings = lktu.ml100k.ratings

    algo = lktf.BPR(20, batch_size=1024, epochs=20, rng_spec=42)
    algo = Recommender.adapt(algo)

    all_recs = []
    all_test = []
    for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(algo, np.unique(test.user), 50)
        all_recs.append(recs)
        all_test.append(test)

    _log.info('analyzing results')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    rla.add_metric(topn.recip_rank)
    scores = rla.compute(pd.concat(all_recs, ignore_index=True),
                         pd.concat(all_test, ignore_index=True),
                         include_missing=True)
    scores.fillna(0, inplace=True)
    _log.info('MRR: %f', scores['recip_rank'].mean())
    _log.info('nDCG: %f', scores['ndcg'].mean())
    assert scores['ndcg'].mean() > 0.1
Exemple #14
0
def test_ii_batch_recommend(ncpus):
    import lenskit.crossfold as xf
    from lenskit import topn

    ratings = lktu.ml100k.ratings

    def eval(train, test):
        _log.info('running training')
        algo = knn.ItemItem(30)
        algo = Recommender.adapt(algo)
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(algo, test.user.unique(), 100, n_jobs=ncpus)
        return recs

    test_frames = []
    recs = []
    for train, test in xf.partition_users(ratings, 5, xf.SampleFrac(0.2)):
        test_frames.append(test)
        recs.append(eval(train, test))

    test = pd.concat(test_frames)
    recs = pd.concat(recs)

    _log.info('analyzing recommendations')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, test)
    dcg = results.ndcg
    _log.info('nDCG for %d users is %f', len(dcg), dcg.mean())
    assert dcg.mean() > 0.03
def recommend(algo_wrappers, ratings):
    all_recs = []
    test_data = []
    for train, test in xf.partition_users(ratings[['user', 'item', 'rating']],
                                          5, xf.SampleFrac(0.2)):
        test_data.append(test)
        for algo_wrapper in algo_wrappers:
            all_recs.append(do_recommend(algo_wrapper, train, test))
    return all_recs, test_data
Exemple #16
0
class LegMedLensKit():
    def loadData():
        ratings = pd.read_csv('/Users/josse/Desktop/ratings.dat',
                              sep='::',
                              names=['user', 'item', 'rating', 'timestamp'])
        print(ratings.head())
        return (ratings)

    #print ("test")
    ratings = loadData()
    data_matrix = np.array(
        ratings.pivot(index='item', columns='user', values='rating'))
    print(data_matrix)
    data_matrix_rev = np.nan_to_num(data_matrix)
    print(data_matrix_rev)

    algo_ii = knn.ItemItem(20)
    algo_als = als.BiasedMF(50)

    def eval(aname, algo, train, test):
        print("test")
        fittable = util.clone(algo)
        fittable = Recommender.adapt(fittable)
        fittable.fit(train)
        users = test.user.unique()
        # now we run the recommender
        recs = batch.recommend(fittable, users, 100)
        # add the algorithm name for analyzability
        recs['Algorithm'] = aname
        print("recs")
        print(recs.head())
        return recs

    all_recs = []
    test_data = []

    for train, test in xf.partition_users(ratings[['user', 'item', 'rating']],
                                          1, xf.SampleFrac(0.2)):
        test_data.append(test)
        #print(test.head(10))
        all_recs.append(eval('ItemItem', algo_ii, train, test))
        all_recs.append(eval('ALS', algo_als, train, test))

    print("test2")
    print(all_recs.head())
    all_recs = pd.concat(all_recs, ignore_index=True)
    print(all_recs.head())
    test_data = pd.concat(test_data, ignore_index=True)
    #print(test_data.head)

    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(all_recs, test_data)
    results.head()

    results.groupby('Algorithm').ndcg.mean()
    results.groupby('Algorithm').ndcg.mean().plot.bar()
Exemple #17
0
def test_sample_frac():
    ratings = lktu.ml_test.ratings
    users = np.random.choice(ratings.user.unique(), 5, replace=False)

    samp = xf.SampleFrac(0.2)
    for u in users:
        udf = ratings[ratings.user == u]
        tst = samp(udf)
        trn = udf.loc[udf.index.difference(tst.index), :]
        assert len(tst) + len(trn) == len(udf)
        assert len(tst) >= math.floor(len(udf) * 0.2)
        assert len(tst) <= math.ceil(len(udf) * 0.2)

    samp = xf.SampleFrac(0.5)
    for u in users:
        udf = ratings[ratings.user == u]
        tst = samp(udf)
        trn = udf.loc[udf.index.difference(tst.index), :]
        assert len(tst) + len(trn) == len(udf)
        assert len(tst) >= math.floor(len(udf) * 0.5)
        assert len(tst) <= math.ceil(len(udf) * 0.5)
Exemple #18
0
    def run(self, strategy_context: RecommenderAlgorithmStrategyContext) -> np.ndarray:
        data_set_source = strategy_context.data_set_source
        data_frame_reader: DataFrameReaderStrategy = self.data_frame_reader_factory.create(data_set_source)
        data_set: DataFrame = data_frame_reader.parse(DataFrameReaderStrategyContext(data_set_source))

        partition = list(partition_users(data=data_set, partitions=1, method=crossfold.SampleFrac(0.2)))[0]
        test, train = partition.test, partition.train
        number_of_recommendations = strategy_context.number_of_recommendations
        algorithm = Recommender.adapt(Bias())
        trained_algorithm = algorithm.fit(train)
        recommendations = lenskit.batch.recommend(trained_algorithm, test['user'].unique(), number_of_recommendations)
        return recommendations.groupby('user')['item'].apply(lambda x: x).to_numpy().reshape(
            (-1, number_of_recommendations))
Exemple #19
0
def test_global_metric():
    import lenskit.crossfold as xf
    import lenskit.batch as batch
    from lenskit.algorithms.bias import Bias

    train, test = next(
        xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5)))
    algo = Bias()
    algo.fit(train)

    preds = batch.predict(algo, test)

    rmse = pm.global_metric(preds)
    assert rmse == pm.rmse(preds.prediction, preds.rating)

    mae = pm.global_metric(preds, metric=pm.mae)
    assert mae == pm.mae(preds.prediction, preds.rating)
Exemple #20
0
def test_tf_isvd(ml20m):
    algo = lenskit_tf.IntegratedBiasMF(20)

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        return batch.predict(algo, test)

    folds = xf.sample_users(ml20m, 2, 5000, xf.SampleFrac(0.2))
    preds = pd.concat(eval(train, test) for (train, test) in folds)
    mae = pm.mae(preds.prediction, preds.rating)
    assert mae == approx(0.60, abs=0.025)

    user_rmse = preds.groupby('user').apply(
        lambda df: pm.rmse(df.prediction, df.rating))
    assert user_rmse.mean() == approx(0.92, abs=0.05)
Exemple #21
0
def test_uu_batch_accuracy():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.ratings

    uu_algo = knn.UserUser(30)
    algo = basic.Fallback(uu_algo, basic.Bias())

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    preds = [__batch_eval((algo, train, test)) for (train, test) in folds]
    preds = pd.concat(preds)
    mae = pm.mae(preds.prediction, preds.rating)
    assert mae == approx(0.71, abs=0.028)

    user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating))
    assert user_rmse.mean() == approx(0.91, abs=0.055)
Exemple #22
0
def test_partition_users_frac():
    ratings = lktu.ml_test.ratings
    splits = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    splits = list(splits)
    assert len(splits) == 5
    ucounts = ratings.groupby('user').item.count()
    uss = ucounts * 0.2

    for s in splits:
        tucs = s.test.groupby('user').item.count()
        assert all(tucs >= uss.loc[tucs.index] - 1)
        assert all(tucs <= uss.loc[tucs.index] + 1)
        assert all(s.test.index.union(s.train.index) == ratings.index)
        assert len(s.test) + len(s.train) == len(ratings)

    # we have all users
    users = ft.reduce(lambda us1, us2: us1 | us2,
                      (set(s.test.user) for s in splits))
    assert len(users) == ratings.user.nunique()
    assert users == set(ratings.user)
Exemple #23
0
def test_user_metric():
    import lenskit.crossfold as xf
    import lenskit.batch as batch
    from lenskit.algorithms.bias import Bias

    train, test = next(
        xf.sample_users(lktu.ml_test.ratings, 1, 200, xf.SampleFrac(0.5)))
    algo = Bias()
    algo.fit(train)

    preds = batch.predict(algo, test)

    rmse = pm.user_metric(preds)
    u_rmse = preds.groupby('user').apply(
        lambda df: pm.rmse(df.prediction, df.rating))
    assert rmse == approx(u_rmse.mean())

    mae = pm.user_metric(preds, metric=pm.mae)
    u_mae = preds.groupby('user').apply(
        lambda df: pm.mae(df.prediction, df.rating))
    assert mae == approx(u_mae.mean())
Exemple #24
0
def test_als_implicit_batch_accuracy():
    import lenskit.crossfold as xf
    from lenskit import batch
    from lenskit import topn

    ratings = lktu.ml100k.ratings

    def eval(train, test):
        train['rating'] = train.rating.astype(np.float_)
        _log.info('training CG')
        cg_algo = als.ImplicitMF(25, iterations=20, method='cg')
        cg_algo = Recommender.adapt(cg_algo)
        cg_algo.fit(train)
        _log.info('training LU')
        lu_algo = als.ImplicitMF(25, iterations=20, method='lu')
        lu_algo = Recommender.adapt(lu_algo)
        lu_algo.fit(train)
        users = test.user.unique()
        _log.info('testing %d users', len(users))
        cg_recs = batch.recommend(cg_algo, users, 100, n_jobs=2)
        lu_recs = batch.recommend(lu_algo, users, 100, n_jobs=2)
        return pd.concat({
            'CG': cg_recs,
            'LU': lu_recs
        }, names=['Method']).reset_index('Method')

    folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2)))
    test = pd.concat(te for (tr, te) in folds)
    recs = pd.concat((eval(train, test) for (train, test) in folds),
                     ignore_index=True)

    _log.info('analyzing recommendations')
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.ndcg)
    results = rla.compute(recs, test)
    results = results.groupby('Method')['ndcg'].mean()
    _log.info('LU nDCG for users is %.4f', results.loc['LU'].mean())
    _log.info('CG nDCG for users is %.4f', results.loc['CG'].mean())
    assert all(results > 0.28)
    assert results.loc['LU'] == approx(results.loc['CG'], rel=0.05)
def test_pop_batch_recommend(ncpus):
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    from lenskit import batch, topn
    import lenskit.metrics.topn as lm

    if not os.path.exists('ml-100k/u.data'):
        raise pytest.skip()

    ratings = pd.read_csv('ml-100k/u.data',
                          sep='\t',
                          names=['user', 'item', 'rating', 'timestamp'])

    algo = basic.Popular()

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        cand_fun = topn.UnratedCandidates(train)
        recs = batch.recommend(algo,
                               test.user.unique(),
                               100,
                               cand_fun,
                               test,
                               nprocs=ncpus)
        return recs

    recs = pd.concat(
        (eval(train, test)
         for (train,
              test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2))))

    _log.info('analyzing recommendations')
    _log.info('have %d recs for good items', (recs.rating > 0).sum())
    dcg = recs.groupby('user').rating.agg(lm.dcg)
    _log.info('DCG for %d users is %f (max=%f)', len(dcg), dcg.mean(),
              dcg.max())
    assert dcg.mean() > 0
def test_ii_batch_recommend(ncpus):
    import lenskit.crossfold as xf
    from lenskit import batch, topn
    import lenskit.metrics.topn as lm

    if not os.path.exists('ml-100k/u.data'):
        raise pytest.skip()

    ratings = pd.read_csv('ml-100k/u.data',
                          sep='\t',
                          names=['user', 'item', 'rating', 'timestamp'])

    algo = knn.ItemItem(30)

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        cand_fun = topn.UnratedCandidates(train)
        recs = batch.recommend(algo,
                               test.user.unique(),
                               100,
                               cand_fun,
                               nprocs=ncpus)
        # combine with test ratings for relevance data
        res = pd.merge(recs, test, how='left', on=('user', 'item'))
        # fill in missing 0s
        res.loc[res.rating.isna(), 'rating'] = 0
        return res

    recs = pd.concat(
        (eval(train, test)
         for (train,
              test) in xf.partition_users(ratings, 5, xf.SampleFrac(0.2))))

    _log.info('analyzing recommendations')
    dcg = recs.groupby('user').rating.apply(lm.dcg)
    _log.info('DCG for %d users is %f', len(dcg), dcg.mean())
    assert dcg.mean() > 0
Exemple #27
0
def all_movie_recommends(ratings, optionList):
    all_recs = []
    test_data = []

    #Declare algorithm models
    basic_bias_model = basic.Bias()
    knn_model = iknn.ItemItem(20)
    knn_u_model = uknn.UserUser(20)
    als_b_model = als.BiasedMF(50)
    als_i_model = als.ImplicitMF(50)
    funk_model = funksvd.FunkSVD(50)

    for train, test in xf.partition_users(ratings[['user', 'item', 'rating']],
                                          5, xf.SampleFrac(0.2)):
        test_data.append(test)

        for option in optionList:
            if option == 1:
                all_recs.append(
                    batch_eval('BasicBias', basic_bias_model, train, test))
            if option == 2:
                all_recs.append(batch_eval('ItemItem', knn_model, train, test))
            if option == 3:
                all_recs.append(
                    batch_eval('UserUser', knn_u_model, train, test))
            if option == 4:
                all_recs.append(
                    batch_eval('ALS-Biased', als_b_model, train, test))
            if option == 5:
                all_recs.append(
                    batch_eval('ALS-Implicit', als_i_model, train, test))
            if option == 6:
                all_recs.append(batch_eval('FunkSVD', funk_model, train, test))

    all_recs = pd.concat(all_recs, ignore_index=True)
    test_data = pd.concat(test_data, ignore_index=True)

    return all_recs, test_data
Exemple #28
0
def test_als_batch_accuracy():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf
    import lenskit.metrics.predict as pm

    ratings = lktu.ml100k.load_ratings()

    svd_algo = als.BiasedMF(25, iterations=20, damping=5)
    algo = basic.Fallback(svd_algo, basic.Bias(damping=5))

    def eval(train, test):
        _log.info('running training')
        algo.fit(train)
        _log.info('testing %d users', test.user.nunique())
        return test.assign(prediction=algo.predict(test))

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    preds = pd.concat(eval(train, test) for (train, test) in folds)
    mae = pm.mae(preds.prediction, preds.rating)
    assert mae == approx(0.73, abs=0.025)

    user_rmse = preds.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating))
    assert user_rmse.mean() == approx(0.91, abs=0.05)
Exemple #29
0
def test_uu_implicit_batch_accuracy():
    from lenskit import batch, topn
    import lenskit.crossfold as xf
    import lenskit.metrics.topn as lm

    ratings = lktu.ml100k.load_ratings()

    algo = knn.UserUser(30, center=False, aggregate='sum')

    folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2))
    rec_lists = []
    for train, test in folds:
        _log.info('running training')
        algo.fit(train.loc[:, ['user', 'item']])
        cands = topn.UnratedCandidates(train)
        _log.info('testing %d users', test.user.nunique())
        recs = batch.recommend(algo, test.user.unique(), 100, cands, test)
        rec_lists.append(recs)
    recs = pd.concat(rec_lists)

    user_dcg = recs.groupby('user').rating.apply(lm.dcg)
    dcg = user_dcg.mean()
    assert dcg >= 0.1
Exemple #30
0
def test_sample_users_frac():
    ratings = lktu.ml_test.ratings
    splits = xf.sample_users(ratings, 5, 100, xf.SampleFrac(0.2))
    splits = list(splits)
    assert len(splits) == 5
    ucounts = ratings.groupby('user').item.count()
    uss = ucounts * 0.2

    for s in splits:
        tucs = s.test.groupby('user').item.count()
        assert len(tucs) == 100
        assert all(tucs >= uss.loc[tucs.index] - 1)
        assert all(tucs <= uss.loc[tucs.index] + 1)
        assert all(s.test.index.union(s.train.index) == ratings.index)
        assert len(s.test) + len(s.train) == len(ratings)

    # no overlapping users
    for s1, s2 in it.product(splits, splits):
        if s1 is s2:
            continue
        us1 = s1.test.user.unique()
        us2 = s2.test.user.unique()
        assert len(np.intersect1d(us1, us2)) == 0