Beispiel #1
0
def test_sweep_oneshot(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, combine=False)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')
    sweep.add_algorithms(Bias(damping=5))

    try:
        sweep.run(3)
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert not (work / 'runs.csv').exists()
    assert not (work / 'runs.parquet').exists()
    assert not (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    assert (work / 'run-3.json').exists()
    assert (work / 'predictions-3.parquet').exists()
    assert (work / 'recommendations-3.parquet').exists()

    with (work / 'run-3.json').open() as f:
        run = json.load(f)
    assert run['RunId'] == 3
Beispiel #2
0
def test_batch_rmse():
    import lenskit.crossfold as xf
    import lenskit.batch as batch
    import lenskit.algorithms.basic as bl

    ratings = lktu.ml100k.ratings
    algo = bl.Bias(damping=5)

    def eval(train, test):
        algo.fit(train)
        preds = batch.predict(algo, test)
        return preds.set_index(['user', 'item'])

    results = pd.concat((eval(train, test)
                         for (train, test)
                         in xf.partition_users(ratings, 5, xf.SampleN(5))))

    user_rmse = results.groupby('user').apply(lambda df: pm.rmse(df.prediction, df.rating))

    # we should have all users
    users = ratings.user.unique()
    assert len(user_rmse) == len(users)
    missing = np.setdiff1d(users, user_rmse.index)
    assert len(missing) == 0

    # we should not have any missing values
    assert all(user_rmse.notna())

    # we should have a reasonable mean
    assert user_rmse.mean() == approx(0.93, abs=0.05)
Beispiel #3
0
def test_fill_users():
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.precision)
    rla.add_metric(topn.recall)

    algo = UserUser(20, min_nbrs=10)
    algo = Recommender.adapt(algo)

    splits = xf.sample_users(ml_test.ratings, 1, 50, xf.SampleN(5))
    train, test = next(splits)
    algo.fit(train)

    rec_users = test['user'].sample(50).unique()
    recs = batch.recommend(algo, rec_users, 25)

    scores = rla.compute(recs, test, include_missing=True)
    assert len(scores) == test['user'].nunique()
    assert scores['recall'].notna().sum() == len(rec_users)
    assert all(scores['ntruth'] == 5)

    mscores = rla.compute(recs, test)
    assert len(mscores) < len(scores)

    recall = scores.loc[scores['recall'].notna(), 'recall'].copy()
    recall, mrecall = recall.align(mscores['recall'])
    assert all(recall == mrecall)
Beispiel #4
0
def test_sweep_nopreds(tmp_path):
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, eval_n_jobs=1)

    ratings = ml_test.ratings
    folds = [(train, test.drop(columns=['rating']))
             for (train, test) in xf.partition_users(ratings, 5, xf.SampleN(5))
             ]
    sweep.add_datasets(folds, DataSet='ml-small')
    sweep.add_algorithms(Popular())
    sweep.add_algorithms(Bias(damping=0))

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert not (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 2 algorithms by 5 partitions
    assert len(runs) == 10
    assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular'])
    bias_runs = runs[runs.AlgoClass == 'Bias']

    recs = pd.read_parquet(work / 'recommendations.parquet')
    assert all(recs.RunId.isin(runs.RunId))
    assert recs['score'].dtype == np.float64
Beispiel #5
0
def test_sweep_save(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')
    sweep.add_algorithms(Bias(damping=5))

    sweep.persist_data()
    pf = work / 'sweep.dat'
    with pf.open('wb') as f:
        pickle.dump(sweep, f)

    with pf.open('rb') as f:
        sweep = pickle.load(f)

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 1 algorithms by 5 partitions
    assert len(runs) == 5
Beispiel #6
0
def test_sweep_filenames(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path)

    ratings = ml_pandas.renamed.ratings
    folds = []
    for part, (train, test) in enumerate(xf.partition_users(ratings, 2, xf.SampleN(5))):
        trfn = work / 'p{}-train.csv'.format(part)
        tefn = work / 'p{}-test.csv'.format(part)
        train.to_csv(trfn)
        test.to_csv(tefn)
        folds.append((trfn, tefn))

    sweep.add_datasets(folds, DataSet='ml-small')
    sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 4 algorithms by 2 partitions
    assert len(runs) == 8
Beispiel #7
0
def test_sweep_norecs(tmp_path):
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, recommend=None)

    ratings = ml_test.ratings
    folds = xf.partition_users(ratings, 5, xf.SampleN(5))
    sweep.add_datasets(folds, DataSet='ml-small')
    sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 4 algorithms by 5 partitions
    assert len(runs) == 20
    assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular'])
    bias_runs = runs[runs.AlgoClass == 'Bias']
    assert all(bias_runs.damping.notna())
    pop_runs = runs[runs.AlgoClass == 'Popular']
    assert all(pop_runs.damping.isna())

    preds = pd.read_parquet(work / 'predictions.parquet')
    assert all(preds.RunId.isin(bias_runs.RunId))
Beispiel #8
0
def test_partition_users():
    """Partitioning ratings when dataframe has non-unique indices"""
    ratings = lktu.ml_test.ratings
    ratings = ratings.set_index('user')  ##forces non-unique index
    with pytest.raises(ValueError):
        for split in xf.partition_users(ratings, 5, xf.SampleN(5)):
            pass
Beispiel #9
0
def test_sweep_combine(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, combine=False)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')

    sweep.add_algorithms([Bias(damping=0), Bias(damping=5)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    sweep.persist_data()

    for i in range(1, 6):
        assert (work / 'ds{}-train.parquet'.format(i)).exists()
        assert (work / 'ds{}-test.parquet'.format(i)).exists()

    for ds, cf, dsa in sweep.datasets:
        assert isinstance(ds, tuple)
        train, test = ds
        assert isinstance(train, pathlib.Path)
        assert isinstance(test, pathlib.Path)

    assert sweep.run_count() == 5 * 3

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert not (work / 'runs.csv').exists()
    assert not (work / 'runs.parquet').exists()
    assert not (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    for i, (ds, a) in enumerate(sweep._flat_runs()):
        run = i + 1
        assert (work / 'run-{}.json'.format(run)).exists()
        if isinstance(a.algorithm, Predictor):
            assert (work / 'predictions-{}.parquet'.format(run)).exists()
        assert (work / 'recommendations-{}.parquet'.format(run)).exists()

    sweep.collect_results()

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    assert len(runs) == 5 * 3
Beispiel #10
0
def test_sample_n():
    ratings = lktu.ml_test.ratings

    users = np.random.choice(ratings.user.unique(), 5, replace=False)

    s5 = xf.SampleN(5)
    for u in users:
        udf = ratings[ratings.user == u]
        tst = s5(udf)
        trn = udf.loc[udf.index.difference(tst.index), :]
        assert len(tst) == 5
        assert len(tst) + len(trn) == len(udf)

    s10 = xf.SampleN(10)
    for u in users:
        udf = ratings[ratings.user == u]
        tst = s10(udf)
        trn = udf.loc[udf.index.difference(tst.index), :]
        assert len(tst) == 10
        assert len(tst) + len(trn) == len(udf)
Beispiel #11
0
def test_adv_fill_users():
    rla = topn.RecListAnalysis()
    rla.add_metric(topn.precision)
    rla.add_metric(topn.recall)

    a_uu = UserUser(30, min_nbrs=10)
    a_uu = Recommender.adapt(a_uu)
    a_ii = ItemItem(20, min_nbrs=4)
    a_ii = Recommender.adapt(a_ii)

    splits = xf.sample_users(ml_test.ratings, 2, 50, xf.SampleN(5))
    all_recs = {}
    all_test = {}
    for i, (train, test) in enumerate(splits):
        a_uu.fit(train)
        rec_users = test['user'].sample(50).unique()
        all_recs[(i + 1, 'UU')] = batch.recommend(a_uu, rec_users, 25)

        a_ii.fit(train)
        rec_users = test['user'].sample(50).unique()
        all_recs[(i + 1, 'II')] = batch.recommend(a_ii, rec_users, 25)
        all_test[i + 1] = test

    recs = pd.concat(all_recs, names=['part', 'algo'])
    recs.reset_index(['part', 'algo'], inplace=True)
    recs.reset_index(drop=True, inplace=True)

    test = pd.concat(all_test, names=['part'])
    test.reset_index(['part'], inplace=True)
    test.reset_index(drop=True, inplace=True)

    scores = rla.compute(recs, test, include_missing=True)
    inames = scores.index.names
    scores.sort_index(inplace=True)
    assert len(scores) == 50 * 4
    assert all(scores['ntruth'] == 5)
    assert scores['recall'].isna().sum() > 0
    _log.info('scores:\n%s', scores)

    ucounts = scores.reset_index().groupby('algo')['user'].agg(
        ['count', 'nunique'])
    assert all(ucounts['count'] == 100)
    assert all(ucounts['nunique'] == 100)

    mscores = rla.compute(recs, test)
    mscores = mscores.reset_index().set_index(inames)
    mscores.sort_index(inplace=True)
    assert len(mscores) < len(scores)
    _log.info('mscores:\n%s', mscores)

    recall = scores.loc[scores['recall'].notna(), 'recall'].copy()
    recall, mrecall = recall.align(mscores['recall'])
    assert all(recall == mrecall)
Beispiel #12
0
def test_batch_predict_preshared():
    from lenskit.algorithms import basic
    import lenskit.crossfold as xf

    algo = basic.Bias()
    splits = xf.sample_users(lktu.ml_test.ratings, 1, 100, xf.SampleN(5))
    train, test = next(splits)

    ares = lkb.train_isolated(algo, train)
    preds = lkb.predict(ares, test)
    assert len(preds) == len(test)
    assert not any(preds['prediction'].isna())
 def create_train_test_rec_data(self):
     # For now, no cross-validation, just split the data into 1 train and 1 test set.
     for i, tp in enumerate(
             xf.partition_users(data=self.data_dense,
                                partitions=1,
                                method=xf.SampleN(5),
                                rng_spec=1)):
         train = tp.train
         test = tp.test
         train.to_csv(
             f'{conf.SYN_DATA_DIR}syn_train_{self.current_date}.csv')
         test.to_csv(f'{conf.SYN_DATA_DIR}syn_test_{self.current_date}.csv')
     return train, test
Beispiel #14
0
def test_sample_users_frac_oversize_ndj():
    ratings = lktu.ml_test.ratings
    splits = xf.sample_users(ratings, 20, 100, xf.SampleN(5), disjoint=False)
    splits = list(splits)
    assert len(splits) == 20

    for s in splits:
        ucounts = s.test.groupby('user').agg('count')
        assert len(ucounts) == 100
        assert len(s.test) == 5 * 100
        assert all(ucounts == 5)
        assert all(s.test.index.union(s.train.index) == ratings.index)
        assert len(s.test) + len(s.train) == len(ratings)
def create_save_train_val_test_rec_data(dense_data, fn):
    # For now, no cross-validation, just split the data into 1 train and 1 test set.
    for i, tp in enumerate(
            xf.partition_users(data=dense_data,
                               partitions=1,
                               method=xf.SampleN(5),
                               rng_spec=1)):
        train = tp.train
        test = tp.test
        test.to_csv(f'{conf.SYN_DATA_DIR}syn_test_{fn}.csv')
        print("[INFO] Train/test split created")

    for i, tp in enumerate(
            xf.partition_users(data=train,
                               partitions=1,
                               method=xf.SampleN(5),
                               rng_spec=1)):
        train = tp.train
        val = tp.test
        train.to_csv(f'{conf.SYN_DATA_DIR}syn_train_{fn}.csv')
        val.to_csv(f'{conf.SYN_DATA_DIR}syn_val_{fn}.csv')
        print("[INFO] Train/val split created")

    return train, val, test
Beispiel #16
0
def test_partition_users():
    ratings = lktu.ml_pandas.renamed.ratings
    splits = xf.partition_users(ratings, 5, xf.SampleN(5))
    splits = list(splits)
    assert len(splits) == 5

    for s in splits:
        ucounts = s.test.groupby('user').agg('count')
        assert all(ucounts == 5)
        assert all(s.test.index.union(s.train.index) == ratings.index)
        assert len(s.test) + len(s.train) == len(ratings)

    users = ft.reduce(lambda us1, us2: us1 | us2,
                      (set(s.test.user) for s in splits))
    assert len(users) == ratings.user.nunique()
    assert users == set(ratings.user)
Beispiel #17
0
def test_save_models(tmp_path, format):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, save_models=format)

    sweep.add_algorithms(Bias(5))
    sweep.add_algorithms(Popular())

    ratings = ml_test.ratings
    sweep.add_datasets(lambda: xf.sample_users(ratings, 2, 100, xf.SampleN(5)),
                       name='ml-small')

    sweep.run()

    runs = pd.read_parquet(fspath(tmp_path / 'runs.parquet'))
    runs = runs.set_index('RunId')

    for i in range(4):
        run_id = i + 1
        fn = work / 'model-{}'.format(run_id)
        if format is True:
            fn = fn.with_suffix('.pkl')
            assert fn.exists()
            with fn.open('rb') as f:
                algo = pickle.load(f)

        elif format == 'gzip':
            fn = fn.with_suffix('.pkl.gz')
            assert fn.exists()
            with gzip.open(fspath(fn), 'rb') as f:
                algo = pickle.load(f)
        elif format == 'joblib':
            fn = fn.with_suffix('.jlpkl')
            assert fn.exists()
            algo = joblib.load(fn)
        else:
            assert False

        assert algo is not None
        algo_class = algo.__class__.__name__
        if isinstance(algo, TopN):
            algo_class = algo.predictor.__class__.__name__

        assert algo_class == runs.loc[run_id, 'AlgoClass']
Beispiel #18
0
def test_sweep_persist(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)),
                       name='ml-small')
    sweep.persist_data()

    for i in range(1, 6):
        assert (work / 'ds{}-train.parquet'.format(i)).exists()
        assert (work / 'ds{}-test.parquet'.format(i)).exists()

    for ds, cf, dsa in sweep.datasets:
        assert isinstance(ds, tuple)
        train, test = ds
        assert isinstance(train, pathlib.Path)
        assert isinstance(test, pathlib.Path)

    sweep.add_algorithms(
        [Bias(damping=0), Bias(damping=5),
         Bias(damping=10)],
        attrs=['damping'])
    sweep.add_algorithms(Popular())

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 4 algorithms by 5 partitions
    assert len(runs) == 20
Beispiel #19
0
def test_sample_users():
    ratings = lktu.ml_test.ratings
    splits = xf.sample_users(ratings, 5, 100, xf.SampleN(5))
    splits = list(splits)
    assert len(splits) == 5

    for s in splits:
        ucounts = s.test.groupby('user').agg('count')
        assert len(s.test) == 5 * 100
        assert len(ucounts) == 100
        assert all(ucounts == 5)
        assert all(s.test.index.union(s.train.index) == ratings.index)
        assert len(s.test) + len(s.train) == len(ratings)

    # no overlapping users
    for s1, s2 in it.product(splits, splits):
        if s1 is s2:
            continue
        us1 = s1.test.user.unique()
        us2 = s2.test.user.unique()
        assert len(np.intersect1d(us1, us2)) == 0
Beispiel #20
0
def test_sample_users_frac_oversize():
    ratings = lktu.ml_test.ratings
    splits = xf.sample_users(ratings, 20, 100, xf.SampleN(5))
    splits = list(splits)
    assert len(splits) == 20

    for s in splits:
        ucounts = s.test.groupby('user').agg('count')
        assert len(ucounts) < 100
        assert all(ucounts == 5)
        assert all(s.test.index.union(s.train.index) == ratings.index)
        assert len(s.test) + len(s.train) == len(ratings)

    users = ft.reduce(lambda us1, us2: us1 | us2,
                      (set(s.test.user) for s in splits))
    assert len(users) == ratings.user.nunique()
    assert users == set(ratings.user)
    for s1, s2 in it.product(splits, splits):
        if s1 is s2:
            continue

        us1 = s1.test.user.unique()
        us2 = s2.test.user.unique()
        assert len(np.intersect1d(us1, us2)) == 0
Beispiel #21
0
def test_partition_may_skip_train():
    """Partitioning when users may not have enough ratings to be in the train set and test set."""
    ratings = lktu.ml_test.ratings
    # make a data set where some users only have 1 rating
    ratings = ratings.sample(frac=0.1)
    users = ratings.groupby('user')['rating'].count()
    assert users.min() == 1.0  # we should have some small users!
    users.name = 'ur_count'

    splits = xf.partition_users(ratings, 5, xf.SampleN(1))
    splits = list(splits)
    assert len(splits) == 5

    # now we go make sure we're missing some users! And don't have any NaN ratings
    for train, test in splits:
        # no null ratings
        assert all(train['rating'].notna())
        # see if test users with 1 rating are missing from train
        test = test.join(users, on='user')
        assert all(~(test.loc[test['ur_count'] == 1,
                              'user'].isin(train['user'].unique())))
        # and users with more than one rating are in train
        assert all(test.loc[test['ur_count'] > 1,
                            'user'].isin(train['user'].unique()))
def main(args):
    dsname = args.get('DATASET')
    partitions = int(args.get('-p'))
    output = args.get('-o')

    _log.info('locating data set %s', dsname)
    data = getattr(datasets, dsname)

    _log.info('loading ratings')
    ratings = data.ratings

    path = Path(output)
    path.mkdir(exist_ok=True, parents=True)

    _log.info('writing to %s', path)
    testRowsPerUsers = 5
    for i, tp in enumerate(
            xf.partition_users(ratings, partitions,
                               xf.SampleN(testRowsPerUsers)), 1):
        # _log.info('writing train set %d', i)
        # tp.train.to_csv(path / f'train-{i}.csv.gz', index=False)
        _log.info('writing test set %d', i)
        tp.test.index.name = 'index'
        tp.test.to_csv(path / f'test-{i}.csv.gz')
from docopt import docopt
from lkdemo import datasets, log
from pathlib import Path

import lenskit.crossfold as xf

_log = log.script(__file__)

args = docopt(__doc__)

dsname = args.get('DATASET')
partitions = int(args.get('-p'))
output = args.get('-o')

_log.info('locating data set %s', dsname)
data = getattr(datasets, dsname)

_log.info('loading ratings')
ratings = data.ratings

path = Path(output)
path.mkdir(exist_ok=True, parents=True)

_log.info('writing to %s', path)
testRowsPerUsers = 5
for i, tp in enumerate(
        xf.partition_users(ratings, partitions, xf.SampleN(testRowsPerUsers)),
        1):
    tp.train.to_csv(path / f'train-{i}.csv', index=False)
    tp.test.to_csv(path / f'test-{i}.csv', index=False)
"""

from docopt import docopt
from lkdemo import datasets, log
from pathlib import Path

import lenskit.crossfold as xf

_log = log.script(__file__)

args = docopt(__doc__)

dsname = args.get('DATASET')
partitions = int(args.get('-p'))
output = args.get('-o')

_log.info('locating data set %s', dsname)
data = getattr(datasets, dsname)

_log.info('loading ratings')
ratings = data.ratings

path = Path(output)
path.mkdir(exist_ok=True, parents=True)

_log.info('writing to %s', path)
testRowsPerUsers = 5
for i, tp in enumerate(xf.partition_users(ratings, partitions, xf.SampleN(testRowsPerUsers)),1):
    tp.train.to_csv(path / f'train-{i}.csv.gz', index=False)
    tp.test.to_csv(path / f'test-{i}.csv.gz' , index=False)
    
    user_n = df.loc[df['count'] < condition]
    return user_n


game_count = groupby_count(result, 'user', 'item')

user_5 = prune(game_count, 5)

user_less_5 = user_5.index
user_less_5

pruned_data_5 = result.set_index('user').drop(user_less_5)
pruned_data_5.reset_index(inplace=True)

#pairs_user = list(partition_users(pruned_data_5, 5, xf.SampleN(1)))
pairs_user = list(sample_users(pruned_data_5, 5, 12000, xf.SampleN(1)))
pickle_out = open("sample_user.pickle", "wb")
pickle.dump(pairs_user, pickle_out)
pickle_out.close()

truth = pd.concat((p.test for p in pairs_user))
#truth.to_csv(r'results/steam/pruned_5.csv')


def algo_eval(path, algo, dataset):
    evaluation = batch.MultiEval(path=path, predict=False, recommend=100)
    evaluation.add_algorithms(algos=algo)
    evaluation.add_datasets(data=dataset)
    evaluation.run()

Beispiel #26
0
# read in the movielens 100k ratings with pandas
# https://grouplens.org/datasets/movielens/100k/
ratings = pd.read_csv('ml-100k/u.data', sep='\t',
        names=['user', 'item', 'rating', 'timestamp'])

algoKNN = knn.ItemItem(30)
algoFunk = funk.FunkSVD(2)
algoAls = als.BiasedMF(20)


# split the data in a test and a training set
# for each user leave one row out for test purpose
data = ratings
nb_partitions = 1
splits = xf.partition_users(data, nb_partitions, xf.SampleN(1))
for (trainSet, testSet) in splits:
    train = trainSet
    test = testSet

# train model
modelKNN = algoKNN.fit(train)
modelFunk = algoFunk.fit(train)
modelALS = algoAls.fit(train)
users = test.user.unique()



def get_recommendations_Funk_SVD(user_id, nb_recommendations = 1):
    '''
    Return a recommendation