コード例 #1
0
ファイル: test_batch_sweep.py プロジェクト: sumitsidana/lkpy
def test_sweep_oneshot(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, combine=False)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')
    sweep.add_algorithms(Bias(damping=5))

    try:
        sweep.run(3)
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert not (work / 'runs.csv').exists()
    assert not (work / 'runs.parquet').exists()
    assert not (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    assert (work / 'run-3.json').exists()
    assert (work / 'predictions-3.parquet').exists()
    assert (work / 'recommendations-3.parquet').exists()

    with (work / 'run-3.json').open() as f:
        run = json.load(f)
    assert run['RunId'] == 3
コード例 #2
0
ファイル: test_batch_sweep.py プロジェクト: sumitsidana/lkpy
def test_sweep_filenames(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path)

    ratings = ml_pandas.renamed.ratings
    folds = []
    for part, (train, test) in enumerate(xf.partition_users(ratings, 2, xf.SampleN(5))):
        trfn = work / 'p{}-train.csv'.format(part)
        tefn = work / 'p{}-test.csv'.format(part)
        train.to_csv(trfn)
        test.to_csv(tefn)
        folds.append((trfn, tefn))

    sweep.add_datasets(folds, DataSet='ml-small')
    sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 4 algorithms by 2 partitions
    assert len(runs) == 8
コード例 #3
0
ファイル: test_batch_sweep.py プロジェクト: sumitsidana/lkpy
def test_sweep_save(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')
    sweep.add_algorithms(Bias(damping=5))

    sweep.persist_data()
    pf = work / 'sweep.dat'
    with pf.open('wb') as f:
        pickle.dump(sweep, f)

    with pf.open('rb') as f:
        sweep = pickle.load(f)

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 1 algorithms by 5 partitions
    assert len(runs) == 5
コード例 #4
0
def test_sweep_nopreds(tmp_path):
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, eval_n_jobs=1)

    ratings = ml_test.ratings
    folds = [(train, test.drop(columns=['rating']))
             for (train, test) in xf.partition_users(ratings, 5, xf.SampleN(5))
             ]
    sweep.add_datasets(folds, DataSet='ml-small')
    sweep.add_algorithms(Popular())
    sweep.add_algorithms(Bias(damping=0))

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert not (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 2 algorithms by 5 partitions
    assert len(runs) == 10
    assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular'])
    bias_runs = runs[runs.AlgoClass == 'Bias']

    recs = pd.read_parquet(work / 'recommendations.parquet')
    assert all(recs.RunId.isin(runs.RunId))
    assert recs['score'].dtype == np.float64
コード例 #5
0
def test_sweep_norecs(tmp_path):
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, recommend=None)

    ratings = ml_test.ratings
    folds = xf.partition_users(ratings, 5, xf.SampleN(5))
    sweep.add_datasets(folds, DataSet='ml-small')
    sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 4 algorithms by 5 partitions
    assert len(runs) == 20
    assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular'])
    bias_runs = runs[runs.AlgoClass == 'Bias']
    assert all(bias_runs.damping.notna())
    pop_runs = runs[runs.AlgoClass == 'Popular']
    assert all(pop_runs.damping.isna())

    preds = pd.read_parquet(work / 'predictions.parquet')
    assert all(preds.RunId.isin(bias_runs.RunId))
コード例 #6
0
ファイル: test_batch_sweep.py プロジェクト: sumitsidana/lkpy
def test_sweep_combine(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, combine=False)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small')

    sweep.add_algorithms([Bias(damping=0), Bias(damping=5)],
                         attrs=['damping'])
    sweep.add_algorithms(Popular())

    sweep.persist_data()

    for i in range(1, 6):
        assert (work / 'ds{}-train.parquet'.format(i)).exists()
        assert (work / 'ds{}-test.parquet'.format(i)).exists()

    for ds, cf, dsa in sweep.datasets:
        assert isinstance(ds, tuple)
        train, test = ds
        assert isinstance(train, pathlib.Path)
        assert isinstance(test, pathlib.Path)

    assert sweep.run_count() == 5 * 3

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert not (work / 'runs.csv').exists()
    assert not (work / 'runs.parquet').exists()
    assert not (work / 'predictions.parquet').exists()
    assert not (work / 'recommendations.parquet').exists()

    for i, (ds, a) in enumerate(sweep._flat_runs()):
        run = i + 1
        assert (work / 'run-{}.json'.format(run)).exists()
        if isinstance(a.algorithm, Predictor):
            assert (work / 'predictions-{}.parquet'.format(run)).exists()
        assert (work / 'recommendations-{}.parquet'.format(run)).exists()

    sweep.collect_results()

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    assert len(runs) == 5 * 3
コード例 #7
0
def test_save_models(tmp_path, format):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path, save_models=format)

    sweep.add_algorithms(Bias(5))
    sweep.add_algorithms(Popular())

    ratings = ml_test.ratings
    sweep.add_datasets(lambda: xf.sample_users(ratings, 2, 100, xf.SampleN(5)),
                       name='ml-small')

    sweep.run()

    runs = pd.read_parquet(fspath(tmp_path / 'runs.parquet'))
    runs = runs.set_index('RunId')

    for i in range(4):
        run_id = i + 1
        fn = work / 'model-{}'.format(run_id)
        if format is True:
            fn = fn.with_suffix('.pkl')
            assert fn.exists()
            with fn.open('rb') as f:
                algo = pickle.load(f)

        elif format == 'gzip':
            fn = fn.with_suffix('.pkl.gz')
            assert fn.exists()
            with gzip.open(fspath(fn), 'rb') as f:
                algo = pickle.load(f)
        elif format == 'joblib':
            fn = fn.with_suffix('.jlpkl')
            assert fn.exists()
            algo = joblib.load(fn)
        else:
            assert False

        assert algo is not None
        algo_class = algo.__class__.__name__
        if isinstance(algo, TopN):
            algo_class = algo.predictor.__class__.__name__

        assert algo_class == runs.loc[run_id, 'AlgoClass']
コード例 #8
0
def test_sweep_persist(tmp_path):
    tmp_path = norm_path(tmp_path)
    work = pathlib.Path(tmp_path)
    sweep = batch.MultiEval(tmp_path)

    ratings = ml_pandas.renamed.ratings
    sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)),
                       name='ml-small')
    sweep.persist_data()

    for i in range(1, 6):
        assert (work / 'ds{}-train.parquet'.format(i)).exists()
        assert (work / 'ds{}-test.parquet'.format(i)).exists()

    for ds, cf, dsa in sweep.datasets:
        assert isinstance(ds, tuple)
        train, test = ds
        assert isinstance(train, pathlib.Path)
        assert isinstance(test, pathlib.Path)

    sweep.add_algorithms(
        [Bias(damping=0), Bias(damping=5),
         Bias(damping=10)],
        attrs=['damping'])
    sweep.add_algorithms(Popular())

    try:
        sweep.run()
    finally:
        if (work / 'runs.csv').exists():
            runs = pd.read_csv(work / 'runs.csv')
            print(runs)

    assert (work / 'runs.csv').exists()
    assert (work / 'runs.parquet').exists()
    assert (work / 'predictions.parquet').exists()
    assert (work / 'recommendations.parquet').exists()

    runs = pd.read_parquet(work / 'runs.parquet')
    # 4 algorithms by 5 partitions
    assert len(runs) == 20
コード例 #9
0
def algo_eval(path, algo, dataset):
    evaluation = batch.MultiEval(path=path, predict=False, recommend=100)
    evaluation.add_algorithms(algos=algo)
    evaluation.add_datasets(data=dataset)
    evaluation.run()
import pandas as pd

from lenskit import batch
from lenskit import crossfold as xf
from lenskit.algorithms import funksvd, item_knn, user_knn
from lenskit.metrics import topn

ratings = pd.read_csv('data/ratings.csv')
ratings.rename({'userId': 'user', 'movieId': 'item'}, axis = 'columns', inplace = True)
print(ratings.head())

xf_dataset_batch, xf_dataset_test = tee(xf.partition_users(ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2)))
truth = pd.concat([test for _, test in xf_dataset_test], ignore_index = True)

runner = batch.MultiEval('result', False, nprocs = 4)
runner.add_algorithms(
    [item_knn.ItemItem(10), item_knn.ItemItem(20), item_knn.ItemItem(30)],
    False,
    ['nnbrs']
)
runner.add_algorithms(
    [user_knn.UserUser(10), user_knn.UserUser(20), user_knn.UserUser(30)],
    True,
    ['nnbrs']
)
runner.add_algorithms(
    [funksvd.FunkSVD(40, damping = 0), funksvd.FunkSVD(50, damping = 5), funksvd.FunkSVD(60, damping = 10)],
    False,
    ['features', 'damping']
)
コード例 #11
0
biased_mf_200 = als.BiasedMF(200)
implicit_mf_50 = als.ImplicitMF(50)
implicit_mf_100 = als.ImplicitMF(100)
implicit_mf_200 = als.ImplicitMF(200)
funk_svd_mf_50 = funksvd.FunkSVD(50)
funk_svd_mf_100 = funksvd.FunkSVD(100)
funk_svd_mf_200 = funksvd.FunkSVD(200)
bayesian = BPR()
hierarchical_poisson_fact_50 = HPF(50)
hierarchical_poisson_fact_100 = HPF(100)
hierarchical_poisson_fact_200 = HPF(200)

train, test = train_test_split(ratings[['user', 'item', 'rating']],
                               test_size=0.2)

eval = batch.MultiEval('../recs/cf', recommend=NUM_OF_RECS)
eval.add_datasets((train, test), name='ml-1m')
eval.add_algorithms(random, name='random')
eval.add_algorithms(popular, name='popular')
eval.add_algorithms(item_to_item_100, name='item_to_item_100')
eval.add_algorithms(item_to_item_200, name='item_to_item_200')
eval.add_algorithms(item_to_item_500, name='item_to_item_500')
eval.add_algorithms(user_to_user_100, name='user_to_user_100')
eval.add_algorithms(user_to_user_200, name='user_to_user_200')
eval.add_algorithms(user_to_user_500, name='user_to_user_500')
eval.add_algorithms(biased_mf_50, name='biased_mf_50')
eval.add_algorithms(biased_mf_100, name='biased_mf_100')
eval.add_algorithms(biased_mf_200, name='biased_mf_200')
eval.add_algorithms(implicit_mf_50, name='implicit_mf_50')
eval.add_algorithms(implicit_mf_100, name='implicit_mf_100')
eval.add_algorithms(implicit_mf_200, name='implicit_mf_200')