def do_prepare(opts): name = opts['-d'] ml = MovieLens(f'data/{name}') train, test = next(sample_users(ml.ratings, 1, 10000, SampleN(5))) test.to_parquet(f'data/{name}-test.parquet', index=False) _log.info('getting popular recs') pop = Popular() pop.fit(train) pop_recs = recommend(pop, test['user'].unique(), 100) _log.info('getting ALS recs') als = ImplicitMF(20, iterations=10) als = Recommender.adapt(als) als.fit(train.drop(columns=['rating'])) als_recs = recommend(als, test['user'].unique(), 100) _log.info('merging recs') recs = pd.concat({ 'Popular': pop_recs, 'ALS': als_recs }, names=['Algorithm']) recs.reset_index('Algorithm', inplace=True) recs.to_parquet(f'data/{name}-recs.parquet', index=False)
def test_pop_recommend(ml20m, rng, n_jobs): users = rng.choice(ml20m['user'].unique(), 10000, replace=False) algo = Popular() _log.info('training %s', algo) algo.fit(ml20m) _log.info('recommending with %s', algo) recs = batch.recommend(algo, users, 10, n_jobs=n_jobs) assert recs['user'].nunique() == 10000
def test_store_save(store_cls): algo = Popular() algo.fit(lktu.ml_test.ratings) with store_cls() as store: k = store.put_model(algo) a2 = store.get_model(k) assert a2 is not algo assert a2.item_pop_ is not algo.item_pop_ assert all(a2.item_pop_ == algo.item_pop_) del a2
def test_store_client_pickle(store_cls): algo = Popular() algo.fit(lktu.ml_test.ratings) with store_cls() as store: k = store.put_model(algo) client = store.client() client = pickle.loads(pickle.dumps(client)) k = pickle.loads(pickle.dumps(k)) a2 = client.get_model(k) assert a2 is not algo assert a2.item_pop_ is not algo.item_pop_ assert all(a2.item_pop_ == algo.item_pop_) del a2
def test_sweep_filenames(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings folds = [] for part, (train, test) in enumerate(xf.partition_users(ratings, 2, xf.SampleN(5))): trfn = work / 'p{}-train.csv'.format(part) tefn = work / 'p{}-test.csv'.format(part) train.to_csv(trfn) test.to_csv(tefn) folds.append((trfn, tefn)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 2 partitions assert len(runs) == 8
def test_sweep_norecs(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, recommend=None) ratings = ml_test.ratings folds = xf.partition_users(ratings, 5, xf.SampleN(5)) sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 5 partitions assert len(runs) == 20 assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular']) bias_runs = runs[runs.AlgoClass == 'Bias'] assert all(bias_runs.damping.notna()) pop_runs = runs[runs.AlgoClass == 'Popular'] assert all(pop_runs.damping.isna()) preds = pd.read_parquet(work / 'predictions.parquet') assert all(preds.RunId.isin(bias_runs.RunId))
def test_sweep_nopreds(tmp_path): work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, eval_n_jobs=1) ratings = ml_test.ratings folds = [(train, test.drop(columns=['rating'])) for (train, test) in xf.partition_users(ratings, 5, xf.SampleN(5)) ] sweep.add_datasets(folds, DataSet='ml-small') sweep.add_algorithms(Popular()) sweep.add_algorithms(Bias(damping=0)) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 2 algorithms by 5 partitions assert len(runs) == 10 assert all(np.sort(runs.AlgoClass.unique()) == ['Bias', 'Popular']) bias_runs = runs[runs.AlgoClass == 'Bias'] recs = pd.read_parquet(work / 'recommendations.parquet') assert all(recs.RunId.isin(runs.RunId)) assert recs['score'].dtype == np.float64
def test_sweep_combine(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, combine=False) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.add_algorithms([Bias(damping=0), Bias(damping=5)], attrs=['damping']) sweep.add_algorithms(Popular()) sweep.persist_data() for i in range(1, 6): assert (work / 'ds{}-train.parquet'.format(i)).exists() assert (work / 'ds{}-test.parquet'.format(i)).exists() for ds, cf, dsa in sweep.datasets: assert isinstance(ds, tuple) train, test = ds assert isinstance(train, pathlib.Path) assert isinstance(test, pathlib.Path) assert sweep.run_count() == 5 * 3 try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert not (work / 'runs.csv').exists() assert not (work / 'runs.parquet').exists() assert not (work / 'predictions.parquet').exists() assert not (work / 'recommendations.parquet').exists() for i, (ds, a) in enumerate(sweep._flat_runs()): run = i + 1 assert (work / 'run-{}.json'.format(run)).exists() if isinstance(a.algorithm, Predictor): assert (work / 'predictions-{}.parquet'.format(run)).exists() assert (work / 'recommendations-{}.parquet'.format(run)).exists() sweep.collect_results() assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') assert len(runs) == 5 * 3
def run(self, strategy_context: RecommenderAlgorithmStrategyContext ) -> np.ndarray: data_set_source = strategy_context.data_set_source data_frame_reader: DataFrameReaderStrategy = self.data_frame_reader_factory.create( data_set_source) data_set: DataFrame = data_frame_reader.parse( DataFrameReaderStrategyContext(data_set_source)) partition = list( partition_users(data=data_set, partitions=1, method=crossfold.SampleFrac(0.2)))[0] test, train = partition.test, partition.train number_of_recommendations = strategy_context.number_of_recommendations algorithm = Popular() trained_algorithm = algorithm.fit(train) recommendations = lenskit.batch.recommend(trained_algorithm, test['user'].unique(), number_of_recommendations) return recommendations.groupby('user')['item'].apply( lambda x: x).to_numpy().reshape((-1, number_of_recommendations))
def test_save_models(tmp_path, format): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path, save_models=format) sweep.add_algorithms(Bias(5)) sweep.add_algorithms(Popular()) ratings = ml_test.ratings sweep.add_datasets(lambda: xf.sample_users(ratings, 2, 100, xf.SampleN(5)), name='ml-small') sweep.run() runs = pd.read_parquet(fspath(tmp_path / 'runs.parquet')) runs = runs.set_index('RunId') for i in range(4): run_id = i + 1 fn = work / 'model-{}'.format(run_id) if format is True: fn = fn.with_suffix('.pkl') assert fn.exists() with fn.open('rb') as f: algo = pickle.load(f) elif format == 'gzip': fn = fn.with_suffix('.pkl.gz') assert fn.exists() with gzip.open(fspath(fn), 'rb') as f: algo = pickle.load(f) elif format == 'joblib': fn = fn.with_suffix('.jlpkl') assert fn.exists() algo = joblib.load(fn) else: assert False assert algo is not None algo_class = algo.__class__.__name__ if isinstance(algo, TopN): algo_class = algo.predictor.__class__.__name__ assert algo_class == runs.loc[run_id, 'AlgoClass']
def test_sweep_persist(tmp_path): tmp_path = norm_path(tmp_path) work = pathlib.Path(tmp_path) sweep = batch.MultiEval(tmp_path) ratings = ml_pandas.renamed.ratings sweep.add_datasets(lambda: xf.partition_users(ratings, 5, xf.SampleN(5)), name='ml-small') sweep.persist_data() for i in range(1, 6): assert (work / 'ds{}-train.parquet'.format(i)).exists() assert (work / 'ds{}-test.parquet'.format(i)).exists() for ds, cf, dsa in sweep.datasets: assert isinstance(ds, tuple) train, test = ds assert isinstance(train, pathlib.Path) assert isinstance(test, pathlib.Path) sweep.add_algorithms( [Bias(damping=0), Bias(damping=5), Bias(damping=10)], attrs=['damping']) sweep.add_algorithms(Popular()) try: sweep.run() finally: if (work / 'runs.csv').exists(): runs = pd.read_csv(work / 'runs.csv') print(runs) assert (work / 'runs.csv').exists() assert (work / 'runs.parquet').exists() assert (work / 'predictions.parquet').exists() assert (work / 'recommendations.parquet').exists() runs = pd.read_parquet(work / 'runs.parquet') # 4 algorithms by 5 partitions assert len(runs) == 20
def test_pop_batch_recommend(ml_folds: MLFolds, ncpus): algo = Popular() recs = ml_folds.eval_all(algo, nprocs=ncpus) ml_folds.check_positive_ndcg(recs)