def test_als_predict_no_user_features_basic(): i = 3 ratings = lktu.ml_test.ratings np.random.seed(45) u = np.random.choice(ratings.user.unique(), 1)[0] items = np.random.choice(ratings.item.unique(), 2) algo = als.ImplicitMF(5, iterations=10, method="lu", use_ratings=True) algo.fit(ratings) preds = algo.predict_for_user(u, items) user_data = ratings[ratings.user == u] new_ratings = user_data.set_index('item')['rating'].copy() algo_no_user_features = als.ImplicitMF(5, iterations=10, method="lu", save_user_features=False) algo_no_user_features.fit(ratings) preds_no_user_features = algo_no_user_features.predict_for_user( u, items, new_ratings) assert algo_no_user_features.user_features_ == None diffs = np.abs(preds.values - preds_no_user_features.values) assert all(diffs <= 0.1)
def test_als_method_match(): lu = als.ImplicitMF(20, iterations=15, method='lu', rand=np.random.RandomState(42).randn) cg = als.ImplicitMF(20, iterations=15, method='cg', rand=np.random.RandomState(42).randn) ratings = lktu.ml_test.ratings timer = Stopwatch() lu.fit(ratings) timer.stop() _log.info('fit with LU solver in %s', timer) timer = Stopwatch() cg.fit(ratings) timer.stop() _log.info('fit with CG solver in %s', timer) preds = [] with lktu.rand_seed(42): for u in np.random.choice(ratings.user.unique(), 10, replace=False): items = np.random.choice(ratings.item.unique(), 15, replace=False) lu_preds = lu.predict_for_user(u, items) cd_preds = cg.predict_for_user(u, items) diff = lu_preds - cd_preds adiff = np.abs(diff) _log.info( 'user %s diffs: L2 = %f, min = %f, med = %f, max = %f, 90%% = %f', u, np.linalg.norm(diff, 2), np.min(adiff), np.median(adiff), np.max(adiff), np.quantile(adiff, 0.9)) preds.append( pd.DataFrame({ 'user': u, 'item': items, 'lu': lu_preds, 'cg': cd_preds, 'adiff': adiff })) _log.info('user %s tau: %s', u, stats.kendalltau(lu_preds, cd_preds)) preds = pd.concat(preds, ignore_index=True) _log.info('LU preds:\n%s', preds.lu.describe()) _log.info('CD preds:\n%s', preds.cg.describe()) _log.info('overall differences:\n%s', preds.adiff.describe()) # there are differences. our check: the 90% are reasonable assert np.quantile(adiff, 0.9) <= 0.3
def test_als_implicit_batch_accuracy(): import lenskit.crossfold as xf from lenskit import batch import lenskit.metrics.topn as lm ratings = lktu.ml100k.load_ratings() algo = als.ImplicitMF(25, iterations=20) def eval(train, test): _log.info('running training') train['rating'] = train.rating.astype(np.float_) algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) candidates = topn.UnratedCandidates(train) recs = batch.recommend(algo, users, 100, candidates, test) return recs folds = xf.partition_users(ratings, 5, xf.SampleFrac(0.2)) recs = pd.concat(eval(train, test) for (train, test) in folds) _log.info('analyzing recommendations') dcg = recs.groupby('user').rating.apply(lm.dcg) _log.info('dcg for users is %.4f', dcg.mean()) assert dcg.mean() > 0
def test_als_implicit_batch_accuracy(): import lenskit.crossfold as xf from lenskit import batch from lenskit import topn ratings = lktu.ml100k.load_ratings() algo = als.ImplicitMF(25, iterations=20) def eval(train, test): _log.info('running training') train['rating'] = train.rating.astype(np.float_) algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) candidates = topn.UnratedCandidates(train) recs = batch.recommend(algo, users, 100, candidates) return recs folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) test = pd.concat(te for (tr, te) in folds) recs = pd.concat(eval(train, test) for (train, test) in folds) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) _log.info('nDCG for users is %.4f', results.ndcg.mean()) assert results.ndcg.mean() > 0
def user_movie_recommend(ratings, optionList, userId): all_recs = [] for option in optionList: if option == 1: basic_bias_model = basic.Bias() all_recs.append( user_eval('BasicBias', basic_bias_model, ratings, userId)) if option == 2: knn_model = iknn.ItemItem(20) all_recs.append(user_eval('ItemItem', knn_model, ratings, userId)) if option == 3: knn_u_model = uknn.UserUser(20) all_recs.append(user_eval('UserUser', knn_u_model, ratings, userId)) if option == 4: als_b_model = als.BiasedMF(50) all_recs.append( user_eval('ALS-Biased', als_b_model, ratings, userId)) if option == 5: als_i_model = als.ImplicitMF(50) all_recs.append( user_eval('ALS-Implicit', als_i_model, ratings, userId)) if option == 6: funk_model = funksvd.FunkSVD(50) all_recs.append(user_eval('FunkSVD', funk_model, ratings, userId)) all_recs = pd.concat(all_recs, ignore_index=True) return all_recs
def train_model(train, n_factors=30, n_iterations=20, regularization=.1, save_training_loss=False, confidence_factor=40): """Train (and evaluate iterations if requested) model""" # Encapsulate the model into a TopN recommender model = Recommender.adapt( als.ImplicitMF(n_factors, iterations=n_iterations, weight=confidence_factor, progress=tqdm, method='cg')) # Compute the confidence values for user-item pairs train['rating'] = 1 + confidence_factor * train['rating'] if save_training_loss: loss = np.zeros(n_iterations) for i, intermediate_model in enumerate(model.fit_iters(train)): predictions = generate_predictions(intermediate_model, train) loss[i] = evaluate_model_loss(intermediate_model, predictions) else: model.fit(train) loss = None return model, loss
def test_alogrithms(): # data = MovieLens('ml-latest-small') data = ML1M('ml-1m') ratings = data.ratings print('Initial ratings table head:') print(ratings.head()) algorithms = [ basic.Bias(damping=5), basic.Popular(), item_knn.ItemItem(20), user_knn.UserUser(20), als.BiasedMF(50), als.ImplicitMF(50), funksvd.FunkSVD(50) ] pairs = list( partition_users(ratings[['user', 'item', 'rating']], 5, SampleFrac(0.2))) eval_algorithms(dataset=pairs, algorithms=algorithms) runs = display_runs() recs = display_recommendations() truth = pd.concat((p.test for p in pairs), ignore_index=True) ndcg_means = check_recommendations(runs, recs, truth) print('NDCG means:') print(ndcg_means) plot_comparison(ndcg_means)
def test_als_save_load(tmp_path): tmp_path = lktu.norm_path(tmp_path) mod_file = tmp_path / 'als.npz' algo = als.ImplicitMF(20, iterations=5) ratings = lktu.ml_pandas.renamed.ratings algo.fit(ratings) algo.save(mod_file) assert mod_file.exists() restored = als.ImplicitMF(20) restored.load(mod_file) assert np.all(restored.user_features_ == algo.user_features_) assert np.all(restored.item_features_ == algo.item_features_) assert np.all(restored.item_index_ == algo.item_index_) assert np.all(restored.user_index_ == algo.user_index_)
def test_als_basic_build(): algo = als.ImplicitMF(20, iterations=10) algo.fit(simple_df) assert set(algo.user_index_) == set([10, 12, 13]) assert set(algo.item_index_) == set([1, 2, 3]) assert algo.user_features_.shape == (3, 20) assert algo.item_features_.shape == (3, 20)
def test_als_predict_bad_user(): algo = als.ImplicitMF(20, iterations=10) algo.fit(simple_df) preds = algo.predict_for_user(50, [3]) assert len(preds) == 1 assert preds.index[0] == 3 assert np.isnan(preds.loc[3])
def test_als_train_large(m): algo = als.ImplicitMF(20, iterations=20, method=m, use_ratings=False) ratings = lktu.ml_test.ratings algo.fit(ratings) assert len(algo.user_index_) == ratings.user.nunique() assert len(algo.item_index_) == ratings.item.nunique() assert algo.user_features_.shape == (ratings.user.nunique(), 20) assert algo.item_features_.shape == (ratings.item.nunique(), 20)
def test_als_predict_basic(): algo = als.ImplicitMF(20, iterations=10) algo.fit(simple_df) preds = algo.predict_for_user(10, [3]) assert len(preds) == 1 assert preds.index[0] == 3 assert preds.loc[3] >= -0.1 assert preds.loc[3] <= 5
def test_als_train_large(): algo = als.ImplicitMF(20, iterations=20) ratings = lktu.ml_pandas.renamed.ratings algo.fit(ratings) assert len(algo.user_index_) == ratings.user.nunique() assert len(algo.item_index_) == ratings.item.nunique() assert algo.user_features_.shape == (ratings.user.nunique(), 20) assert algo.item_features_.shape == (ratings.item.nunique(), 20)
def eval(train, test): train['rating'] = train.rating.astype(np.float_) _log.info('training CG') cg_algo = als.ImplicitMF(25, iterations=20, method='cg') cg_algo = Recommender.adapt(cg_algo) cg_algo.fit(train) _log.info('training LU') lu_algo = als.ImplicitMF(25, iterations=20, method='lu') lu_algo = Recommender.adapt(lu_algo) lu_algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) cg_recs = batch.recommend(cg_algo, users, 100, n_jobs=2) lu_recs = batch.recommend(lu_algo, users, 100, n_jobs=2) return pd.concat({ 'CG': cg_recs, 'LU': lu_recs }, names=['Method']).reset_index('Method')
def test_als_train_large_noratings(): algo = als.ImplicitMF(20, iterations=20) ratings = lktu.ml_test.ratings ratings = ratings.loc[:, ['user', 'item']] algo.fit(ratings) assert len(algo.user_index_) == ratings.user.nunique() assert len(algo.item_index_) == ratings.item.nunique() assert algo.user_features_.shape == (ratings.user.nunique(), 20) assert algo.item_features_.shape == (ratings.item.nunique(), 20)
def test_als_implicit_batch_accuracy(): import lenskit.crossfold as xf from lenskit import batch from lenskit import topn ratings = lktu.ml100k.ratings cg_algo = als.ImplicitMF(25, iterations=20, method='cg') lu_algo = als.ImplicitMF(25, iterations=20, method='lu') def eval(train, test): train['rating'] = train.rating.astype(np.float_) _log.info('training CG') cg_algo.fit(train) _log.info('training LU') lu_algo.fit(train) users = test.user.unique() _log.info('testing %d users', len(users)) candidates = topn.UnratedCandidates(train) cg_recs = batch.recommend(cg_algo, users, 100, candidates, n_jobs=2) lu_recs = batch.recommend(lu_algo, users, 100, candidates, n_jobs=2) return pd.concat({ 'CG': cg_recs, 'LU': lu_recs }, names=['Method']).reset_index('Method') folds = list(xf.partition_users(ratings, 5, xf.SampleFrac(0.2))) test = pd.concat(te for (tr, te) in folds) recs = pd.concat((eval(train, test) for (train, test) in folds), ignore_index=True) _log.info('analyzing recommendations') rla = topn.RecListAnalysis() rla.add_metric(topn.ndcg) results = rla.compute(recs, test) results = results.groupby('Method')['ndcg'].mean() _log.info('LU nDCG for users is %.4f', results.loc['LU'].mean()) _log.info('CG nDCG for users is %.4f', results.loc['CG'].mean()) assert all(results > 0.28) assert results.loc['LU'] == approx(results.loc['CG'], rel=0.05)
def test_als_save_load(): algo = als.ImplicitMF(20, iterations=5) ratings = lktu.ml_pandas.renamed.ratings algo.fit(ratings) mod = pickle.dumps(algo) _log.info('serialized to %d bytes', len(mod)) restored = pickle.loads(mod) assert np.all(restored.user_features_ == algo.user_features_) assert np.all(restored.item_features_ == algo.item_features_) assert np.all(restored.item_index_ == algo.item_index_) assert np.all(restored.user_index_ == algo.user_index_)
def test_als_save_load(tmp_path): "Test saving and loading ALS models, and regularized training." algo = als.ImplicitMF(5, iterations=5, reg=(2, 1)) ratings = lktu.ml_test.ratings algo.fit(ratings) fn = tmp_path / 'model.bpk' binpickle.dump(algo, fn, codec=None) restored = binpickle.load(fn) assert np.all(restored.user_features_ == algo.user_features_) assert np.all(restored.item_features_ == algo.item_features_) assert np.all(restored.item_index_ == algo.item_index_) assert np.all(restored.user_index_ == algo.user_index_)
def test_als_save_load(): "Test saving and loading ALS models, and regularized training." algo = als.ImplicitMF(5, iterations=5, reg=(2, 1)) ratings = lktu.ml_test.ratings algo.fit(ratings) mod = pickle.dumps(algo) _log.info('serialized to %d bytes', len(mod)) restored = pickle.loads(mod) assert np.all(restored.user_features_ == algo.user_features_) assert np.all(restored.item_features_ == algo.item_features_) assert np.all(restored.item_index_ == algo.item_index_) assert np.all(restored.user_index_ == algo.user_index_)
def test_als_predict_basic_for_new_ratings(): """ Test ImplicitMF ability to support new ratings """ algo = als.ImplicitMF(20, iterations=10) algo.fit(simple_df) new_ratings = pd.Series([4.0, 5.0], index=[1, 2]) # items as index and ratings as values preds = algo.predict_for_user(15, [3], new_ratings) assert len(preds) == 1 assert preds.index[0] == 3 assert preds.loc[3] >= -0.1 assert preds.loc[3] <= 5
def test_als_recs_topn_for_new_users_with_new_ratings(rng): """ Test if ImplicitMF topn recommendations using the same ratings for a new user is the same as a user in ml-latest-small dataset. The test is run for more than one user. """ from lenskit.algorithms import basic import scipy.stats as stats n_users = 10 new_u_id = -1 ratings = lktu.ml_test.ratings users = rng.choice(np.unique(ratings.user), n_users) algo = als.ImplicitMF(20, iterations=10, method="lu") rec_algo = basic.TopN(algo) rec_algo.fit(ratings) # _log.debug("Items: " + str(items)) correlations = pd.Series(np.nan, index=users) for u in users: recs = rec_algo.recommend(u, 10) user_data = ratings[ratings.user == u] upos = algo.user_index_.get_loc(u) _log.info('user %s: %s ratings', u, len(user_data)) _log.debug("user_features from fit: " + str(algo.user_features_[upos, :])) # get the user's rating series new_ratings = user_data.set_index('item')['rating'].copy() new_recs = rec_algo.recommend(new_u_id, 10, ratings=new_ratings) # merge new & old recs all_recs = pd.merge(recs.rename(columns={'score': 'old_score'}), new_recs.rename(columns={'score': 'new_score'}), how='outer').fillna(-np.inf) tau = stats.kendalltau(all_recs.old_score, all_recs.new_score) _log.info('correlation for user %s: %f', u, tau.correlation) correlations.loc[u] = tau.correlation _log.debug('correlations: %s', correlations) assert not (any(correlations.isnull())) assert all(correlations >= 0.5)
def get_topn_algo_class(algo): if algo == 'popular': return basic.Popular() elif algo == 'bias': return basic.TopN(basic.Bias()) elif algo == 'itemitem': return basic.TopN(iknn.ItemItem(nnbrs=-1, center=False, aggregate='sum')) elif algo == 'useruser': return basic.TopN(uknn.UserUser(nnbrs=5, center=False, aggregate='sum')) elif algo == 'biasedmf': return basic.TopN(als.BiasedMF(50, iterations=10)) elif algo == 'implicitmf': return basic.TopN(als.ImplicitMF(20, iterations=10)) elif algo == 'funksvd': return basic.TopN(svd.FunkSVD(20, iterations=20)) elif algo == 'bpr': return basic.TopN(BPR(25))
def get_algo_class(self, algo): if algo == 'popular': return basic.Popular() elif algo == 'bias': return basic.Bias(users=False) elif algo == 'topn': return basic.TopN(basic.Bias()) elif algo == 'itemitem': return iknn.ItemItem(nnbrs=-1) elif algo == 'useruser': return uknn.UserUser(nnbrs=5) elif algo == 'biasedmf': return als.BiasedMF(50, iterations=10) elif algo == 'implicitmf': return als.ImplicitMF(20, iterations=10) elif algo == 'funksvd': return svd.FunkSVD(20, iterations=20)
def test_alogrithms(): data = MovieLens('ml-latest-small') #data = ML1M('ml-1m') ratings = data.ratings print('Initial ratings table head:') print(ratings.head()) algorithms = { 'Bias': basic.Bias(damping=5), 'Popular': basic.Popular(), 'ItemItem': item_knn.ItemItem(20), 'UserUser': user_knn.UserUser(20), 'BiasedMF': als.BiasedMF(50), 'ImplicitMF': als.ImplicitMF(50), 'FunkSVD': funksvd.FunkSVD(50) } all_recs, test_data = eval_algos(ratings, algorithms) ndcg_means = eval_ndcg(all_recs, test_data) print('NDCG means:') print(ndcg_means) plot_comparison(ndcg_means)
def test_als_predict_basic_for_new_user_with_new_ratings(): """ Test if ImplicitMF predictions using the same ratings for a new user is the same as a user in the current simple_df dataset. """ u = 10 i = 3 algo = als.ImplicitMF(20, iterations=10) algo.fit(simple_df) preds = algo.predict_for_user(u, [i]) new_u_id = 1 new_ratings = pd.Series([4.0, 5.0], index=[1, 2]) # items as index and ratings as values new_preds = algo.predict_for_user(new_u_id, [i], new_ratings) assert abs(preds.loc[i] - new_preds.loc[i]) <= 0.1
def test_als_predict_for_new_users_with_new_ratings(): """ Test if ImplicitMF predictions using the same ratings for a new user is the same as a user in ml-latest-small dataset. The test is run for more than one user. """ n_users = 3 n_items = 2 new_u_id = -1 ratings = lktu.ml_test.ratings np.random.seed(45) users = np.random.choice(ratings.user.unique(), n_users) items = np.random.choice(ratings.item.unique(), n_items) algo = als.ImplicitMF(20, iterations=10, method="lu") algo.fit(ratings) _log.debug("Items: " + str(items)) for u in users: _log.debug(f"user: {u}") preds = algo.predict_for_user(u, items) upos = algo.user_index_.get_loc(u) user_data = ratings[ratings.user == u] _log.debug("user_features from fit: " + str(algo.user_features_[upos, :])) # get the user's rating series new_ratings = user_data.set_index('item')['rating'].copy() new_preds = algo.predict_for_user(new_u_id, items, new_ratings) _log.debug("preds: " + str(preds.values)) _log.debug("new_preds: " + str(new_preds.values)) _log.debug("------------") diffs = np.abs(preds.values - new_preds.values) assert all(diffs <= 0.1)
def all_movie_recommends(ratings, optionList): all_recs = [] test_data = [] #Declare algorithm models basic_bias_model = basic.Bias() knn_model = iknn.ItemItem(20) knn_u_model = uknn.UserUser(20) als_b_model = als.BiasedMF(50) als_i_model = als.ImplicitMF(50) funk_model = funksvd.FunkSVD(50) for train, test in xf.partition_users(ratings[['user', 'item', 'rating']], 5, xf.SampleFrac(0.2)): test_data.append(test) for option in optionList: if option == 1: all_recs.append( batch_eval('BasicBias', basic_bias_model, train, test)) if option == 2: all_recs.append(batch_eval('ItemItem', knn_model, train, test)) if option == 3: all_recs.append( batch_eval('UserUser', knn_u_model, train, test)) if option == 4: all_recs.append( batch_eval('ALS-Biased', als_b_model, train, test)) if option == 5: all_recs.append( batch_eval('ALS-Implicit', als_i_model, train, test)) if option == 6: all_recs.append(batch_eval('FunkSVD', funk_model, train, test)) all_recs = pd.concat(all_recs, ignore_index=True) test_data = pd.concat(test_data, ignore_index=True) return all_recs, test_data
def get_algo_class(algo): if algo == 'popular': return basic.Popular() elif algo == 'bias': return basic.Bias(users=False) elif algo == 'topn': return basic.TopN(basic.Bias()) elif algo == 'itemitem': return iknn.ItemItem(nnbrs=-1) elif algo == 'useruser': return uknn.UserUser(nnbrs=5) elif algo == 'biasedmf': return als.BiasedMF(50, iterations=10) elif algo == 'implicitmf': return als.ImplicitMF(20, iterations=10) elif algo == 'funksvd': return svd.FunkSVD(20, iterations=20) elif algo == 'tf_bpr': return lktf.BPR(20, batch_size=1024, epochs=5, neg_count=2, rng_spec=42)
""" Basic algorithm definitions as starting points. """ from lenskit.algorithms import item_knn, user_knn, als, funksvd from lenskit.algorithms import basic Bias = basic.Bias(damping=5) Pop = basic.Popular() II = item_knn.ItemItem(20, save_nbrs=2500) UU = user_knn.UserUser(30) ALS = als.BiasedMF(50) IALS = als.ImplicitMF(50) MFSGD = funksvd.FunkSVD(50)
ml1m = ML1M('../datasets/ml-1m') ratings = ml1m.ratings random = basic.Random() popular = basic.Popular() item_to_item_100 = item_knn.ItemItem(100) item_to_item_200 = item_knn.ItemItem(200) item_to_item_500 = item_knn.ItemItem(500) user_to_user_100 = user_knn.UserUser(100) user_to_user_200 = user_knn.UserUser(200) user_to_user_500 = user_knn.UserUser(500) biased_mf_50 = als.BiasedMF(50) biased_mf_100 = als.BiasedMF(100) biased_mf_200 = als.BiasedMF(200) implicit_mf_50 = als.ImplicitMF(50) implicit_mf_100 = als.ImplicitMF(100) implicit_mf_200 = als.ImplicitMF(200) funk_svd_mf_50 = funksvd.FunkSVD(50) funk_svd_mf_100 = funksvd.FunkSVD(100) funk_svd_mf_200 = funksvd.FunkSVD(200) bayesian = BPR() hierarchical_poisson_fact_50 = HPF(50) hierarchical_poisson_fact_100 = HPF(100) hierarchical_poisson_fact_200 = HPF(200) train, test = train_test_split(ratings[['user', 'item', 'rating']], test_size=0.2) eval = batch.MultiEval('../recs/cf', recommend=NUM_OF_RECS) eval.add_datasets((train, test), name='ml-1m')