def test_data(self): from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data( movielens_dir) ratings_df = pd.read_csv(rating_csv_path) self.assertListEqual(list(ratings_df.columns), ['rating', 'timestamp', 'itemid', 'userid']) self.assertEqual(len(ratings_df), 1000209) users_df = pd.read_csv(users_csv_path) self.assertListEqual(list(users_df.columns), [ 'user_ind', 'gender', 'age', 'occupation', 'zipcode', 'index', 'occupation_name', 'userid' ]) self.assertEqual(len(users_df), 6040) movies_df = pd.read_csv(movies_csv_path) self.assertListEqual(list(movies_df.columns), ['item_ind', 'itemid', 'genres']) self.assertEqual(len(movies_df), 3883) from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF obs = ObservationsDF(df_obs=ratings_df) info = obs.data_info() self.assertEqual(info['len'], 989539) self.assertEqual(info['n_unique_items'], 3706) self.assertEqual(info['n_unique_users'], 5796) self.assertEqual(info['ratings_20_pctl'], 3.0) self.assertEqual(info['ratings_80_pctl'], 5.0)
def test_data(self): from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data( movielens_dir) ratings_df = pd.read_csv(rating_csv_path) self.assertListEqual(list(ratings_df.columns), ['rating', 'timestamp', 'itemid', 'userid']) self.assertEqual(len(ratings_df), 1000209) users_df = pd.read_csv(users_csv_path) self.assertListEqual(list(users_df.columns), [ 'gender', 'age', 'occupation', 'zipcode', 'index', 'occupation_name', 'userid' ]) self.assertEqual(len(users_df), 6040) movies_df = pd.read_csv(movies_csv_path) self.assertSetEqual( set(movies_df.columns), { 'itemid', 'Adventure', 'FilmNoir', 'Comedy', 'SciFi', 'Fantasy', 'Crime', 'Mystery', 'Action', 'Thriller', 'Horror', 'Musical', 'Drama', 'Western', 'War', 'Animation', 'Romance', 'Childrens', 'Documentary' }) self.assertEqual(len(movies_df), 3883) from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF obs = ObservationsDF(df_obs=ratings_df) info = obs.data_info() self.assertEqual(info['len'], 989539) self.assertEqual(info['n_unique_items'], 3706) self.assertEqual(info['n_unique_users'], 5796) self.assertEqual(info['ratings_20_pctl'], 3.0) self.assertEqual(info['ratings_80_pctl'], 5.0)
""" This is an example on datasets-1M demonstrating: - More advanced fitting features: fit, evaluation, early stopping, hyper-param search """ from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data import pandas as pd from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data() ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid') train_obs, test_obs = obs.split_train_test(ratio=0.2) lfm_rec = LightFMRecommender() # train LightFM with early stopping and print evaluation results lfm_rec.fit_with_early_stop(train_obs, epochs_max=30, epochs_step=1, stop_patience=1, valid_ratio=0.2, metric='n-MRR@10', refit_on_all=True) print( lfm_rec.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm early stop ')) # perform a hyperparameter search on LightFM recommender space = lfm_rec.guess_search_space() hp_results = lfm_rec.hyper_param_search(
import pandas as pd from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data from ml_recsys_tools.utils.testing import TestCaseWithState from tests.test_movielens_data import movielens_dir from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF from ml_recsys_tools.data_handlers.interactions_with_features import ObsWithFeatures rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data( movielens_dir) class TestRecommendersBasic(TestCaseWithState): def _obs_split_data_check(self, obs_full, obs1, obs2): # all the data is still there self.assertEqual( len(obs1.df_obs) + len(obs2.df_obs), len(obs_full.df_obs)) # no intersections intersections = pd.merge(obs1.df_obs, obs2.df_obs, on=['userid', 'itemid'], how='inner') self.assertEqual(len(intersections), 0) def _split_tester(self, obs): ratio = 0.2 # regular split train_obs, test_obs = obs.split_train_test(ratio=ratio) self._obs_split_data_check(obs, train_obs, test_obs) self.state.train_obs, self.state.test_obs = train_obs, test_obs