def test_splits(self): from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid', timestamp_col='timestamp') obs = obs.sample_observations(n_users=1000, n_items=1000) ratio = 0.2 # regular split train_obs, test_obs = obs.split_train_test(ratio=ratio) self._obs_split_data_check(obs, train_obs, test_obs) self.state.train_obs, self.state.test_obs = train_obs, test_obs # split for only some users user_ratio = 0.2 train_obs, test_obs = obs.split_train_test(ratio=ratio, users_ratio=user_ratio) self._obs_split_data_check(obs, train_obs, test_obs) post_split_ratio = test_obs.df_obs['userid'].nunique( ) / train_obs.df_obs['userid'].nunique() self.assertAlmostEqual(user_ratio, post_split_ratio, places=1) # split by timestamp time_col = obs.timestamp_col train_obs, test_obs = obs.split_train_test(ratio=ratio, time_split_column=time_col) self._obs_split_data_check(obs, train_obs, test_obs) self.assertGreaterEqual(test_obs.df_obs[time_col].min(), train_obs.df_obs[time_col].max())
def fit(self, train_obs: ObservationsDF, *args, **kwargs): factors_obs, reg_obs = train_obs.split_train_test( ratio=self.stacking_split) self._set_item_features_df(train_obs) self._set_data(factors_obs) self._fit_factorizer(factors_obs) self._fit_regressor(reg_obs)
def _setup_obs_handler(self): ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col=self.user_id_col, iid_col=self.item_id_col) obs = obs.sample_observations(n_users=1000, n_items=1000) self.state.train_obs, self.state.test_obs = obs.split_train_test(ratio=0.2, users_ratio=1.0) # add some fake data for sanity tests self.state.train_obs.df_obs = self._add_testing_obs_data(self.state.train_obs.df_obs)
""" This is an example on datasets-1M demonstrating: - More advanced fitting features: fit, evaluation, early stopping, hyper-param search """ from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data import pandas as pd from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data() ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid') train_obs, test_obs = obs.split_train_test(ratio=0.2) lfm_rec = LightFMRecommender() # train LightFM with early stopping and print evaluation results lfm_rec.fit_with_early_stop(train_obs, epochs_max=30, epochs_step=1, stop_patience=1, valid_ratio=0.2, metric='n-MRR@10', refit_on_all=True) print( lfm_rec.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm early stop ')) # perform a hyperparameter search on LightFM recommender space = lfm_rec.guess_search_space() hp_results = lfm_rec.hyper_param_search(
""" This is an example on datasets-1M demonstrating recommenders from spotlight library """ from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data import pandas as pd from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data() ratings_df = pd.read_csv(rating_csv_path) obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid', timestamp_col='timestamp') train_obs, test_obs = obs.split_train_test(ratio=0.2, time_split_column=obs.timestamp_col) from ml_recsys_tools.recommenders.spotlight_recommenders import EmbeddingFactorsRecommender emb_rec = EmbeddingFactorsRecommender(model_params=dict(loss='adaptive_hinge', n_iter=1)) # emb_rec.fit(train_obs) emb_rec.fit_with_early_stop(train_obs, epochs_max=5, epochs_step=1) print(emb_rec.eval_on_test_by_ranking(test_obs, prefix='implicit embeddings ')) # trying to reproduce this: # https://github.com/maciejkula/spotlight/tree/master/examples/movielens_sequence from ml_recsys_tools.recommenders.spotlight_recommenders import SequenceEmbeddingRecommender seq_rec = SequenceEmbeddingRecommender( model_params=dict(n_iter=15, embedding_dim=32, batch_size=32, learning_rate=0.01), fit_params=dict(max_sequence_length=200, timestamp_col='timestamp')) seq_rec.fit(train_obs) # emb_rec.fit_with_early_stop(train_obs, epochs_max=30, epochs_step=3) print(seq_rec.eval_on_test_by_ranking(test_obs, prefix='lstm ', include_train=False))