コード例 #1
0
    def test_splits(self):
        from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF

        ratings_df = pd.read_csv(rating_csv_path)
        obs = ObservationsDF(ratings_df,
                             uid_col='userid',
                             iid_col='itemid',
                             timestamp_col='timestamp')
        obs = obs.sample_observations(n_users=1000, n_items=1000)

        ratio = 0.2

        # regular split
        train_obs, test_obs = obs.split_train_test(ratio=ratio)
        self._obs_split_data_check(obs, train_obs, test_obs)
        self.state.train_obs, self.state.test_obs = train_obs, test_obs

        # split for only some users
        user_ratio = 0.2
        train_obs, test_obs = obs.split_train_test(ratio=ratio,
                                                   users_ratio=user_ratio)
        self._obs_split_data_check(obs, train_obs, test_obs)
        post_split_ratio = test_obs.df_obs['userid'].nunique(
        ) / train_obs.df_obs['userid'].nunique()
        self.assertAlmostEqual(user_ratio, post_split_ratio, places=1)

        # split by timestamp
        time_col = obs.timestamp_col
        train_obs, test_obs = obs.split_train_test(ratio=ratio,
                                                   time_split_column=time_col)
        self._obs_split_data_check(obs, train_obs, test_obs)
        self.assertGreaterEqual(test_obs.df_obs[time_col].min(),
                                train_obs.df_obs[time_col].max())
コード例 #2
0
 def fit(self, train_obs: ObservationsDF, *args, **kwargs):
     factors_obs, reg_obs = train_obs.split_train_test(
         ratio=self.stacking_split)
     self._set_item_features_df(train_obs)
     self._set_data(factors_obs)
     self._fit_factorizer(factors_obs)
     self._fit_regressor(reg_obs)
コード例 #3
0
 def _setup_obs_handler(self):
     ratings_df = pd.read_csv(rating_csv_path)
     obs = ObservationsDF(ratings_df, uid_col=self.user_id_col, iid_col=self.item_id_col)
     obs = obs.sample_observations(n_users=1000, n_items=1000)
     self.state.train_obs, self.state.test_obs = obs.split_train_test(ratio=0.2, users_ratio=1.0)
     # add some fake data for sanity tests
     self.state.train_obs.df_obs = self._add_testing_obs_data(self.state.train_obs.df_obs)
コード例 #4
0
"""
This is an example on datasets-1M demonstrating:
    - More advanced fitting features: fit, evaluation, early stopping, hyper-param search
"""

from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
import pandas as pd
from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF
from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender

rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data()
ratings_df = pd.read_csv(rating_csv_path)

obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid')
train_obs, test_obs = obs.split_train_test(ratio=0.2)
lfm_rec = LightFMRecommender()

# train LightFM with early stopping and print evaluation results
lfm_rec.fit_with_early_stop(train_obs,
                            epochs_max=30,
                            epochs_step=1,
                            stop_patience=1,
                            valid_ratio=0.2,
                            metric='n-MRR@10',
                            refit_on_all=True)
print(
    lfm_rec.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm early stop '))

# perform a hyperparameter search on LightFM recommender
space = lfm_rec.guess_search_space()
hp_results = lfm_rec.hyper_param_search(
コード例 #5
0
"""
This is an example on datasets-1M demonstrating recommenders from spotlight library
"""

from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
import pandas as pd
from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF

rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data()
ratings_df = pd.read_csv(rating_csv_path)

obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid', timestamp_col='timestamp')
train_obs, test_obs = obs.split_train_test(ratio=0.2, time_split_column=obs.timestamp_col)


from ml_recsys_tools.recommenders.spotlight_recommenders import EmbeddingFactorsRecommender
emb_rec = EmbeddingFactorsRecommender(model_params=dict(loss='adaptive_hinge', n_iter=1))
# emb_rec.fit(train_obs)
emb_rec.fit_with_early_stop(train_obs, epochs_max=5, epochs_step=1)
print(emb_rec.eval_on_test_by_ranking(test_obs, prefix='implicit embeddings '))


# trying to reproduce this:
# https://github.com/maciejkula/spotlight/tree/master/examples/movielens_sequence
from ml_recsys_tools.recommenders.spotlight_recommenders import SequenceEmbeddingRecommender
seq_rec = SequenceEmbeddingRecommender(
    model_params=dict(n_iter=15, embedding_dim=32, batch_size=32, learning_rate=0.01),
    fit_params=dict(max_sequence_length=200, timestamp_col='timestamp'))
seq_rec.fit(train_obs)
# emb_rec.fit_with_early_stop(train_obs, epochs_max=30, epochs_step=3)
print(seq_rec.eval_on_test_by_ranking(test_obs, prefix='lstm ', include_train=False))