Exemple #1
0
 def _setup_obs_handler(self):
     ratings_df = pd.read_csv(rating_csv_path)
     obs = ObservationsDF(ratings_df, uid_col=self.user_id_col, iid_col=self.item_id_col)
     obs = obs.sample_observations(n_users=1000, n_items=1000)
     self.state.train_obs, self.state.test_obs = obs.split_train_test(ratio=0.2, users_ratio=1.0)
     # add some fake data for sanity tests
     self.state.train_obs.df_obs = self._add_testing_obs_data(self.state.train_obs.df_obs)
Exemple #2
0
    def test_data(self):
        from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
        rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data(
            movielens_dir)

        ratings_df = pd.read_csv(rating_csv_path)
        self.assertListEqual(list(ratings_df.columns),
                             ['rating', 'timestamp', 'itemid', 'userid'])
        self.assertEqual(len(ratings_df), 1000209)

        users_df = pd.read_csv(users_csv_path)
        self.assertListEqual(list(users_df.columns), [
            'user_ind', 'gender', 'age', 'occupation', 'zipcode', 'index',
            'occupation_name', 'userid'
        ])
        self.assertEqual(len(users_df), 6040)

        movies_df = pd.read_csv(movies_csv_path)
        self.assertListEqual(list(movies_df.columns),
                             ['item_ind', 'itemid', 'genres'])
        self.assertEqual(len(movies_df), 3883)

        from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF
        obs = ObservationsDF(df_obs=ratings_df)
        info = obs.data_info()
        self.assertEqual(info['len'], 989539)
        self.assertEqual(info['n_unique_items'], 3706)
        self.assertEqual(info['n_unique_users'], 5796)
        self.assertEqual(info['ratings_20_pctl'], 3.0)
        self.assertEqual(info['ratings_80_pctl'], 5.0)
    def test_data(self):
        from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
        rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data(
            movielens_dir)

        ratings_df = pd.read_csv(rating_csv_path)
        self.assertListEqual(list(ratings_df.columns),
                             ['rating', 'timestamp', 'itemid', 'userid'])
        self.assertEqual(len(ratings_df), 1000209)

        users_df = pd.read_csv(users_csv_path)
        self.assertListEqual(list(users_df.columns), [
            'gender', 'age', 'occupation', 'zipcode', 'index',
            'occupation_name', 'userid'
        ])
        self.assertEqual(len(users_df), 6040)

        movies_df = pd.read_csv(movies_csv_path)
        self.assertSetEqual(
            set(movies_df.columns), {
                'itemid', 'Adventure', 'FilmNoir', 'Comedy', 'SciFi',
                'Fantasy', 'Crime', 'Mystery', 'Action', 'Thriller', 'Horror',
                'Musical', 'Drama', 'Western', 'War', 'Animation', 'Romance',
                'Childrens', 'Documentary'
            })
        self.assertEqual(len(movies_df), 3883)

        from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF
        obs = ObservationsDF(df_obs=ratings_df)
        info = obs.data_info()
        self.assertEqual(info['len'], 989539)
        self.assertEqual(info['n_unique_items'], 3706)
        self.assertEqual(info['n_unique_users'], 5796)
        self.assertEqual(info['ratings_20_pctl'], 3.0)
        self.assertEqual(info['ratings_80_pctl'], 5.0)
Exemple #4
0
 def fit(self, train_obs: ObservationsDF, *args, **kwargs):
     factors_obs, reg_obs = train_obs.split_train_test(
         ratio=self.stacking_split)
     self._set_item_features_df(train_obs)
     self._set_data(factors_obs)
     self._fit_factorizer(factors_obs)
     self._fit_regressor(reg_obs)
Exemple #5
0
    def test_splits(self):

        ratings_df = pd.read_csv(rating_csv_path)
        obs_params = dict(uid_col='userid',
                          iid_col='itemid',
                          timestamp_col='timestamp')
        obs = ObservationsDF(ratings_df, **obs_params)
        obs = obs.sample_observations(n_users=1000, n_items=1000)
        self._split_tester(obs)

        items_df = pd.read_csv(movies_csv_path)
        obs_feat = ObsWithFeatures(df_obs=ratings_df,
                                   df_items=items_df,
                                   item_id_col='itemid',
                                   **obs_params)
        obs_feat = obs_feat.sample_observations(n_users=1000, n_items=1000)
        self._split_tester(obs_feat)
Exemple #6
0
    def test_splits(self):
        from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF

        ratings_df = pd.read_csv(rating_csv_path)
        obs = ObservationsDF(ratings_df,
                             uid_col='userid',
                             iid_col='itemid',
                             timestamp_col='timestamp')
        obs = obs.sample_observations(n_users=1000, n_items=1000)

        ratio = 0.2

        # regular split
        train_obs, test_obs = obs.split_train_test(ratio=ratio)
        self._obs_split_data_check(obs, train_obs, test_obs)
        self.state.train_obs, self.state.test_obs = train_obs, test_obs

        # split for only some users
        user_ratio = 0.2
        train_obs, test_obs = obs.split_train_test(ratio=ratio,
                                                   users_ratio=user_ratio)
        self._obs_split_data_check(obs, train_obs, test_obs)
        post_split_ratio = test_obs.df_obs['userid'].nunique(
        ) / train_obs.df_obs['userid'].nunique()
        self.assertAlmostEqual(user_ratio, post_split_ratio, places=1)

        # split by timestamp
        time_col = obs.timestamp_col
        train_obs, test_obs = obs.split_train_test(ratio=ratio,
                                                   time_split_column=time_col)
        self._obs_split_data_check(obs, train_obs, test_obs)
        self.assertGreaterEqual(test_obs.df_obs[time_col].min(),
                                train_obs.df_obs[time_col].max())
"""
This is an example on datasets-1M demonstrating:
    - More advanced fitting features: fit, evaluation, early stopping, hyper-param search
"""

from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
import pandas as pd
from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF
from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender

rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data()
ratings_df = pd.read_csv(rating_csv_path)

obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid')
train_obs, test_obs = obs.split_train_test(ratio=0.2)
lfm_rec = LightFMRecommender()

# train LightFM with early stopping and print evaluation results
lfm_rec.fit_with_early_stop(train_obs,
                            epochs_max=30,
                            epochs_step=1,
                            stop_patience=1,
                            valid_ratio=0.2,
                            metric='n-MRR@10',
                            refit_on_all=True)
print(
    lfm_rec.eval_on_test_by_ranking(test_obs.df_obs, prefix='lfm early stop '))

# perform a hyperparameter search on LightFM recommender
space = lfm_rec.guess_search_space()
hp_results = lfm_rec.hyper_param_search(
Exemple #8
0
"""
This is an example on datasets-1M demonstrating recommenders from spotlight library
"""

from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
import pandas as pd
from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF

rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data()
ratings_df = pd.read_csv(rating_csv_path)

obs = ObservationsDF(ratings_df, uid_col='userid', iid_col='itemid', timestamp_col='timestamp')
train_obs, test_obs = obs.split_train_test(ratio=0.2, time_split_column=obs.timestamp_col)


from ml_recsys_tools.recommenders.spotlight_recommenders import EmbeddingFactorsRecommender
emb_rec = EmbeddingFactorsRecommender(model_params=dict(loss='adaptive_hinge', n_iter=1))
# emb_rec.fit(train_obs)
emb_rec.fit_with_early_stop(train_obs, epochs_max=5, epochs_step=1)
print(emb_rec.eval_on_test_by_ranking(test_obs, prefix='implicit embeddings '))


# trying to reproduce this:
# https://github.com/maciejkula/spotlight/tree/master/examples/movielens_sequence
from ml_recsys_tools.recommenders.spotlight_recommenders import SequenceEmbeddingRecommender
seq_rec = SequenceEmbeddingRecommender(
    model_params=dict(n_iter=15, embedding_dim=32, batch_size=32, learning_rate=0.01),
    fit_params=dict(max_sequence_length=200, timestamp_col='timestamp'))
seq_rec.fit(train_obs)
# emb_rec.fit_with_early_stop(train_obs, epochs_max=30, epochs_step=3)
print(seq_rec.eval_on_test_by_ranking(test_obs, prefix='lstm ', include_train=False))
"""
using multiple test sets
"""

# dataset: download and prepare dataframes
import pandas as pd
from ml_recsys_tools.datasets.prep_movielense_data import get_and_prep_data
from ml_recsys_tools.recommenders.lightfm_recommender import LightFMRecommender

rating_csv_path, users_csv_path, movies_csv_path = get_and_prep_data()

# read the interactions dataframe and create a data handler object and  split to train and test
ratings_df = pd.read_csv(rating_csv_path)
from ml_recsys_tools.data_handlers.interaction_handlers_base import ObservationsDF

obs = ObservationsDF(ratings_df)
train_obs, test_obs = obs.split_train_test(ratio=0.2, users_ratio=0.2)


def construct_multiple_test_sets(test_df, train_df):
    # by user history - active and inactive users
    user_hist_counts = train_df.userid.value_counts()
    user_hist_counts.hist(bins=100, alpha=0.5)
    active_users = user_hist_counts[user_hist_counts >= 300].index.tolist()
    test_df_act_us = test_df[test_df.userid.isin(active_users)]
    test_df_nonact_us = test_df[~test_df.userid.isin(active_users)]

    # by item popularity- popular and unpopular items
    item_hist_counts = train_df.itemid.value_counts()
    item_hist_counts.hist(bins=100, alpha=0.5)
    popular_items = item_hist_counts[item_hist_counts >= 1000].index.tolist()