def main(): movielens = fetch_movielens() train = movielens['train'] test = movielens['test'] print(train.shape) print(test.shape) model = LightFM(learning_rate=0.05, loss='bpr') model.fit(train, epochs=5) k = 10 train_recall = recall_at_k(model, train, k=k).mean() test_recall = recall_at_k(model, test, k=k).mean() print(f'recall_at_{k}(train): {train_recall}') print(f'recall_at_{k}(test) : {test_recall}') train_auc = auc_score(model, train).mean() test_auc = auc_score(model, test).mean() print(f'auc_score(train): {train_auc}') print(f'auc_score(test) : {test_auc}') y_train_preds = model.predict_rank(train) y_test_preds = model.predict_rank(test) train_dcg = dcg_score(train.toarray(), y_train_preds.toarray()) test_dcg = dcg_score(test.toarray(), y_test_preds.toarray()) print(f'dcg_score(train): {train_dcg}') print(f'dcg_score(test) : {test_dcg}') print('DONE') return 0
def test_matrix_types(): mattypes = (sp.coo_matrix, sp.lil_matrix, sp.csr_matrix, sp.csc_matrix) dtypes = (np.int32, np.int64, np.float32, np.float64) no_users, no_items = (10, 100) no_features = 20 for mattype in mattypes: for dtype in dtypes: train = mattype((no_users, no_items), dtype=dtype) weights = train.tocoo() user_features = mattype((no_users, no_features), dtype=dtype) item_features = mattype((no_items, no_features), dtype=dtype) model = LightFM() model.fit_partial(train, sample_weight=weights, user_features=user_features, item_features=item_features) model.predict(np.random.randint(0, no_users, 10).astype(np.int32), np.random.randint(0, no_items, 10).astype(np.int32), user_features=user_features, item_features=item_features) model.predict_rank(train, user_features=user_features, item_features=item_features)
def test_predict_ranks(): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.float32) model = LightFM() model.fit_partial(train) # Compute ranks for all items rank_input = sp.csr_matrix(np.ones((no_users, no_items))) ranks = model.predict_rank(rank_input, num_threads=2).todense() assert np.all(ranks.min(axis=1) == 0) assert np.all(ranks.max(axis=1) == no_items - 1) for row in range(no_users): assert np.all(np.sort(ranks[row]) == np.arange(no_items)) # Make sure this is true also when there are ties model.user_embeddings = np.zeros_like(model.user_embeddings) model.item_embeddings = np.zeros_like(model.item_embeddings) model.user_biases = np.zeros_like(model.user_biases) model.item_biases = np.zeros_like(model.item_biases) ranks = model.predict_rank(rank_input, num_threads=2).todense() assert np.all(ranks.min(axis=1) == 0) assert np.all(ranks.max(axis=1) == 0) # Wrong input dimensions with pytest.raises(ValueError): model.predict_rank(sp.csr_matrix((5, 5)), num_threads=2)
def test_predict_not_fitted(): model = LightFM() with pytest.raises(ValueError): model.predict(np.arange(10), np.arange(10)) with pytest.raises(ValueError): model.predict_rank(1) with pytest.raises(ValueError): model.get_user_representations() with pytest.raises(ValueError): model.get_item_representations()
def test_predict_ranks(): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.float32) train = sp.rand(no_users, no_items, format='csr', random_state=42) model = LightFM() model.fit_partial(train) # Compute ranks for all items rank_input = sp.csr_matrix(np.ones((no_users, no_items))) ranks = model.predict_rank(rank_input, num_threads=2).todense() assert np.all(ranks.min(axis=1) == 0) assert np.all(ranks.max(axis=1) == no_items - 1) for row in range(no_users): assert np.all(np.sort(ranks[row]) == np.arange(no_items)) # Train set exclusions. All ranks should be zero # if train interactions is dense. ranks = model.predict_rank(rank_input, train_interactions=rank_input).todense() assert np.all(ranks == 0) # Max rank should be num_items - 1 - number of positives # in train in that row ranks = model.predict_rank(rank_input, train_interactions=train).todense() assert np.all(np.squeeze(np.array(ranks.max(axis=1))) == no_items - 1 - np.squeeze(np.array(train.getnnz(axis=1)))) # Make sure ranks are computed pessimistically when # there are ties (that is, equal predictions for every # item will assign maximum rank to each). model.user_embeddings = np.zeros_like(model.user_embeddings) model.item_embeddings = np.zeros_like(model.item_embeddings) model.user_biases = np.zeros_like(model.user_biases) model.item_biases = np.zeros_like(model.item_biases) ranks = model.predict_rank(rank_input, num_threads=2).todense() assert np.all(ranks.min(axis=1) == 99) assert np.all(ranks.max(axis=1) == 99) # Wrong input dimensions with pytest.raises(ValueError): model.predict_rank(sp.csr_matrix((5, 5)), num_threads=2)
def test_predict_ranks(): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.float32) train = sp.rand(no_users, no_items, format='csr') model = LightFM() model.fit_partial(train) # Compute ranks for all items rank_input = sp.csr_matrix(np.ones((no_users, no_items))) ranks = model.predict_rank(rank_input, num_threads=2).todense() assert np.all(ranks.min(axis=1) == 0) assert np.all(ranks.max(axis=1) == no_items - 1) for row in range(no_users): assert np.all(np.sort(ranks[row]) == np.arange(no_items)) # Train set exclusions. All ranks should be zero # if train interactions is dense. ranks = model.predict_rank(rank_input, train_interactions=rank_input).todense() assert np.all(ranks == 0) # Max rank should be num_items - 1 - number of positives # in train in that row ranks = model.predict_rank(rank_input, train_interactions=train).todense() assert np.all(np.squeeze(np.array(ranks.max(axis=1))) == no_items - 1 - np.squeeze(np.array(train.getnnz(axis=1)))) # Make sure invariants hold when there are ties model.user_embeddings = np.zeros_like(model.user_embeddings) model.item_embeddings = np.zeros_like(model.item_embeddings) model.user_biases = np.zeros_like(model.user_biases) model.item_biases = np.zeros_like(model.item_biases) ranks = model.predict_rank(rank_input, num_threads=2).todense() assert np.all(ranks.min(axis=1) == 0) assert np.all(ranks.max(axis=1) == 0) # Wrong input dimensions with pytest.raises(ValueError): model.predict_rank(sp.csr_matrix((5, 5)), num_threads=2)
def test_predict_scores(num_threads=2): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.float32) train = sp.rand(no_users, no_items, format='csr') model = LightFM() model.fit_partial(train) # Compute scores and check if results equal to model.predict predict_input = sp.csr_matrix(np.ones((no_users, no_items))) scores = model.predict_score(predict_input, num_threads=num_threads).todense() for uid in range(no_users): scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items)) score_slice = np.array(scores)[uid, :] assert np.array_equal(score_slice, scores_arr) # check if precompute and parallelization work correctly scores_serial = model.predict_score(predict_input, num_threads=1).todense() scores_no_prec = model.predict_score(predict_input, num_threads=num_threads, precompute_representations=False ).todense() scores_ser_no_prec = model.predict_score(predict_input, num_threads=1, precompute_representations=False ).todense() assert np.array_equal(scores, scores_serial) assert np.array_equal(scores, scores_no_prec) assert np.array_equal(scores, scores_ser_no_prec) # Compute ranks and compares with ranks computed from scores ranks = model.predict_rank(predict_input, num_threads=num_threads).todense() def rank_scores(s): # ranks from scores as in http://stackoverflow.com/a/14672797/5251962 u, v = np.unique(s, return_inverse=True) return len(s) - 1 - (np.cumsum(np.bincount(v)) - 1)[v] check_ranks = np.apply_along_axis(rank_scores, 1, scores) assert np.array_equal(ranks, check_ranks) # Train set exclusions. All scores should be zero # if train interactions is dense. scores = model.predict_score(predict_input, train_interactions=predict_input).todense() assert np.all(scores == 0) # Make sure invariants hold when there are ties model.user_embeddings = np.zeros_like(model.user_embeddings) model.item_embeddings = np.zeros_like(model.item_embeddings) model.user_biases = np.zeros_like(model.user_biases) model.item_biases = np.zeros_like(model.item_biases) scores = model.predict_score(predict_input, num_threads=num_threads).todense() assert np.all(scores.min(axis=1) == 0) assert np.all(scores.max(axis=1) == 0) # Wrong input dimensions with pytest.raises(ValueError): model.predict_score(sp.csr_matrix((5, 5)), num_threads=num_threads)
class LightFMRecommender(BaseFactorizationRecommender): default_model_params = { 'loss': 'warp', 'learning_schedule': 'adadelta', 'no_components': 30, 'max_sampled': 10, 'item_alpha': 0, 'user_alpha': 0, } default_fit_params = { 'epochs': 100, 'item_features': None, 'num_threads': N_CPUS, 'verbose': True, } default_external_features_params = dict(add_identity_mat=True) def __init__(self, use_sample_weight=False, external_features=None, external_features_params=None, initialiser_model=None, initialiser_scale=0.1, **kwargs): self.use_sample_weight = use_sample_weight self.external_features = external_features self.external_features_params = external_features_params or \ self.default_external_features_params.copy() self.initialiser_model = initialiser_model self.initialiser_scale = initialiser_scale super().__init__(**kwargs) def _prep_for_fit(self, train_obs, **fit_params): # self.toggle_mkl_blas_1_thread(True) # assign all observation data self._set_data(train_obs) fit_params['sample_weight'] = self.train_mat.tocoo() \ if self.use_sample_weight else None self._set_fit_params(fit_params) self._add_external_features() # init model and set params self.model = LightFM(**self.model_params) if self.initialiser_model is not None: self._initialise_from_model(train_obs) def _initialise_from_model(self, train_obs): # fit initialiser model (this is done here to prevent any data leaks from passing fitted models) simple_logger.info('Training %s model to initialise LightFM model.' % str(self.initialiser_model)) self.initialiser_model.fit(train_obs) self._reuse_data(self.initialiser_model) # have the internals initialised self.model.fit_partial(self.train_mat, epochs=0) # transplant factors from inititialiser model self.model.item_embeddings = self.initialiser_model._get_item_factors()[1] self.model.user_embeddings = self.initialiser_model._get_user_factors()[1] # scale the factors to be of similar scale scale = self.initialiser_scale self.model.item_embeddings *= scale / np.mean(np.abs(self.model.item_embeddings)) self.model.user_embeddings *= scale / np.mean(np.abs(self.model.user_embeddings)) def _add_external_features(self): if self.external_features is not None: self.external_features_mat = self.external_features.\ fit_transform_ids_df_to_mat( items_encoder=self.sparse_mat_builder.iid_encoder, **self.external_features_params) simple_logger.info('External item features matrix: %s' % str(self.external_features_mat.shape)) # add external features if specified self.fit_params['item_features'] = self.external_features_mat if self.external_features_mat is not None: simple_logger.info('Fitting using external features mat: %s' % str(self.external_features_mat.shape)) def fit(self, train_obs, **fit_params): self._prep_for_fit(train_obs, **fit_params) self.model.fit_partial(self.train_mat, **self.fit_params) return self def fit_partial(self, train_obs, epochs=1): self._set_epochs(epochs) if self.model is None: self.fit(train_obs) else: self.model.fit_partial(self.train_mat) return self def fit_batches(self, train_obs, train_dfs, epochs_per_batch=None, **fit_params): self._prep_for_fit(train_obs) for i, df in enumerate(train_dfs): batch_train_mat = self.sparse_mat_builder.build_sparse_interaction_matrix(df) if epochs_per_batch is not None: fit_params['epochs'] = epochs_per_batch fit_params['sample_weight'] = batch_train_mat.tocoo() \ if self.use_sample_weight else None self._set_fit_params(fit_params) simple_logger.info('Fitting batch %d (%d interactions)' % (i, len(df))) self.model.fit_partial(batch_train_mat, **self.fit_params) def _set_epochs(self, epochs): self.set_params(epochs=epochs) def set_params(self, **params): params = self._pop_set_params( params, ['use_sample_weight', 'external_features', 'external_features_params', 'initialiser_model', 'initialiser_scale']) super().set_params(**params) def _get_item_factors(self, mode=None): n_items = len(self.sparse_mat_builder.iid_encoder.classes_) biases, representations = self.model.get_item_representations(self.fit_params['item_features']) if mode is None: pass # default mode elif mode == 'external_features': external_features_mat = self.external_features_mat assert external_features_mat is not None, \ 'Must define and add a feature matrix for "external_features" similarity.' representations = external_features_mat elif (mode == 'no_features') and (self.fit_params['item_features'] is not None): simple_logger.info('LightFM recommender: get_similar_items: "no_features" mode ' 'assumes ID mat was added and is the last part of the feature matrix.') assert self.model.item_embeddings.shape[0] > n_items, \ 'Either no ID matrix was added, or no features added' representations = self.model.item_embeddings[-n_items:, :] else: raise ValueError('Uknown representation mode: %s' % mode) return biases, representations def _get_user_factors(self, mode=None): return self.model.get_user_representations() def _predict_on_inds(self, user_inds, item_inds): return self.model.predict(user_inds, item_inds, item_features=self.fit_params['item_features'], num_threads=N_CPUS) def _predict_rank(self, test_mat, train_mat=None): return self.model.predict_rank( test_interactions=test_mat, train_interactions=train_mat, item_features=self.fit_params['item_features'], num_threads=N_CPUS) def reduce_memory_for_serving(self): # would be best to set those to None, but than LightFM will complain, and more importantly # Cython code expects the right data format and will crash if its predict() will be used, # so I just point to the embeddings (which doesn't add memory). # the danger in this is that I don't know what will be the damage if someone calls one of the fit methods # for this reason it's in an explicit method "for_serving" and not in a __getstate__() method self.model.item_embedding_gradients = self.model.item_embeddings self.model.item_embedding_momentum= self.model.item_embeddings self.model.user_embedding_gradients = self.model.user_embeddings self.model.user_embedding_momentum = self.model.user_embeddings self.model.item_bias_gradients = self.model.item_biases self.model.item_bias_momentum= self.model.item_biases self.model.user_bias_gradients = self.model.user_biases self.model.user_bias_momentum = self.model.user_biases self.fit_params['sample_weight'] = None super().reduce_memory_for_serving()
print("Splitting the data into train/test set...\n") train, test = cross_validation.random_train_test_split(user_items_train) # print(train,test) # print(train.shape(),test.shape()) model1 = LightFM(learning_rate=0.05, loss='bpr') model2 = LightFM(learning_rate=0.05, loss='warp') print("Fitting models of BPR & WARP ranking losses...\n") model1.fit(train, epochs=10) model2.fit(train, epochs=10) #ranks = model.predict(user_items_train,num_threads=1) #print(ranks) res = model1.predict_rank(test) print(res) print("Evaluating methods...\n") train_recall1_10 = recall_at_k(model1, train, k=10).mean() test_recall1_10 = recall_at_k(model1, test, k=10).mean() train_recall1_20 = recall_at_k(model1, train, k=20).mean() test_recall1_20 = recall_at_k(model1, test, k=20).mean() #train_mrr1 = reciprocal_rank(model1, train).mean() #train_mrr_20 = reciprocal_rank(model1, user_items_train).mean() #train_mrr2 = reciprocal_rank(model2, user_items_train).mean() train_recall2_10 = recall_at_k(model2, train, k=10).mean() test_recall2_10 = recall_at_k(model2, test, k=10).mean()
def test_predict_scores(num_threads=2): no_users, no_items = (10, 100) train = sp.coo_matrix((no_users, no_items), dtype=np.float32) train = sp.rand(no_users, no_items, format='csr') model = LightFM() model.fit_partial(train) # Compute scores and check if results equal to model.predict predict_input = sp.csr_matrix(np.ones((no_users, no_items))) scores = model.predict_score(predict_input, num_threads=num_threads).todense() for uid in range(no_users): scores_arr = model.predict(np.repeat(uid, no_items), np.arange(no_items)) score_slice = np.array(scores)[uid, :] assert np.array_equal(score_slice, scores_arr) # check if precompute and parallelization work correctly scores_serial = model.predict_score(predict_input, num_threads=1).todense() scores_no_prec = model.predict_score( predict_input, num_threads=num_threads, precompute_representations=False).todense() scores_ser_no_prec = model.predict_score( predict_input, num_threads=1, precompute_representations=False).todense() assert np.array_equal(scores, scores_serial) assert np.array_equal(scores, scores_no_prec) assert np.array_equal(scores, scores_ser_no_prec) # Compute ranks and compares with ranks computed from scores ranks = model.predict_rank(predict_input, num_threads=num_threads).todense() def rank_scores(s): # ranks from scores as in http://stackoverflow.com/a/14672797/5251962 u, v = np.unique(s, return_inverse=True) return len(s) - 1 - (np.cumsum(np.bincount(v)) - 1)[v] check_ranks = np.apply_along_axis(rank_scores, 1, scores) assert np.array_equal(ranks, check_ranks) # Train set exclusions. All scores should be zero # if train interactions is dense. scores = model.predict_score(predict_input, train_interactions=predict_input).todense() assert np.all(scores == 0) # Make sure invariants hold when there are ties model.user_embeddings = np.zeros_like(model.user_embeddings) model.item_embeddings = np.zeros_like(model.item_embeddings) model.user_biases = np.zeros_like(model.user_biases) model.item_biases = np.zeros_like(model.item_biases) scores = model.predict_score(predict_input, num_threads=num_threads).todense() assert np.all(scores.min(axis=1) == 0) assert np.all(scores.max(axis=1) == 0) # Wrong input dimensions with pytest.raises(ValueError): model.predict_score(sp.csr_matrix((5, 5)), num_threads=num_threads)
def train_model(): # uesr features user_features, user_feature_names = get_user_features() # create data data_ws = Dataset(user_identity_features=True) # warm start # create map between user_id, post_id, user_features and internal indices data_ws.fit((x['user_id'] for x in get_data()), (x['post_id'] for x in get_data()), user_features=user_features) #user_biases = #--------------------------- # Building the interactions matrix #--------------------------- # create interaction matrix to optimize (interactions_ws, weights_ws) = data_ws.build_interactions( ((x['user_id'], x['post_id']) for x in get_data())) print(repr(interactions_ws)) # retrieve mapping from dataset user_id_map, user_feature_map, item_id_map, item_feature_map = data_ws.mapping( ) #--------------------------- # train model #--------------------------- # initialize model model_warp_ws = LightFM(learning_rate=0.05, loss='warp', no_components=len(user_feature_names)) # train model model_warp_ws.fit(interactions_ws, user_features=user_features, epochs=30) #--------------------------- # make predictions #--------------------------- # make predictions for all user prediction_ws = model_warp_ws.predict_rank(interactions_ws, user_features=user_features) # create identity matrix that represent user features of hypothetical user user_features_identity = sparse.csr_matrix( np.identity(len(user_feature_names))) # make prediction for hypothetical user prediction_hypo = [] for user_irt in range(len(user_feature_names)): # calculate prediction score prediction_score = model_warp_ws.predict( user_ids=0, item_ids=item_id_map.values(), user_features=user_features_identity) # combine prediction score with item map prediction_zipped = zip(prediction_score, item_id_map) # sort by prediction score prediction_sorted = sorted(prediction_zipped, key=lambda x: x[0], reverse=True) # add to list of hypothetical users prediction_hypo.append(prediction_sorted) return prediction_hypo, prediction_ws, user_id_map, item_id_map, user_feature_names