def hit_rate(model, test_interactions, k=10, filter_previous=False): """evaluate hit-rate (any match) wrt out-of-sample observed interactions :param model: trained RankFM model instance :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions :param k: number of recommendations to generate for each user :param filter_previous: remove observed training items from generated recommendations :return: the hit rate or proportion of test users with any matching items """ # ensure that the model has been fit before attempting to generate predictions assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics" # transform interactions into a user -> items dictionary test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id']) test_user_items = test_user_items.groupby('user_id')['item_id'].apply( set).to_dict() test_users = list(test_user_items.keys()) # generate topK recommendations for all test users also present in the training data test_recs = model.recommend(users=test_users, n_items=k, filter_previous=filter_previous, cold_start='drop') comm_user = test_recs.index.values # calculate the hit rate (percentage of users with any relevant recommendation) wrt common users hit_rate = np.mean([ int(len(set(test_recs.loc[u]) & test_user_items[u]) > 0) for u in comm_user ]) return hit_rate
def predict(self, pairs, cold_start='nan'): """calculate the predicted pointwise utilities for all (user, item) pairs :param pairs: dataframe of [user, item] pairs to score :param cold_start: whether to generate missing values ('nan') or drop ('drop') user/item pairs not found in training data :return: np.array of real-valued model scores """ assert isinstance( pairs, (np.ndarray, pd.DataFrame)), "[pairs] must be np.ndarray or pd.dataframe" assert pairs.shape[1] == 2, "[pairs] should be: [user_id, item_id]" assert self.is_fit, "you must fit the model prior to generating predictions" pred_pairs = pd.DataFrame(get_data(pairs).copy(), columns=['user_id', 'item_id']) pred_pairs['user_id'] = pred_pairs['user_id'].map(self.user_to_index) pred_pairs['item_id'] = pred_pairs['item_id'].map(self.item_to_index) pred_pairs = np.ascontiguousarray(pred_pairs, dtype=np.float32) scores = _predict(pred_pairs, self.x_uf, self.x_if, self.w_i, self.w_if, self.v_u, self.v_i, self.v_uf, self.v_if) if cold_start == 'nan': return scores elif cold_start == 'drop': return scores[~np.isnan(scores)] else: raise ValueError( "param [cold_start] must be set to either 'nan' or 'drop'")
def _init_interactions(self, interactions, sample_weight): """map new interaction data to existing internal user/item indexes :param interactions: dataframe of observed user/item interactions: [user_id, item_id] :param sample_weight: vector of importance weights for each observed interaction :return: None """ # check user data inputs assert isinstance(interactions, (np.ndarray, pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe" assert interactions.shape[1] == 2, "[interactions] should be: [user_id, item_id]" # map the raw user/item identifiers to internal zero-based index positions # NOTE: any user/item pairs not found in the existing indexes will be dropped self.interactions = pd.DataFrame(get_data(interactions).copy(), columns=['user_id', 'item_id']) self.interactions['user_id'] = self.interactions['user_id'].map(self.user_to_index).astype(np.int32) self.interactions['item_id'] = self.interactions['item_id'].map(self.item_to_index).astype(np.int32) self.interactions = self.interactions.rename({'user_id': 'user_idx', 'item_id': 'item_idx'}, axis=1).dropna().astype(np.int32) # store the sample weights internally or create a vector of ones if not passed if sample_weight is not None: assert isinstance(sample_weight, (np.ndarray, pd.Series)), "[sample_weight] must be np.ndarray or pd.series" assert sample_weight.ndim == 1, "[sample_weight] must a vector (ndim=1)" assert len(sample_weight) == len(interactions), "[sample_weight] must have the same length as [interactions]" self.sample_weight = get_data(sample_weight).astype(np.float32) else: self.sample_weight = np.ones(len(self.interactions), dtype=np.float32, order='C') # create python/numba lookup dictionaries containing the set of observed items for each user # NOTE: the typed numba dictionary will be used to sample unobserved items during training # NOTE: the interactions data must be converted to np.ndarray prior to training to use @njit self.user_items_nb = nb.typed.Dict.empty(key_type=nb.types.int32, value_type=nb.types.int32[:]) self.user_items_py = self.interactions.sort_values(['user_idx', 'item_idx']).groupby('user_idx')['item_idx'].apply(np.array, dtype=np.int32).to_dict() self.interactions = self.interactions.to_numpy() for user, items in self.user_items_py.items(): self.user_items_nb[user] = items
def _init_all(self, interactions, user_features=None, item_features=None, sample_weight=None): """index the raw interaction and user/item features data to numpy arrays :param interactions: dataframe of observed user/item interactions: [user_id, item_id] :param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n] :param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n] :param sample_weight: vector of importance weights for each observed interaction :return: None """ # check user data inputs assert isinstance(interactions, ( np.ndarray, pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe" assert interactions.shape[ 1] == 2, "[interactions] should be: [user_id, item_id]" # save the unique lists of users/items in terms of original identifiers interactions_df = pd.DataFrame(get_data(interactions), columns=['user_id', 'item_id']) self.user_id = pd.Series(np.sort(np.unique( interactions_df['user_id']))) self.item_id = pd.Series(np.sort(np.unique( interactions_df['item_id']))) # create zero-based index position to identifier mappings self.index_to_user = self.user_id self.index_to_item = self.item_id # create reverse mappings from identifiers to zero-based index positions self.user_to_index = pd.Series(data=self.index_to_user.index, index=self.index_to_user.values) self.item_to_index = pd.Series(data=self.index_to_item.index, index=self.index_to_item.values) # store unique values of user/item indexes and observed interactions for each user self.user_idx = np.arange(len(self.user_id), dtype=np.int32) self.item_idx = np.arange(len(self.item_id), dtype=np.int32) # map the interactions to internal index positions self._init_interactions(interactions, sample_weight) # map the user/item features to internal index positions self._init_features(user_features, item_features) # initialize the model weights after the user/item/feature dimensions have been established self._init_weights(user_features, item_features)
def discounted_cumulative_gain(model, test_interactions, k=10, filter_previous=False): """evaluate discounted cumulative gain wrt out-of-sample observed interactions :param model: trained RankFM model instance :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions :param k: number of recommendations to generate for each user :param filter_previous: remove observed training items from generated recommendations :return: mean discounted cumulative gain wrt the test users """ # ensure that the model has been fit before attempting to generate predictions assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics" # transform interactions into a user -> items dictionary test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id']) test_user_items = test_user_items.groupby('user_id')['item_id'].apply( set).to_dict() test_users = list(test_user_items.keys()) # generate topK recommendations for all test users also present in the training data test_recs = model.recommend(users=test_users, n_items=k, filter_previous=filter_previous, cold_start='drop') comm_user = test_recs.index.values # calculate the discounted cumulative gain (sum of inverse log scaled ranks of relevant items) wrt common users match_indexes = [ np.where( test_recs.loc[u].isin(set(test_recs.loc[u]) & test_user_items[u]))[0] for u in comm_user ] discounted_cumulative_gain = np.mean([ np.sum(1 / np.log2(index + 2)) if len(index) > 0 else 0 for index in match_indexes ]) return discounted_cumulative_gain
def diversity(model, test_interactions, k=10, filter_previous=False): """evaluate the diversity of the model recommendations :param model: trained RankFM model instance :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions :param k: number of recommendations to generate for each user :param filter_previous: remove observed training items from generated recommendations :return: dataframe of cnt/pct of users recommended for each item """ # ensure that the model has been fit before attempting to generate predictions assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics" # get the unique set of test users test_user_items = pd.DataFrame(get_data(test_interactions), columns=['user_id', 'item_id']) test_users = test_user_items['user_id'].unique() # generate topK recommendations for all test users also present in the training data test_recs = model.recommend(users=test_users, n_items=k, filter_previous=filter_previous, cold_start='drop') comm_user = test_recs.index.values # stack the recommendations long-format for aggregation test_recs = test_recs.stack().reset_index().drop('level_1', axis=1) test_recs.columns = ['user_id', 'item_id'] # calculate the number and percentage of users getting recommended each unique item user_counts = test_recs.groupby('item_id')['user_id'].count().to_frame( 'cnt_users') user_counts = user_counts.reindex( model.item_id.values, fill_value=0).sort_values('cnt_users', ascending=False).reset_index() user_counts['pct_users'] = user_counts['cnt_users'] / len(comm_user) return user_counts
def _init_interactions(self, interactions, sample_weight): """map new interaction data to existing internal user/item indexes :param interactions: dataframe of observed user/item interactions: [user_id, item_id] :param sample_weight: vector of importance weights for each observed interaction :return: None """ assert isinstance(interactions, ( np.ndarray, pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe" assert interactions.shape[ 1] == 2, "[interactions] should be: [user_id, item_id]" # map the raw user/item identifiers to internal zero-based index positions # NOTE: any user/item pairs not found in the existing indexes will be dropped self.interactions = pd.DataFrame(get_data(interactions).copy(), columns=['user_id', 'item_id']) self.interactions['user_id'] = self.interactions['user_id'].map( self.user_to_index).astype(np.int32) self.interactions['item_id'] = self.interactions['item_id'].map( self.item_to_index).astype(np.int32) self.interactions = self.interactions.rename( { 'user_id': 'user_idx', 'item_id': 'item_idx' }, axis=1).dropna() # store the sample weights internally or generate a vector of ones if not given if sample_weight is not None: assert isinstance( sample_weight, (np.ndarray, pd.Series)), "[sample_weight] must be np.ndarray or pd.series" assert sample_weight.ndim == 1, "[sample_weight] must a vector (ndim=1)" assert len(sample_weight) == len( interactions ), "[sample_weight] must have the same length as [interactions]" self.sample_weight = np.ascontiguousarray(get_data(sample_weight), dtype=np.float32) else: self.sample_weight = np.ones(len(self.interactions), dtype=np.float32) # create a dictionary containing the set of observed items for each user # NOTE: if the model has been previously fit extend rather than replace the itemset for each user if self.is_fit: new_user_items = self.interactions.groupby( 'user_idx')['item_idx'].apply(set).to_dict() self.user_items = { user: np.sort( np.array(list( set(self.user_items[user]) | set(new_user_items[user])), dtype=np.int32)) for user in self.user_items.keys() } else: self.user_items = self.interactions.sort_values([ 'user_idx', 'item_idx' ]).groupby('user_idx')['item_idx'].apply(np.array, dtype=np.int32).to_dict() # format the interactions data as a c-contiguous integer array for cython use self.interactions = np.ascontiguousarray(self.interactions, dtype=np.int32)