Beispiel #1
0
def hit_rate(model, test_interactions, k=10, filter_previous=False):
    """evaluate hit-rate (any match) wrt out-of-sample observed interactions

    :param model: trained RankFM model instance
    :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions
    :param k: number of recommendations to generate for each user
    :param filter_previous: remove observed training items from generated recommendations
    :return: the hit rate or proportion of test users with any matching items
    """

    # ensure that the model has been fit before attempting to generate predictions
    assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"

    # transform interactions into a user -> items dictionary
    test_user_items = pd.DataFrame(get_data(test_interactions),
                                   columns=['user_id', 'item_id'])
    test_user_items = test_user_items.groupby('user_id')['item_id'].apply(
        set).to_dict()
    test_users = list(test_user_items.keys())

    # generate topK recommendations for all test users also present in the training data
    test_recs = model.recommend(users=test_users,
                                n_items=k,
                                filter_previous=filter_previous,
                                cold_start='drop')
    comm_user = test_recs.index.values

    # calculate the hit rate (percentage of users with any relevant recommendation) wrt common users
    hit_rate = np.mean([
        int(len(set(test_recs.loc[u]) & test_user_items[u]) > 0)
        for u in comm_user
    ])
    return hit_rate
Beispiel #2
0
    def predict(self, pairs, cold_start='nan'):
        """calculate the predicted pointwise utilities for all (user, item) pairs

        :param pairs: dataframe of [user, item] pairs to score
        :param cold_start: whether to generate missing values ('nan') or drop ('drop') user/item pairs not found in training data
        :return: np.array of real-valued model scores
        """

        assert isinstance(
            pairs,
            (np.ndarray,
             pd.DataFrame)), "[pairs] must be np.ndarray or pd.dataframe"
        assert pairs.shape[1] == 2, "[pairs] should be: [user_id, item_id]"
        assert self.is_fit, "you must fit the model prior to generating predictions"

        pred_pairs = pd.DataFrame(get_data(pairs).copy(),
                                  columns=['user_id', 'item_id'])
        pred_pairs['user_id'] = pred_pairs['user_id'].map(self.user_to_index)
        pred_pairs['item_id'] = pred_pairs['item_id'].map(self.item_to_index)
        pred_pairs = np.ascontiguousarray(pred_pairs, dtype=np.float32)

        scores = _predict(pred_pairs, self.x_uf, self.x_if, self.w_i,
                          self.w_if, self.v_u, self.v_i, self.v_uf, self.v_if)

        if cold_start == 'nan':
            return scores
        elif cold_start == 'drop':
            return scores[~np.isnan(scores)]
        else:
            raise ValueError(
                "param [cold_start] must be set to either 'nan' or 'drop'")
Beispiel #3
0
    def _init_interactions(self, interactions, sample_weight):
        """map new interaction data to existing internal user/item indexes

        :param interactions: dataframe of observed user/item interactions: [user_id, item_id]
        :param sample_weight: vector of importance weights for each observed interaction
        :return: None
        """

        # check user data inputs
        assert isinstance(interactions, (np.ndarray, pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe"
        assert interactions.shape[1] == 2, "[interactions] should be: [user_id, item_id]"

        # map the raw user/item identifiers to internal zero-based index positions
        # NOTE: any user/item pairs not found in the existing indexes will be dropped

        self.interactions = pd.DataFrame(get_data(interactions).copy(), columns=['user_id', 'item_id'])
        self.interactions['user_id'] = self.interactions['user_id'].map(self.user_to_index).astype(np.int32)
        self.interactions['item_id'] = self.interactions['item_id'].map(self.item_to_index).astype(np.int32)
        self.interactions = self.interactions.rename({'user_id': 'user_idx', 'item_id': 'item_idx'}, axis=1).dropna().astype(np.int32)

        # store the sample weights internally or create a vector of ones if not passed
        if sample_weight is not None:
            assert isinstance(sample_weight, (np.ndarray, pd.Series)), "[sample_weight] must be np.ndarray or pd.series"
            assert sample_weight.ndim == 1, "[sample_weight] must a vector (ndim=1)"
            assert len(sample_weight) == len(interactions), "[sample_weight] must have the same length as [interactions]"
            self.sample_weight = get_data(sample_weight).astype(np.float32)
        else:
            self.sample_weight = np.ones(len(self.interactions), dtype=np.float32, order='C')

        # create python/numba lookup dictionaries containing the set of observed items for each user
        # NOTE: the typed numba dictionary will be used to sample unobserved items during training
        # NOTE: the interactions data must be converted to np.ndarray prior to training to use @njit

        self.user_items_nb = nb.typed.Dict.empty(key_type=nb.types.int32, value_type=nb.types.int32[:])
        self.user_items_py = self.interactions.sort_values(['user_idx', 'item_idx']).groupby('user_idx')['item_idx'].apply(np.array, dtype=np.int32).to_dict()
        self.interactions = self.interactions.to_numpy()

        for user, items in self.user_items_py.items():
            self.user_items_nb[user] = items
Beispiel #4
0
    def _init_all(self,
                  interactions,
                  user_features=None,
                  item_features=None,
                  sample_weight=None):
        """index the raw interaction and user/item features data to numpy arrays

        :param interactions: dataframe of observed user/item interactions: [user_id, item_id]
        :param user_features: dataframe of user metadata features: [user_id, uf_1, ..., uf_n]
        :param item_features: dataframe of item metadata features: [item_id, if_1, ..., if_n]
        :param sample_weight: vector of importance weights for each observed interaction
        :return: None
        """

        # check user data inputs
        assert isinstance(interactions, (
            np.ndarray,
            pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe"
        assert interactions.shape[
            1] == 2, "[interactions] should be: [user_id, item_id]"

        # save the unique lists of users/items in terms of original identifiers
        interactions_df = pd.DataFrame(get_data(interactions),
                                       columns=['user_id', 'item_id'])
        self.user_id = pd.Series(np.sort(np.unique(
            interactions_df['user_id'])))
        self.item_id = pd.Series(np.sort(np.unique(
            interactions_df['item_id'])))

        # create zero-based index position to identifier mappings
        self.index_to_user = self.user_id
        self.index_to_item = self.item_id

        # create reverse mappings from identifiers to zero-based index positions
        self.user_to_index = pd.Series(data=self.index_to_user.index,
                                       index=self.index_to_user.values)
        self.item_to_index = pd.Series(data=self.index_to_item.index,
                                       index=self.index_to_item.values)

        # store unique values of user/item indexes and observed interactions for each user
        self.user_idx = np.arange(len(self.user_id), dtype=np.int32)
        self.item_idx = np.arange(len(self.item_id), dtype=np.int32)

        # map the interactions to internal index positions
        self._init_interactions(interactions, sample_weight)

        # map the user/item features to internal index positions
        self._init_features(user_features, item_features)

        # initialize the model weights after the user/item/feature dimensions have been established
        self._init_weights(user_features, item_features)
Beispiel #5
0
def discounted_cumulative_gain(model,
                               test_interactions,
                               k=10,
                               filter_previous=False):
    """evaluate discounted cumulative gain wrt out-of-sample observed interactions

    :param model: trained RankFM model instance
    :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions
    :param k: number of recommendations to generate for each user
    :param filter_previous: remove observed training items from generated recommendations
    :return: mean discounted cumulative gain wrt the test users
    """

    # ensure that the model has been fit before attempting to generate predictions
    assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"

    # transform interactions into a user -> items dictionary
    test_user_items = pd.DataFrame(get_data(test_interactions),
                                   columns=['user_id', 'item_id'])
    test_user_items = test_user_items.groupby('user_id')['item_id'].apply(
        set).to_dict()
    test_users = list(test_user_items.keys())

    # generate topK recommendations for all test users also present in the training data
    test_recs = model.recommend(users=test_users,
                                n_items=k,
                                filter_previous=filter_previous,
                                cold_start='drop')
    comm_user = test_recs.index.values

    # calculate the discounted cumulative gain (sum of inverse log scaled ranks of relevant items) wrt common users
    match_indexes = [
        np.where(
            test_recs.loc[u].isin(set(test_recs.loc[u])
                                  & test_user_items[u]))[0] for u in comm_user
    ]
    discounted_cumulative_gain = np.mean([
        np.sum(1 / np.log2(index + 2)) if len(index) > 0 else 0
        for index in match_indexes
    ])
    return discounted_cumulative_gain
Beispiel #6
0
def diversity(model, test_interactions, k=10, filter_previous=False):
    """evaluate the diversity of the model recommendations

    :param model: trained RankFM model instance
    :param test_interactions: pandas dataframe of out-of-sample observed user/item interactions
    :param k: number of recommendations to generate for each user
    :param filter_previous: remove observed training items from generated recommendations
    :return: dataframe of cnt/pct of users recommended for each item
    """

    # ensure that the model has been fit before attempting to generate predictions
    assert model.is_fit, "you must fit the model prior to evaluating hold-out metrics"

    # get the unique set of test users
    test_user_items = pd.DataFrame(get_data(test_interactions),
                                   columns=['user_id', 'item_id'])
    test_users = test_user_items['user_id'].unique()

    # generate topK recommendations for all test users also present in the training data
    test_recs = model.recommend(users=test_users,
                                n_items=k,
                                filter_previous=filter_previous,
                                cold_start='drop')
    comm_user = test_recs.index.values

    # stack the recommendations long-format for aggregation
    test_recs = test_recs.stack().reset_index().drop('level_1', axis=1)
    test_recs.columns = ['user_id', 'item_id']

    # calculate the number and percentage of users getting recommended each unique item
    user_counts = test_recs.groupby('item_id')['user_id'].count().to_frame(
        'cnt_users')
    user_counts = user_counts.reindex(
        model.item_id.values,
        fill_value=0).sort_values('cnt_users', ascending=False).reset_index()
    user_counts['pct_users'] = user_counts['cnt_users'] / len(comm_user)
    return user_counts
Beispiel #7
0
    def _init_interactions(self, interactions, sample_weight):
        """map new interaction data to existing internal user/item indexes

        :param interactions: dataframe of observed user/item interactions: [user_id, item_id]
        :param sample_weight: vector of importance weights for each observed interaction
        :return: None
        """

        assert isinstance(interactions, (
            np.ndarray,
            pd.DataFrame)), "[interactions] must be np.ndarray or pd.dataframe"
        assert interactions.shape[
            1] == 2, "[interactions] should be: [user_id, item_id]"

        # map the raw user/item identifiers to internal zero-based index positions
        # NOTE: any user/item pairs not found in the existing indexes will be dropped
        self.interactions = pd.DataFrame(get_data(interactions).copy(),
                                         columns=['user_id', 'item_id'])
        self.interactions['user_id'] = self.interactions['user_id'].map(
            self.user_to_index).astype(np.int32)
        self.interactions['item_id'] = self.interactions['item_id'].map(
            self.item_to_index).astype(np.int32)
        self.interactions = self.interactions.rename(
            {
                'user_id': 'user_idx',
                'item_id': 'item_idx'
            }, axis=1).dropna()

        # store the sample weights internally or generate a vector of ones if not given
        if sample_weight is not None:
            assert isinstance(
                sample_weight,
                (np.ndarray,
                 pd.Series)), "[sample_weight] must be np.ndarray or pd.series"
            assert sample_weight.ndim == 1, "[sample_weight] must a vector (ndim=1)"
            assert len(sample_weight) == len(
                interactions
            ), "[sample_weight] must have the same length as [interactions]"
            self.sample_weight = np.ascontiguousarray(get_data(sample_weight),
                                                      dtype=np.float32)
        else:
            self.sample_weight = np.ones(len(self.interactions),
                                         dtype=np.float32)

        # create a dictionary containing the set of observed items for each user
        # NOTE: if the model has been previously fit extend rather than replace the itemset for each user

        if self.is_fit:
            new_user_items = self.interactions.groupby(
                'user_idx')['item_idx'].apply(set).to_dict()
            self.user_items = {
                user: np.sort(
                    np.array(list(
                        set(self.user_items[user])
                        | set(new_user_items[user])),
                             dtype=np.int32))
                for user in self.user_items.keys()
            }
        else:
            self.user_items = self.interactions.sort_values([
                'user_idx', 'item_idx'
            ]).groupby('user_idx')['item_idx'].apply(np.array,
                                                     dtype=np.int32).to_dict()

        # format the interactions data as a c-contiguous integer array for cython use
        self.interactions = np.ascontiguousarray(self.interactions,
                                                 dtype=np.int32)