コード例 #1
0
ファイル: precision.py プロジェクト: hugh-whitesource/jurity
def precision(actual_results: pd.DataFrame,
              predicted_results: pd.DataFrame,
              click_column: str,
              k: int,
              user_id_column: str = Constants.user_id,
              item_id_column: str = Constants.item_id):
    # Only consider clicks
    actual_results = actual_results.astype({click_column: bool})
    actual_clicks = actual_results[actual_results[click_column]]

    # Get the users to get_score on, which are the users who have both clicks and predictions
    users = np.intersect1d(actual_clicks[user_id_column].unique(),
                           predicted_results[user_id_column].unique())

    # Sort and get the top predictions
    predicted_results = predicted_results.set_index(
        [user_id_column, item_id_column])
    sorted_clicks = get_sorted_clicks(predicted_results, user_id_column,
                                      click_column, k)

    # Merge the predictions and actual clicks together
    merged = sorted_clicks.join(actual_clicks.set_index(
        [user_id_column, item_id_column]),
                                rsuffix='_ac')
    merged = merged.fillna(False)
    merged = merged[merged.index.isin(
        users,
        level=0)]  # Only look at users who have both clicks and predictions

    # Get the precision results
    merged_group = merged.groupby(user_id_column)[f'{click_column}_ac']
    results = merged_group.mean().values

    return results
コード例 #2
0
ファイル: recall.py プロジェクト: hugh-whitesource/jurity
def recall(actual_results: pd.DataFrame, predicted_results: pd.DataFrame, click_column: str, k: int,
           user_id_column: str = Constants.user_id, item_id_column: str = Constants.item_id):
    # Only consider clicks
    actual_results = actual_results.astype({click_column: bool})
    actual_clicks = actual_results[actual_results[click_column]]

    # Sort and get the top predictions
    predicted_results = predicted_results.set_index([user_id_column, item_id_column])
    sorted_clicks = get_sorted_clicks(predicted_results, user_id_column, click_column, k)

    # Merge the predictions and actual clicks together
    merged = actual_clicks.set_index([user_id_column, item_id_column]).join(sorted_clicks, rsuffix='_pr')
    included = merged[f'{click_column}_pr'].notna()
    results = included.groupby(user_id_column).mean().values

    return results
コード例 #3
0
    def get_score(
        self,
        actual_results: pd.DataFrame,
        predicted_results: pd.DataFrame,
        batch_accumulate: bool = False,
        return_extended_results: bool = False
    ) -> Union[float, dict, Tuple[float, float], Tuple[dict, dict]]:
        """Evaluates the current metric on the given data.

        There are 4 scenarios controlled by the ``batch_accumulate`` and ``return_extended_results`` parameters:

        1) Calculating the metric for the whole data:

        This is the default method, which assumes you are operating on the full data and you want to get the metric by
        itself. Returns ``float``.

        .. highlight:: python
        .. code-block:: python

            print(ctr.get_score(actual_responses_batch, recommendations_batch))
            >>> 0.316

        2) Calculating the extended results for the whole data:

        This assumes you are operating on the full data and you want to get the auxiliary information such as the
        support in addition to the metric. The information returned depends on the metric. Returns ``dict``.

        .. highlight:: python
        .. code-block:: python

            print(ctr.get_score(actual_responses_batch, recommendations_batch, return_extended_results=True))
            >>> {'ctr': 0.316, 'support': 122}

        3) Calculating the metric across multiple batches.

        This assumes that you are operating on batched data, and will therefore call this method multiple times for each
        batch. It also assumes that you want to get the metric by itself. Returns ``Tuple[float, float]``.

        .. highlight:: python
        .. code-block:: python

            for actual_responses_batch, recommendations_batch in ..
                ctr_batch, ctr_acc = ctr.get_score(actual_responses_batch, recommendations_batch, accumulate=True)
                print(f'CTR for this batch: {ctr_batch} Overall CTR: {ctr_acc}')
                >>> CTR for this batch: 0.453 Overall CTR: 0.316

        4) Calculating the extended results across multiple matches:

        This assumes you are operating on batched data, and will therefore call this method multiple times for each
        batch. It also assumes you want to get the auxiliary information such as the support in addition to the metric.
        The information returned depends on the metric. Returns ``Tuple[dict, dict]``.

        .. highlight:: python
        .. code-block:: python

            for actual_responses_batch, recommendations_batch in ..
                ctr_batch, ctr_acc = ctr.get_score(actual_responses_batch, recommendations_batch, accumulate=True, return_extended_results=True)
                print(f'CTR for this batch: {ctr_batch} Overall CTR: {ctr_acc}')
                >>> CTR for this batch: {'ctr': 0.453, 'support': 12} Overall CTR: {'ctr': 0.316, 'support': 122}

        Parameters
        ---------
        actual_results: pd.DataFrame
            A pandas DataFrame for the ground truth user item interaction data, captured from historical logs.
            The DataFrame should contain a minimum of two columns, including self._user_id_column, self._item_id_column,
            and anything else the metric may need. Each row contains the interaction of one user with one item, and the
            scores associated with this interaction. There can be multiple interactions per user, and there can be
            multiple users per DataFrame. However, the interactions for a specific user must be contained within a
            single DataFrame.
        predicted_results: pd.DataFrame
            A pandas DataFrame for the recommended user item interaction data, captured from a recommendation algorithm.
            The DataFrame should contain a minimum of two columns, including self._user_id_column, self._item_id_column,
            and anything else the metric may need. Each row contains the interaction of one user with one item, and the
            scores associated with this interaction. There can be multiple interactions per user, and there can be
            multiple users per DataFrame. However, the interactions for a specific user must be contained within a
            single DataFrame.
        batch_accumulate: bool
            If specified, this parameter allows you to pass in minibatches of results and accumulate the metric
            correctly across the batches. This reduces the memory footprint and integrates easily with batched
            training. If specified, the ``get_score`` function will return a tuple of batch results and accumulated
            results.
        return_extended_results: bool
            Whether the extended results such as the support should also be returned. If specified, the returned results
            will be of type ``dict``. CTR currently returns ``ctr`` and the ``support`` used to calculate CTR.

        Returns
        -------
        metric: Union[float, dict, Tuple[float, float], Tuple[dict, dict]]
            The averaged result(s). The return type is determined by the ``batch_accumulate`` and
            ``return_extended_results`` parameters. See the examples above.
        """
        actual_results = actual_results.set_index(
            [self._user_id_column, self._item_id_column])
        predicted_results = predicted_results.set_index(
            [self._user_id_column, self._item_id_column])
        if self.k is not None:
            sorted_clicks = get_sorted_clicks(predicted_results,
                                              self._user_id_column,
                                              self.click_column, self.k)
        else:
            sorted_clicks = predicted_results

        matches = actual_results.join(sorted_clicks, how='inner', rsuffix='_r')
        clicks = matches[self.value_column].values

        return self._accumulate_and_return(clicks, batch_accumulate,
                                           return_extended_results)
コード例 #4
0
def intralist_diversity(predicted_results: pd.DataFrame,
                        item_features: pd.DataFrame,
                        click_column: str,
                        k: int,
                        user_id_column: str = Constants.user_id,
                        item_id_column: str = Constants.item_id,
                        user_sample_size: Union[int, float, None] = 10000,
                        seed: int = Constants.default_seed,
                        metric: Union[str, Callable] = 'cosine',
                        n_jobs: int = 1,
                        num_runs: int = 10):
    """
    Intra-List Diversity@k measures the intra-list diversity of the recommendations when only k recommendations are
    made to the user. Given a list of items recommended to one user and the item features, the averaged pairwise cosine
    distances of items is calculated. Then the results from all users are averaged as the metric Intra-List Diversity@k.
    This metric has a range in :math:`[0, 1]`. The higher this metric is, the more diversified
    items are recommended to each user. Let :math:`U` denote the set of :math:`N` unique users, :math:`u_i` denote
    the i-th user in the user set, :math:`i \in \{1,2,\cdots,N\}`. :math:`v_p^{u_i}`, :math:`v_q^{u_i}` are the
    item features of the p-th and q-th item in the list of items recommended to :math:`u_i`,
    :math:`p, q \in \{0,1,\cdots,k-1\}`. :math:`I^{u_i}` is the set of all unique pairs of item indices for :math:`u_i`,
    :math:`\\forall~p<q, \{p, q\} \in I^{u_i}`.

    .. math::
            Intra\mbox{-} list~diversity = \\frac{1}{N}\sum_{i=1}^N \\frac{\sum_{p, q, \{p, q\} \in I^{u_i}}(cosine\_distance(v_p^{u_i}, v_q^{u_i}))}{|I^{u_i}|}

    By default, the reported metric is averaged over a number of ``num_runs`` (default=10) evaluations with each run
    using ``user_sample_size`` (default=10000) users, to ease the computing process and meanwhile get close
    approximation of this metric. When ``user_sample_size=None``, all users will be used in evaluation.

    Parameters
    ----------
    predicted_results: pd.DataFrame
        Recommendations data frame with (user_id, item_id, score) in each row.
    item_features: pd.DataFrame
        features data frame with (item_id, feature_1, feature_2, ..., feature_n) in each row.
    k: int
        Top-k recommendations to consider.
    user_id_column: str
        User id column name.
    item_id_column: str
        Item id column name.
    click_column: str
        Recommendation score column name.
    user_sample_size: Union[int, float, None]
        When input is an integer, it defines the number of randomly sampled users. When input is float, it defines the
        proportion of users to randomly sample for evaluation. If it is None, all users are included. Default=10,000.
    seed: int
        The seed used to create random state.
    metric: Union[str, Callable]
        Default = 'cosine'. The distance metric leveraged by sklearn.metrics.pairwise_distances_chunked.
        The metric to use when calculating distance between instances in a feature array.
        If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric
        parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is a callable function,
        it is called on each pair of instances (rows) and the resulting value recorded.
        The callable should take two arrays from X as input and return a value indicating the distance between them.
    num_runs: int
        num_runs is used to report the approximation of Intra-List Diversity over multiple runs on smaller
        samples of users, default=10, for a speed-up on evaluations. The sampling size is defined by
        user_sample_size. The final result is averaged over the multiple runs.
    n_jobs: int
        Number of jobs to use for computation in parallel, leveraged by sklearn.metrics.pairwise_distances.
        -1 means using all processors. Default=1.

    Returns
    -------
    Intra-list diversity metric, number of unique users as the support to get the metric
    """

    # Sample users
    if user_sample_size is not None:
        results_over_runs = []
        supports_over_runs = []

        # Create a different seed for each run
        rng = np.random.default_rng(seed)
        seeds = rng.integers(0, num_runs * 10, num_runs)

        for i in range(num_runs):
            df = sample_users(predicted_results,
                              user_id_column,
                              user_sample_size,
                              seed=seeds[i])

            res = intralist_diversity(df,
                                      item_features,
                                      click_column,
                                      k,
                                      user_id_column=user_id_column,
                                      item_id_column=item_id_column,
                                      user_sample_size=None,
                                      metric=metric,
                                      n_jobs=n_jobs)
            results_over_runs.extend(res)

        return np.array(results_over_runs)

    if k == 1:
        warnings.warn(
            'Intra-List Diversity will be nan when only one item is provided in item lists.'
        )

    df = predicted_results

    # Sort by user and score, and take the top k scores.
    df = get_sorted_clicks(df, user_id_column, click_column, k)

    # mapping user_ids and item_ids to indices
    unique_user_ids = list(df[user_id_column].unique())
    user_id_map = dict(zip(unique_user_ids, range(len(unique_user_ids))))

    item_features = item_features.reset_index(drop=True)
    item_id_map = {
        v: k
        for k, v in item_features[item_id_column].to_dict().items()
    }

    df['item_index'] = df[item_id_column].map(item_id_map).values
    df['user_index'] = df[user_id_column].map(user_id_map).values

    # cross join on user_id to get all 2-item combinations of items in lists
    df_merged = df.merge(df, on=user_id_column)

    # remove item pairs with the same items
    df_merged = df_merged[
        df_merged['item_index_x'] < df_merged['item_index_y']][[
            user_id_column, 'item_index_x', 'item_index_y'
        ]]

    ## recompute pairwise distances for all combinations of items in the features dataframe
    item_feature_minus = item_features.set_index(item_id_column)
    cosine_distance = pairwise_distances(item_feature_minus, metric=metric)

    # fetch distance for every pair in every item pair
    df_merged['cosine_distance'] = cosine_distance[df_merged['item_index_x'],
                                                   df_merged['item_index_y']]
    results = df_merged[[user_id_column,
                         'cosine_distance']].groupby(user_id_column).mean()

    intra_list_diversity = results.values.flatten()

    return intra_list_diversity
コード例 #5
0
def interlist_diversity(predicted_results: pd.DataFrame,
                        click_column: str,
                        k: int,
                        user_id_column: str = Constants.user_id,
                        item_id_column: str = Constants.item_id,
                        user_sample_size: Union[int, float, None] = 10000,
                        seed: int = Constants.default_seed,
                        metric: Union[str, Callable] = 'cosine',
                        num_runs: int = 10,
                        n_jobs: int = 1,
                        working_memory: int = None) -> Tuple[float, int]:
    """
    Inter-List Diversity@k measures the inter-list diversity of the recommendations when only k recommendations are
    made to the user. It measures how user's lists of recommendations are different from each other. This metric has a
    range in :math:`[0, 1]`. The higher this metric is, the more diversified lists of items are recommended to different
    users. Let :math:`U` denote the set of :math:`N` unique users, :math:`u_i`, :math:`u_j \in U` denote the i-th and
    j-th user in the user set, :math:`i, j \in \{1,2,\cdots,N\}`. :math:`R_{u_i}` is the binary indicator vector
    representing provided recommendations for :math:`u_i`. :math:`I` is the set of all unique user pairs,
    :math:`\\forall~i<j, \{u_i, u_j\} \in I`.

    .. math::
            Inter \mbox{-} list~diversity = \\frac{\sum_{i,j, \{u_i, u_j\} \in I}(cosine\_distance(R_{u_i}, R_{u_j}))}{|I|}

    By default, the reported metric is averaged over a number of ``num_runs`` (default=10) evaluations with each run
    using ``user_sample_size`` (default=10000) users, to ease the computing process and meanwhile get close
    approximation of this metric. When ``user_sample_size=None``, all users will be used in evaluation.

    Parameters
    ----------
    predicted_results: pd.DataFrame
        Recommendations data frame with (user_id, item_id, score) in each row.
    k: int
        Top-k recommendations to consider.
    user_id_column: str
        User id column name.
    item_id_column: str
        Item id column name.
    click_column: str
        Recommendation score column name.
    user_sample_size: Union[int, float, None]
        When input is an integer, it defines the number of randomly sampled users. When input is float, it defines the
        proportion of users to randomly sample for evaluation. If it is None, all users are included. Default=10,000.
    seed: int
        The seed used to create random state.
    metric: Union[str, Callable]
        Default = 'cosine'. The distance metric leveraged by sklearn.metrics.pairwise_distances_chunked.
        The metric to use when calculating distance between instances in a feature array.
        If metric is a string, it must be one of the options allowed by scipy.spatial.distance.pdist for its metric
        parameter, or a metric listed in pairwise.PAIRWISE_DISTANCE_FUNCTIONS. If metric is a callable function,
        it is called on each pair of instances (rows) and the resulting value recorded.
        The callable should take two arrays from X as input and return a value indicating the distance between them.
    num_runs: int
        num_runs is used to report the approximation of Inter-List Diversity over multiple runs on smaller
        samples of users, default=10, for a speed-up on evaluations. The sampling size is defined by
        user_sample_size. The final result is averaged over the multiple runs.
    n_jobs: int
        Number of jobs to use for computation in parallel, leveraged by sklearn.metrics.pairwise_distances_chunked.
        -1 means using all processors. Default=1.
    working_memory: Union[int, None]
        Maximum memory for temporary distance matrix chunks, leveraged by sklearn.metrics.pairwise_distances_chunked.
        Example input: working_memory = 1024. When None (default), the value of sklearn.get_config()['working_memory'],
        i.e. 1024M, is used.

    Returns
    -------
    Inter-list diversity metric, number of unique users as the support to get the metric
    """

    # Sample users
    if user_sample_size is not None:
        results_over_runs = []
        supports_over_runs = []

        # Create a different seed for each run
        rng = np.random.default_rng(seed)
        seeds = rng.integers(0, num_runs * 10, num_runs)

        for i in range(num_runs):

            df = sample_users(predicted_results,
                              user_id_column,
                              user_sample_size,
                              seed=seeds[i])

            res, support = interlist_diversity(df,
                                               click_column,
                                               k,
                                               user_id_column=user_id_column,
                                               item_id_column=item_id_column,
                                               user_sample_size=None,
                                               metric=metric,
                                               n_jobs=n_jobs,
                                               working_memory=working_memory)
            results_over_runs.append(res)
            supports_over_runs.append(support)

        inter_list_diversity = np.mean(results_over_runs)
        support = int(np.mean(supports_over_runs))

        return inter_list_diversity, support

    df = predicted_results

    # Sort by user and score, and take the top k scores.
    df = get_sorted_clicks(df, user_id_column, click_column, k)

    # Given user/item id column names, create sparse matrix as the new representation of user-item interactions.
    sparse_matrix = tocsr(df, user_id_column, item_id_column)

    # Get pairwise cosine distances
    chunked_sum_cosine_distances = map(
        sum,
        pairwise_distances_chunked(sparse_matrix,
                                   reduce_func=reduce_func,
                                   metric=metric,
                                   n_jobs=n_jobs,
                                   working_memory=working_memory))

    # Sum of all cosine distances of unique pairs
    sum_cosine_distances = sum(list(chunked_sum_cosine_distances)) / 2.0

    # Get number of pairs
    num_pairs = np.sum(range(sparse_matrix.shape[0]))

    # Calculate metric
    if num_pairs == 0:
        inter_list_diversity = np.nan
        warnings.warn(
            'Inter-List Diversity will be nan when there is only one single user.'
        )
    else:
        inter_list_diversity = sum_cosine_distances / num_pairs
        if np.abs(inter_list_diversity) <= 1e-06:
            inter_list_diversity = 0.0

    # Calculate support, set it to be the number of users
    support = len(df[user_id_column].unique())

    return inter_list_diversity, support
コード例 #6
0
    def get_score(self, actual_results: pd.DataFrame, predicted_results: pd.DataFrame, batch_accumulate: bool = False,
                  return_extended_results: bool = False) -> Union[float, dict, Tuple[float, float], Tuple[dict, dict]]:
        """Evaluates the current metric on the given data.

        There are 4 scenarios controlled by the ``batch_accumulate`` and ``return_extended_results`` parameters:

        1) Calculating the metric for the whole data:

        This is the default method, which assumes you are operating on the full data and you want to get the metric by
        itself. Returns ``float``.

        .. highlight:: python
        .. code-block:: python

            print(ctr.get_score(actual_responses_batch, recommendations_batch))
            >>> 0.316

        2) Calculating the extended results for the whole data:

        This assumes you are operating on the full data and you want to get the auxiliary information such as the
        support in addition to the metric. The information returned depends on the metric. Returns ``dict``.

        .. highlight:: python
        .. code-block:: python

            print(ctr.get_score(actual_responses_batch, recommendations_batch, return_extended_results=True))
            >>> {'ctr': 0.316, 'support': 122}

        3) Calculating the metric across multiple batches.

        This assumes that you are operating on batched data, and will therefore call this method multiple times for each
        batch. It also assumes that you want to get the metric by itself. Returns ``Tuple[float, float]``.

        .. highlight:: python
        .. code-block:: python

            for actual_responses_batch, recommendations_batch in ..
                ctr_batch, ctr_acc = ctr.get_score(actual_responses_batch, recommendations_batch, accumulate=True)
                print(f'CTR for this batch: {ctr_batch} Overall CTR: {ctr_acc}')
                >>> CTR for this batch: 0.453 Overall CTR: 0.316

        4) Calculating the extended results across multiple matches:

        This assumes you are operating on batched data, and will therefore call this method multiple times for each
        batch. It also assumes you want to get the auxiliary information such as the support in addition to the metric.
        The information returned depends on the metric. Returns ``Tuple[dict, dict]``.

        .. highlight:: python
        .. code-block:: python

            for actual_responses_batch, recommendations_batch in ..
                ctr_batch, ctr_acc = ctr.get_score(actual_responses_batch, recommendations_batch, accumulate=True, return_extended_results=True)
                print(f'CTR for this batch: {ctr_batch} Overall CTR: {ctr_acc}')
                >>> CTR for this batch: {'ctr': 0.453, 'support': 12} Overall CTR: {'ctr': 0.316, 'support': 122}

        Parameters
        ---------
        actual_results: pd.DataFrame
            A pandas DataFrame for the ground truth user item interaction data, captured from historical logs.
            The DataFrame should contain a minimum of two columns, including self._user_id_column, self._item_id_column,
            and anything else the metric may need. Each row contains the interaction of one user with one item, and the
            scores associated with this interaction. There can be multiple interactions per user, and there can be
            multiple users per DataFrame. However, the interactions for a specific user must be contained within a
            single DataFrame.
        predicted_results: pd.DataFrame
            A pandas DataFrame for the recommended user item interaction data, captured from a recommendation algorithm.
            The DataFrame should contain a minimum of two columns, including self._user_id_column, self._item_id_column,
            and anything else the metric may need. Each row contains the interaction of one user with one item, and the
            scores associated with this interaction. There can be multiple interactions per user, and there can be
            multiple users per DataFrame. However, the interactions for a specific user must be contained within a
            single DataFrame.
        batch_accumulate: bool
            If specified, this parameter allows you to pass in minibatches of results and accumulate the metric
            correctly across the batches. This reduces the memory footprint and integrates easily with batched
            training. If specified, the ``get_score`` function will return a tuple of batch results and accumulated
            results.
        return_extended_results: bool
            Whether the extended results such as the support should also be returned. If specified, the returned results
            will be of type ``dict``. MAP currently returns ``map`` and the ``support`` used to calculate MAP.

        Returns
        -------
        metric: Union[float, dict, Tuple[float, float], Tuple[dict, dict]]
            The averaged result(s). The return type is determined by the ``batch_accumulate`` and
            ``return_extended_results`` parameters. See the examples above.
        """
        # Only consider clicks
        actual_results = actual_results.astype({self.click_column: bool})
        actual_clicks = actual_results[actual_results[self.click_column]]

        # Get the users to get_score on, which are the users who have both clicks and predictions
        users = np.intersect1d(actual_clicks[self._user_id_column].unique(),
                               predicted_results[self._user_id_column].unique())

        # Sort and get the top predictions
        predicted_results = predicted_results.set_index([self._user_id_column, self._item_id_column])
        sorted_clicks = get_sorted_clicks(predicted_results, self._user_id_column, self.click_column, self.k)

        # Merge the predictions and actual clicks together
        merged = sorted_clicks.join(actual_clicks.set_index([self._user_id_column, self._item_id_column]),
                                    rsuffix='_ac')
        merged = merged.fillna(False)
        merged = merged[merged.index.isin(users, level=0)]  # Only look at users who have both clicks and predictions

        # Calculate Precision@n for all n in [1, k]. If the item at rank n is not relevant for a user,
        # replace the precision value with 0. The sum of these values per user make up the numerator of MAP@k
        merged_group = merged.groupby(self._user_id_column)[f'{self.click_column}_ac']
        sums = (merged_group.cumsum() / (merged_group.cumcount() + 1)) * merged[f'{self.click_column}_ac']
        precision_at_ks = sums.groupby(self._user_id_column).sum().values

        # Get the number of clicks per user
        clicks_per_user = \
            actual_clicks[actual_clicks[self._user_id_column].isin(users)].groupby(self._user_id_column).count()[
                self.click_column]

        # Divide by the maximum possible number of relevant items to get the mean. The maximum possible number of
        # relevant items is the minimum of k and the number of actual clicks.
        results = precision_at_ks / np.minimum(self.k, clicks_per_user)

        return self._accumulate_and_return(results, batch_accumulate, return_extended_results)