if len(relevant_items) > 0:
        neval += 1
        #
        # TODO: Here you can write to file the recommendations for each user in the test split.
        # WARNING: there is a catch with the item idx!
        #
        # this will rank *all* items
        recommended_items = recommender.recommend(user_profile,
                                                  exclude_seen=True)
        # use this to have the *top-k* recommended items (warning: this can underestimate ROC-AUC for small k)
        # recommended_items = recommender.recommend(user_profile, k=at, exclude_seen=True)
        roc_auc_ += roc_auc(recommended_items, relevant_items)
        precision_ += precision(recommended_items, relevant_items, at=at)
        recall_ += recall(recommended_items, relevant_items, at=at)
        map_ += map(recommended_items, relevant_items, at=at)
        mrr_ += rr(recommended_items, relevant_items, at=at)
        ndcg_ += ndcg(recommended_items,
                      relevant_items,
                      relevance=test[test_user].data,
                      at=at)
roc_auc_ /= neval
precision_ /= neval
recall_ /= neval
map_ /= neval
mrr_ /= neval
ndcg_ /= neval

logger.info('Ranking quality')
logger.info('ROC-AUC: {:.4f}'.format(roc_auc_))
logger.info('Precision@{}: {:.4f}'.format(at, precision_))
logger.info('Recall@{}: {:.4f}'.format(at, recall_))
Example #2
0
    def _run_evaluation_on_selected_users(self,
                                          recommender_object,
                                          usersToEvaluate,
                                          block_size=1000):

        start_time = time.time()
        start_time_print = time.time()

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.get_URM_train(),
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        n_users_evaluated = 0

        # Start from -block_size to ensure it to be 0 at the first block
        user_batch_start = 0
        user_batch_end = 0

        while user_batch_start < len(self.usersToEvaluate):

            user_batch_end = user_batch_start + block_size
            user_batch_end = min(user_batch_end, len(usersToEvaluate))

            test_user_batch_array = np.array(
                usersToEvaluate[user_batch_start:user_batch_end])
            user_batch_start = user_batch_end

            # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time
            recommended_items_batch_list = recommender_object.recommend(
                test_user_batch_array,
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                remove_CustomItems_flag=self.ignore_items_flag)

            # Compute recommendation quality for each user in batch
            for batch_user_index in range(len(recommended_items_batch_list)):

                user_id = test_user_batch_array[batch_user_index]
                recommended_items = recommended_items_batch_list[
                    batch_user_index]

                # Being the URM CSR, the indices are the non-zero column indexes
                relevant_items = self.get_user_relevant_items(user_id)
                is_relevant = np.in1d(recommended_items,
                                      relevant_items,
                                      assume_unique=True)

                n_users_evaluated += 1

                for cutoff in self.cutoff_list:

                    results_current_cutoff = results_dict[cutoff]

                    is_relevant_current_cutoff = is_relevant[0:cutoff]
                    recommended_items_current_cutoff = recommended_items[
                        0:cutoff]

                    results_current_cutoff[
                        EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                            is_relevant_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.PRECISION.value] += precision(
                            is_relevant_current_cutoff, len(relevant_items))
                    results_current_cutoff[
                        EvaluatorMetrics.RECALL.value] += recall(
                            is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                           value] += recall_min_test_len(
                                               is_relevant_current_cutoff,
                                               relevant_items)
                    results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                        is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                        is_relevant_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.NDCG.value] += ndcg(
                            recommended_items_current_cutoff,
                            relevant_items,
                            relevance=self.get_user_test_ratings(user_id),
                            at=cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.HIT_RATE.
                        value] += is_relevant_current_cutoff.sum()
                    results_current_cutoff[
                        EvaluatorMetrics.ARHR.value] += arhr(
                            is_relevant_current_cutoff)

                    results_current_cutoff[
                        EvaluatorMetrics.NOVELTY.value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_GINI.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.SHANNON_ENTROPY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.COVERAGE_ITEM.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.COVERAGE_USER.
                        value].add_recommendations(
                            recommended_items_current_cutoff, user_id)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

                    if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                        results_current_cutoff[
                            EvaluatorMetrics.DIVERSITY_SIMILARITY.
                            value].add_recommendations(
                                recommended_items_current_cutoff)

                if time.time(
                ) - start_time_print > 30 or n_users_evaluated == len(
                        self.usersToEvaluate):
                    print(
                        "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                        .format(
                            n_users_evaluated,
                            100.0 * float(n_users_evaluated) /
                            len(self.usersToEvaluate),
                            time.time() - start_time,
                            float(n_users_evaluated) /
                            (time.time() - start_time)))

                    sys.stdout.flush()
                    sys.stderr.flush()

                    start_time_print = time.time()

        return results_dict, n_users_evaluated
Example #3
0
    def evaluateRecommender(self, recommender_object):
        """
        :param recommender_object: the trained recommender object, a Recommender subclass
        :param URM_test_list: list of URMs to test the recommender against, or a single URM object
        :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff
        """

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.URM_train,
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        start_time = time.time()
        start_time_print = time.time()

        n_eval = 0

        self.__all_items = np.arange(0, self.n_items, dtype=np.int)
        self.__all_items = set(self.__all_items)

        if self.ignore_items_flag:
            recommender_object.set_items_to_ignore(self.ignore_items_ID)

        for test_user in self.usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)

            n_eval += 1

            self.user_specific_remove_items(recommender_object, test_user)

            # recommended_items = recommender_object.recommend(np.array(test_user), remove_seen_flag=self.exclude_seen,
            #                                                  cutoff = self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag)
            recommended_items = recommender_object.recommend(
                np.atleast_1d(test_user),
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                remove_CustomItems_flag=self.ignore_items_flag)

            recommended_items = np.array(recommended_items[0])

            recommender_object.reset_items_to_ignore()

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                       value] += recall_min_test_len(
                                           is_relevant_current_cutoff,
                                           relevant_items)
                results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                    is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                    is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

            if time.time() - start_time_print > 30 or n_eval == len(
                    self.usersToEvaluate):
                print(
                    "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                    .format(n_eval,
                            100.0 * float(n_eval) / len(self.usersToEvaluate),
                            time.time() - start_time,
                            float(n_eval) / (time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()

        if (n_eval > 0):

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                for key in results_current_cutoff.keys():

                    value = results_current_cutoff[key]

                    if isinstance(value, Metrics_Object):
                        results_current_cutoff[key] = value.get_metric_value()
                    else:
                        results_current_cutoff[key] = value / n_eval

                precision_ = results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value]
                recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value]

                if precision_ + recall_ != 0:
                    results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * (
                        precision_ * recall_) / (precision_ + recall_)

        else:
            print(
                "WARNING: No users had a sufficient number of relevant items")

        if self.ignore_items_flag:
            recommender_object.reset_items_to_ignore()

        results_run_string = self.get_result_string(results_dict)

        return (results_dict, results_run_string)
Example #4
0
    def _run_evaluation_on_selected_users(self, recommender_object,
                                          usersToEvaluate):

        start_time = time.time()
        start_time_print = time.time()

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.URM_train,
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        n_users_evaluated = 0

        for test_user in usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)

            n_users_evaluated += 1

            recommended_items = recommender_object.recommend(
                test_user,
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                remove_CustomItems_flag=self.ignore_items_flag)

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                       value] += recall_min_test_len(
                                           is_relevant_current_cutoff,
                                           relevant_items)
                results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                    is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                    is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

            if time.time() - start_time_print > 30 or n_users_evaluated == len(
                    self.usersToEvaluate):
                print(
                    "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                    .format(
                        n_users_evaluated, 100.0 * float(n_users_evaluated) /
                        len(self.usersToEvaluate),
                        time.time() - start_time,
                        float(n_users_evaluated) / (time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()

        return results_dict, n_users_evaluated