예제 #1
0
    def test_NDCG(self):

        from Base.Evaluation.metrics import dcg, ndcg

        pos_items = np.asarray([2, 4, 5, 10])
        pos_relevances = np.asarray([5, 4, 3, 2])
        ranked_list_1 = np.asarray([1, 2, 3, 4, 5])  # rel = 0, 5, 0, 4, 3
        ranked_list_2 = np.asarray([10, 5, 2, 4, 3])  # rel = 2, 3, 5, 4, 0
        ranked_list_3 = np.asarray([1, 3, 6, 7, 8])  # rel = 0, 0, 0, 0, 0
        idcg = ((2**5 - 1) / np.log(2) + (2**4 - 1) / np.log(3) +
                (2**3 - 1) / np.log(4) + (2**2 - 1) / np.log(5))
        self.assertTrue(np.allclose(dcg(np.sort(pos_relevances)[::-1]), idcg))
        self.assertTrue(
            np.allclose(
                ndcg(ranked_list_1, pos_items, pos_relevances),
                ((2**5 - 1) / np.log(3) + (2**4 - 1) / np.log(5) +
                 (2**3 - 1) / np.log(6)) / idcg,
            ))
        self.assertTrue(
            np.allclose(
                ndcg(ranked_list_2, pos_items, pos_relevances),
                ((2**2 - 1) / np.log(2) + (2**3 - 1) / np.log(3) +
                 (2**5 - 1) / np.log(4) + (2**4 - 1) / np.log(5)) / idcg,
            ))
        self.assertTrue(
            np.allclose(ndcg(ranked_list_3, pos_items, pos_relevances), 0.0))
예제 #2
0
    def evaluateRecommender(self, recommender_object):
        """
        :param recommender_object: the trained recommender object, a Recommender subclass
        :param URM_test_list: list of URMs to test the recommender against, or a single URM object
        :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff
        """

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.URM_train,
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        start_time = time.time()
        start_time_print = time.time()

        n_eval = 0

        self.__all_items = np.arange(0, self.n_items, dtype=np.int)
        self.__all_items = set(self.__all_items)

        if self.ignore_items_flag:
            recommender_object.set_items_to_ignore(self.ignore_items_ID)

        for test_user in self.usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)

            n_eval += 1

            self.user_specific_remove_items(recommender_object, test_user)

            # recommended_items = recommender_object.recommend(np.array(test_user), remove_seen_flag=self.exclude_seen,
            #                                                  cutoff = self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag)
            recommended_items = recommender_object.recommend(
                np.atleast_1d(test_user),
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                remove_CustomItems_flag=self.ignore_items_flag)

            recommended_items = np.array(recommended_items[0])

            recommender_object.reset_items_to_ignore()

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                       value] += recall_min_test_len(
                                           is_relevant_current_cutoff,
                                           relevant_items)
                results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                    is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                    is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

            if time.time() - start_time_print > 30 or n_eval == len(
                    self.usersToEvaluate):
                print(
                    "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                    .format(n_eval,
                            100.0 * float(n_eval) / len(self.usersToEvaluate),
                            time.time() - start_time,
                            float(n_eval) / (time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()

        if (n_eval > 0):

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                for key in results_current_cutoff.keys():

                    value = results_current_cutoff[key]

                    if isinstance(value, Metrics_Object):
                        results_current_cutoff[key] = value.get_metric_value()
                    else:
                        results_current_cutoff[key] = value / n_eval

                precision_ = results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value]
                recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value]

                if precision_ + recall_ != 0:
                    results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * (
                        precision_ * recall_) / (precision_ + recall_)

        else:
            print(
                "WARNING: No users had a sufficient number of relevant items")

        if self.ignore_items_flag:
            recommender_object.reset_items_to_ignore()

        results_run_string = self.get_result_string(results_dict)

        return (results_dict, results_run_string)
예제 #3
0
    def _run_evaluation_on_selected_users(self,
                                          recommender_object,
                                          usersToEvaluate,
                                          block_size=1000):

        start_time = time.time()
        start_time_print = time.time()

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.get_URM_train(),
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        n_users_evaluated = 0

        # Start from -block_size to ensure it to be 0 at the first block
        user_batch_start = 0
        user_batch_end = 0

        while user_batch_start < len(self.usersToEvaluate):

            user_batch_end = user_batch_start + block_size
            user_batch_end = min(user_batch_end, len(usersToEvaluate))

            test_user_batch_array = np.array(
                usersToEvaluate[user_batch_start:user_batch_end])
            user_batch_start = user_batch_end

            # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time
            recommended_items_batch_list = recommender_object.recommend(
                test_user_batch_array)

            # Compute recommendation quality for each user in batch
            for batch_user_index in range(len(recommended_items_batch_list)):

                user_id = test_user_batch_array[batch_user_index]
                recommended_items = recommended_items_batch_list[
                    batch_user_index]

                # Being the URM CSR, the indices are the non-zero column indexes
                relevant_items = self.get_user_relevant_items(user_id)
                is_relevant = np.in1d(recommended_items,
                                      relevant_items,
                                      assume_unique=True)

                n_users_evaluated += 1

                for cutoff in self.cutoff_list:

                    results_current_cutoff = results_dict[cutoff]

                    is_relevant_current_cutoff = is_relevant[0:cutoff]
                    recommended_items_current_cutoff = recommended_items[
                        0:cutoff]

                    results_current_cutoff[
                        EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                            is_relevant_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.PRECISION.value] += precision(
                            is_relevant_current_cutoff, len(relevant_items))
                    results_current_cutoff[
                        EvaluatorMetrics.RECALL.value] += recall(
                            is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                           value] += recall_min_test_len(
                                               is_relevant_current_cutoff,
                                               relevant_items)
                    results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                        is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                        is_relevant_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.NDCG.value] += ndcg(
                            recommended_items_current_cutoff,
                            relevant_items,
                            relevance=self.get_user_test_ratings(user_id),
                            at=cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.HIT_RATE.
                        value] += is_relevant_current_cutoff.sum()
                    results_current_cutoff[
                        EvaluatorMetrics.ARHR.value] += arhr(
                            is_relevant_current_cutoff)

                    results_current_cutoff[
                        EvaluatorMetrics.NOVELTY.value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_GINI.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.SHANNON_ENTROPY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.COVERAGE_ITEM.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.COVERAGE_USER.
                        value].add_recommendations(
                            recommended_items_current_cutoff, user_id)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

                    if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                        results_current_cutoff[
                            EvaluatorMetrics.DIVERSITY_SIMILARITY.
                            value].add_recommendations(
                                recommended_items_current_cutoff)

                if time.time(
                ) - start_time_print > 30 or n_users_evaluated == len(
                        self.usersToEvaluate):
                    print(
                        "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                        .format(
                            n_users_evaluated,
                            100.0 * float(n_users_evaluated) /
                            len(self.usersToEvaluate),
                            time.time() - start_time,
                            float(n_users_evaluated) /
                            (time.time() - start_time)))

                    sys.stdout.flush()
                    sys.stderr.flush()

                    start_time_print = time.time()

        return results_dict, n_users_evaluated
예제 #4
0
    def _run_evaluation_on_selected_users(self, recommender_object,
                                          usersToEvaluate):

        start_time = time.time()
        start_time_print = time.time()

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.URM_train,
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        n_users_evaluated = 0

        for test_user in usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)

            n_users_evaluated += 1

            recommended_items = recommender_object.recommend(
                test_user,
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                remove_CustomItems_flag=self.ignore_items_flag)

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                       value] += recall_min_test_len(
                                           is_relevant_current_cutoff,
                                           relevant_items)
                results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                    is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                    is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

            if time.time() - start_time_print > 30 or n_users_evaluated == len(
                    self.usersToEvaluate):
                print(
                    "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                    .format(
                        n_users_evaluated, 100.0 * float(n_users_evaluated) /
                        len(self.usersToEvaluate),
                        time.time() - start_time,
                        float(n_users_evaluated) / (time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()

        return results_dict, n_users_evaluated
예제 #5
0
    def evaluateRecommender(self, recommender_object):
        """
        :param recommender_object: the trained recommender object, a BaseRecommender subclass
        :param URM_test_list: list of URMs to test the recommender against, or a single URM object
        :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff
        """



        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(self.n_items, self.n_users,
                                                             recommender_object.URM_train,
                                                             self.ignore_items_ID,
                                                             self.ignore_users_ID,
                                                             cutoff,
                                                             self.diversity_object)

        start_time = time.time()
        start_time_print = time.time()

        n_users_evaluated = 0

        if self.ignore_items_flag:
            recommender_object.set_items_to_ignore(self.ignore_items_ID)


        for test_user in self.usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)
            relevant_items_rating = self.get_user_test_ratings(test_user)

            n_users_evaluated += 1

            items_to_compute = self._get_user_specific_items_to_compute(test_user)

            recommended_items, all_items_predicted_ratings = recommender_object.recommend(np.atleast_1d(test_user),
                                                              remove_seen_flag=self.exclude_seen,
                                                              cutoff = self.max_cutoff,
                                                              remove_top_pop_flag=False,
                                                              items_to_compute = items_to_compute,
                                                              remove_CustomItems_flag=self.ignore_items_flag,
                                                              return_scores = True
                                                             )


            assert len(recommended_items) == 1, "{}: recommended_items contained recommendations for {} users, expected was {}".format(
                self.EVALUATOR_NAME, len(recommended_items), 1)

            assert all_items_predicted_ratings.shape[0] == 1, "{}: all_items_predicted_ratings contained scores for {} users, expected was {}".format(
                self.EVALUATOR_NAME, all_items_predicted_ratings.shape[0], 1)

            assert all_items_predicted_ratings.shape[1] == self.n_items, "{}: all_items_predicted_ratings contained scores for {} items, expected was {}".format(
                self.EVALUATOR_NAME, all_items_predicted_ratings.shape[1], self.n_items)



            recommended_items = np.array(recommended_items[0])
            user_rmse = rmse(all_items_predicted_ratings[0], relevant_items, relevant_items_rating)

            recommender_object.reset_items_to_ignore()

            is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)



            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[EvaluatorMetrics.ROC_AUC.value]              += roc_auc(is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.PRECISION.value]            += precision(is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.value]   += precision_recall_min_denominator(is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[EvaluatorMetrics.RECALL.value]               += recall(is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.NDCG.value]                 += ndcg(recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff)
                results_current_cutoff[EvaluatorMetrics.HIT_RATE.value]             += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value]                 += arhr(is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.RMSE.value]                 += user_rmse

                results_current_cutoff[EvaluatorMetrics.MRR.value].add_recommendations(is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.MAP.value].add_recommendations(is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.NOVELTY.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(recommended_items_current_cutoff, test_user)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.value].add_recommendations(recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.value].add_recommendations(recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[EvaluatorMetrics.DIVERSITY_SIMILARITY.value].add_recommendations(recommended_items_current_cutoff)



            if time.time() - start_time_print > 30 or n_users_evaluated==len(self.usersToEvaluate):
                elapsed_time = time.time()-start_time
                new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)

                print("{}: Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}".format(
                              self.EVALUATOR_NAME,
                              n_users_evaluated,
                              100.0* float(n_users_evaluated)/len(self.usersToEvaluate),
                              new_time_value, new_time_unit,
                              float(n_users_evaluated)/elapsed_time))


                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()


        if (n_users_evaluated > 0):

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                for key in results_current_cutoff.keys():

                    value = results_current_cutoff[key]

                    if isinstance(value, Metrics_Object):
                        results_current_cutoff[key] = value.get_metric_value()
                    else:
                        results_current_cutoff[key] = value/n_users_evaluated

                precision_ = results_current_cutoff[EvaluatorMetrics.PRECISION.value]
                recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value]

                if precision_ + recall_ != 0:
                    # F1 micro averaged: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.104.8244&rep=rep1&type=pdf
                    results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * (precision_ * recall_) / (precision_ + recall_)


        else:
            print("WARNING: No users had a sufficient number of relevant items")


        if self.ignore_items_flag:
            recommender_object.reset_items_to_ignore()



        results_run_string = get_result_string(results_dict)

        return (results_dict, results_run_string)
예제 #6
0
    def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate, block_size = None):


        if block_size is None:
            block_size = min(1000, int(1e8/self.n_items))



        start_time = time.time()
        start_time_print = time.time()


        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(self.n_items, self.n_users,
                                                             recommender_object.get_URM_train(),
                                                             self.ignore_items_ID,
                                                             self.ignore_users_ID,
                                                             cutoff,
                                                             self.diversity_object)

        n_users_evaluated = 0

        # Start from -block_size to ensure it to be 0 at the first block
        user_batch_start = 0
        user_batch_end = 0

        while user_batch_start < len(self.usersToEvaluate):

            user_batch_end = user_batch_start + block_size
            user_batch_end = min(user_batch_end, len(usersToEvaluate))

            test_user_batch_array = np.array(usersToEvaluate[user_batch_start:user_batch_end])
            user_batch_start = user_batch_end

            # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time
            recommended_items_batch_list, scores_batch = recommender_object.recommend(test_user_batch_array,
                                                                      remove_seen_flag=self.exclude_seen,
                                                                      cutoff = self.max_cutoff,
                                                                      remove_top_pop_flag=False,
                                                                      remove_CustomItems_flag=self.ignore_items_flag,
                                                                      return_scores = True
                                                                     )


            assert len(recommended_items_batch_list) == len(test_user_batch_array), "{}: recommended_items_batch_list contained recommendations for {} users, expected was {}".format(
                self.EVALUATOR_NAME, len(recommended_items_batch_list), len(test_user_batch_array))

            assert scores_batch.shape[0] == len(test_user_batch_array), "{}: scores_batch contained scores for {} users, expected was {}".format(
                self.EVALUATOR_NAME, scores_batch.shape[0], len(test_user_batch_array))

            assert scores_batch.shape[1] == self.n_items, "{}: scores_batch contained scores for {} items, expected was {}".format(
                self.EVALUATOR_NAME, scores_batch.shape[1], self.n_items)


            # Compute recommendation quality for each user in batch
            for batch_user_index in range(len(recommended_items_batch_list)):

                test_user = test_user_batch_array[batch_user_index]

                relevant_items = self.get_user_relevant_items(test_user)
                relevant_items_rating = self.get_user_test_ratings(test_user)

                all_items_predicted_ratings = scores_batch[batch_user_index]
                user_rmse = rmse(all_items_predicted_ratings, relevant_items, relevant_items_rating)

                # Being the URM CSR, the indices are the non-zero column indexes
                recommended_items = recommended_items_batch_list[batch_user_index]
                is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True)

                n_users_evaluated += 1

                for cutoff in self.cutoff_list:

                    results_current_cutoff = results_dict[cutoff]

                    is_relevant_current_cutoff = is_relevant[0:cutoff]
                    recommended_items_current_cutoff = recommended_items[0:cutoff]

                    results_current_cutoff[EvaluatorMetrics.ROC_AUC.value]              += roc_auc(is_relevant_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.PRECISION.value]            += precision(is_relevant_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.value]   += precision_recall_min_denominator(is_relevant_current_cutoff, len(relevant_items))
                    results_current_cutoff[EvaluatorMetrics.RECALL.value]               += recall(is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.NDCG.value]                 += ndcg(recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff)
                    results_current_cutoff[EvaluatorMetrics.HIT_RATE.value]             += is_relevant_current_cutoff.sum()
                    results_current_cutoff[EvaluatorMetrics.ARHR.value]                 += arhr(is_relevant_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.RMSE.value]                 += user_rmse

                    results_current_cutoff[EvaluatorMetrics.MRR.value].add_recommendations(is_relevant_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.MAP.value].add_recommendations(is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.NOVELTY.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(recommended_items_current_cutoff, test_user)
                    results_current_cutoff[EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.value].add_recommendations(recommended_items_current_cutoff)
                    results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.value].add_recommendations(recommended_items_current_cutoff)

                    if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                        results_current_cutoff[EvaluatorMetrics.DIVERSITY_SIMILARITY.value].add_recommendations(recommended_items_current_cutoff)


                if time.time() - start_time_print > 30 or n_users_evaluated==len(self.usersToEvaluate):

                    elapsed_time = time.time()-start_time
                    new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time)

                    print("{}: Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}".format(
                                  self.EVALUATOR_NAME,
                                  n_users_evaluated,
                                  100.0* float(n_users_evaluated)/len(self.usersToEvaluate),
                                  new_time_value, new_time_unit,
                                  float(n_users_evaluated)/elapsed_time))

                    sys.stdout.flush()
                    sys.stderr.flush()

                    start_time_print = time.time()



        return results_dict, n_users_evaluated
예제 #7
0
    # topk_part = pred_val[np.arange(batch_users)[:, np.newaxis],
    #                    idx_topk_part[:, :k]]
    # idx_part = np.argsort(-topk_part, axis=1)
    # # X_pred[np.arange(batch_users)[:, np.newaxis], idx_topk] is the sorted
    # # topk predicted score
    # idx_topk = idx_topk_part[np.arange(batch_users)[:, np.newaxis], idx_part]

    recommended_items = np.argsort(-pred_val, axis=1).ravel()[:k]

    is_relevant = np.in1d(recommended_items, pos_items_array, assume_unique=True)

    # his_recall = Recall_at_k_batch(pred_val, pos_items_sparse, k=20)[0]
    # my_recall = recall(is_relevant, pos_items_array)

    his_ndcg = NDCG_binary_at_k_batch(pred_val, pos_items_sparse, k=100)[0]
    my_ndcg = ndcg(recommended_items, pos_items_array)

    if not np.allclose(my_ndcg, his_ndcg, atol=0.0001):
        pass

n100_list = np.concatenate(n100_list)
r20_list = np.concatenate(r20_list)
r50_list = np.concatenate(r50_list)

print("Test NDCG@100=%.5f (%.5f)" % (np.mean(n100_list), np.std(n100_list) / np.sqrt(len(n100_list))))
print("Test Recall@20=%.5f (%.5f)" % (np.mean(r20_list), np.std(r20_list) / np.sqrt(len(r20_list))))
print("Test Recall@50=%.5f (%.5f)" % (np.mean(r50_list), np.std(r50_list) / np.sqrt(len(r50_list))))

from Base.Evaluation.Evaluator import EvaluatorHoldout

evaluator = EvaluatorHoldout(test_data_te, cutoff_list=[20, 50, 100])
예제 #8
0
    def evaluateRecommender(self, recommender_object):
        """
        :param recommender_object: the trained recommender object, a BaseRecommender subclass
        :param URM_test_list: list of URMs to test the recommender against, or a single URM object
        :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff
        """

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.URM_train,
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        start_time = time.time()
        start_time_print = time.time()

        n_users_evaluated = 0

        if self.ignore_items_flag:
            recommender_object.set_items_to_ignore(self.ignore_items_ID)

        for test_user in self.usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)
            relevant_items_rating = self.get_user_test_ratings(test_user)

            n_users_evaluated += 1

            items_to_compute = self._get_user_specific_items_to_compute(
                test_user)

            recommended_items, all_items_predicted_ratings = recommender_object.recommend(
                np.atleast_1d(test_user),
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                items_to_compute=items_to_compute,
                remove_CustomItems_flag=self.ignore_items_flag,
                return_scores=True)

            assert len(
                recommended_items
            ) == 1, "{}: recommended_items contained recommendations for {} users, expected was {}".format(
                self.EVALUATOR_NAME, len(recommended_items), 1)

            assert all_items_predicted_ratings.shape[
                0] == 1, "{}: all_items_predicted_ratings contained scores for {} users, expected was {}".format(
                    self.EVALUATOR_NAME, all_items_predicted_ratings.shape[0],
                    1)

            assert all_items_predicted_ratings.shape[
                1] == self.n_items, "{}: all_items_predicted_ratings contained scores for {} items, expected was {}".format(
                    self.EVALUATOR_NAME, all_items_predicted_ratings.shape[1],
                    self.n_items)

            recommended_items = np.array(recommended_items[0])
            user_rmse = rmse(all_items_predicted_ratings[0], relevant_items,
                             relevant_items_rating)

            recommender_object.reset_items_to_ignore()

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            # (CNR) -------------------------------------------------
            weighted_hits = np.zeros(len(recommended_items))
            log_weighted_hits = np.zeros(len(recommended_items))

            pos_weighted_hits = np.zeros(len(recommended_items))
            pos_log_weighted_hits = np.zeros(len(recommended_items))
            '''
            alpha, beta, scale, pi = 100, 0.03, 1 / 15, np.pi
            percentile = get_percentile(a, 45)

            f = 1 / (beta * np.sqrt(2 * pi))

            y_a = np.tanh(alpha * a) + scale * f * np.exp(-1 / (2 * (beta ** 2)) * (a - percentile) ** 2)
            y_a = y_a / max(y_a)
            '''

            # es. recommended_items = [2, 7, 10, 70, 5464]
            # es. is_relevant       = [0, 0,  1,  0,    0]
            for i in range(len(recommended_items)):
                if is_relevant[i]:
                    weighted_hits[i] = 1 / (
                        1 + Settings.popularity[recommended_items[i]])
                    # es. weighted_hits = [1, 7, 10, 70, 5464]
                    pos_weighted_hits[i] = 1 / (
                        1 + i + Settings.popularity[recommended_items[i]])

                    log_weighted_hits[i] = 1 / (1 + math.log(
                        1 + Settings.popularity[recommended_items[i]]))
                    pos_log_weighted_hits[i] = 1 / (1 + i + math.log(
                        1 + Settings.popularity[recommended_items[i]]))
                    # -------------------------------------------------------

            number_of_guessed_items = 0

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                # Questo array e' fondamentale.
                # Dato un utente, results_current_cutoff e' nella forma [0, 1, 0, 0, 0]
                # La sua lunghezza e' pari al cutoff.
                # Nell'esempio riportato, il vettore ci dice che l'oggetto
                # raccomandato in seconda posizione e' un hit, cioe' e' presente nel test set.
                # Attenzione pero'. Gli oggetti presi in esame sono quelli del test set piu' quelli negativi
                # (che sappiamo non piacere all'utente).

                is_relevant_current_cutoff = is_relevant[0:cutoff]

                # (CNR) -------------------------------------------------------
                custom_hits = np.zeros(len(recommended_items))

                for i in range(len(recommended_items)):
                    if is_relevant[i]:
                        # print('Luciano > Computing custom weight. Parameters (pop, pos, cutoff):', Settings.popularity[recommended_items[i]], i, cutoff)
                        custom_hits[i] = y_custom(
                            Settings.popularity[recommended_items[i]], i,
                            cutoff)

                        if custom_hits[i] > 1:
                            print(
                                '=============================================================='
                            )
                            print(
                                'Luciano > WARNING! custom_hits[{}]={}'.format(
                                    i, custom_hits[i]))
                            print(
                                '=============================================================='
                            )

                weighted_hits_current_cutoff = weighted_hits[0:cutoff]
                log_weighted_hits_current_cutoff = log_weighted_hits[0:cutoff]
                pos_weighted_hits_current_cutoff = pos_weighted_hits[0:cutoff]
                pos_log_weighted_hits_current_cutoff = pos_log_weighted_hits[
                    0:cutoff]
                custom_hits_current_cutoff = custom_hits[0:cutoff]
                # -------------------------------------------------------------

                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.
                    value] += precision_recall_min_denominator(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                number_of_guessed_items = is_relevant_current_cutoff.sum()
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()

                # (CNR) -------------------------------------------------
                verbose = False
                if verbose and is_relevant_current_cutoff.sum() > 1:
                    print(
                        '============================================================================='
                    )
                    print('user:'******'is_relevant_current_cutoff:',
                          is_relevant_current_cutoff)
                    print('recommended_items_current_cutoff:',
                          recommended_items_current_cutoff)
                    print('Warning! is_relevant_current_cutoff.sum()>1:',
                          is_relevant_current_cutoff.sum())
                    print('relevant_items:', relevant_items)
                    print('relevant_items_rating:', relevant_items_rating)
                    print('items_to_compute:', items_to_compute)
                    print(
                        '============================================================================='
                    )

                results_current_cutoff[
                    EvaluatorMetrics.WEIGHTED_HIT_RATE.
                    value] += weighted_hits_current_cutoff.sum()
                results_current_cutoff[
                    EvaluatorMetrics.LOG_WEIGHTED_HIT_RATE.
                    value] += log_weighted_hits_current_cutoff.sum()

                results_current_cutoff[
                    EvaluatorMetrics.POS_WEIGHTED_HIT_RATE.
                    value] += pos_weighted_hits_current_cutoff.sum()
                results_current_cutoff[
                    EvaluatorMetrics.POS_LOG_WEIGHTED_HIT_RATE.
                    value] += pos_log_weighted_hits_current_cutoff.sum()

                results_current_cutoff[
                    EvaluatorMetrics.CUSTOM_HIT_RATE.
                    value] += custom_hits_current_cutoff.sum()
                # -------------------------------------------------------

                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.RMSE.value] += user_rmse

                results_current_cutoff[
                    EvaluatorMetrics.MRR.value].add_recommendations(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.MAP.value].add_recommendations(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

            verbose = False
            if verbose and test_user % 1000 == 0:
                if number_of_guessed_items > 0:
                    print(
                        'Test ======================================================='
                    )
                    print('user:'******'relevant items:', relevant_items)
                    print('relevant items rating:', relevant_items_rating)
                    print('items_to_compute:\n', items_to_compute)
                    print('len(items_to_compute):', len(items_to_compute))
                    print('recommended_items:', recommended_items)
                    print('is_relevant:', is_relevant)
                    print('number_of_guessed_items:', number_of_guessed_items)
                    print(
                        '============================================================'
                    )
                else:
                    print('.')

            if time.time() - start_time_print > 30 or n_users_evaluated == len(
                    self.usersToEvaluate):
                elapsed_time = time.time() - start_time
                new_time_value, new_time_unit = seconds_to_biggest_unit(
                    elapsed_time)

                print(
                    "{}: Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}"
                    .format(
                        self.EVALUATOR_NAME, n_users_evaluated, 100.0 *
                        float(n_users_evaluated) / len(self.usersToEvaluate),
                        new_time_value, new_time_unit,
                        float(n_users_evaluated) / elapsed_time))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()

        if (n_users_evaluated > 0):

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                for key in results_current_cutoff.keys():

                    value = results_current_cutoff[key]

                    if isinstance(value, Metrics_Object):
                        results_current_cutoff[key] = value.get_metric_value()
                    else:
                        results_current_cutoff[key] = value / n_users_evaluated

                precision_ = results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value]
                recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value]

                if precision_ + recall_ != 0:
                    # F1 micro averaged: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.104.8244&rep=rep1&type=pdf
                    results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * (
                        precision_ * recall_) / (precision_ + recall_)

        else:
            print(
                "WARNING: No users had a sufficient number of relevant items")

        if self.ignore_items_flag:
            recommender_object.reset_items_to_ignore()

        results_run_string = get_result_string(results_dict)

        return (results_dict, results_run_string)
    def _compute_metrics_on_recommendation_list(self, test_user_batch_array,
                                                recommended_items_batch_list,
                                                scores_batch, results_dict):

        assert len(recommended_items_batch_list) == len(
            test_user_batch_array
        ), "{}: recommended_items_batch_list contained recommendations for {} users, expected was {}".format(
            self.EVALUATOR_NAME, len(recommended_items_batch_list),
            len(test_user_batch_array))

        assert scores_batch.shape[0] == len(
            test_user_batch_array
        ), "{}: scores_batch contained scores for {} users, expected was {}".format(
            self.EVALUATOR_NAME, scores_batch.shape[0],
            len(test_user_batch_array))

        assert scores_batch.shape[
            1] == self.n_items, "{}: scores_batch contained scores for {} items, expected was {}".format(
                self.EVALUATOR_NAME, scores_batch.shape[1], self.n_items)

        # Compute recommendation quality for each user in batch
        for batch_user_index in range(len(recommended_items_batch_list)):

            test_user = test_user_batch_array[batch_user_index]

            #print(f"Evaluating user: {test_user}")

            relevant_items = self.get_user_relevant_items(test_user)

            # Add the RMSE to the global object, no need to loop through the various cutoffs
            # This repository is not designed to ensure proper RMSE optimization
            # relevant_items_rating = self.get_user_test_ratings(test_user)
            #
            # all_items_predicted_ratings = scores_batch[batch_user_index]
            # global_RMSE_object = results_dict[self.cutoff_list[0]][EvaluatorMetrics.RMSE.value]
            # global_RMSE_object.add_recommendations(all_items_predicted_ratings, relevant_items, relevant_items_rating)

            # Being the URM CSR, the indices are the non-zero column indexes
            recommended_items = recommended_items_batch_list[batch_user_index]

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            #print(f"Is Relevant: {is_relevant}")

            self._n_users_evaluated += 1

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.
                    value] += precision_recall_min_denominator(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.MRR.value].add_recommendations(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.MAP.value].add_recommendations(
                        is_relevant_current_cutoff, relevant_items)
                #print(results_current_cutoff[EvaluatorMetrics.MAP.value])
                #print("----------------------------------------------------------------------------------------------------")
                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.COVERAGE_ITEM_CORRECT.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff,
                                           is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[EvaluatorMetrics.COVERAGE_USER_CORRECT.
                                       value].add_recommendations(
                                           is_relevant_current_cutoff,
                                           test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

        if time.time(
        ) - self._start_time_print > 30 or self._n_users_evaluated == len(
                self.users_to_evaluate):

            elapsed_time = time.time() - self._start_time
            new_time_value, new_time_unit = seconds_to_biggest_unit(
                elapsed_time)

            self._print(
                "Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}"
                .format(
                    self._n_users_evaluated,
                    100.0 * float(self._n_users_evaluated) /
                    len(self.users_to_evaluate), new_time_value, new_time_unit,
                    float(self._n_users_evaluated) / elapsed_time))

            sys.stdout.flush()
            sys.stderr.flush()

            self._start_time_print = time.time()

        return results_dict
예제 #10
0
    def _parallel_compute_metrics_on_recommendation_list(
            self, test_user_batch_array, recommended_items_batch_list,
            scores_batch, results_dict):
        """
        TO RUN THIS IN PARALLEL, results_dict MUST be an empty copy of a results dictionary created by
        calling _create_empty_metrics_dict. If not, then race conditions may occur.
        """

        if len(recommended_items_batch_list) != len(test_user_batch_array):
            raise ValueError(
                f"{self.EVALUATOR_NAME}: recommended_items_batch_list contained recommendations for {len(recommended_items_batch_list)} users, expected was {len(test_user_batch_array)}"
            )

        if scores_batch.shape[0] != len(test_user_batch_array):
            raise ValueError(
                f"{self.EVALUATOR_NAME}: scores_batch contained scores for {scores_batch.shape[0]} users, expected was {len(test_user_batch_array)}"
            )

        if scores_batch.shape[1] != self.n_items:
            raise ValueError(
                f"{self.EVALUATOR_NAME}: scores_batch contained scores for {scores_batch.shape[1]} items, expected was {self.n_items}"
            )

        for batch_user_index in range(len(recommended_items_batch_list)):

            test_user = test_user_batch_array[batch_user_index]

            relevant_items = self.get_user_relevant_items(test_user)

            # Being the URM CSR, the indices are the non-zero column indexes
            recommended_items = recommended_items_batch_list[batch_user_index]
            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            self._n_users_evaluated += 1

            # self.cutoff_list is an array containing different cut points when measuring Map@K and metrics like that.
            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.
                    value] += precision_recall_min_denominator(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.MRR.value].add_recommendations(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.MAP.value].add_recommendations(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

        return results_dict