def evaluate(self, method='hits', topk=None, not_rated_penalty=None, on_feedback_level=None): feedback = self.data.fields.feedback if int(topk or 0) > self.topk: self.topk = topk # will also flush old recommendations # support rolling back scenario for @k calculations recommendations = self.recommendations[:, :topk] # will recalculate if empty eval_data = self.data.test.holdout if self.switch_positive is None: # all recommendations are considered positive predictions # this is a proper setting for binary data problems (implicit feedback) # in this case all unrated items, recommended by an algorithm # assumed to be "honest" false positives and therefore penalty equals 1 not_rated_penalty = 1 if not_rated_penalty is None else not_rated_penalty is_positive = None else: # if data is not binary (explicit feedback), the intuition is different # it becomes unclear whether unrated items are "honest" false positives # as among these items can be both top rated and down-rated # the defualt setting in this case is to ignore such items at all # by setting penalty to 0, however, it is adjustable not_rated_penalty = not_rated_penalty or 0 is_positive = (eval_data[feedback] >= self.switch_positive).values scoring_data = assemble_scoring_matrices(recommendations, eval_data, self._key, self._target, is_positive, feedback=feedback) if method == 'relevance': # no need for feedback if self.data.holdout_size == 1: scores = get_hr_score(scoring_data[1]) else: scores = get_relevance_scores(*scoring_data, not_rated_penalty=not_rated_penalty) elif method == 'ranking': if self.data.holdout_size == 1: scores = get_mrr_score(scoring_data[1]) else: ndcg_alternative = get_default('ndcg_alternative') topk = recommendations.shape[1] # handle topk=None case # topk has to be passed explicitly, otherwise it's unclear how to # estimate ideal ranking for NDCG and NDCL metrics in get_ndcr_discounts scores = get_ranking_scores(*scoring_data, switch_positive=self.switch_positive, topk=topk, alternative=ndcg_alternative) elif method == 'hits': # no need for feedback scores = get_hits(*scoring_data, not_rated_penalty=not_rated_penalty) else: raise NotImplementedError return scores
def evaluate(self, metric_type='all', topk=None, not_rated_penalty=None, switch_positive=None, ignore_feedback=False, simple_rates=False, on_feedback_level=None): if metric_type == 'all': metric_type = ['hits', 'relevance', 'ranking', 'experience'] if metric_type == 'main': metric_type = ['relevance', 'ranking'] if not isinstance(metric_type, (list, tuple)): metric_type = [metric_type] # support rolling back scenario for @k calculations if int(topk or 0) > self.topk: self.topk = topk # will also flush old recommendations # ORDER OF CALLS MATTERS!!! # make sure to call holdout before getting recommendations # this will ensure that model is renewed if data has changed holdout = self.data.test.holdout # <-- call before getting recs recommendations = self.recommendations[:, : topk] # will recalculate if empty switch_positive = switch_positive or self.switch_positive feedback = self.data.fields.feedback if (switch_positive is None) or (feedback is None): # all recommendations are considered positive predictions # this is a proper setting for binary data problems (implicit feedback) # in this case all unrated items, recommended by an algorithm # assumed to be "honest" false positives and therefore penalty equals 1 not_rated_penalty = 1 if not_rated_penalty is None else not_rated_penalty is_positive = None else: # if data is not binary (explicit feedback), the intuition is different # it becomes unclear whether unrated items are "honest" false positives # as among these items can be both top rated and down-rated # the defualt setting in this case is to ignore such items at all # by setting penalty to 0, however, it is adjustable not_rated_penalty = not_rated_penalty or 0 is_positive = (holdout[feedback] >= switch_positive).values feedback = None if ignore_feedback else feedback scoring_data = assemble_scoring_matrices(recommendations, holdout, self._prediction_key, self._prediction_target, is_positive, feedback=feedback) scores = [] if 'relevance' in metric_type: # no need for feedback if (self.data.holdout_size == 1) or simple_rates: scores.append(get_hr_score(scoring_data[1])) else: scores.append( get_relevance_scores(*scoring_data, not_rated_penalty=not_rated_penalty)) if 'ranking' in metric_type: if (self.data.holdout_size == 1) or simple_rates: scores.append(get_mrr_score(scoring_data[1])) else: ndcg_alternative = get_default('ndcg_alternative') topk = recommendations.shape[1] # handle topk=None case # topk has to be passed explicitly, otherwise it's unclear how to # estimate ideal ranking for NDCG and NDCL metrics in get_ndcr_discounts scores.append( get_ranking_scores(*scoring_data, switch_positive=switch_positive, topk=topk, alternative=ndcg_alternative)) if 'experience' in metric_type: # no need for feedback fields = self.data.fields # support custom scenarios, e.g. coldstart entity_type = fields._fields[fields.index(self._prediction_target)] entity_index = getattr(self.data.index, entity_type) try: n_entities = entity_index.shape[0] except AttributeError: n_entities = entity_index.training.shape[0] scores.append(get_experience_scores(recommendations, n_entities)) if 'hits' in metric_type: # no need for feedback scores.append( get_hits(*scoring_data, not_rated_penalty=not_rated_penalty)) if not scores: raise NotImplementedError if len(scores) == 1: scores = scores[0] return scores