if len(relevant_items) > 0: neval += 1 # # TODO: Here you can write to file the recommendations for each user in the test split. # WARNING: there is a catch with the item idx! # # this will rank *all* items recommended_items = recommender.recommend(user_profile, exclude_seen=True) # use this to have the *top-k* recommended items (warning: this can underestimate ROC-AUC for small k) # recommended_items = recommender.recommend(user_profile, k=at, exclude_seen=True) roc_auc_ += roc_auc(recommended_items, relevant_items) precision_ += precision(recommended_items, relevant_items, at=at) recall_ += recall(recommended_items, relevant_items, at=at) map_ += map(recommended_items, relevant_items, at=at) mrr_ += rr(recommended_items, relevant_items, at=at) ndcg_ += ndcg(recommended_items, relevant_items, relevance=test[test_user].data, at=at) roc_auc_ /= neval precision_ /= neval recall_ /= neval map_ /= neval mrr_ /= neval ndcg_ /= neval logger.info('Ranking quality') logger.info('ROC-AUC: {:.4f}'.format(roc_auc_)) logger.info('Precision@{}: {:.4f}'.format(at, precision_)) logger.info('Recall@{}: {:.4f}'.format(at, recall_))
def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate, block_size=1000): start_time = time.time() start_time_print = time.time() results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.get_URM_train(), self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) n_users_evaluated = 0 # Start from -block_size to ensure it to be 0 at the first block user_batch_start = 0 user_batch_end = 0 while user_batch_start < len(self.usersToEvaluate): user_batch_end = user_batch_start + block_size user_batch_end = min(user_batch_end, len(usersToEvaluate)) test_user_batch_array = np.array( usersToEvaluate[user_batch_start:user_batch_end]) user_batch_start = user_batch_end # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time recommended_items_batch_list = recommender_object.recommend( test_user_batch_array, remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) # Compute recommendation quality for each user in batch for batch_user_index in range(len(recommended_items_batch_list)): user_id = test_user_batch_array[batch_user_index] recommended_items = recommended_items_batch_list[ batch_user_index] # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(user_id) is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) n_users_evaluated += 1 for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[ 0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(user_id), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[ EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER. value].add_recommendations( recommended_items_current_cutoff, user_id) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time( ) - start_time_print > 30 or n_users_evaluated == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format( n_users_evaluated, 100.0 * float(n_users_evaluated) / len(self.usersToEvaluate), time.time() - start_time, float(n_users_evaluated) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() return results_dict, n_users_evaluated
def evaluateRecommender(self, recommender_object): """ :param recommender_object: the trained recommender object, a Recommender subclass :param URM_test_list: list of URMs to test the recommender against, or a single URM object :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff """ results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.URM_train, self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) start_time = time.time() start_time_print = time.time() n_eval = 0 self.__all_items = np.arange(0, self.n_items, dtype=np.int) self.__all_items = set(self.__all_items) if self.ignore_items_flag: recommender_object.set_items_to_ignore(self.ignore_items_ID) for test_user in self.usersToEvaluate: # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(test_user) n_eval += 1 self.user_specific_remove_items(recommender_object, test_user) # recommended_items = recommender_object.recommend(np.array(test_user), remove_seen_flag=self.exclude_seen, # cutoff = self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) recommended_items = recommender_object.recommend( np.atleast_1d(test_user), remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) recommended_items = np.array(recommended_items[0]) recommender_object.reset_items_to_ignore() is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER.value].add_recommendations( recommended_items_current_cutoff, test_user) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_eval == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format(n_eval, 100.0 * float(n_eval) / len(self.usersToEvaluate), time.time() - start_time, float(n_eval) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() if (n_eval > 0): for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] for key in results_current_cutoff.keys(): value = results_current_cutoff[key] if isinstance(value, Metrics_Object): results_current_cutoff[key] = value.get_metric_value() else: results_current_cutoff[key] = value / n_eval precision_ = results_current_cutoff[ EvaluatorMetrics.PRECISION.value] recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value] if precision_ + recall_ != 0: results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * ( precision_ * recall_) / (precision_ + recall_) else: print( "WARNING: No users had a sufficient number of relevant items") if self.ignore_items_flag: recommender_object.reset_items_to_ignore() results_run_string = self.get_result_string(results_dict) return (results_dict, results_run_string)
def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate): start_time = time.time() start_time_print = time.time() results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.URM_train, self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) n_users_evaluated = 0 for test_user in usersToEvaluate: # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(test_user) n_users_evaluated += 1 recommended_items = recommender_object.recommend( test_user, remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER.value].add_recommendations( recommended_items_current_cutoff, test_user) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_users_evaluated == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format( n_users_evaluated, 100.0 * float(n_users_evaluated) / len(self.usersToEvaluate), time.time() - start_time, float(n_users_evaluated) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() return results_dict, n_users_evaluated