def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate, block_size = None): if block_size is None: block_size = min(1000, int(1e8/self.n_items)) start_time = time.time() start_time_print = time.time() results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict(self.n_items, self.n_users, recommender_object.get_URM_train(), self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) n_users_evaluated = 0 # Start from -block_size to ensure it to be 0 at the first block user_batch_start = 0 user_batch_end = 0 while user_batch_start < len(self.usersToEvaluate): user_batch_end = user_batch_start + block_size user_batch_end = min(user_batch_end, len(usersToEvaluate)) test_user_batch_array = np.array(usersToEvaluate[user_batch_start:user_batch_end]) user_batch_start = user_batch_end # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time recommended_items_batch_list, scores_batch = recommender_object.recommend(test_user_batch_array, remove_seen_flag=self.exclude_seen, cutoff = self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag, return_scores = True ) assert len(recommended_items_batch_list) == len(test_user_batch_array), "{}: recommended_items_batch_list contained recommendations for {} users, expected was {}".format( self.EVALUATOR_NAME, len(recommended_items_batch_list), len(test_user_batch_array)) assert scores_batch.shape[0] == len(test_user_batch_array), "{}: scores_batch contained scores for {} users, expected was {}".format( self.EVALUATOR_NAME, scores_batch.shape[0], len(test_user_batch_array)) assert scores_batch.shape[1] == self.n_items, "{}: scores_batch contained scores for {} items, expected was {}".format( self.EVALUATOR_NAME, scores_batch.shape[1], self.n_items) # Compute recommendation quality for each user in batch for batch_user_index in range(len(recommended_items_batch_list)): test_user = test_user_batch_array[batch_user_index] relevant_items = self.get_user_relevant_items(test_user) relevant_items_rating = self.get_user_test_ratings(test_user) all_items_predicted_ratings = scores_batch[batch_user_index] user_rmse = rmse(all_items_predicted_ratings, relevant_items, relevant_items_rating) # Being the URM CSR, the indices are the non-zero column indexes recommended_items = recommended_items_batch_list[batch_user_index] is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) n_users_evaluated += 1 for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[EvaluatorMetrics.ROC_AUC.value] += roc_auc(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.PRECISION.value] += precision(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.value] += precision_recall_min_denominator(is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[EvaluatorMetrics.RECALL.value] += recall(is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[EvaluatorMetrics.HIT_RATE.value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.RMSE.value] += user_rmse results_current_cutoff[EvaluatorMetrics.MRR.value].add_recommendations(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.MAP.value].add_recommendations(is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.NOVELTY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(recommended_items_current_cutoff, test_user) results_current_cutoff[EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.value].add_recommendations(recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[EvaluatorMetrics.DIVERSITY_SIMILARITY.value].add_recommendations(recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_users_evaluated==len(self.usersToEvaluate): elapsed_time = time.time()-start_time new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time) print("{}: Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}".format( self.EVALUATOR_NAME, n_users_evaluated, 100.0* float(n_users_evaluated)/len(self.usersToEvaluate), new_time_value, new_time_unit, float(n_users_evaluated)/elapsed_time)) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() return results_dict, n_users_evaluated
def evaluateRecommender(self, recommender_object): """ :param recommender_object: the trained recommender object, a BaseRecommender subclass :param URM_test_list: list of URMs to test the recommender against, or a single URM object :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff """ results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict(self.n_items, self.n_users, recommender_object.URM_train, self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) start_time = time.time() start_time_print = time.time() n_users_evaluated = 0 if self.ignore_items_flag: recommender_object.set_items_to_ignore(self.ignore_items_ID) for test_user in self.usersToEvaluate: # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(test_user) relevant_items_rating = self.get_user_test_ratings(test_user) n_users_evaluated += 1 items_to_compute = self._get_user_specific_items_to_compute(test_user) recommended_items, all_items_predicted_ratings = recommender_object.recommend(np.atleast_1d(test_user), remove_seen_flag=self.exclude_seen, cutoff = self.max_cutoff, remove_top_pop_flag=False, items_to_compute = items_to_compute, remove_CustomItems_flag=self.ignore_items_flag, return_scores = True ) assert len(recommended_items) == 1, "{}: recommended_items contained recommendations for {} users, expected was {}".format( self.EVALUATOR_NAME, len(recommended_items), 1) assert all_items_predicted_ratings.shape[0] == 1, "{}: all_items_predicted_ratings contained scores for {} users, expected was {}".format( self.EVALUATOR_NAME, all_items_predicted_ratings.shape[0], 1) assert all_items_predicted_ratings.shape[1] == self.n_items, "{}: all_items_predicted_ratings contained scores for {} items, expected was {}".format( self.EVALUATOR_NAME, all_items_predicted_ratings.shape[1], self.n_items) recommended_items = np.array(recommended_items[0]) user_rmse = rmse(all_items_predicted_ratings[0], relevant_items, relevant_items_rating) recommender_object.reset_items_to_ignore() is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[EvaluatorMetrics.ROC_AUC.value] += roc_auc(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.PRECISION.value] += precision(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.PRECISION_RECALL_MIN_DEN.value] += precision_recall_min_denominator(is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[EvaluatorMetrics.RECALL.value] += recall(is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[EvaluatorMetrics.HIT_RATE.value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.RMSE.value] += user_rmse results_current_cutoff[EvaluatorMetrics.MRR.value].add_recommendations(is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.MAP.value].add_recommendations(is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.NOVELTY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(recommended_items_current_cutoff, test_user) results_current_cutoff[EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.value].add_recommendations(recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.value].add_recommendations(recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[EvaluatorMetrics.DIVERSITY_SIMILARITY.value].add_recommendations(recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_users_evaluated==len(self.usersToEvaluate): elapsed_time = time.time()-start_time new_time_value, new_time_unit = seconds_to_biggest_unit(elapsed_time) print("{}: Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}".format( self.EVALUATOR_NAME, n_users_evaluated, 100.0* float(n_users_evaluated)/len(self.usersToEvaluate), new_time_value, new_time_unit, float(n_users_evaluated)/elapsed_time)) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() if (n_users_evaluated > 0): for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] for key in results_current_cutoff.keys(): value = results_current_cutoff[key] if isinstance(value, Metrics_Object): results_current_cutoff[key] = value.get_metric_value() else: results_current_cutoff[key] = value/n_users_evaluated precision_ = results_current_cutoff[EvaluatorMetrics.PRECISION.value] recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value] if precision_ + recall_ != 0: # F1 micro averaged: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.104.8244&rep=rep1&type=pdf results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * (precision_ * recall_) / (precision_ + recall_) else: print("WARNING: No users had a sufficient number of relevant items") if self.ignore_items_flag: recommender_object.reset_items_to_ignore() results_run_string = get_result_string(results_dict) return (results_dict, results_run_string)
def evaluateRecommender(self, recommender_object): """ :param recommender_object: the trained recommender object, a BaseRecommender subclass :param URM_test_list: list of URMs to test the recommender against, or a single URM object :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff """ results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.URM_train, self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) start_time = time.time() start_time_print = time.time() n_users_evaluated = 0 if self.ignore_items_flag: recommender_object.set_items_to_ignore(self.ignore_items_ID) for test_user in self.usersToEvaluate: # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(test_user) relevant_items_rating = self.get_user_test_ratings(test_user) n_users_evaluated += 1 items_to_compute = self._get_user_specific_items_to_compute( test_user) recommended_items, all_items_predicted_ratings = recommender_object.recommend( np.atleast_1d(test_user), remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, items_to_compute=items_to_compute, remove_CustomItems_flag=self.ignore_items_flag, return_scores=True) assert len( recommended_items ) == 1, "{}: recommended_items contained recommendations for {} users, expected was {}".format( self.EVALUATOR_NAME, len(recommended_items), 1) assert all_items_predicted_ratings.shape[ 0] == 1, "{}: all_items_predicted_ratings contained scores for {} users, expected was {}".format( self.EVALUATOR_NAME, all_items_predicted_ratings.shape[0], 1) assert all_items_predicted_ratings.shape[ 1] == self.n_items, "{}: all_items_predicted_ratings contained scores for {} items, expected was {}".format( self.EVALUATOR_NAME, all_items_predicted_ratings.shape[1], self.n_items) recommended_items = np.array(recommended_items[0]) user_rmse = rmse(all_items_predicted_ratings[0], relevant_items, relevant_items_rating) recommender_object.reset_items_to_ignore() is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) # (CNR) ------------------------------------------------- weighted_hits = np.zeros(len(recommended_items)) log_weighted_hits = np.zeros(len(recommended_items)) pos_weighted_hits = np.zeros(len(recommended_items)) pos_log_weighted_hits = np.zeros(len(recommended_items)) ''' alpha, beta, scale, pi = 100, 0.03, 1 / 15, np.pi percentile = get_percentile(a, 45) f = 1 / (beta * np.sqrt(2 * pi)) y_a = np.tanh(alpha * a) + scale * f * np.exp(-1 / (2 * (beta ** 2)) * (a - percentile) ** 2) y_a = y_a / max(y_a) ''' # es. recommended_items = [2, 7, 10, 70, 5464] # es. is_relevant = [0, 0, 1, 0, 0] for i in range(len(recommended_items)): if is_relevant[i]: weighted_hits[i] = 1 / ( 1 + Settings.popularity[recommended_items[i]]) # es. weighted_hits = [1, 7, 10, 70, 5464] pos_weighted_hits[i] = 1 / ( 1 + i + Settings.popularity[recommended_items[i]]) log_weighted_hits[i] = 1 / (1 + math.log( 1 + Settings.popularity[recommended_items[i]])) pos_log_weighted_hits[i] = 1 / (1 + i + math.log( 1 + Settings.popularity[recommended_items[i]])) # ------------------------------------------------------- number_of_guessed_items = 0 for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] # Questo array e' fondamentale. # Dato un utente, results_current_cutoff e' nella forma [0, 1, 0, 0, 0] # La sua lunghezza e' pari al cutoff. # Nell'esempio riportato, il vettore ci dice che l'oggetto # raccomandato in seconda posizione e' un hit, cioe' e' presente nel test set. # Attenzione pero'. Gli oggetti presi in esame sono quelli del test set piu' quelli negativi # (che sappiamo non piacere all'utente). is_relevant_current_cutoff = is_relevant[0:cutoff] # (CNR) ------------------------------------------------------- custom_hits = np.zeros(len(recommended_items)) for i in range(len(recommended_items)): if is_relevant[i]: # print('Luciano > Computing custom weight. Parameters (pop, pos, cutoff):', Settings.popularity[recommended_items[i]], i, cutoff) custom_hits[i] = y_custom( Settings.popularity[recommended_items[i]], i, cutoff) if custom_hits[i] > 1: print( '==============================================================' ) print( 'Luciano > WARNING! custom_hits[{}]={}'.format( i, custom_hits[i])) print( '==============================================================' ) weighted_hits_current_cutoff = weighted_hits[0:cutoff] log_weighted_hits_current_cutoff = log_weighted_hits[0:cutoff] pos_weighted_hits_current_cutoff = pos_weighted_hits[0:cutoff] pos_log_weighted_hits_current_cutoff = pos_log_weighted_hits[ 0:cutoff] custom_hits_current_cutoff = custom_hits[0:cutoff] # ------------------------------------------------------------- recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION_RECALL_MIN_DEN. value] += precision_recall_min_denominator( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) number_of_guessed_items = is_relevant_current_cutoff.sum() results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() # (CNR) ------------------------------------------------- verbose = False if verbose and is_relevant_current_cutoff.sum() > 1: print( '=============================================================================' ) print('user:'******'is_relevant_current_cutoff:', is_relevant_current_cutoff) print('recommended_items_current_cutoff:', recommended_items_current_cutoff) print('Warning! is_relevant_current_cutoff.sum()>1:', is_relevant_current_cutoff.sum()) print('relevant_items:', relevant_items) print('relevant_items_rating:', relevant_items_rating) print('items_to_compute:', items_to_compute) print( '=============================================================================' ) results_current_cutoff[ EvaluatorMetrics.WEIGHTED_HIT_RATE. value] += weighted_hits_current_cutoff.sum() results_current_cutoff[ EvaluatorMetrics.LOG_WEIGHTED_HIT_RATE. value] += log_weighted_hits_current_cutoff.sum() results_current_cutoff[ EvaluatorMetrics.POS_WEIGHTED_HIT_RATE. value] += pos_weighted_hits_current_cutoff.sum() results_current_cutoff[ EvaluatorMetrics.POS_LOG_WEIGHTED_HIT_RATE. value] += pos_log_weighted_hits_current_cutoff.sum() results_current_cutoff[ EvaluatorMetrics.CUSTOM_HIT_RATE. value] += custom_hits_current_cutoff.sum() # ------------------------------------------------------- results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.RMSE.value] += user_rmse results_current_cutoff[ EvaluatorMetrics.MRR.value].add_recommendations( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.MAP.value].add_recommendations( is_relevant_current_cutoff, relevant_items) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.AVERAGE_POPULARITY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER.value].add_recommendations( recommended_items_current_cutoff, test_user) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) verbose = False if verbose and test_user % 1000 == 0: if number_of_guessed_items > 0: print( 'Test =======================================================' ) print('user:'******'relevant items:', relevant_items) print('relevant items rating:', relevant_items_rating) print('items_to_compute:\n', items_to_compute) print('len(items_to_compute):', len(items_to_compute)) print('recommended_items:', recommended_items) print('is_relevant:', is_relevant) print('number_of_guessed_items:', number_of_guessed_items) print( '============================================================' ) else: print('.') if time.time() - start_time_print > 30 or n_users_evaluated == len( self.usersToEvaluate): elapsed_time = time.time() - start_time new_time_value, new_time_unit = seconds_to_biggest_unit( elapsed_time) print( "{}: Processed {} ( {:.2f}% ) in {:.2f} {}. Users per second: {:.0f}" .format( self.EVALUATOR_NAME, n_users_evaluated, 100.0 * float(n_users_evaluated) / len(self.usersToEvaluate), new_time_value, new_time_unit, float(n_users_evaluated) / elapsed_time)) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() if (n_users_evaluated > 0): for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] for key in results_current_cutoff.keys(): value = results_current_cutoff[key] if isinstance(value, Metrics_Object): results_current_cutoff[key] = value.get_metric_value() else: results_current_cutoff[key] = value / n_users_evaluated precision_ = results_current_cutoff[ EvaluatorMetrics.PRECISION.value] recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value] if precision_ + recall_ != 0: # F1 micro averaged: http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.104.8244&rep=rep1&type=pdf results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * ( precision_ * recall_) / (precision_ + recall_) else: print( "WARNING: No users had a sufficient number of relevant items") if self.ignore_items_flag: recommender_object.reset_items_to_ignore() results_run_string = get_result_string(results_dict) return (results_dict, results_run_string)