def evaluate_best(self, test_dataset): self.load_best() an_scores, gt_labels = self._evaluate(test_dataset) # AUC _ = metrics.roc_auc(gt_labels, an_scores, show=True) # Average Precision _ = metrics.pre_rec_curve(gt_labels, an_scores, show=True)
def evaluate(self, test_dataset): ret_dict = {} an_scores, gt_labels = self._evaluate(test_dataset) # normed to [0,1) an_scores = (an_scores - np.amin(an_scores)) / (np.amax(an_scores) - np.amin(an_scores)) # AUC auc_dict = metrics.roc_auc(gt_labels, an_scores) ret_dict.update(auc_dict) # Average Precision p_r_dict = metrics.pre_rec_curve(gt_labels, an_scores) ret_dict.update(p_r_dict) return ret_dict
def metrics_calculator(masks, preds, mode_average=True, additional=False): batch_size, masks, predictions = mtr.standardize_for_metrics(masks, preds) accuracy_score = mtr.accuracy(batch_size, masks, predictions, mode_average) if additional: roc_auc_score = mtr.roc_auc(batch_size, masks, predictions, mode_average) jaccard_score = mtr.jaccard(batch_size, masks, predictions, mode_average) sens_score, spec_score, prec_score, f1_score = mtr.confusion_matrix( batch_size, masks, predictions, mode_average) pr_auc_score = mtr.precision_recall_auc(batch_size, masks, predictions, mode_average) iou_score = mtr.fast_hist(predictions, masks, 2) return roc_auc_score, accuracy_score, jaccard_score, sens_score, spec_score, prec_score, f1_score, pr_auc_score, iou_score return accuracy_score
neval = 0 for test_user in range(nusers): user_profile = train[test_user].indices #what is doing here? relevant_items = test[test_user].indices if len(relevant_items) > 0: neval += 1 # # TODO: Here you can write to file the recommendations for each user in the test split. # WARNING: there is a catch with the item idx! # # this will rank *all* items recommended_items = recommender.recommend(user_profile, exclude_seen=True) # use this to have the *top-k* recommended items (warning: this can underestimate ROC-AUC for small k) # recommended_items = recommender.recommend(user_profile, k=at, exclude_seen=True) roc_auc_ += roc_auc(recommended_items, relevant_items) precision_ += precision(recommended_items, relevant_items, at=at) recall_ += recall(recommended_items, relevant_items, at=at) map_ += map(recommended_items, relevant_items, at=at) mrr_ += rr(recommended_items, relevant_items, at=at) ndcg_ += ndcg(recommended_items, relevant_items, relevance=test[test_user].data, at=at) roc_auc_ /= neval precision_ /= neval recall_ /= neval map_ /= neval mrr_ /= neval ndcg_ /= neval
# You might also want to plot some generalization of the [ROC curve](http://scikit-learn.org/stable/modules/model_evaluation.html#receiver-operating-characteristic-roc) for the case of multi-label classification. Provided function *roc_auc* can make it for you. The input parameters of this function are: # - true labels # - decision functions scores # - number of classes # In[81]: from metrics import roc_auc get_ipython().magic('matplotlib inline') # In[82]: n_classes = len(tags_counts) roc_auc(y_val, y_val_predicted_scores_mybag, n_classes) # In[83]: n_classes = len(tags_counts) roc_auc(y_val, y_val_predicted_scores_tfidf, n_classes) # **Task 4 (MultilabelClassification).** Once we have the evaluation set up, we suggest that you experiment a bit with training your classifiers. We will use *F1-score weighted* as an evaluation metric. Our recommendation: # - compare the quality of the bag-of-words and TF-IDF approaches and chose one of them. # - for the chosen one, try *L1* and *L2*-regularization techniques in Logistic Regression with different coefficients (e.g. C equal to 0.1, 1, 10, 100). # # You also could try other improvements of the preprocessing / model, if you want. # In[84]: ######################################
top_positive_words = [index_to_words[ind] for ind in sorted_ind[-5:]] top_negative_words = [index_to_words[ind] for ind in sorted_ind[:5]] print('\nTag:\t{}'.format(tag)) print('Top positive words:\t{}'.format(', '.join(top_positive_words))) print('Top negative words:\t{}\n'.format(', '.join(top_negative_words))) if __name__ == "__main__": train_docs, train_labels, test_docs, test_labels, mlb_classes = load_data() vectorised_train_documents, vectorised_test_documents, tfidf_reversed_vocab = tfidf_features( train_docs, test_docs) predicted_scores = None best_model = None for model_name in MODEL_NAMES: model = train_classifier(model_name, vectorised_train_documents, train_labels, 'l2', 10) predicted_labels = model.predict(vectorised_test_documents) if model_name == 'LinearSVC': best_model = model predicted_scores = model.decision_function( vectorised_test_documents) print(model_name + ' scores') print_evaluation_scores(test_labels, predicted_labels) roc_auc(test_labels, predicted_scores, n_classes) print_words_for_tag(best_model, 'cotton-oil', mlb_classes, tfidf_reversed_vocab)
def evaluateRecommender(self, recommender_object): """ :param recommender_object: the trained recommender object, a Recommender subclass :param URM_test_list: list of URMs to test the recommender against, or a single URM object :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff """ results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.URM_train, self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) start_time = time.time() start_time_print = time.time() n_eval = 0 self.__all_items = np.arange(0, self.n_items, dtype=np.int) self.__all_items = set(self.__all_items) if self.ignore_items_flag: recommender_object.set_items_to_ignore(self.ignore_items_ID) for test_user in self.usersToEvaluate: # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(test_user) n_eval += 1 self.user_specific_remove_items(recommender_object, test_user) # recommended_items = recommender_object.recommend(np.array(test_user), remove_seen_flag=self.exclude_seen, # cutoff = self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) recommended_items = recommender_object.recommend( np.atleast_1d(test_user), remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) recommended_items = np.array(recommended_items[0]) recommender_object.reset_items_to_ignore() is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER.value].add_recommendations( recommended_items_current_cutoff, test_user) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_eval == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format(n_eval, 100.0 * float(n_eval) / len(self.usersToEvaluate), time.time() - start_time, float(n_eval) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() if (n_eval > 0): for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] for key in results_current_cutoff.keys(): value = results_current_cutoff[key] if isinstance(value, Metrics_Object): results_current_cutoff[key] = value.get_metric_value() else: results_current_cutoff[key] = value / n_eval precision_ = results_current_cutoff[ EvaluatorMetrics.PRECISION.value] recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value] if precision_ + recall_ != 0: results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * ( precision_ * recall_) / (precision_ + recall_) else: print( "WARNING: No users had a sufficient number of relevant items") if self.ignore_items_flag: recommender_object.reset_items_to_ignore() results_run_string = self.get_result_string(results_dict) return (results_dict, results_run_string)
def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate, block_size=1000): start_time = time.time() start_time_print = time.time() results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.get_URM_train(), self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) n_users_evaluated = 0 # Start from -block_size to ensure it to be 0 at the first block user_batch_start = 0 user_batch_end = 0 while user_batch_start < len(self.usersToEvaluate): user_batch_end = user_batch_start + block_size user_batch_end = min(user_batch_end, len(usersToEvaluate)) test_user_batch_array = np.array( usersToEvaluate[user_batch_start:user_batch_end]) user_batch_start = user_batch_end # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time recommended_items_batch_list = recommender_object.recommend( test_user_batch_array, remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) # Compute recommendation quality for each user in batch for batch_user_index in range(len(recommended_items_batch_list)): user_id = test_user_batch_array[batch_user_index] recommended_items = recommended_items_batch_list[ batch_user_index] # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(user_id) is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) n_users_evaluated += 1 for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[ 0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(user_id), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[ EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER. value].add_recommendations( recommended_items_current_cutoff, user_id) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time( ) - start_time_print > 30 or n_users_evaluated == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format( n_users_evaluated, 100.0 * float(n_users_evaluated) / len(self.usersToEvaluate), time.time() - start_time, float(n_users_evaluated) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() return results_dict, n_users_evaluated
def _run_evaluation_on_selected_users(self, recommender_object, usersToEvaluate): start_time = time.time() start_time_print = time.time() results_dict = {} for cutoff in self.cutoff_list: results_dict[cutoff] = create_empty_metrics_dict( self.n_items, self.n_users, recommender_object.URM_train, self.ignore_items_ID, self.ignore_users_ID, cutoff, self.diversity_object) n_users_evaluated = 0 for test_user in usersToEvaluate: # Being the URM CSR, the indices are the non-zero column indexes relevant_items = self.get_user_relevant_items(test_user) n_users_evaluated += 1 recommended_items = recommender_object.recommend( test_user, remove_seen_flag=self.exclude_seen, cutoff=self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag) is_relevant = np.in1d(recommended_items, relevant_items, assume_unique=True) for cutoff in self.cutoff_list: results_current_cutoff = results_dict[cutoff] is_relevant_current_cutoff = is_relevant[0:cutoff] recommended_items_current_cutoff = recommended_items[0:cutoff] results_current_cutoff[ EvaluatorMetrics.ROC_AUC.value] += roc_auc( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.PRECISION.value] += precision( is_relevant_current_cutoff, len(relevant_items)) results_current_cutoff[ EvaluatorMetrics.RECALL.value] += recall( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN. value] += recall_min_test_len( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MAP.value] += map( is_relevant_current_cutoff, relevant_items) results_current_cutoff[EvaluatorMetrics.MRR.value] += rr( is_relevant_current_cutoff) results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg( recommended_items_current_cutoff, relevant_items, relevance=self.get_user_test_ratings(test_user), at=cutoff) results_current_cutoff[ EvaluatorMetrics.HIT_RATE. value] += is_relevant_current_cutoff.sum() results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr( is_relevant_current_cutoff) results_current_cutoff[ EvaluatorMetrics.NOVELTY.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[ EvaluatorMetrics.COVERAGE_USER.value].add_recommendations( recommended_items_current_cutoff, test_user) results_current_cutoff[ EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST. value].add_recommendations( recommended_items_current_cutoff) results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL. value].add_recommendations( recommended_items_current_cutoff) if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff: results_current_cutoff[ EvaluatorMetrics.DIVERSITY_SIMILARITY. value].add_recommendations( recommended_items_current_cutoff) if time.time() - start_time_print > 30 or n_users_evaluated == len( self.usersToEvaluate): print( "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}" .format( n_users_evaluated, 100.0 * float(n_users_evaluated) / len(self.usersToEvaluate), time.time() - start_time, float(n_users_evaluated) / (time.time() - start_time))) sys.stdout.flush() sys.stderr.flush() start_time_print = time.time() return results_dict, n_users_evaluated
def main(): # Set hyperparameters num_folds = 100 label_name = "1" # Specify data location data_path = "Data/test_data.csv" # Load data to table df = pd.read_csv(data_path, sep=";", index_col=0) # Check if any labels are missing print("Number of missing values:\n", df.isnull().sum()) print() # Only keep first instance if multiple instances have the same key num_instances_before = len(df) df = df[~df.index.duplicated(keep="first")] num_instances_diff = num_instances_before - len(df) if num_instances_diff > 0: print( "Warning: {} instances removed due to duplicate keys - only keeping first occurrence!" .format(num_instances_diff)) # Perform standardized preprocessing preprocessor = TabularPreprocessor() df = preprocessor.fit_transform(df) # Display bar chart with number of samples per class # seaborn.countplot(x=label_name, data=df) # plt.title("Original class frequencies") # plt.savefig("Results/original_class_frequencies.png") # plt.close() # Separate data into training and test y = df[label_name] x = df.drop(label_name, axis="columns") # Get samples per class print("Samples per class") for (label, count) in zip(*np.unique(y, return_counts=True)): print("{}: {}".format(label, count)) print() # Get number of classes num_classes = len(np.unique(df[label_name].values)) # Setup classifiers knn = KNeighborsClassifier(weights="distance") knn_param_grid = { "n_neighbors": [int(val) for val in np.round(np.sqrt(x.shape[1])) + np.arange(5) + 1] + [ int(val) for val in np.round(np.sqrt(x.shape[1])) - np.arange(5) if val >= 1 ], "p": np.arange(1, 5) } dt = DecisionTreeClassifier() dt_param_grid = { "criterion": ["gini", "entropy"], "splitter": ["best", "random"], "max_depth": np.arange(1, 20), "min_samples_split": [2, 4, 6], "min_samples_leaf": [1, 3, 5, 6], "max_features": ["auto", "sqrt", "log2"] } rf = RandomForestClassifier(n_estimators=100, criterion="entropy", max_depth=5, min_samples_split=5, min_samples_leaf=2) rf_param_grid = {} nn = MLPClassifier(hidden_layer_sizes=(32, 64, 32), activation="relu") nn_param_grid = {} clfs = { "knn": { "classifier": knn, "parameters": knn_param_grid }, "dt": { "classifier": dt, "parameters": dt_param_grid }, "rf": { "classifier": rf, "parameters": rf_param_grid }, "nn": { "classifier": nn, "parameters": nn_param_grid } } clfs_performance = {"acc": [], "sns": [], "spc": [], "auc": []} # Initialize result table results = pd.DataFrame(index=list(clfs.keys())) # Iterate over classifiers for clf in clfs: # Initialize cumulated confusion matrix and fold-wise performance containers cms = np.zeros((num_classes, num_classes)) performance_foldwise = {"acc": [], "sns": [], "spc": [], "auc": []} # Iterate over MCCV for fold_index in np.arange(num_folds): # Split into training and test data x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.15, stratify=y, random_state=fold_index) # Perform standardization and feature imputation intra_fold_preprocessor = TabularIntraFoldPreprocessor( k="automated", normalization="standardize") intra_fold_preprocessor = intra_fold_preprocessor.fit(x_train) x_train = intra_fold_preprocessor.transform(x_train) x_test = intra_fold_preprocessor.transform(x_test) # Perform (ANOVA) feature selection selected_indices, x_train, x_test = univariate_feature_selection( x_train.values, y_train.values, x_test.values, score_func=f_classif, num_features="log2n") # # Random undersampling # rus = RandomUnderSampler(random_state=fold_index, sampling_strategy=0.3) # x_train, y_train = rus.fit_resample(x_train, y_train) # SMOTE smote = SMOTE(random_state=fold_index, sampling_strategy=1) x_train, y_train = smote.fit_resample(x_train, y_train) # Setup model model = clfs[clf]["classifier"] model.random_state = fold_index # Hyperparameter tuning and keep model trained with the best set of hyperparameters optimized_model = RandomizedSearchCV( model, param_distributions=clfs[clf]["parameters"], cv=5, random_state=fold_index) optimized_model.fit(x_train, y_train) # Predict test data using trained model y_pred = optimized_model.predict(x_test) # Compute performance cm = confusion_matrix(y_test, y_pred) acc = accuracy_score(y_test, y_pred) sns = metrics.sensitivity(y_test, y_pred) spc = metrics.specificity(y_test, y_pred) auc = metrics.roc_auc(y_test, y_pred) # Append performance to fold-wise and overall containers cms += cm performance_foldwise["acc"].append(acc) performance_foldwise["sns"].append(sns) performance_foldwise["spc"].append(spc) performance_foldwise["auc"].append(auc) # Calculate overall performance for metric in performance_foldwise: avg_metric = np.round( np.sum(performance_foldwise[metric]) / len(performance_foldwise[metric]), 2) clfs_performance[metric].append(avg_metric) # Display overall performances print("== {} ==".format(clf)) print("Cumulative CM:\n", cms) for metric in clfs_performance: print("Avg {}: {}".format(metric, clfs_performance[metric][-1])) print() # Display confusion matrix # sns.heatmap(cms, annot=True, cmap="Blues", fmt="g") # plt.xlabel("Predicted") # plt.ylabel("Actual") # plt.title("{} - Confusion matrix".format(clf)) # plt.savefig("Results/confusion_matrix-{}.png".format(clf)) # plt.close() # Append performance to result table for metric in clfs_performance: results[metric] = clfs_performance[metric] # Save result table results.to_csv("performances.csv", sep=";") results.plot.bar(rot=45).legend(loc="upper right") plt.savefig("performance.png".format(clf)) plt.show() plt.close()