コード例 #1
0
 def evaluate_best(self, test_dataset):
     self.load_best()
     an_scores, gt_labels = self._evaluate(test_dataset)
     # AUC
     _ = metrics.roc_auc(gt_labels, an_scores, show=True)
     # Average Precision
     _ = metrics.pre_rec_curve(gt_labels, an_scores, show=True)
コード例 #2
0
 def evaluate(self, test_dataset):
     ret_dict = {}
     an_scores, gt_labels = self._evaluate(test_dataset)
     # normed to [0,1)
     an_scores = (an_scores - np.amin(an_scores)) / (np.amax(an_scores) -
                                                     np.amin(an_scores))
     # AUC
     auc_dict = metrics.roc_auc(gt_labels, an_scores)
     ret_dict.update(auc_dict)
     # Average Precision
     p_r_dict = metrics.pre_rec_curve(gt_labels, an_scores)
     ret_dict.update(p_r_dict)
     return ret_dict
コード例 #3
0
def metrics_calculator(masks, preds, mode_average=True, additional=False):
    batch_size, masks, predictions = mtr.standardize_for_metrics(masks, preds)
    accuracy_score = mtr.accuracy(batch_size, masks, predictions, mode_average)
    if additional:
        roc_auc_score = mtr.roc_auc(batch_size, masks, predictions,
                                    mode_average)
        jaccard_score = mtr.jaccard(batch_size, masks, predictions,
                                    mode_average)
        sens_score, spec_score, prec_score, f1_score = mtr.confusion_matrix(
            batch_size, masks, predictions, mode_average)
        pr_auc_score = mtr.precision_recall_auc(batch_size, masks, predictions,
                                                mode_average)
        iou_score = mtr.fast_hist(predictions, masks, 2)
        return roc_auc_score, accuracy_score, jaccard_score, sens_score, spec_score, prec_score, f1_score, pr_auc_score, iou_score
    return accuracy_score
コード例 #4
0
neval = 0
for test_user in range(nusers):
    user_profile = train[test_user].indices  #what is doing here?
    relevant_items = test[test_user].indices
    if len(relevant_items) > 0:
        neval += 1
        #
        # TODO: Here you can write to file the recommendations for each user in the test split.
        # WARNING: there is a catch with the item idx!
        #
        # this will rank *all* items
        recommended_items = recommender.recommend(user_profile,
                                                  exclude_seen=True)
        # use this to have the *top-k* recommended items (warning: this can underestimate ROC-AUC for small k)
        # recommended_items = recommender.recommend(user_profile, k=at, exclude_seen=True)
        roc_auc_ += roc_auc(recommended_items, relevant_items)
        precision_ += precision(recommended_items, relevant_items, at=at)
        recall_ += recall(recommended_items, relevant_items, at=at)
        map_ += map(recommended_items, relevant_items, at=at)
        mrr_ += rr(recommended_items, relevant_items, at=at)
        ndcg_ += ndcg(recommended_items,
                      relevant_items,
                      relevance=test[test_user].data,
                      at=at)
roc_auc_ /= neval
precision_ /= neval
recall_ /= neval
map_ /= neval
mrr_ /= neval
ndcg_ /= neval
コード例 #5
0
# You might also want to plot some generalization of the [ROC curve](http://scikit-learn.org/stable/modules/model_evaluation.html#receiver-operating-characteristic-roc) for the case of multi-label classification. Provided function *roc_auc* can make it for you. The input parameters of this function are:
#  - true labels
#  - decision functions scores
#  - number of classes

# In[81]:

from metrics import roc_auc

get_ipython().magic('matplotlib inline')

# In[82]:

n_classes = len(tags_counts)
roc_auc(y_val, y_val_predicted_scores_mybag, n_classes)

# In[83]:

n_classes = len(tags_counts)
roc_auc(y_val, y_val_predicted_scores_tfidf, n_classes)

# **Task 4 (MultilabelClassification).** Once we have the evaluation set up, we suggest that you experiment a bit with training your classifiers. We will use *F1-score weighted* as an evaluation metric. Our recommendation:
# - compare the quality of the bag-of-words and TF-IDF approaches and chose one of them.
# - for the chosen one, try *L1* and *L2*-regularization techniques in Logistic Regression with different coefficients (e.g. C equal to 0.1, 1, 10, 100).
#
# You also could try other improvements of the preprocessing / model, if you want.

# In[84]:

######################################
コード例 #6
0
    top_positive_words = [index_to_words[ind] for ind in sorted_ind[-5:]]
    top_negative_words = [index_to_words[ind] for ind in sorted_ind[:5]]

    print('\nTag:\t{}'.format(tag))
    print('Top positive words:\t{}'.format(', '.join(top_positive_words)))
    print('Top negative words:\t{}\n'.format(', '.join(top_negative_words)))


if __name__ == "__main__":
    train_docs, train_labels, test_docs, test_labels, mlb_classes = load_data()
    vectorised_train_documents, vectorised_test_documents, tfidf_reversed_vocab = tfidf_features(
        train_docs, test_docs)
    predicted_scores = None
    best_model = None
    for model_name in MODEL_NAMES:
        model = train_classifier(model_name, vectorised_train_documents,
                                 train_labels, 'l2', 10)
        predicted_labels = model.predict(vectorised_test_documents)

        if model_name == 'LinearSVC':
            best_model = model
            predicted_scores = model.decision_function(
                vectorised_test_documents)

        print(model_name + ' scores')
        print_evaluation_scores(test_labels, predicted_labels)

    roc_auc(test_labels, predicted_scores, n_classes)
    print_words_for_tag(best_model, 'cotton-oil', mlb_classes,
                        tfidf_reversed_vocab)
コード例 #7
0
    def evaluateRecommender(self, recommender_object):
        """
        :param recommender_object: the trained recommender object, a Recommender subclass
        :param URM_test_list: list of URMs to test the recommender against, or a single URM object
        :param cutoff_list: list of cutoffs to be use to report the scores, or a single cutoff
        """

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.URM_train,
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        start_time = time.time()
        start_time_print = time.time()

        n_eval = 0

        self.__all_items = np.arange(0, self.n_items, dtype=np.int)
        self.__all_items = set(self.__all_items)

        if self.ignore_items_flag:
            recommender_object.set_items_to_ignore(self.ignore_items_ID)

        for test_user in self.usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)

            n_eval += 1

            self.user_specific_remove_items(recommender_object, test_user)

            # recommended_items = recommender_object.recommend(np.array(test_user), remove_seen_flag=self.exclude_seen,
            #                                                  cutoff = self.max_cutoff, remove_top_pop_flag=False, remove_CustomItems_flag=self.ignore_items_flag)
            recommended_items = recommender_object.recommend(
                np.atleast_1d(test_user),
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                remove_CustomItems_flag=self.ignore_items_flag)

            recommended_items = np.array(recommended_items[0])

            recommender_object.reset_items_to_ignore()

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                       value] += recall_min_test_len(
                                           is_relevant_current_cutoff,
                                           relevant_items)
                results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                    is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                    is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

            if time.time() - start_time_print > 30 or n_eval == len(
                    self.usersToEvaluate):
                print(
                    "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                    .format(n_eval,
                            100.0 * float(n_eval) / len(self.usersToEvaluate),
                            time.time() - start_time,
                            float(n_eval) / (time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()

        if (n_eval > 0):

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                for key in results_current_cutoff.keys():

                    value = results_current_cutoff[key]

                    if isinstance(value, Metrics_Object):
                        results_current_cutoff[key] = value.get_metric_value()
                    else:
                        results_current_cutoff[key] = value / n_eval

                precision_ = results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value]
                recall_ = results_current_cutoff[EvaluatorMetrics.RECALL.value]

                if precision_ + recall_ != 0:
                    results_current_cutoff[EvaluatorMetrics.F1.value] = 2 * (
                        precision_ * recall_) / (precision_ + recall_)

        else:
            print(
                "WARNING: No users had a sufficient number of relevant items")

        if self.ignore_items_flag:
            recommender_object.reset_items_to_ignore()

        results_run_string = self.get_result_string(results_dict)

        return (results_dict, results_run_string)
コード例 #8
0
    def _run_evaluation_on_selected_users(self,
                                          recommender_object,
                                          usersToEvaluate,
                                          block_size=1000):

        start_time = time.time()
        start_time_print = time.time()

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.get_URM_train(),
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        n_users_evaluated = 0

        # Start from -block_size to ensure it to be 0 at the first block
        user_batch_start = 0
        user_batch_end = 0

        while user_batch_start < len(self.usersToEvaluate):

            user_batch_end = user_batch_start + block_size
            user_batch_end = min(user_batch_end, len(usersToEvaluate))

            test_user_batch_array = np.array(
                usersToEvaluate[user_batch_start:user_batch_end])
            user_batch_start = user_batch_end

            # Compute predictions for a batch of users using vectorization, much more efficient than computing it one at a time
            recommended_items_batch_list = recommender_object.recommend(
                test_user_batch_array,
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                remove_CustomItems_flag=self.ignore_items_flag)

            # Compute recommendation quality for each user in batch
            for batch_user_index in range(len(recommended_items_batch_list)):

                user_id = test_user_batch_array[batch_user_index]
                recommended_items = recommended_items_batch_list[
                    batch_user_index]

                # Being the URM CSR, the indices are the non-zero column indexes
                relevant_items = self.get_user_relevant_items(user_id)
                is_relevant = np.in1d(recommended_items,
                                      relevant_items,
                                      assume_unique=True)

                n_users_evaluated += 1

                for cutoff in self.cutoff_list:

                    results_current_cutoff = results_dict[cutoff]

                    is_relevant_current_cutoff = is_relevant[0:cutoff]
                    recommended_items_current_cutoff = recommended_items[
                        0:cutoff]

                    results_current_cutoff[
                        EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                            is_relevant_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.PRECISION.value] += precision(
                            is_relevant_current_cutoff, len(relevant_items))
                    results_current_cutoff[
                        EvaluatorMetrics.RECALL.value] += recall(
                            is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                           value] += recall_min_test_len(
                                               is_relevant_current_cutoff,
                                               relevant_items)
                    results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                        is_relevant_current_cutoff, relevant_items)
                    results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                        is_relevant_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.NDCG.value] += ndcg(
                            recommended_items_current_cutoff,
                            relevant_items,
                            relevance=self.get_user_test_ratings(user_id),
                            at=cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.HIT_RATE.
                        value] += is_relevant_current_cutoff.sum()
                    results_current_cutoff[
                        EvaluatorMetrics.ARHR.value] += arhr(
                            is_relevant_current_cutoff)

                    results_current_cutoff[
                        EvaluatorMetrics.NOVELTY.value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_GINI.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.SHANNON_ENTROPY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.COVERAGE_ITEM.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.COVERAGE_USER.
                        value].add_recommendations(
                            recommended_items_current_cutoff, user_id)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                        value].add_recommendations(
                            recommended_items_current_cutoff)
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

                    if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                        results_current_cutoff[
                            EvaluatorMetrics.DIVERSITY_SIMILARITY.
                            value].add_recommendations(
                                recommended_items_current_cutoff)

                if time.time(
                ) - start_time_print > 30 or n_users_evaluated == len(
                        self.usersToEvaluate):
                    print(
                        "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                        .format(
                            n_users_evaluated,
                            100.0 * float(n_users_evaluated) /
                            len(self.usersToEvaluate),
                            time.time() - start_time,
                            float(n_users_evaluated) /
                            (time.time() - start_time)))

                    sys.stdout.flush()
                    sys.stderr.flush()

                    start_time_print = time.time()

        return results_dict, n_users_evaluated
コード例 #9
0
    def _run_evaluation_on_selected_users(self, recommender_object,
                                          usersToEvaluate):

        start_time = time.time()
        start_time_print = time.time()

        results_dict = {}

        for cutoff in self.cutoff_list:
            results_dict[cutoff] = create_empty_metrics_dict(
                self.n_items, self.n_users, recommender_object.URM_train,
                self.ignore_items_ID, self.ignore_users_ID, cutoff,
                self.diversity_object)

        n_users_evaluated = 0

        for test_user in usersToEvaluate:

            # Being the URM CSR, the indices are the non-zero column indexes
            relevant_items = self.get_user_relevant_items(test_user)

            n_users_evaluated += 1

            recommended_items = recommender_object.recommend(
                test_user,
                remove_seen_flag=self.exclude_seen,
                cutoff=self.max_cutoff,
                remove_top_pop_flag=False,
                remove_CustomItems_flag=self.ignore_items_flag)

            is_relevant = np.in1d(recommended_items,
                                  relevant_items,
                                  assume_unique=True)

            for cutoff in self.cutoff_list:

                results_current_cutoff = results_dict[cutoff]

                is_relevant_current_cutoff = is_relevant[0:cutoff]
                recommended_items_current_cutoff = recommended_items[0:cutoff]

                results_current_cutoff[
                    EvaluatorMetrics.ROC_AUC.value] += roc_auc(
                        is_relevant_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.PRECISION.value] += precision(
                        is_relevant_current_cutoff, len(relevant_items))
                results_current_cutoff[
                    EvaluatorMetrics.RECALL.value] += recall(
                        is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.RECALL_TEST_LEN.
                                       value] += recall_min_test_len(
                                           is_relevant_current_cutoff,
                                           relevant_items)
                results_current_cutoff[EvaluatorMetrics.MAP.value] += map(
                    is_relevant_current_cutoff, relevant_items)
                results_current_cutoff[EvaluatorMetrics.MRR.value] += rr(
                    is_relevant_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.NDCG.value] += ndcg(
                    recommended_items_current_cutoff,
                    relevant_items,
                    relevance=self.get_user_test_ratings(test_user),
                    at=cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.HIT_RATE.
                    value] += is_relevant_current_cutoff.sum()
                results_current_cutoff[EvaluatorMetrics.ARHR.value] += arhr(
                    is_relevant_current_cutoff)

                results_current_cutoff[
                    EvaluatorMetrics.NOVELTY.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_GINI.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.SHANNON_ENTROPY.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_ITEM.value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[
                    EvaluatorMetrics.COVERAGE_USER.value].add_recommendations(
                        recommended_items_current_cutoff, test_user)
                results_current_cutoff[
                    EvaluatorMetrics.DIVERSITY_MEAN_INTER_LIST.
                    value].add_recommendations(
                        recommended_items_current_cutoff)
                results_current_cutoff[EvaluatorMetrics.DIVERSITY_HERFINDAHL.
                                       value].add_recommendations(
                                           recommended_items_current_cutoff)

                if EvaluatorMetrics.DIVERSITY_SIMILARITY.value in results_current_cutoff:
                    results_current_cutoff[
                        EvaluatorMetrics.DIVERSITY_SIMILARITY.
                        value].add_recommendations(
                            recommended_items_current_cutoff)

            if time.time() - start_time_print > 30 or n_users_evaluated == len(
                    self.usersToEvaluate):
                print(
                    "SequentialEvaluator: Processed {} ( {:.2f}% ) in {:.2f} seconds. Users per second: {:.0f}"
                    .format(
                        n_users_evaluated, 100.0 * float(n_users_evaluated) /
                        len(self.usersToEvaluate),
                        time.time() - start_time,
                        float(n_users_evaluated) / (time.time() - start_time)))

                sys.stdout.flush()
                sys.stderr.flush()

                start_time_print = time.time()

        return results_dict, n_users_evaluated
コード例 #10
0
def main():
    # Set hyperparameters
    num_folds = 100
    label_name = "1"

    # Specify data location
    data_path = "Data/test_data.csv"

    # Load data to table
    df = pd.read_csv(data_path, sep=";", index_col=0)

    # Check if any labels are missing
    print("Number of missing values:\n", df.isnull().sum())
    print()

    # Only keep first instance if multiple instances have the same key
    num_instances_before = len(df)
    df = df[~df.index.duplicated(keep="first")]
    num_instances_diff = num_instances_before - len(df)
    if num_instances_diff > 0:
        print(
            "Warning: {} instances removed due to duplicate keys - only keeping first occurrence!"
            .format(num_instances_diff))

    # Perform standardized preprocessing
    preprocessor = TabularPreprocessor()
    df = preprocessor.fit_transform(df)

    # Display bar chart with number of samples per class
    # seaborn.countplot(x=label_name, data=df)
    # plt.title("Original class frequencies")
    # plt.savefig("Results/original_class_frequencies.png")
    # plt.close()

    # Separate data into training and test
    y = df[label_name]
    x = df.drop(label_name, axis="columns")

    # Get samples per class
    print("Samples per class")
    for (label, count) in zip(*np.unique(y, return_counts=True)):
        print("{}: {}".format(label, count))
    print()

    # Get number of classes
    num_classes = len(np.unique(df[label_name].values))

    # Setup classifiers
    knn = KNeighborsClassifier(weights="distance")
    knn_param_grid = {
        "n_neighbors":
        [int(val)
         for val in np.round(np.sqrt(x.shape[1])) + np.arange(5) + 1] +
        [
            int(val)
            for val in np.round(np.sqrt(x.shape[1])) - np.arange(5) if val >= 1
        ],
        "p":
        np.arange(1, 5)
    }

    dt = DecisionTreeClassifier()
    dt_param_grid = {
        "criterion": ["gini", "entropy"],
        "splitter": ["best", "random"],
        "max_depth": np.arange(1, 20),
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 3, 5, 6],
        "max_features": ["auto", "sqrt", "log2"]
    }

    rf = RandomForestClassifier(n_estimators=100,
                                criterion="entropy",
                                max_depth=5,
                                min_samples_split=5,
                                min_samples_leaf=2)
    rf_param_grid = {}

    nn = MLPClassifier(hidden_layer_sizes=(32, 64, 32), activation="relu")
    nn_param_grid = {}

    clfs = {
        "knn": {
            "classifier": knn,
            "parameters": knn_param_grid
        },
        "dt": {
            "classifier": dt,
            "parameters": dt_param_grid
        },
        "rf": {
            "classifier": rf,
            "parameters": rf_param_grid
        },
        "nn": {
            "classifier": nn,
            "parameters": nn_param_grid
        }
    }

    clfs_performance = {"acc": [], "sns": [], "spc": [], "auc": []}

    # Initialize result table
    results = pd.DataFrame(index=list(clfs.keys()))

    # Iterate over classifiers
    for clf in clfs:

        # Initialize cumulated confusion matrix and fold-wise performance containers
        cms = np.zeros((num_classes, num_classes))
        performance_foldwise = {"acc": [], "sns": [], "spc": [], "auc": []}

        # Iterate over MCCV
        for fold_index in np.arange(num_folds):

            # Split into training and test data
            x_train, x_test, y_train, y_test = train_test_split(
                x, y, test_size=0.15, stratify=y, random_state=fold_index)

            # Perform standardization and feature imputation
            intra_fold_preprocessor = TabularIntraFoldPreprocessor(
                k="automated", normalization="standardize")
            intra_fold_preprocessor = intra_fold_preprocessor.fit(x_train)
            x_train = intra_fold_preprocessor.transform(x_train)
            x_test = intra_fold_preprocessor.transform(x_test)

            # Perform (ANOVA) feature selection
            selected_indices, x_train, x_test = univariate_feature_selection(
                x_train.values,
                y_train.values,
                x_test.values,
                score_func=f_classif,
                num_features="log2n")

            # # Random undersampling
            # rus = RandomUnderSampler(random_state=fold_index, sampling_strategy=0.3)
            # x_train, y_train = rus.fit_resample(x_train, y_train)

            # SMOTE
            smote = SMOTE(random_state=fold_index, sampling_strategy=1)
            x_train, y_train = smote.fit_resample(x_train, y_train)

            # Setup model
            model = clfs[clf]["classifier"]
            model.random_state = fold_index

            # Hyperparameter tuning and keep model trained with the best set of hyperparameters
            optimized_model = RandomizedSearchCV(
                model,
                param_distributions=clfs[clf]["parameters"],
                cv=5,
                random_state=fold_index)
            optimized_model.fit(x_train, y_train)

            # Predict test data using trained model
            y_pred = optimized_model.predict(x_test)

            # Compute performance
            cm = confusion_matrix(y_test, y_pred)
            acc = accuracy_score(y_test, y_pred)
            sns = metrics.sensitivity(y_test, y_pred)
            spc = metrics.specificity(y_test, y_pred)
            auc = metrics.roc_auc(y_test, y_pred)

            # Append performance to fold-wise and overall containers
            cms += cm
            performance_foldwise["acc"].append(acc)
            performance_foldwise["sns"].append(sns)
            performance_foldwise["spc"].append(spc)
            performance_foldwise["auc"].append(auc)

        # Calculate overall performance
        for metric in performance_foldwise:
            avg_metric = np.round(
                np.sum(performance_foldwise[metric]) /
                len(performance_foldwise[metric]), 2)
            clfs_performance[metric].append(avg_metric)

        # Display overall performances
        print("== {} ==".format(clf))
        print("Cumulative CM:\n", cms)
        for metric in clfs_performance:
            print("Avg {}: {}".format(metric, clfs_performance[metric][-1]))
        print()

        # Display confusion matrix
        # sns.heatmap(cms, annot=True, cmap="Blues", fmt="g")
        # plt.xlabel("Predicted")
        # plt.ylabel("Actual")
        # plt.title("{} - Confusion matrix".format(clf))
        # plt.savefig("Results/confusion_matrix-{}.png".format(clf))
        # plt.close()

    # Append performance to result table
    for metric in clfs_performance:
        results[metric] = clfs_performance[metric]

    # Save result table
    results.to_csv("performances.csv", sep=";")
    results.plot.bar(rot=45).legend(loc="upper right")
    plt.savefig("performance.png".format(clf))
    plt.show()
    plt.close()