Exemple #1
0
def test_model(model, samples_train, categorical_labels_train, samples_test,
               categorical_labels_test):
    '''
    Test the keras model given as input with predict_classes()

    
    Epochs (nb_epoch) is the number of times that the model is exposed to the training dataset.
    Batch Size (batch_size) is the number of training instances shown to the model before a weight update is performed.
    '''

    predictions = model.predict_classes(samples_test, verbose=2)

    # Calculate soft predictionsmulti_layer_perceptron
    soft_values = model.predict(samples_test, verbose=2)
    mc.calculate_stats(predictions,
                       categorical_labels_test,
                       'mlp_confusion_matrix',
                       show_fig=False)

    training_predictions = model.predict_classes(samples_train, verbose=2)
    training_soft_values = model.predict(samples_train, verbose=2)

    # print(len(categorical_labels_test))
    # print(categorical_labels_test)

    # print(len(predictions))
    # print(predictions)

    # print(len(soft_values))
    # print(soft_values)

    # Accuracy, F-measure, and g-mean
    accuracy = accuracy_score(categorical_labels_test, predictions)
    fmeasure = f1_score(categorical_labels_test, predictions, average='macro')
    macro_gmean = mean(
        im.geometric_mean_score(categorical_labels_test,
                                predictions,
                                average=None))

    # Accuracy, F-measure, and g-mean on training set
    training_accuracy = accuracy_score(categorical_labels_train,
                                       training_predictions)
    training_fmeasure = f1_score(categorical_labels_train,
                                 training_predictions,
                                 average='macro')
    training_macro_gmean = mean(
        im.geometric_mean_score(categorical_labels_train,
                                training_predictions,
                                average=None))

    return soft_values, predictions, training_soft_values, training_predictions, accuracy, fmeasure, macro_gmean, training_accuracy, training_fmeasure, training_macro_gmean
Exemple #2
0
def test_geometric_mean_multiclass():
    """Test geometric mean for multiclass classification task"""
    y_true, y_pred, _ = make_prediction(binary=False)

    # Compute the geometric mean for each of the classes
    geo_mean = geometric_mean_score(y_true, y_pred, average=None)
    assert_array_almost_equal(geo_mean, [0.85, 0.29, 0.7], 2)

    # average tests
    geo_mean = geometric_mean_score(y_true, y_pred, average='macro')
    assert_almost_equal(geo_mean, 0.68, 2)

    geo_mean = geometric_mean_score(y_true, y_pred, average='weighted')
    assert_array_almost_equal(geo_mean, 0.65, 2)
def decisiontree(X_tr, Y_tr, X_te, Y_te):
    # X_tr, X_te = normalize_data(X_tr, X_te, "minmax")
    if Y_tr.shape[1] > 1:
        Y_tr = np.argmax(Y_tr, axis=1)
        Y_te = np.argmax(Y_te, axis=1)
    param_grid = {'max_depth': np.arange(3, 6)}

    tree = GridSearchCV(DecisionTreeClassifier(), param_grid)

    tree.fit(X_tr, Y_tr)
    start = time.time()
    y_pred = tree.predict(X_te)
    end = time.time()
    elapsed = (end - start) / float(len(X_te))
    acc = accuracy_score(Y_te, y_pred)
    fpr_vot, tpr_vot, _ = roc_curve(Y_te,
                                    y_pred,
                                    pos_label=1,
                                    drop_intermediate=False)
    roc_auc_vot = auc(fpr_vot, tpr_vot)
    cmat = classification_report_imbalanced(Y_te, y_pred)
    print("Decision tree")
    # print (cmat)

    geo = geometric_mean_score(Y_te, y_pred)
    f1 = f1_score(Y_te, y_pred, average='micro')

    print('The auc is {} '.format(roc_auc_vot))
    return roc_auc_vot, elapsed
def svm(X_tr, Y_tr, X_te, Y_te):
    # bw = (len(X_tr)/2.0)**0.5        #default value in One-class SVM
    # gamma = 1/(2*bw*bw)
    X_tr, X_te = normalize_data(X_tr, X_te, "minmax")
    if Y_tr.shape[1] > 1:
        Y_tr = np.argmax(Y_tr, axis=1)
        Y_te = np.argmax(Y_te, axis=1)
    # parameters =  [{'kernel': ['rbf'], 'gamma': [1e-3],
    #                 'C': [1]}]
    # {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
    # svc = svm.SVC()
    # clf = GridSearchCV(svc, parameters,cv= 5)
    # clf = SVC (gamma = gamma)
    clf = LinearSVC(random_state=0)
    clf.fit(X_tr, Y_tr)
    start = time.time()
    y_pred = clf.predict(X_te)
    end = time.time()
    elapsed = (end - start) / float(len(X_te))
    acc = accuracy_score(Y_te, y_pred)
    fpr_vot, tpr_vot, _ = roc_curve(Y_te,
                                    y_pred,
                                    pos_label=1,
                                    drop_intermediate=False)
    roc_auc_vot = auc(fpr_vot, tpr_vot)
    cmat = classification_report_imbalanced(Y_te, y_pred)
    print("SVM")
    geo = geometric_mean_score(Y_te, y_pred)
    f1 = f1_score(Y_te, y_pred, average='macro')
    print('The auc is {} '.format(roc_auc_vot))
    return roc_auc_vot, elapsed
Exemple #5
0
def get_output(labels, predictions, data_option = None, t=0.5, to_plot = False, pos_label = 1):
    predicted_classes = threshold(predictions, t)
    true_classes = labels
    conf_mat = confusion_matrix(y_true = true_classes, y_pred = predicted_classes)
    #report = classification_report(true_classes, predicted_classes)
    AUROC = []
    AUPR = []
    # print(np.where(labels==1))
    if np.count_nonzero(labels) > 0 and np.count_nonzero(labels) != labels.shape[0]: #Makes sure both classes present

        fpr, tpr, thresholds = roc_curve(y_true = true_classes, y_score = predictions, pos_label = pos_label)
        #auc1 = roc_auc_score(y_true = labels, y_score = predictions)
        AUROC = auc(fpr, tpr)

        precision, recall, thresholds = precision_recall_curve(true_classes, predictions)
        AUPR = auc(recall, precision)
        # if auc1<0.5:

        #     auc1 = roc_auc_score(y_true = 1-labels, y_score = predictions)
        #print('ROC AUC is ', auc1)
        if to_plot == True:
            plot_ROC_AUC(fpr,tpr, AUROC, data_option)
    else:
        print('only one class present')
        #g_mean = geometric_mean_score(labels, predicted_classes)
    g_mean = geometric_mean_score(labels, predicted_classes)
        # print(report)
        # print("\n")
        #print(conf_mat)

    return AUROC, conf_mat, g_mean, AUPR
Exemple #6
0
def compute_metrics(y_test,
                    y_pred,
                    y_proba=None,
                    average='weighted',
                    return_index=False):
    """
    Function computing metrics of interest for a sets of prediction

    :input y_test: pd.DataFrame or np.array of original label
    :input y_pred: pd.DataFrame or np.array of predicted label

    :output red: list of value for metrics, in order - Accuracy - Precision - Recall - F1 Score - Sensitivity - Specifity
    """
    if return_index:
        return [
            'accuracy', 'precision', 'recall', 'f1_score', 'sensitivity_score',
            'specificity_score', 'geometric_mean_score',
            'average_precision_score'
        ]
    else:
        res = []
        res.append(accuracy_score(y_test, y_pred))
        res.append(precision_score(y_test, y_pred, average=average))
        res.append(recall_score(y_test, y_pred, average=average))
        res.append(f1_score(y_test, y_pred, average=average))
        res.append(sensitivity_score(y_test, y_pred, average=average))
        res.append(specificity_score(y_test, y_pred, average=average))
        res.append(geometric_mean_score(y_test, y_pred, average=average))
        if y_proba is not None:
            res.append(
                average_precision_score(y_test, y_proba, average=average))
        return res
def cross_validate(model, x, y, cv=5):
    kf = KFold(n_splits=5, random_state=42, shuffle=True)

    results = {
        "recall": [],
        "accuracy": [],
        "f1": [],
        "geometric-gmean": [],
        "average_precision_score": []
    }

    for train_index, test_index in kf.split(x):
        fit = model.fit([x[index] for index in train_index],
                        [y[index] for index in train_index])

        predictions = fit.predict([x[index] for index in test_index])
        y_true = [y[index] for index in test_index]

        results["recall"].append(recall_score(y_true, predictions))
        results["accuracy"].append(accuracy_score(y_true, predictions))
        results["f1"].append(f1_score(y_true, predictions))
        results["geometric-gmean"].append(
            geometric_mean_score(y_true, predictions, average='weighted'))
        results["average_precision_score"].append(
            average_precision_score(y_true, predictions))

    return results
Exemple #8
0
def randomforest(X_tr, Y_tr, X_te, Y_te):
    if Y_tr.shape[1] > 1:
        Y_tr = np.argmax(Y_tr, axis=1)
        Y_te = np.argmax(Y_te, axis=1)
    rfc = RandomForestClassifier(n_jobs=-1,
                                 max_features='sqrt',
                                 n_estimators=40,
                                 oob_score=True)
    clf = RandomForestClassifier(n_estimators=100,
                                 max_depth=80,
                                 random_state=0)

    param_grid = {'n_estimators': [5, 10, 20, 40, 80, 150]}
    clf = GridSearchCV(estimator=rfc, param_grid=param_grid)
    clf.fit(X_tr, Y_tr)
    y_pred = clf.predict(X_te)
    acc = accuracy_score(Y_te, y_pred)
    fpr_vot, tpr_vot, _ = roc_curve(Y_te,
                                    y_pred,
                                    pos_label=1,
                                    drop_intermediate=False)
    roc_auc_vot = auc(fpr_vot, tpr_vot)
    cmat = classification_report(Y_te, y_pred)
    print(cmat)
    geo = geometric_mean_score(Y_te, y_pred)
    f1 = f1_score(Y_te, y_pred, average='micro')
    print('The geometric mean is {}'.format(geo))
    cnf_matrix = confusion_matrix(Y_te, y_pred)
    print(cnf_matrix)
    print('The auc is {}'.format(roc_auc_vot))
    print('The f1 is {}'.format(f1))

    return acc
Exemple #9
0
def calc_metrics_radar(y_true, y_prob):
    # calculate the metrics for prediction probabilities from RaDaR
    vfunc = np.vectorize(lambda x: 1 if x > 0.05 else 0)
    y_pred = vfunc(y_prob).ravel()
    precision, recall, _ = precision_recall_curve(y_true, y_prob)
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    pr_auc = average_precision_score(y_true, y_prob)
    exp_pos = np.sum(y_prob)
    f_score = f1_score(y_true, y_pred)
    g_mean = geometric_mean_score(y_true, y_pred)

    loss = log_loss(y_true, y_prob)

    metrics = {
        'Loss': loss,
        'PR_AUC': pr_auc,
        'ROC_AUC': roc_auc,
        'F_score': f_score,
        'G_mean': g_mean,
        'Expeted_#_D60': exp_pos,
        'Actual_#_D60': np.sum(y_true),
        'Diff_D60': abs(np.sum(y_true) - exp_pos),
        'Ratio_D60': np.sum(y_true) / exp_pos,
    }
    return metrics
Exemple #10
0
def randomforest_cross_validation(train_x, train_y):
    np.random.seed(100)
    clf = ensemble.RandomForestClassifier()
    
    clf.fit(train_x, train_y)
    
    #calculate the accuracy
    accuracy = cross_val_score(clf, train_x, train_y, cv=10, scoring='accuracy') 
    print "accuracy: %f" %accuracy.mean() + '\n'
    #print accuracy
    
    #calculate the precision
    precision = cross_val_score(clf, train_x, train_y, cv=10, scoring='precision_macro')
    print "precision: %f" %precision.mean() + '\n'
    
    #calculate the recall score
    recall = cross_val_score(clf, train_x, train_y, cv=10, scoring='recall_macro')
    print "recall: %f" %recall.mean() + '\n'
    
    #calculate the f_measure
    f_measure = cross_val_score(clf, train_x, train_y, cv=10, scoring='f1_macro')
    print "f_measure: %f " %f_measure.mean() + '\n' 

    #generate classification report and MCC and G-mean value
    y_pred = cross_val_predict(clf, train_x, train_y, cv=10)
    G_mean = geometric_mean_score(train_y, y_pred)
    MCC = matthews_corrcoef(train_y, y_pred)
    print "G_mean: %f" %G_mean.mean() + '\n'
    print "MCC: %f" %np.mean(MCC) + '\n'
    
    print "Classification_report:"
    print(metrics.classification_report(train_y, y_pred))    
    
    return clf
Exemple #11
0
def calculate_performance(labels, predictions):
    output = dict()
    output["balanced_accuracy"] = balanced_accuracy_score(
        labels, predictions[0])
    output["gmean"] = metrics.geometric_mean_score(labels, predictions[0])
    output["accuracy"] = accuracy_score(labels, predictions[0])
    output["f1score"] = f1_score(labels, predictions[0])
    output["recall"] = recall_score(labels, predictions[0])
    output["precision"] = precision_score(labels, predictions[0])

    output["auc"] = roc_auc_score(labels, predictions[1][:, 1])
    output["prc"] = average_precision_score(labels, predictions[1][:, 1])

    tn, fp, fn, tp = confusion_matrix(labels, predictions[0]).ravel()
    output["tpr"] = float(tp) / (float(tp) + float(fn))
    output["tnr"] = float(tn) / (float(tn) + float(fp))
    output["opm"] = (output['gmean'] + output['balanced_accuracy'] +
                     output['f1score'] + output['tpr'] + output["tnr"]) / 5.

    output["opm_prc"] = (output['gmean'] + output['prc'] +
                         output['balanced_accuracy'] + output['f1score'] +
                         output['tpr'] + output["tnr"]) / 6.
    output["opm_auc"] = (output['gmean'] + output['auc'] +
                         output['balanced_accuracy'] + output['f1score'] +
                         output['tpr'] + output["tnr"]) / 6.

    return output
def f_per_particle(m, alpha):
    """Computes for the objective function per particle

    Inputs
    ------
    m : numpy.ndarray
        Binary mask that can be obtained from BinaryPSO, will
        be used to mask features.
    alpha: float (default is 0.5)
        Constant weight for trading-off classifier performance
        and number of features

    Returns
    -------
    numpy.ndarray
        Computed objective function
    """
    total_features = 19
    # Get the subset of the features from the binary mask
    if np.count_nonzero(m) == 0:
        X_subset = X
    else:
        X_subset = X[:,m==1]
    # Perform classification and store performance in P
    classifier.fit(X_subset, y)
    from imblearn.metrics import geometric_mean_score
    P = geometric_mean_score(y_train, classifier.predict(X_subset))
    # Compute for the objective function
    j = (alpha * (1.0 - P) + (1.0 - alpha) * (1 - (X_subset.shape[1] / total_features)))

    return j
Exemple #13
0
def calc_metrics(y_test, pred, auc, i):
    sen = metrics.sensitivity_score(y_test, pred, pos_label=1)
    spe = metrics.specificity_score(y_test, pred, pos_label=1)
    geo = metrics.geometric_mean_score(y_test, pred, pos_label=1)
    index = ['sm', 'b1', 'b2', 'enn', 'tom', 'ada', 'mnd']
    metrics_list = [index[i], sen, spe, geo, auc]
    return metrics_list
Exemple #14
0
def decisiontree(X_tr, Y_tr, X_te, Y_te):
    if Y_tr.shape[1] > 1:
        Y_tr = np.argmax(Y_tr, axis=1)
        Y_te = np.argmax(Y_te, axis=1)
    param_grid = {'max_depth': [5, 6, 7, 8, 9, 10, 50, 100]}
    tree = GridSearchCV(DecisionTreeClassifier(), param_grid)

    tree.fit(X_tr, Y_tr)
    y_pred = tree.predict(X_te)
    acc = accuracy_score(Y_te, y_pred)
    fpr_vot, tpr_vot, _ = roc_curve(Y_te,
                                    y_pred,
                                    pos_label=1,
                                    drop_intermediate=False)
    roc_auc_vot = auc(fpr_vot, tpr_vot)
    cmat = classification_report_imbalanced(Y_te, y_pred)
    print("Decision tree")
    print(cmat)
    cnf_matrix = confusion_matrix(Y_te, y_pred)
    print(cnf_matrix)
    geo = geometric_mean_score(Y_te, y_pred)
    f1 = f1_score(Y_te, y_pred, average='micro')
    print('The geometric mean is {}'.format(geo))
    print('The auc is {}'.format(roc_auc_vot))
    print('The f1 is {}'.format(f1))

    return acc
    def get_reward(self, train_x, train_y, train_weights, valid_x, valid_y, test_x, test_y):
        '''Train the classifier with supervised

        :param train_x:
        :param train_y:
        :param train_weights:
        :param valid_x:
        :param valid_y:
        :return: The reward (F1)
        '''
        from imblearn.metrics import geometric_mean_score
        from sklearn.metrics import matthews_corrcoef
        idx = train_weights == 1
        x = train_x[idx]
        y = train_y[idx]
        self.env.fit(x, np.argmax(y, axis=1).astype('int32'))
        if task == 'vehicle':
            preds = self.env.predict(valid_x)
            valid_reward = geometric_mean_score(np.argmax(valid_y, axis=1).astype('int32'), preds)
        elif self.task == 'page':
            preds = self.env.predict(valid_x)
            valid_reward = matthews_corrcoef(np.argmax(valid_y, axis=1).astype('int32'), preds)
        elif self.task == 'spam':
            preds = self.env.predict(valid_x)
            valid_reward = evaluate_f2(np.argmax(valid_y, axis=1).astype('int32'), preds)  # for spam
        elif task == 'credit':
            preds = self.env.predict_proba(valid_x)[:, 1]
            valid_reward = evaluate_auc_prc(np.argmax(valid_y, axis=1).astype('int32'), preds)
        return valid_reward, valid_reward, valid_reward
Exemple #16
0
def svm(X_tr, Y_tr, X_te, Y_te):
    if Y_tr.shape[1] > 1:
        Y_tr = np.argmax(Y_tr, axis=1)
        Y_te = np.argmax(Y_te, axis=1)
    parameters = [{
        'kernel': ['rbf'],
        'gamma': [1e-3, 1e-2, 1e-1, 1],
        'C': [1]
    }]
    #{'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]
    svc = svm.SVC()
    clf = GridSearchCV(svc, parameters, cv=5)

    clf.fit(X_tr, Y_tr)
    y_pred = clf.predict(X_te)
    acc = accuracy_score(Y_te, y_pred)
    fpr_vot, tpr_vot, _ = roc_curve(Y_te,
                                    y_pred,
                                    pos_label=1,
                                    drop_intermediate=False)
    roc_auc_vot = auc(fpr_vot, tpr_vot)
    cmat = classification_report_imbalanced(Y_te, y_pred)
    print("SVM")

    print(cmat)

    cnf_matrix = confusion_matrix(Y_te, y_pred)
    print(cnf_matrix)
    geo = geometric_mean_score(Y_te, y_pred)
    f1 = f1_score(Y_te, y_pred, average='micro')
    print('The geometric mean is {}'.format(geo))
    print('The auc is {}'.format(roc_auc_vot))
    print('The f1 is {}'.format(f1))

    return acc
Exemple #17
0
def validate_easy_ensemble(estimator, X, y):
    acc = []
    b_acc = []
    a_p_c = []
    roc = []
    gm = []
    for key, x_val in zip(X.keys(), X.values()):
        preds = estimator.predict(x_val)

        acc.append(accuracy_score(preds, y[key]))
        b_acc.append(balanced_accuracy_score(preds, y[key]))
        a_p_c.append(average_precision_score(preds, y[key]))
        roc.append(roc_auc_score(preds, y[key]))
        gm.append(geometric_mean_score(preds, y[key]))

    scores = {
        'Accuracy Score = ': np.round(np.mean(acc), 3),
        'Accuracy Std = ': np.round(np.std(acc), 3),
        'Balanced Accuracy Score = ': np.round(np.mean(b_acc), 3),
        'Balanced Accuracy Std = ': np.round(np.std(b_acc), 3),
        'Average Precision Recall Score = ': np.round(np.mean(a_p_c), 3),
        'Average Precision Recall Std = ': np.round(np.std(a_p_c), 3),
        'Roc Auc Score = ': np.round(np.mean(roc), 3),
        'Roc Auc Std = ': np.round(np.std(roc), 3),
        'G Mean Score = ': np.round(np.mean(gm), 3),
        'G Mean Std = ': np.round(np.std(gm), 3)
    }

    return scores
Exemple #18
0
def performance_summary(
    clf: OptimalSamplingClassifier,
    X: np.ndarray,
    y: np.ndarray,
    info: Optional[Dict[str, any]] = None,
) -> Dict[str, float]:
    predicted_proba = clf.predict_proba(X)
    predicted = clf.predict(X)
    nominal_proba = (y == clf.positive_class).mean()
    return dict(model=str(clf.estimator).replace("\n", "").replace(" ", ""),
                class_ratio=1 / nominal_proba,
                weight_ratio=clf.positive_weight / clf.negative_weight,
                sampling_probability=clf._sampling_proba,
                previous_probabilities=clf._prev_sampling_probas,
                cross_val_probabilities=clf._cross_val_sampling_probas,
                sampling_ratio=clf._sampling_proba / nominal_proba,
                iter_to_converge=clf._iter_count,
                accuracy=accuracy_score(y, predicted),
                sensitivity=sensitivity_score(y, predicted),
                specificity=specificity_score(y, predicted),
                precision=precision_score(y, predicted) if
                (predicted == clf.positive_class).sum() > 0 else None,
                recall=recall_score(y, predicted) if
                (predicted == clf.positive_class).sum() > 0 else None,
                f1_score=f1_score(y, predicted),
                geometric_mean_score=geometric_mean_score(y, predicted) if
                (predicted == clf.positive_class).sum() > 0 else None,
                roc_auc_score=roc_auc_score(y, predicted_proba),
                average_precision_score=average_precision_score(
                    y, predicted_proba),
                weighted_loss=clf.weighted_loss(X, y).mean(),
                cost=clf.cost(X, y).mean(),
                **(info if info else {}))
Exemple #19
0
def evalSampling(sampler, classifier, Xtrain, Xtest,ytrain, ytest):
    """Evaluate a sampling method with a given classifier and dataset
    
    Keyword arguments:
    sampler -- the sampling method to employ. None for no sampling
    classifer -- the classifier to use after sampling
    train -- (X, y) for training
    test -- (Xt, yt) for testing
    
    Returns:
    A tuple containing precision, recall, f1 score, AUC of ROC, Cohen's Kappa score, and 
    geometric mean score.
    """
    X = Xtrain
    y = ytrain
    Xt = Xtest
    yt = ytest
    
    if sampler is not None:
        X_resampled, y_resampled = sampler.fit_sample(X, y)
        classifier.fit(X_resampled, y_resampled)
    else:
        classifier.fit(X, y)
        
    yp = classifier.predict(Xt)
    yProb = classifier.predict_proba(Xt)[:,1] # Indicating class value 1 (not 0)

    precision = precision_score(yt, yp)
    recall    = recall_score(yt, yp)
    f1        = f1_score(yt, yp)
    rocauc    = roc_auc_score(yt, yProb)
    kappa     = cohen_kappa_score(yt, yp)
    gmean     = geometric_mean_score(yt, yp)
    
    return (precision, recall, f1, rocauc, kappa, gmean)
Exemple #20
0
def test_clf(clf, X_test, Y_test):
    print(clf)
    try:
        y_prob = clf.predict_proba(X_test)[:, 1]
        roc_score = roc_auc_score(Y_test, y_prob)
        fpr, tpr, threshold = roc_curve(Y_test, y_prob)
        plt.plot(fpr, tpr, label='ROAUC')
        plt.plot(1 - fpr, tpr, 'r')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROAUC SCORE = %.2f' % roc_score)
        plt.legend(loc='best')
        plt.show()
        y_predictions = np.round(y_prob)
    except Exception as e:
        y_pred = clf.predict(X_test)
        y_predictions = np.round(y_pred)

    print('ACCURACY = ', accuracy_score(Y_test, np.round(y_predictions)))
    print('GEOMETRIC MEAN SCORE = ',
          geometric_mean_score(Y_test, np.round(y_predictions)))
    print(classification_report(Y_test, np.round(y_predictions)))
    tn, fp, fn, tp = confusion_matrix(Y_test, np.round(y_predictions)).ravel()
    metrics = {}
    metrics['TN'] = tn
    metrics['FP'] = fp
    metrics['FN'] = fn
    metrics['TP'] = tp
    print('confusion matrix : ', metrics)
def test_geometric_mean_support_binary():
    y_true, y_pred, _ = make_prediction(binary=True)

    # compute the geometric mean for the binary problem
    geo_mean = geometric_mean_score(y_true, y_pred)

    assert_allclose(geo_mean, 0.77, rtol=R_TOL)
def test_geometric_mean_support_binary():
    y_true, y_pred, _ = make_prediction(binary=True)

    # compute the geometric mean for the binary problem
    geo_mean = geometric_mean_score(y_true, y_pred)

    assert_allclose(geo_mean, 0.77, rtol=R_TOL)
Exemple #23
0
def evaluate(X, y, estm):
    # Performance metrics
    y_pred = estm.predict(X)
    print(confusion_matrix(y, y_pred).ravel())
    tn, fp, fn, tp = confusion_matrix(y, y_pred).ravel()
    # ROC curve
    try:
        if "decision_function" not in dir(estm):
            y_prob = estm.predict_proba(X)[:, 1]
        else:
            y_prob = estm.decision_function(X)
        pre, rec, _ = precision_recall_curve(y, y_prob)
        fpr, tpr, _ = roc_curve(y, y_prob)
        aucroc = auc(fpr, tpr)
        aucpr = auc(rec, pre)
    except AttributeError:
        print("Classifier don't have predict_proba or decision_function, ignoring roc_curve.")
        pre, rec = None, None
        fpr, tpr = None, None
        aucroc = None
        aucpr = None
    eval_dictionary = {
        "CM": confusion_matrix(y, y_pred),  # Confusion matrix
        "ACC": (tp + tn) / (tp + fp + fn + tn),  # accuracy
        "F1": fbeta_score(y, y_pred, beta=1),
        "F2": fbeta_score(y, y_pred, beta=2),
        "GMean": geometric_mean_score(y, y_pred, average='binary'),
        "SEN": tp / (tp + fn),
        "PREC": tp / (tp + fp),
        "SPEC": tn / (tn + fp),
        "MCC": matthews_corrcoef(y, y_pred),
        "PRCURVE": {"precision": pre, "recall": rec, "aucpr": aucpr},
        "ROCCURVE": {"fpr": fpr, "tpr": tpr, "aucroc": aucroc}
    }
    return eval_dictionary
Exemple #24
0
def cros_val(X_train_folds, y_train_folds, X_test_folds, y_test_folds, models):
    metrics = {'acc':{}, 'f1':{}, 'gmean':{}, 'precision':{}, 'recall':{}}
    #Validação cruzada
    for name, model in models.items():
        print(f'\nModelo {name}: ')
        acc_folds = []
        f1_folds = []
        gmean_folds = []
        precision_folds = []
        recall_folds = []
        for i in range(len(X_train_folds)):
            print('.',end='')
            model.fit(X_train_folds[i], y_train_folds[i])
            y_pred = model.predict(X_test_folds[i])
            y_true = y_test_folds[i]
            #Calculando as métricas:
            acc_folds.append(accuracy_score(y_true, y_pred))
            f1_folds.append(f1_score(y_true, y_pred, average='macro'))
            gmean_folds.append(geometric_mean_score(y_true, y_pred, average='macro'))
            precision_folds.append(precision_score(y_true, y_pred, average='macro', zero_division=0))
            recall_folds.append(recall_score(y_true, y_pred, average='macro'))
        metrics['acc'][name] = acc_folds
        metrics['f1'][name] = f1_folds
        metrics['gmean'][name] = gmean_folds
        metrics['precision'][name] = precision_folds
        metrics['recall'][name] = recall_folds
    return metrics
Exemple #25
0
 def on_epoch_end(self, epoch, logs=None):
     # fetch results
     targets = self.targets
     predictions = self.predictions
     # convert prediction class probabilities to class
     y_pred = np.asarray([np.argmax(line) for line in predictions])
     # calculate metrics with sklearn
     gmean = geometric_mean_score(targets, y_pred, average='macro')
     accuracy = accuracy_score(targets, y_pred)
     # save scores
     self.val_gmean.append(gmean)
     self.val_accuracy.append(accuracy)
     # reset results
     self.targets = []
     self.predictions = []
     # check results
     if (gmean > self.best_score):
         self.best_score = gmean
         self.best_epoch = epoch
         self.model.save(self.save_path)
         if (self.verbose is True):
             print(
                 f"{epoch} - gmean: {gmean} - accuracy: {accuracy} (best)")
     else:
         if (self.verbose is True):
             print(f"{epoch} - gmean: {gmean} - accuracy: {accuracy}")
     # end if patience is overdue
     if (epoch - self.patience > self.best_epoch):
         if (self.verbose is True):
             print(f"Epoch {epoch}: early stopping Threshold")
         self.model.stop_training = True
Exemple #26
0
def randomforest(X_tr, Y_tr, X_te, Y_te):
    if Y_tr.shape[1] > 1:
        Y_tr = np.argmax(Y_tr, axis=1)
        Y_te = np.argmax(Y_te, axis=1)
    rfc = RandomForestClassifier(n_jobs=-1,max_features= 'sqrt' ,n_estimators=40, oob_score = True)

    param_grid = {
    'n_estimators': [40, 100]}


    CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid)
    CV_rfc.fit(X_tr, Y_tr)
    #print CV_rfc.best_params_
    #clf = RandomForestClassifier(n_estimators=150, random_state =42)
    #clf.fit(X_tr, Y_tr)
    y_pred = CV_rfc.predict(X_te)
    fpr_vot , tpr_vot , _ = roc_curve(Y_te , y_pred , pos_label =1,  drop_intermediate=False)
    roc_auc_vot = auc(fpr_vot , tpr_vot)
    cmat = classification_report_imbalanced(Y_te, y_pred)
    #print (cmat.diagonal()/cmat.sum(axis=1))
    print (cmat)
    print('The geometric mean is {}'.format(geometric_mean_score(Y_te,y_pred)))
    print('The auc is {}'.format(roc_auc_vot))
    print('The f1 is {}'.format(f1_score(Y_te, y_pred, average='weighted')))
    return CV_rfc, fpr_vot, tpr_vot, roc_auc_vot
Exemple #27
0
def validacion_cruzada(modelo, X, y, cv):
    y_test_all = []
    y_prob_all = []

    for train, test in cv.split(X, y):
        modelo = modelo.fit(X[train], y[train])
        y_pred = modelo.predict(X[test])
        y_prob = modelo.predict_proba(
            X[test]
        )[:, 1]  #la segunda columna es la clase positiva '1' en bank-marketing
        y_test_bin = y[test]
        #y_test_bin = le.fit_transform(y[test]) #se convierte a binario para AUC: 'yes' -> 1 (clase positiva) y 'no' -> 0 en bank-marketing

        print(
            "Accuracy: {:6.2f}%, F1-score: {:.4f}, G-mean: {:.4f}, AUC: {:.4f}"
            .format(
                accuracy_score(y[test], y_pred) * 100,
                f1_score(y[test], y_pred, average='macro'),
                geometric_mean_score(y[test], y_pred, average='macro'),
                roc_auc_score(y_test_bin, y_prob)))
        y_test_all = numpy.concatenate([y_test_all, y_test_bin])
        y_prob_all = numpy.concatenate([y_prob_all, y_prob])

    print("")

    return modelo, y_test_all, y_prob_all
Exemple #28
0
def printar_resultados(y_test, pred, ensemble, nome_modelo):
    '''
    metodo para printar os resultados de cada modelo
    :param: y_test: dados correspondentes a saida de teste
    :param: pred: dados correspondentes a previsao do modelo
    :return: retorna as metricas: acuracia, auc, f1score e gmean
    '''
    
    # computando as metricas para os dados recebidos
    qtd_modelos = len(ensemble.estimators_)
    acuracia = metrics.accuracy_score(y_test, pred)
    auc = metrics.roc_auc_score(y_test, pred)
    f1measure = metrics.f1_score(y_test, pred, average='binary')
    gmean = geometric_mean_score(y_test, pred, average='binary')
    
    # calculando o desempenho
    print('\n'+nome_modelo)
    print("qtd modelos:", qtd_modelos)
    print("taxa de acerto:", acuracia)
    print("AUC:", auc)
    print("f-measure:", f1measure)
    print("g-mean:", gmean)
    
    # retornando os resultados
    return qtd_modelos, acuracia, auc, f1measure, gmean
Exemple #29
0
def classification_report_imbalanced_values(
    y_true, y_pred, labels, target_names=None, sample_weight=None, digits=2, alpha=0.1
):
    """Copy of imblearn.metrics.classification_report_imbalanced to have
    access to the raw values. The code is mostly the same except the
    formatting code and generation of the report which haven removed. Copied
    from version 0.4.3. The original code is living here:
    https://github.com/scikit-learn-contrib/imbalanced-learn/blob/b861b3a8e3414c52f40a953f2e0feca5b32e7460/imblearn/metrics/_classification.py#L790
    """
    labels = np.asarray(labels)

    if target_names is None:
        target_names = [str(label) for label in labels]

    # Compute the different metrics
    # Precision/recall/f1
    precision, recall, f1, support = precision_recall_fscore_support(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )
    # Specificity
    specificity = specificity_score(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )
    # Geometric mean
    geo_mean = geometric_mean_score(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )
    # Index balanced accuracy
    iba_gmean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
        geometric_mean_score
    )
    iba = iba_gmean(
        y_true, y_pred, labels=labels, average=None, sample_weight=sample_weight
    )

    result = {"targets": {}}

    for i, label in enumerate(labels):
        result["targets"][target_names[i]] = {
            "precision": precision[i],
            "recall": recall[i],
            "specificity": specificity[i],
            "f1": f1[i],
            "geo_mean": geo_mean[i],
            "iba": iba[i],
            "support": support[i],
        }

    result["average"] = {
        "precision": np.average(precision, weights=support),
        "recall": np.average(recall, weights=support),
        "specificity": np.average(specificity, weights=support),
        "f1": np.average(f1, weights=support),
        "geo_mean": np.average(geo_mean, weights=support),
        "iba": np.average(iba, weights=support),
        "support": np.sum(support),
    }

    return result
Exemple #30
0
    def run_research(self, parameters_dist: dict, n_params: int = 10):
        self.parameters_dist = parameters_dist
        self.__generate_permutations()

        parameter_ranges = self.__display_research_info()

        for variant in self.parameters_perm:
            self.resampler.set_params(**variant)
            X_resampled, y_resampled = self.resampler.resample_to_ndarray()

            X_train, X_test, y_train, y_test = train_test_split(X_resampled,
                                                                y_resampled,
                                                                test_size=0.3,
                                                                random_state=0)
            clf = RandomForestClassifier(random_state=0, n_jobs=-1)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            if 'estimator' in variant.keys():
                if variant[
                        'estimator'].__class__.__name__ == 'RandomForestClassifier':
                    variant['estimator'] = 'RFC'
                elif variant[
                        'estimator'].__class__.__name__ == 'AdaBoostClassifier':
                    variant['estimator'] = 'ABC'
                elif variant[
                        'estimator'].__class__.__name__ == 'GradientBoostingClassifier':
                    variant['estimator'] = 'GBC'
                elif variant[
                        'estimator'].__class__.__name__ == 'KNeighborsClassifier':
                    variant['estimator'] = 'KNN'
                elif variant[
                        'estimator'].__class__.__name__ == 'DecisionTreeClassifier':
                    variant['estimator'] = 'DT'
                elif variant[
                        'estimator'].__class__.__name__ == 'LogisticRegression':
                    variant['estimator'] = 'LR'

            index = self.parameters_perm.index(variant)
            self.logs.update({
                index: {
                    "params": variant,
                    "gmean": geometric_mean_score(y_test, y_pred),
                    "recall": recall_score(y_test, y_pred),
                    "classes_size":
                    DataController.count_classes_size(y_resampled),
                    "roc_auc": roc_auc_score(y_test, y_pred)
                }
            })

            print('{0:0=3d}'.format(index + 1), self.logs[index])

        filename = self.resampler.get_name() + str(parameter_ranges)

        # Drawing plots
        self.__draw_plots(filename)

        # Print and save top found parameters
        self.__logs_to_file(filename, n_params)
Exemple #31
0
def test_geometric_mean_support_binary():
    """Test the geometric mean for binary classification task"""
    y_true, y_pred, _ = make_prediction(binary=True)

    # compute the geometric mean for the binary problem
    geo_mean = geometric_mean_score(y_true, y_pred)

    assert_almost_equal(geo_mean, 0.77, 2)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=RANDOM_STATE)

# Train the classifier with balancing
pipeline.fit(X_train, y_train)

# Test the classifier and get the prediction
y_pred_bal = pipeline.predict(X_test)

###############################################################################
# The geometric mean corresponds to the square root of the product of the
# sensitivity and specificity. Combining the two metrics should account for
# the balancing of the dataset.

print('The geometric mean is {}'.format(geometric_mean_score(
    y_test,
    y_pred_bal)))

###############################################################################
# The index balanced accuracy can transform any metric to be used in
# imbalanced learning problems.

alpha = 0.1
geo_mean = make_index_balanced_accuracy(alpha=alpha, squared=True)(
    geometric_mean_score)

print('The IBA using alpha = {} and the geometric mean: {}'.format(
    alpha, geo_mean(
        y_test,
        y_pred_bal)))
###############################################################################
# We train a decision tree classifier which will be used as a baseline for the
# rest of this example.

###############################################################################
# The results are reported in terms of balanced accuracy and geometric mean
# which are metrics widely used in the literature to validate model trained on
# imbalanced set.

tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)
print('Decision tree classifier performance:')
print('Balanced accuracy: {:.2f} - Geometric mean {:.2f}'
      .format(balanced_accuracy_score(y_test, y_pred_tree),
              geometric_mean_score(y_test, y_pred_tree)))
cm_tree = confusion_matrix(y_test, y_pred_tree)
fig, ax = plt.subplots()
plot_confusion_matrix(cm_tree, classes=np.unique(satimage.target), ax=ax,
                      title='Decision tree')

###############################################################################
# Classification using bagging classifier with and without sampling
###############################################################################
# Instead of using a single tree, we will check if an ensemble of decsion tree
# can actually alleviate the issue induced by the class imbalancing. First, we
# will use a bagging classifier and its counter part which internally uses a
# random under-sampling to balanced each boostrap sample.

bagging = BaggingClassifier(n_estimators=50, random_state=0, n_jobs=-1)
balanced_bagging = BalancedBaggingClassifier(n_estimators=50, random_state=0,
def test_geometric_mean_multiclass(y_true, y_pred, correction, expected_gmean):
    gmean = geometric_mean_score(y_true, y_pred, correction=correction)
    assert gmean == pytest.approx(expected_gmean, rel=R_TOL)
def test_geometric_mean_average(y_true, y_pred, average, expected_gmean):
    gmean = geometric_mean_score(y_true, y_pred, average=average)
    assert gmean == pytest.approx(expected_gmean, rel=R_TOL)
def test_geometric_mean_sample_weight(y_true, y_pred, sample_weight, average,
                                      expected_gmean):
    gmean = geometric_mean_score(y_true, y_pred, labels=[0, 1],
                                 sample_weight=sample_weight,
                                 average=average)
    assert gmean == pytest.approx(expected_gmean, rel=R_TOL)
def test_geometric_mean_multiclass():
    y_true = [0, 0, 1, 1]
    y_pred = [0, 0, 1, 1]
    assert_allclose(geometric_mean_score(y_true, y_pred), 1.0, rtol=R_TOL)

    y_true = [0, 0, 0, 0]
    y_pred = [1, 1, 1, 1]
    assert_allclose(geometric_mean_score(y_true, y_pred), 0.0, rtol=R_TOL)

    cor = 0.001
    y_true = [0, 0, 0, 0]
    y_pred = [0, 0, 0, 0]
    assert_allclose(
        geometric_mean_score(y_true, y_pred, correction=cor), 1.0, rtol=R_TOL)

    y_true = [0, 0, 0, 0]
    y_pred = [1, 1, 1, 1]
    assert_allclose(
        geometric_mean_score(y_true, y_pred, correction=cor), cor, rtol=R_TOL)

    y_true = [0, 0, 1, 1]
    y_pred = [0, 1, 1, 0]
    assert_allclose(
        geometric_mean_score(y_true, y_pred, correction=cor), 0.5, rtol=R_TOL)

    y_true = [0, 1, 2, 0, 1, 2]
    y_pred = [0, 2, 1, 0, 0, 1]
    assert_allclose(
        geometric_mean_score(y_true, y_pred, correction=cor),
        (1 * cor * cor) ** (1.0 / 3.0),
        rtol=R_TOL)

    y_true = [0, 1, 2, 3, 4, 5]
    y_pred = [0, 1, 2, 3, 4, 5]
    assert_allclose(
        geometric_mean_score(y_true, y_pred, correction=cor), 1, rtol=R_TOL)

    y_true = [0, 1, 1, 1, 1, 0]
    y_pred = [0, 0, 1, 1, 1, 1]
    assert_allclose(
        geometric_mean_score(y_true, y_pred, correction=cor),
        (0.5 * 0.75) ** 0.5,
        rtol=R_TOL)

    y_true = [0, 1, 2, 0, 1, 2]
    y_pred = [0, 2, 1, 0, 0, 1]
    assert_allclose(
        geometric_mean_score(y_true, y_pred, average='macro'),
        0.47140452079103168,
        rtol=R_TOL)
    assert_allclose(
        geometric_mean_score(y_true, y_pred, average='micro'),
        0.47140452079103168,
        rtol=R_TOL)
    assert_allclose(
        geometric_mean_score(y_true, y_pred, average='weighted'),
        0.47140452079103168,
        rtol=R_TOL)
    assert_allclose(
        geometric_mean_score(y_true, y_pred, average=None),
        [0.8660254, 0.0, 0.0],
        rtol=R_TOL)

    y_true = [0, 1, 2, 0, 1, 2]
    y_pred = [0, 1, 1, 0, 0, 1]
    assert_allclose(
        geometric_mean_score(y_true, y_pred, labels=[0, 1]),
        0.70710678118654752,
        rtol=R_TOL)
    assert_allclose(
        geometric_mean_score(
            y_true, y_pred, labels=[0, 1], sample_weight=[1, 2, 1, 1, 2, 1]),
        0.70710678118654752,
        rtol=R_TOL)
    assert_allclose(
        geometric_mean_score(
            y_true,
            y_pred,
            labels=[0, 1],
            sample_weight=[1, 2, 1, 1, 2, 1],
            average='weighted'),
        0.3333333333,
        rtol=R_TOL)

    y_true, y_pred, _ = make_prediction(binary=False)

    geo_mean = geometric_mean_score(y_true, y_pred)
    assert_allclose(geo_mean, 0.41, rtol=R_TOL)

    # Compute the geometric mean for each of the classes
    geo_mean = geometric_mean_score(y_true, y_pred, average=None)
    assert_allclose(geo_mean, [0.85, 0.29, 0.7], rtol=R_TOL)

    # average tests
    geo_mean = geometric_mean_score(y_true, y_pred, average='macro')
    assert_allclose(geo_mean, 0.68, rtol=R_TOL)

    geo_mean = geometric_mean_score(y_true, y_pred, average='weighted')
    assert_allclose(geo_mean, 0.65, rtol=R_TOL)
def test_geometric_mean_score_prediction(average, expected_gmean):
    y_true, y_pred, _ = make_prediction(binary=False)

    gmean = geometric_mean_score(y_true, y_pred, average=average)
    assert gmean == pytest.approx(expected_gmean, rel=R_TOL)