def geometric_mean_score(
    """Compute the geometric mean.

    The geometric mean (G-mean) is the root of the product of class-wise
    sensitivity. This measure tries to maximize the accuracy on each of the
    classes while keeping these accuracies balanced. For binary classification
    G-mean is the squared root of the product of the sensitivity
    and specificity. For multi-class problems it is a higher root of the
    product of sensitivity for each class.

    For compatibility with other imbalance performance measures, G-mean can be
    calculated for each class separately on a one-vs-rest basis when
    ``average != 'multiclass'``.

    The best value is 1 and the worst value is 0. Traditionally if at least one
    class is unrecognized by the classifier, G-mean resolves to zero. To
    alleviate this property, for highly multi-class the sensitivity of
    unrecognized classes can be "corrected" to be a user specified value
    (instead of zero). This option works only if ``average == 'multiclass'``.

    Read more in the :ref:`User Guide <imbalanced_metrics>`.

    y_true : ndarray of shape (n_samples,)
        Ground truth (correct) target values.

    y_pred : ndarray of shape (n_samples,)
        Estimated targets as returned by a classifier.

    labels : list, default=None
        The set of labels to include when ``average != 'binary'``, and their
        order if ``average is None``. Labels present in the data can be
        excluded, for example to calculate a multiclass average ignoring a
        majority negative class, while labels not present in the data will
        result in 0 components in a macro average.

    pos_label : str or int, default=1
        The class to report if ``average='binary'`` and the data is binary.
        If the data are multiclass, this will be ignored;
        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
        scores for that label only.

    average : str or None, default='multiclass'
        If ``None``, the scores for each class are returned. Otherwise, this
        determines the type of averaging performed on the data:

            Only report results for the class specified by ``pos_label``.
            This is applicable only if targets (``y_{true,pred}``) are binary.
            Calculate metrics globally by counting the total true positives,
            false negatives and false positives.
            Calculate metrics for each label, and find their unweighted
            mean.  This does not take label imbalance into account.
            Calculate metrics for each label, and find their average, weighted
            by support (the number of true instances for each label). This
            alters 'macro' to account for label imbalance; it can result in an
            F-score that is not between precision and recall.
            Calculate metrics for each instance, and find their average (only
            meaningful for multilabel classification where this differs from

    sample_weight : ndarray of shape (n_samples,), default=None
        Sample weights.

    correction: float, default=0.0
        Substitutes sensitivity of unrecognized classes from zero to a given

    geometric_mean : float

    See :ref:``.

    .. [1] Kubat, M. and Matwin, S. "Addressing the curse of
       imbalanced training sets: one-sided selection" ICML (1997)

    .. [2] Barandela, R., Sánchez, J. S., Garcıa, V., & Rangel, E. "Strategies
       for learning in class imbalance problems", Pattern Recognition,
       36(3), (2003), pp 849-851.

    >>> from imblearn.metrics import geometric_mean_score
    >>> y_true = [0, 1, 2, 0, 1, 2]
    >>> y_pred = [0, 2, 1, 0, 0, 1]
    >>> geometric_mean_score(y_true, y_pred)
    >>> geometric_mean_score(y_true, y_pred, correction=0.001)
    >>> geometric_mean_score(y_true, y_pred, average='macro')
    >>> geometric_mean_score(y_true, y_pred, average='micro')
    >>> geometric_mean_score(y_true, y_pred, average='weighted')
    >>> geometric_mean_score(y_true, y_pred, average=None)
    array([ 0.8660254,  0.       ,  0.       ])
    if average is None or average != "multiclass":
        sen, spe, _ = sensitivity_specificity_support(
            warn_for=("specificity", "specificity"),

        return np.sqrt(sen * spe)
        present_labels = unique_labels(y_true, y_pred)

        if labels is None:
            labels = present_labels
            n_labels = None
            n_labels = len(labels)
            labels = np.hstack([
                np.setdiff1d(present_labels, labels, assume_unique=True),

        le = LabelEncoder()
        y_true = le.transform(y_true)
        y_pred = le.transform(y_pred)
        sorted_labels = le.classes_

        # labels are now from 0 to len(labels) - 1 -> use bincount
        tp = y_true == y_pred
        tp_bins = y_true[tp]

        if sample_weight is not None:
            tp_bins_weights = np.asarray(sample_weight)[tp]
            tp_bins_weights = None

        if len(tp_bins):
            tp_sum = np.bincount(tp_bins,
            # Pathological case
            true_sum = tp_sum = np.zeros(len(labels))
        if len(y_true):
            true_sum = np.bincount(y_true,

        # Retain only selected labels
        indices = np.searchsorted(sorted_labels, labels[:n_labels])
        tp_sum = tp_sum[indices]
        true_sum = true_sum[indices]

        with np.errstate(divide="ignore", invalid="ignore"):
            recall = _prf_divide(tp_sum, true_sum, "recall", "true", None,
        recall[recall == 0] = correction

        with np.errstate(divide="ignore", invalid="ignore"):
            gmean = sp.stats.gmean(recall)
        # old version of scipy return MaskedConstant instead of 0.0
        if isinstance(gmean,
            return 0.0
        return gmean
Ejemplo n.º 2
def analysis(model, X_train, y_train):, y_train)

    # predict probabilities
    probs = model.predict_proba(X_test)
    print("probs: ", probs)
    # keep probabilities for the positive outcome only
    probs = probs[:, 1]

    # predict class values
    preds = model.predict(X_test)

    # calculate precision-recall curve
    precision, recall, thresholds = precision_recall_curve(y_test, probs)

    # calculate average precision
    average_precision = average_precision_score(y_test, probs)

    # recall score for class 1 (Predict that Biopsy is True)
    rs = recall_score(y_test, preds)

    # calculate F1 score
    # precision_recall_fscore_support return 4 things, I need just the f
    sample_weight = None
    warn_for = ('f-score')
    zero_division = "warn"
    average = 'binary'
    beta = 1.0
    labels = _check_set_wise_labels(y_test, preds, average, None, 1)

    # Calculate tp_sum, pred_sum, true_sum ###
    samplewise = average == 'samples'
    MCM = multilabel_confusion_matrix(y_test,

    # In multilabel confusion matrix :math:`MCM`, the count of true negatives
    #     is :math:`MCM_{:,0,0}`, false negatives is :math:`MCM_{:,1,0}`,
    #     true positives is :math:`MCM_{:,1,1}` and false positives is
    #     :math:`MCM_{:,0,1}`.
    tn_sum = MCM[:, 0, 0]
    fn_sum = MCM[:, 1, 0]
    fp_sum = MCM[:, 0, 1]

    tp_sum = MCM[:, 1, 1]
    pred_sum = tp_sum + MCM[:, 0, 1]
    true_sum = tp_sum + MCM[:, 1, 0]

    if average == 'micro':
        tp_sum = np.array([tp_sum.sum()])
        pred_sum = np.array([pred_sum.sum()])
        true_sum = np.array([true_sum.sum()])
        # added
        tn_sum = np.array([tn_sum.sum()])
        fn_sum = np.array([fn_sum.sum()])
        fp_sum = np.array([fp_sum.sum()])

    # Finally, we have all our sufficient statistics. Divide! #
    beta2 = beta**2

    # Divide, and on zero-division, set scores and/or warn according to
    # zero_division:
    precision1 = _prf_divide(tp_sum, pred_sum, 'precision', 'predicted',
                             average, warn_for, zero_division)
    recall1 = _prf_divide(tp_sum, true_sum, 'recall', 'true', average,
                          warn_for, zero_division)

    # warn for f-score only if zero_division is warn, it is in warn_for
    # and BOTH prec and rec are ill-defined
    if zero_division == "warn" and ("f-score", ) == warn_for:
        if (pred_sum[true_sum == 0] == 0).any():
            _warn_prf(average, "true nor predicted", 'F-score is',

    # if tp == 0 F will be 1 only if all predictions are zero, all labels are
    # zero, and zero_division=1. In all other case, 0
    if np.isposinf(beta):
        f_score = recall1
        denom = beta2 * precision1 + recall1

        denom[denom == 0.] = 1  # avoid division by 0
        f_score = (1 + beta2) * precision1 * recall1 / denom

    # Average the results
    if average == 'weighted':
        weights = true_sum
        if weights.sum() == 0:
            zero_division_value = 0.0 if zero_division in ["warn", 0] else 1.0
            # precision is zero_division if there are no positive predictions
            # recall is zero_division if there are no positive labels
            # fscore is zero_division if all labels AND predictions are
            # negative
            return (zero_division_value if pred_sum.sum() == 0 else 0,
                    zero_division_value if pred_sum.sum() == 0 else 0, None)

    elif average == 'samples':
        weights = sample_weight
        weights = None

    if average is not None:
        assert average != 'binary' or len(precision1) == 1
        precision1 = np.average(precision1, weights=weights)
        recall1 = np.average(recall1, weights=weights)
        f_score = np.average(f_score, weights=weights)
        true_sum = None  # return no support

    #return precision, recall, f_score, true_sum
    print("precision: ", precision1)
    print("recall: ", recall1)
    print("tp_sum: ", tp_sum)
    print("tn_sum: ", tn_sum)
    print("fn_sum: ", fn_sum)
    print("fp_sum: ", fp_sum)

    #f1 = f1_score(y_test, preds)
    f1 = f_score

    # calculate precision-recall AUC
    auc_score = auc(recall, precision)

    # create chart
    # In matplotlib < 1.5, plt.fill_between does not have a 'step' argument
    step_kwargs = ({
        'step': 'post'
    } if 'step' in signature(plt.fill_between).parameters else {})
    plt.step(recall, precision, color='b', alpha=0.2, where='post')
    plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)

    # plot a "no skill" line
    plt.plot([0, 1], [0.5, 0.5], linestyle='--')

    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall curve: Average Precision={0:0.3f}'.format(

    # print(confusion_matrix(y_test, preds))
    print('Classification Report:')
    print(classification_report(y_test, preds))

    print('f1=%.3f auc=%.3f recall=%.3f' % (f1, auc_score, rs))
def sensitivity_specificity_support(
        warn_for=("sensitivity", "specificity"),
    """Compute sensitivity, specificity, and support for each class

    The sensitivity is the ratio ``tp / (tp + fn)`` where ``tp`` is the number
    of true positives and ``fn`` the number of false negatives. The sensitivity
    quantifies the ability to avoid false negatives_[1].

    The specificity is the ratio ``tn / (tn + fp)`` where ``tn`` is the number
    of true negatives and ``fn`` the number of false negatives. The specificity
    quantifies the ability to avoid false positives_[1].

    The support is the number of occurrences of each class in ``y_true``.

    If ``pos_label is None`` and in binary classification, this function
    returns the average sensitivity and specificity if ``average``
    is one of ``'weighted'``.

    Read more in the :ref:`User Guide <sensitivity_specificity>`.

    y_true : ndarray of shape (n_samples,)
        Ground truth (correct) target values.

    y_pred : ndarray of shape (n_samples,)
        Estimated targets as returned by a classifier.

    labels : list, default=None
        The set of labels to include when ``average != 'binary'``, and their
        order if ``average is None``. Labels present in the data can be
        excluded, for example to calculate a multiclass average ignoring a
        majority negative class, while labels not present in the data will
        result in 0 components in a macro average. For multilabel targets,
        labels are column indices. By default, all labels in ``y_true`` and
        ``y_pred`` are used in sorted order.

    pos_label : str or int, default=1
        The class to report if ``average='binary'`` and the data is binary.
        If the data are multiclass, this will be ignored;
        setting ``labels=[pos_label]`` and ``average != 'binary'`` will report
        scores for that label only.

    average : str, default=None
        If ``None``, the scores for each class are returned. Otherwise, this
        determines the type of averaging performed on the data:

            Only report results for the class specified by ``pos_label``.
            This is applicable only if targets (``y_{true,pred}``) are binary.
            Calculate metrics globally by counting the total true positives,
            false negatives and false positives.
            Calculate metrics for each label, and find their unweighted
            mean.  This does not take label imbalance into account.
            Calculate metrics for each label, and find their average, weighted
            by support (the number of true instances for each label). This
            alters 'macro' to account for label imbalance; it can result in an
            F-score that is not between precision and recall.
            Calculate metrics for each instance, and find their average (only
            meaningful for multilabel classification where this differs from

    warn_for : tuple or set of {{"sensitivity", "specificity"}}, for internal use
        This determines which warnings will be made in the case that this
        function is being used to return only one of its metrics.

    sample_weight : ndarray of shape (n_samples,), default=None
        Sample weights.

    sensitivity : float (if `average is None`) or ndarray of \
            shape (n_unique_labels,)
        The sensitivity metric.

    specificity : float (if `average is None`) or ndarray of \
            shape (n_unique_labels,)
        The specificity metric.

    support : int (if `average is None`) or ndarray of \
            shape (n_unique_labels,)
        The number of occurrences of each label in ``y_true``.

    .. [1] `Wikipedia entry for the Sensitivity and specificity

    >>> import numpy as np
    >>> from imblearn.metrics import sensitivity_specificity_support
    >>> y_true = np.array(['cat', 'dog', 'pig', 'cat', 'dog', 'pig'])
    >>> y_pred = np.array(['cat', 'pig', 'dog', 'cat', 'cat', 'dog'])
    >>> sensitivity_specificity_support(y_true, y_pred, average='macro')
    (0.33333333333333331, 0.66666666666666663, None)
    >>> sensitivity_specificity_support(y_true, y_pred, average='micro')
    (0.33333333333333331, 0.66666666666666663, None)
    >>> sensitivity_specificity_support(y_true, y_pred, average='weighted')
    (0.33333333333333331, 0.66666666666666663, None)
    average_options = (None, "micro", "macro", "weighted", "samples")
    if average not in average_options and average != "binary":
        raise ValueError("average has to be one of " + str(average_options))

    y_type, y_true, y_pred = _check_targets(y_true, y_pred)
    present_labels = unique_labels(y_true, y_pred)

    if average == "binary":
        if y_type == "binary":
            if pos_label not in present_labels:
                if len(present_labels) < 2:
                    # Only negative labels
                    return (0.0, 0.0, 0)
                    raise ValueError("pos_label=%r is not a valid label: %r" %
                                     (pos_label, present_labels))
            labels = [pos_label]
            raise ValueError("Target is %s but average='binary'. Please "
                             "choose another average setting." % y_type)
    elif pos_label not in (None, 1):
            "Note that pos_label (set to %r) is ignored when "
            "average != 'binary' (got %r). You may use "
            "labels=[pos_label] to specify a single positive class." %
            (pos_label, average),

    if labels is None:
        labels = present_labels
        n_labels = None
        n_labels = len(labels)
        labels = np.hstack(
             np.setdiff1d(present_labels, labels, assume_unique=True)])

    # Calculate tp_sum, pred_sum, true_sum ###

    if y_type.startswith("multilabel"):
        raise ValueError("imblearn does not support multilabel")
    elif average == "samples":
        raise ValueError("Sample-based precision, recall, fscore is "
                         "not meaningful outside multilabel "
                         "classification. See the accuracy_score instead.")
        le = LabelEncoder()
        y_true = le.transform(y_true)
        y_pred = le.transform(y_pred)
        sorted_labels = le.classes_

        # labels are now from 0 to len(labels) - 1 -> use bincount
        tp = y_true == y_pred
        tp_bins = y_true[tp]
        if sample_weight is not None:
            tp_bins_weights = np.asarray(sample_weight)[tp]
            tp_bins_weights = None

        if len(tp_bins):
            tp_sum = np.bincount(tp_bins,
            # Pathological case
            true_sum = pred_sum = tp_sum = np.zeros(len(labels))
        if len(y_pred):
            pred_sum = np.bincount(y_pred,
        if len(y_true):
            true_sum = np.bincount(y_true,

        # Compute the true negative
        tn_sum = y_true.size - (pred_sum + true_sum - tp_sum)

        # Retain only selected labels
        indices = np.searchsorted(sorted_labels, labels[:n_labels])
        tp_sum = tp_sum[indices]
        true_sum = true_sum[indices]
        pred_sum = pred_sum[indices]
        tn_sum = tn_sum[indices]

    if average == "micro":
        tp_sum = np.array([tp_sum.sum()])
        pred_sum = np.array([pred_sum.sum()])
        true_sum = np.array([true_sum.sum()])
        tn_sum = np.array([tn_sum.sum()])

    # Finally, we have all our sufficient statistics. Divide! #

    with np.errstate(divide="ignore", invalid="ignore"):
        # Divide, and on zero-division, set scores to 0 and warn:

        # Oddly, we may get an "invalid" rather than a "divide" error
        # here.
        specificity = _prf_divide(
            tn_sum + pred_sum - tp_sum,
        sensitivity = _prf_divide(tp_sum, true_sum, "sensitivity", "true",
                                  average, warn_for)

    # Average the results

    if average == "weighted":
        weights = true_sum
        if weights.sum() == 0:
            return 0, 0, None
    elif average == "samples":
        weights = sample_weight
        weights = None

    if average is not None:
        assert average != "binary" or len(specificity) == 1
        specificity = np.average(specificity, weights=weights)
        sensitivity = np.average(sensitivity, weights=weights)
        true_sum = None  # return no support

    return sensitivity, specificity, true_sum
Ejemplo n.º 4
    def _f1_from_confusion_matrix(
        self, MCM, average, beta=1, warn_for=("precision", "recall", "f-score"), zero_division="warn"
        Code borrowed from sklear.metrics



        tp_sum = MCM[:, 1, 1]
        pred_sum = tp_sum + MCM[:, 0, 1]
        true_sum = tp_sum + MCM[:, 1, 0]

        if average == "micro":
            tp_sum = np.array([tp_sum.sum()])
            pred_sum = np.array([pred_sum.sum()])
            true_sum = np.array([true_sum.sum()])

        # Finally, we have all our sufficient statistics. Divide! #
        beta2 = beta ** 2

        # Divide, and on zero-division, set scores and/or warn according to
        # zero_division:
        from sklearn.metrics._classification import _prf_divide, _warn_prf

        precision = _prf_divide(tp_sum, pred_sum, "precision", "predicted", average, warn_for, zero_division)
        recall = _prf_divide(tp_sum, true_sum, "recall", "true", average, warn_for, zero_division)

        # warn for f-score only if zero_division is warn, it is in warn_for
        # and BOTH prec and rec are ill-defined
        if zero_division == "warn" and ("f-score",) == warn_for:
            if (pred_sum[true_sum == 0] == 0).any():
                _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))

        # if tp == 0 F will be 1 only if all predictions are zero, all labels are
        # zero, and zero_division=1. In all other case, 0
        if np.isposinf(beta):
            f_score = recall
            denom = beta2 * precision + recall

            denom[denom == 0.0] = 1  # avoid division by 0
            f_score = (1 + beta2) * precision * recall / denom

        # Average the results
        if average == "weighted":
            weights = true_sum
            if weights.sum() == 0:
                zero_division_value = 0.0 if zero_division in ["warn", 0] else 1.0
                # precision is zero_division if there are no positive predictions
                # recall is zero_division if there are no positive labels
                # fscore is zero_division if all labels AND predictions are
                # negative
                return (
                    zero_division_value if pred_sum.sum() == 0 else 0,
                    zero_division_value if pred_sum.sum() == 0 else 0,
            weights = None

        if average is not None:
            assert average != "binary" or len(precision) == 1
            f_score = np.average(f_score, weights=weights)

        return f_score