def _daal_roc_auc_score(y_true, y_score, *, average="macro", sample_weight=None, max_fpr=None, multi_class="raise", labels=None): y_type = _daal_type_of_target(y_true) y_true = check_array(y_true, ensure_2d=False, dtype=None) y_score = check_array(y_score, ensure_2d=False) if y_type[0] == "multiclass" or (y_type[0] == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2): # do not support partial ROC computation for multiclass if max_fpr is not None and max_fpr != 1.: raise ValueError("Partial AUC computation not available in " "multiclass setting, 'max_fpr' must be" " set to `None`, received `max_fpr={0}` " "instead".format(max_fpr)) if multi_class == 'raise': raise ValueError("multi_class must be in ('ovo', 'ovr')") logging.info("sklearn.metrics.roc_auc_score: " + get_patch_message("sklearn")) result = multiclass_roc_auc_score(y_true, y_score, labels, multi_class, average, sample_weight) elif y_type[0] == "binary": labels = y_type[1] daal_use = max_fpr is None and sample_weight is None and len(labels) == 2 if daal_use: logging.info("sklearn.metrics.roc_auc_score: " + get_patch_message("daal")) if not np.array_equal(labels, [0, 1]): y_true = label_binarize(y_true, classes=labels)[:, 0] result = d4p.daal_roc_auc_score(y_true.reshape(-1, 1), y_score.reshape(-1, 1)) if not daal_use or result == -1: y_true = label_binarize(y_true, classes=labels)[:, 0] logging.info("sklearn.metrics.roc_auc_score: " + get_patch_message("sklearn")) if sklearn_check_version('0.22'): result = _average_binary_score(partial(_binary_roc_auc_score, max_fpr=max_fpr), y_true, y_score, average, sample_weight=sample_weight) else: result = multiclass_roc_auc_score(y_true, y_score, average, sample_weight=sample_weight, max_fpr=max_fpr) else: logging.info("sklearn.metrics.roc_auc_score: " + get_patch_message("sklearn")) if sklearn_check_version('0.22'): result = _average_binary_score(partial(_binary_roc_auc_score, max_fpr=max_fpr), y_true, y_score, average, sample_weight=sample_weight) else: result = multiclass_roc_auc_score(y_true, y_score, average, sample_weight=sample_weight, max_fpr=max_fpr) return result
def average_precision_score(y_true, y_score, average="macro", sample_weight=None): def _binary_average_precision(y_true, y_score, sample_weight=None): precision, recall, thresholds = precision_recall_curve( y_true, y_score, sample_weight=sample_weight) return auc(recall, precision) return _average_binary_score(_binary_average_precision, y_true, y_score, average, sample_weight=sample_weight)
def min_aupdc(y_true, pos_label, average, sample_weight=None, known_skew=None, new_skew=None): """ Compute the minimum possible area under the performance diagram curve. Essentially, a vote of NO for all predictions. """ min_score = np.zeros((len(y_true))) average_precision = partial(_binary_uninterpolated_average_precision, known_skew=known_skew, new_skew=new_skew, pos_label=pos_label) ap_min = _average_binary_score(average_precision, y_true, min_score, average, sample_weight=sample_weight) return ap_min
def norm_aupdc(y_true, y_score, known_skew=None, *, average="macro", pos_label=1, sample_weight=None, min_method='random'): """ Compute the normalized modified average precision. Normalization removes the no-skill region either based on skew or random classifier performance. Modification alters success ratio to be consistent with a known skew. Parameters: ------------------- y_true, array of (n_samples,) Binary, truth labels (0,1) y_score, array of (n_samples,) Model predictions (either determinstic or probabilistic) known_skew, float between 0 and 1 Known or reference skew (# of 1 / n_samples) for computing the modified success ratio. min_method, 'skew' or 'random' If 'skew', then the normalization is based on the minimum AUPDC formula presented in Boyd et al. (2012). If 'random', then the normalization is based on the minimum AUPDC for a random classifier, which is equal to the known skew. Boyd, 2012: Unachievable Region in Precision-Recall Space and Its Effect on Empirical Evaluation, ArXiv """ new_skew = np.mean(y_true) if known_skew is None: known_skew = new_skew y_type = type_of_target(y_true) if y_type == "multilabel-indicator" and pos_label != 1: raise ValueError("Parameter pos_label is fixed to 1 for " "multilabel-indicator y_true. Do not set " "pos_label or set pos_label to 1.") elif y_type == "binary": # Convert to Python primitive type to avoid NumPy type / Python str # comparison. See https://github.com/numpy/numpy/issues/6784 present_labels = np.unique(y_true).tolist() if len(present_labels) == 2 and pos_label not in present_labels: raise ValueError( f"pos_label={pos_label} is not a valid label. It should be " f"one of {present_labels}") average_precision = partial(_binary_uninterpolated_average_precision, known_skew=known_skew, new_skew=new_skew, pos_label=pos_label) ap = _average_binary_score(average_precision, y_true, y_score, average, sample_weight=sample_weight) if min_method == 'random': ap_min = known_skew elif min_method == 'skew': ap_min = min_aupdc(y_true, pos_label, average, sample_weight=sample_weight, known_skew=known_skew, new_skew=new_skew) naupdc = (ap - ap_min) / (1.0 - ap_min) return naupdc
def multiclass_roc_auc_score( y_true, y_score, labels, multi_class, average, sample_weight=None, invalid_proba_tolerance: float = 1e-6, ): """Multiclass roc auc score (copied from sklearn) Parameters ---------- y_true : array-like of shape (n_samples,) True multiclass labels. y_score : array-like of shape (n_samples, n_classes) Target scores corresponding to probability estimates of a sample belonging to a particular class labels : array, shape = [n_classes] or None, optional (default=None) List of labels to index ``y_score`` used for multiclass. If ``None``, the lexical order of ``y_true`` is used to index ``y_score``. multi_class : string, 'ovr' or 'ovo' Determines the type of multiclass configuration to use. ``'ovr'``: Calculate metrics for the multiclass case using the one-vs-rest approach. ``'ovo'``: Calculate metrics for the multiclass case using the one-vs-one approach. average : 'macro' or 'weighted', optional (default='macro') Determines the type of averaging performed on the pairwise binary metric scores ``'macro'``: Calculate metrics for each label, and find their unweighted mean. This does not take label imbalance into account. Classes are assumed to be uniformly distributed. ``'weighted'``: Calculate metrics for each label, taking into account the prevalence of the classes. sample_weight : array-like of shape (n_samples,), default=None Sample weights. :param invalid_proba_tolerance: float in [0, 1] The proportion of samples that can eventually be ignored if their class scores do not sum up to 1. """ # validation of the input y_score are_close = np.isclose(1, y_score.sum(axis=1)) # I added this try-except to deal with cases where a very small amount of voxels have an issue # to sum the probabilities to 1, which might happen (probably, i suppose) because I use float16 instead of 64 try: if not np.all(are_close): raise ValueError( "Target scores need to be probabilities for multiclass " "roc_auc, i.e. they should sum up to 1.0 over classes") except ValueError as ex: logger.exception(ex) assert 0 <= invalid_proba_tolerance <= 1, f"{invalid_proba_tolerance=}" nsamples_not_close = int((~are_close).sum()) percentage_samples_not_close = nsamples_not_close / are_close.size logger.warning( f"{nsamples_not_close=} ({percentage_samples_not_close=:.7%})") if percentage_samples_not_close > invalid_proba_tolerance: raise ValueError( f"Too many samples are not close 1 {nsamples_not_close=} {percentage_samples_not_close=:.7%} {invalid_proba_tolerance=:.7%}." ) else: logger.warning( f"The amount of probabilities not summing up to 1 will be tolerated " f"{percentage_samples_not_close=:.7%} {invalid_proba_tolerance=:.7%}. " f"The bad samples will be ignored!") y_true = y_true[are_close] y_score = y_score[are_close, :] # validation for multiclass parameter specifications average_options = ("macro", "weighted") if average not in average_options: raise ValueError("average must be one of {0} for " "multiclass problems".format(average_options)) multiclass_options = ("ovo", "ovr") if multi_class not in multiclass_options: raise ValueError("multi_class='{0}' is not supported " "for multiclass ROC AUC, multi_class must be " "in {1}".format(multi_class, multiclass_options)) from sklearn.utils import column_or_1d from sklearn.preprocessing._label import _encode from sklearn.metrics._base import _average_multiclass_ovo_score from sklearn.preprocessing import label_binarize from sklearn.metrics._ranking import _binary_roc_auc_score from sklearn.metrics._base import _average_binary_score if labels is not None: labels = column_or_1d(labels) classes = _encode(labels) if len(classes) != len(labels): raise ValueError("Parameter 'labels' must be unique") if not np.array_equal(classes, labels): raise ValueError("Parameter 'labels' must be ordered") if len(classes) != y_score.shape[1]: raise ValueError( "Number of given labels, {0}, not equal to the number " "of columns in 'y_score', {1}".format(len(classes), y_score.shape[1])) if len(np.setdiff1d(y_true, classes)): raise ValueError( "'y_true' contains labels not in parameter 'labels'") else: classes = _encode(y_true) if len(classes) != y_score.shape[1]: raise ValueError( "Number of classes in y_true not equal to the number of " "columns in 'y_score'") if multi_class == "ovo": if sample_weight is not None: raise ValueError("sample_weight is not supported " "for multiclass one-vs-one ROC AUC, " "'sample_weight' must be None in this case.") _, y_true_encoded = _encode(y_true, uniques=classes, encode=True) # Hand & Till (2001) implementation (ovo) return _average_multiclass_ovo_score(_binary_roc_auc_score, y_true_encoded, y_score, average=average) else: # ovr is same as multi-label y_true_multilabel = label_binarize(y_true, classes=classes) return _average_binary_score(_binary_roc_auc_score, y_true_multilabel, y_score, average, sample_weight=sample_weight)
def _daal_roc_auc_score( y_true, y_score, *, average="macro", sample_weight=None, max_fpr=None, multi_class="raise", labels=None, ): y_type = _daal_type_of_target(y_true) y_true = check_array(y_true, ensure_2d=False, dtype=None) y_score = check_array(y_score, ensure_2d=False) _patching_status = PatchingConditionsChain("sklearn.metrics.roc_auc_score") _dal_ready = _patching_status.and_conditions([ (y_type[0] == "binary" and not (y_score.ndim == 2 and y_score.shape[1] > 2), "y_true type is not one-dimensional binary.") ]) _patching_status.write_log() if y_type[0] == "multiclass" or (y_type[0] == "binary" and y_score.ndim == 2 and y_score.shape[1] > 2): # do not support partial ROC computation for multiclass if max_fpr is not None and max_fpr != 1.: raise ValueError("Partial AUC computation not available in " "multiclass setting, 'max_fpr' must be" " set to `None`, received `max_fpr={0}` " "instead".format(max_fpr)) if multi_class == 'raise': raise ValueError("multi_class must be in ('ovo', 'ovr')") return multiclass_roc_auc_score(y_true, y_score, labels, multi_class, average, sample_weight) if y_type[0] == "binary": labels = y_type[1] _dal_ready = _patching_status.and_conditions([ (len(labels) == 2, "Number of unique labels is not equal to 2."), (max_fpr is None, "Maximum false-positive rate is not supported."), (sample_weight is None, "Sample weights are not supported.") ]) if _dal_ready: if not np.array_equal(labels, [0, 1]) or labels.dtype == bool: y_true = label_binarize(y_true, classes=labels)[:, 0] result = d4p.daal_roc_auc_score(y_true.reshape(-1, 1), y_score.reshape(-1, 1)) if result != -1: return result logging.info("sklearn.metrics.roc_auc_score: " + get_patch_message("sklearn_after_daal")) # return to sklearn implementation y_true = label_binarize(y_true, classes=labels)[:, 0] return _average_binary_score( partial(_binary_roc_auc_score, max_fpr=max_fpr), y_true, y_score, average, sample_weight=sample_weight, )