def plotIncrementalProfit(Ytest, Yprobs, ax=None, xlabel=True, ylabel=True): """Plots incremental profit obtained by the model as a function of probability threshold""" if ax is None: ax = plt.gca() fps, tps, thresh = _binary_clf_curve(Ytest, Yprobs) thresh = thresh profit = (9.85 * tps - 0.15 * fps) / (sum(Ytest) * 9.85) plt.style.use('seaborn-whitegrid') step_kwargs = ({'step': 'post'}) ax.fill_between(thresh, profit, alpha=0.4, color='skyblue', **step_kwargs) ix = np.nanargmax(profit) bestThr = thresh[ix] ax.scatter(thresh[ix], profit[ix], marker='o', color='red', label='Best') ax.set_xlabel('Threshold', fontsize=18) ax.set_ylabel('Incremental profit (percentage of max)', fontsize=18) ax.set_ylim([0, 1]) ax.set_xlim([0, 1]) ax.xaxis.set_tick_params(labelsize=16) ax.yaxis.set_tick_params(labelsize=16) ax.legend(fontsize=14) if not ylabel: ax.set_ylabel('') if not xlabel: ax.set_xlabel('') ax.set_aspect('equal') profit = pd.DataFrame({ 'profit': profit[:-1], 'threshold': thresh[:-1] }).transpose() return profit, bestThr
def prec_rec(y_true, y_pred, method, alpha=100, plot=False): """ Calculates the weighted precision metric at recall levels 0.1, 0.5 and 0.9 as proposed in: The Deepfake Detection Challenge (DFDC) Preview Dataset (https://arxiv.org/abs/1910.08854) Parts from sklearn.metrics precision_recall_curve adapted by: Christopher Otto alpha = 100 as suggested in the paper. """ fps, tps, thresholds = _ranking._binary_clf_curve(y_true, y_pred, pos_label=None, sample_weight=None) weighted_precision = tps / (tps + alpha * fps) weighted_precision[np.isnan(weighted_precision)] = 0 # take log of weighted precision similar to The Deepfake Detection Challenge (DFDC) Preview Dataset (https://arxiv.org/abs/1910.08854) weighted_precision = [ math.log(entry) if entry > 0 else 0 for entry in weighted_precision ] recall = tps / tps[-1] # stop when full recall attained # and reverse the outputs so recall is decreasing last_ind = tps.searchsorted(tps[-1]) sl = slice(last_ind, None, -1) prec, rec, thresh = np.r_[weighted_precision[sl], 1], np.r_[recall[sl], 0], thresholds[sl] # first precision entry for recall level at 0.9 threshold_index_point_nine = len([entry for entry in rec if entry >= 0.9]) - 1 weighted_precision_at_point_nine_rec = prec[threshold_index_point_nine] # first precision entry for recall level at 0.5 threshold_index_point_five = len([entry for entry in rec if entry >= 0.5]) - 1 weighted_precision_at_point_five_rec = prec[threshold_index_point_five] # first precision entry for recall level at 0.1 threshold_index_point_one = len([entry for entry in rec if entry >= 0.1]) - 1 weigthed_precision_at_point_one_rec = prec[threshold_index_point_one] if plot: average_precision = average_precision_score(y_true, y_pred) viz = precision_recall_curve.PrecisionRecallDisplay( precision=prec, recall=rec, average_precision=average_precision, estimator_name=f"{method}") disp = viz.plot(ax=None, name=f"Method: {method}") disp.ax_.set_title('Weighted Precision-Recall curve') plt.xlabel('Weighted Precision (Cost)') plt.ylabel('Recall') plt.savefig('w_prec_recall_curve.png') plt.show() return weigthed_precision_at_point_one_rec, weighted_precision_at_point_five_rec, weighted_precision_at_point_nine_rec
def specificity_recall_calculator(y_true, probas_pred, pos_label=None, sample_weight=None): fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred, pos_label=pos_label, sample_weight=sample_weight) specificity = (fps[-1] - fps) / fps[-1] specificity[np.isnan(specificity)] = 0 recall = tps / tps[-1] # stop when full recall attained # and reverse the outputs so recall is decreasing last_ind = tps.searchsorted(tps[-1]) sl = slice(last_ind, None, -1) return np.r_[specificity[sl], 1], np.r_[recall[sl], 0], thresholds[sl]
def find(self, clf, x_train, y_train): proba = clf.predict_proba(x_train) assert isinstance(proba, np.ndarray), \ "classifier should return numpy array" assert proba.shape == (x_train.shape[0], 2), \ "classifier should return (%d,%d)-shaped array, not %s" % ( x_train.shape[0], 2, str(proba.shape)) fps, tps, thresholds = _binary_clf_curve(y_train, proba[:, 1]) precision = tps / (tps + fps) for k in reversed(range(len(precision))): if precision[k] >= self.min_precision: return thresholds[k] return 2.0
def precision_curves(y_true, probas_pred, *, pos_label=None, sample_weight=None): """ Minor adaption of corresponding scikit-learn function """ fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred, pos_label=pos_label, sample_weight=sample_weight) precision = tps / (tps + fps) precision[np.isnan(precision)] = 0 recall = tps / tps[-1] specificity = 1 - fps / fps[-1] last_ind = tps.searchsorted(tps[-1]) sl = slice(last_ind, None, -1) return np.r_[precision[sl], 1], np.r_[recall[sl], 0], np.r_[specificity[sl], 1]
def _truncated_roc(y_df, bg_idx=-1, fp_cutoff=None): """ Computes truncated ROC info """ import sklearn try: from sklearn.metrics._ranking import _binary_clf_curve except ImportError: from sklearn.metrics.ranking import _binary_clf_curve y_true = (y_df['true'] == y_df['pred']) y_score = y_df['score'] sample_weight = y_df['weight'] # y_true[y_true == -1] = 0 # < TRUCNATED PART > # GET ROC CURVES AT A PARTICULAR FALSE POSITIVE COUNT CUTOFF # This will let different runs be more comparable realpos_total = sample_weight[(y_df['txs'] >= 0)].sum() fp_count, tp_count, count_thresholds = _binary_clf_curve( y_true, y_score, pos_label=1, sample_weight=sample_weight) if len(count_thresholds) > 0 and count_thresholds[-1] == 0: # Chop off the last entry where it will jump count_thresholds = count_thresholds[:-1] tp_count = tp_count[:-1] fp_count = fp_count[:-1] # Cutoff the curves at a comparable point if fp_cutoff is None: fp_cutoff = np.inf idxs = np.where(fp_count > fp_cutoff)[0] if len(idxs) == 0: idx = len(fp_count) else: idx = idxs[0] trunc_fp_count = fp_count[:idx] trunc_tp_count = tp_count[:idx] trunc_thresholds = count_thresholds[:idx] # if the cuttoff was not reached, horizontally extend the curve # This will hurt the scores (aka we may be bias against small # scenes), but this will ensure that big scenes are comparable if len(fp_count) == 0: trunc_fp_count = np.array([fp_cutoff]) trunc_tp_count = np.array([0]) trunc_thresholds = np.array([0]) # THIS WILL CAUSE AUC TO RAISE AN ERROR IF IT GETS HIT elif fp_count[-1] < fp_cutoff and np.isfinite(fp_cutoff): trunc_fp_count = np.hstack([trunc_fp_count, [fp_cutoff]]) trunc_tp_count = np.hstack([trunc_tp_count, [trunc_tp_count[-1]]]) trunc_thresholds = np.hstack([trunc_thresholds, [0]]) falsepos_total = trunc_fp_count[-1] # is this right? trunc_tpr = trunc_tp_count / realpos_total trunc_fpr = trunc_fp_count / falsepos_total trunc_auc = sklearn.metrics.auc(trunc_fpr, trunc_tpr) # < /TRUCNATED PART > roc_info = { 'fp_cutoff': fp_cutoff, 'realpos_total': realpos_total, 'tpr': trunc_tpr, 'fpr': trunc_fpr, 'fp_count': trunc_fp_count, 'tp_count': trunc_tp_count, 'thresholds': trunc_thresholds, 'auc': trunc_auc, } return roc_info
def roc(self, fp_cutoff=None, stabalize_thresh=7, stabalize_pad=7): """ Example: >>> self = BinaryConfusionVectors.demo(n=0) >>> print('roc = {}'.format(ub.repr2(self.roc()))) >>> self = BinaryConfusionVectors.demo(n=1, p_true=0.5, p_error=0.5) >>> print('roc = {}'.format(ub.repr2(self.roc()))) >>> self = BinaryConfusionVectors.demo(n=3, p_true=0.5, p_error=0.5) >>> print('roc = {}'.format(ub.repr2(self.roc()))) """ import sklearn import sklearn.metrics # NOQA try: from sklearn.metrics._ranking import _binary_clf_curve except ImportError: from sklearn.metrics.ranking import _binary_clf_curve data = self.data y_true = data['is_true'].astype(np.uint8) y_score = data['pred_score'] sample_weight = data._data.get('weight', None) npad = 0 if len(self) > 0: if len(self) <= stabalize_thresh: # add dummy data to stabalize the computation if sample_weight is None: sample_weight = np.ones(len(self)) npad = stabalize_pad y_true, y_score, sample_weight = _stabalilze_data( y_true, y_score, sample_weight, npad=npad) if sample_weight is None: weight = 1 nsupport = len(y_true) - bool(npad) else: weight = sample_weight nsupport = sample_weight.sum() - bool(npad) # y_true[y_true == -1] = 0 # < TRUCNATED PART > # GET ROC CURVES AT A PARTICULAR FALSE POSITIVE COUNT CUTOFF # This will let different runs be more comparable # Get the total weight (typically number of) positive and negative # examples of this class realpos_total = (y_true * weight).sum() realneg_total = ((1 - y_true) * weight).sum() if len(self) == 0: fp_count = np.array([np.nan]) tp_count = np.array([np.nan]) count_thresholds = np.array([np.nan]) else: fp_count, tp_count, count_thresholds = _binary_clf_curve( y_true, y_score, pos_label=1, sample_weight=sample_weight) if len(count_thresholds) > 0 and count_thresholds[-1] == 0: # Chop off the last entry where it will jump count_thresholds = count_thresholds[:-1] tp_count = tp_count[:-1] fp_count = fp_count[:-1] # Cutoff the curves at a comparable point if fp_cutoff is None: fp_cutoff = np.inf elif isinstance(fp_cutoff, str): if fp_cutoff == 'num_true': fp_cutoff = int(np.ceil(realpos_total)) else: raise KeyError(fp_cutoff) idxs = np.where(fp_count > fp_cutoff)[0] if len(idxs) == 0: idx = len(fp_count) else: idx = idxs[0] trunc_fp_count = fp_count[:idx] trunc_tp_count = tp_count[:idx] trunc_thresholds = count_thresholds[:idx] # if the cuttoff was not reached, horizontally extend the curve # This will hurt the scores (aka we may be bias against small # scenes), but this will ensure that big scenes are comparable if len(fp_count) == 0: trunc_fp_count = np.array([fp_cutoff]) trunc_tp_count = np.array([0]) trunc_thresholds = np.array([0]) # THIS WILL CAUSE AUC TO RAISE AN ERROR IF IT GETS HIT elif fp_count[-1] < fp_cutoff and np.isfinite(fp_cutoff): trunc_fp_count = np.hstack([trunc_fp_count, [fp_cutoff]]) trunc_tp_count = np.hstack([trunc_tp_count, [trunc_tp_count[-1]]]) trunc_thresholds = np.hstack([trunc_thresholds, [0]]) falsepos_total = trunc_fp_count[-1] with warnings.catch_warnings(): warnings.filterwarnings('ignore', message='invalid .* true_divide') trunc_tpr = trunc_tp_count / realpos_total trunc_fpr = trunc_fp_count / falsepos_total try: trunc_auc = sklearn.metrics.auc(trunc_fpr, trunc_tpr) except ValueError: # At least 2 points are needed to compute area under curve, but x.shape = 1 trunc_auc = np.nan # < /TRUCNATED PART > roc_info = { 'fp_cutoff': fp_cutoff, 'realpos_total': realpos_total, 'realneg_total': realneg_total, 'nsupport': nsupport, 'tpr': trunc_tpr, 'fpr': trunc_fpr, 'fp_count': trunc_fp_count, 'tp_count': trunc_tp_count, 'thresholds': trunc_thresholds, 'auc': trunc_auc, } if self.cx is not None: roc_info.update({ 'cx': self.cx, 'node': self.classes[self.cx], }) return ROC_Result(roc_info)
def precision_recall(self, stabalize_thresh=7, stabalize_pad=7, method='sklearn'): """ Example: >>> self = BinaryConfusionVectors.demo(n=11) >>> print('precision_recall = {}'.format(ub.repr2(self.precision_recall()))) >>> self = BinaryConfusionVectors.demo(n=7) >>> print('precision_recall = {}'.format(ub.repr2(self.precision_recall()))) >>> self = BinaryConfusionVectors.demo(n=5) >>> print('precision_recall = {}'.format(ub.repr2(self.precision_recall()))) >>> self = BinaryConfusionVectors.demo(n=3) >>> print('precision_recall = {}'.format(ub.repr2(self.precision_recall()))) >>> self = BinaryConfusionVectors.demo(n=2) >>> print('precision_recall = {}'.format(ub.repr2(self.precision_recall()))) >>> self = BinaryConfusionVectors.demo(n=1) >>> print('precision_recall = {}'.format(ub.repr2(self.precision_recall()))) >>> self = BinaryConfusionVectors.demo(n=0) >>> print('precision_recall = {}'.format(ub.repr2(self.precision_recall()))) >>> self = BinaryConfusionVectors.demo(n=1, p_true=0.5, p_error=0.5) >>> print('precision_recall = {}'.format(ub.repr2(self.precision_recall()))) >>> self = BinaryConfusionVectors.demo(n=3, p_true=0.5, p_error=0.5) >>> print('precision_recall = {}'.format(ub.repr2(self.precision_recall()))) """ import sklearn import sklearn.metrics # NOQA try: from sklearn.metrics._ranking import _binary_clf_curve except ImportError: from sklearn.metrics.ranking import _binary_clf_curve data = self.data y_true = data['is_true'].astype(np.uint8) y_score = data['pred_score'] sample_weight = data._data.get('weight', None) npad = 0 if len(self) == 0: ap = np.nan prec = [np.nan] rec = [np.nan] fps = [np.nan] fns = [np.nan] tps = [np.nan] thresholds = [np.nan] realpos_total = 0 realneg_total = 0 nsupport = 0 else: if len(self) <= stabalize_thresh: # add dummy data to stabalize the computation if sample_weight is None: sample_weight = np.ones(len(self)) npad = stabalize_pad y_true, y_score, sample_weight = _stabalilze_data( y_true, y_score, sample_weight, npad=npad) # Get the total weight (typically number of) positive and negative # examples of this class if sample_weight is None: weight = 1 nsupport = len(y_true) - bool(npad) else: weight = sample_weight nsupport = sample_weight.sum() - bool(npad) realpos_total = (y_true * weight).sum() realneg_total = ((1 - y_true) * weight).sum() """ Notes: Apparently, consistent scoring is really hard to get right. For detection problems scoring via confusion_vectors+sklearn produces noticably different results than the VOC method. There are a few reasons for this. The VOC method stops counting true positives after all assigned predicted boxes have been counted. It simply remembers the amount of original true positives to normalize the true positive reate. On the other hand, confusion vectors maintains a list of these unassigned true boxes and gives them a predicted index of -1 and a score of zero. This means that this function sees them as having a y_true of 1 and a y_score of 0, which allows the scikit-learn fps and tps counts to effectively get up to 100% recall when the threshold is zero. The VOC method simply ignores these and handles them implicitly. The problem is that if you remove these from the scikit-learn inputs, it wont see the correct number of positives and it will incorrectly normalize the recall. In summary: VOC: * remembers realpos_total * doesn't count unassigned truths as TP when the threshold is zero. CV+SKL: * counts unassigned truths as TP with score=0. * Always ensure tpr=1, ppv=0 and ppv=1, tpr=0 cases exist. """ with warnings.catch_warnings(): warnings.filterwarnings('ignore', message='invalid .* true_divide') if method.startswith('voc'): y_score_ = y_score[y_score > 0] y_true_ = y_true[y_score > 0] fps, tps, _thresholds = _binary_clf_curve( y_true_, y_score_, pos_label=1.0, sample_weight=sample_weight) elif method == 'sklearn': fps, tps, _thresholds = _binary_clf_curve( y_true, y_score, pos_label=1.0, sample_weight=sample_weight) else: raise KeyError(method) # Slight tweak to sklearn.metrics.precision_recall_curve fns = realpos_total - tps precision = tps / (tps + fps) precision[np.isnan(precision)] = 0 recall = tps / realpos_total # stop when full recall attained # and reverse the outputs so recall is decreasing last_ind = tps.searchsorted(tps[-1]) sl = slice(last_ind, None, -1) prec, rec, thresholds = (np.r_[precision[sl], 1], np.r_[recall[sl], 0], _thresholds[sl]) if method.startswith('voc'): from netharn.metrics.voc_metrics import _voc_ave_precision ap = _voc_ave_precision(rec[::-1], prec[::-1], method=method) elif method == 'sklearn': ap = sklearn.metrics.average_precision_score( y_score=y_score, y_true=y_true, sample_weight=sample_weight) prs_info = { 'ap': ap, 'ppv': prec, # (positive predictive value) == (precision) 'tpr': rec, # (true positive rate) == (recall) 'fp_count': fps, 'tp_count': tps, 'fn_count': fns, 'thresholds': thresholds, 'nsupport': nsupport, 'realpos_total': realpos_total, 'realneg_total': realneg_total, } if self.cx is not None: prs_info.update({ 'cx': self.cx, 'node': self.classes[self.cx], }) return PR_Result(prs_info)
def _binary_clf_curves(self, stabalize_thresh=7, stabalize_pad=7): """ Code common to ROC, PR, and threshold measures TODO: refactor ROC and PR curves to use this code, perhaps even memoizing it. """ try: from sklearn.metrics._ranking import _binary_clf_curve except ImportError: from sklearn.metrics.ranking import _binary_clf_curve data = self.data y_true = data['is_true'].astype(np.uint8) y_score = data['pred_score'] sample_weight = data._data.get('weight', None) npad = 0 if len(self) == 0: fps = [np.nan] fns = [np.nan] tps = [np.nan] thresholds = [np.nan] realpos_total = 0 realneg_total = 0 nsupport = 0 else: if len(self) <= stabalize_thresh: # add dummy data to stabalize the computation if sample_weight is None: sample_weight = np.ones(len(self)) npad = stabalize_pad y_true, y_score, sample_weight = _stabalilze_data( y_true, y_score, sample_weight, npad=npad) # Get the total weight (typically number of) positive and negative # examples of this class if sample_weight is None: weight = 1 nsupport = len(y_true) - bool(npad) else: weight = sample_weight nsupport = sample_weight.sum() - bool(npad) realpos_total = (y_true * weight).sum() realneg_total = ((1 - y_true) * weight).sum() fps, tps, thresholds = _binary_clf_curve( y_true, y_score, pos_label=1.0, sample_weight=sample_weight) # Adjust weighted totals to be robust to floating point errors if np.isclose(realneg_total, fps[-1]): realneg_total = max(realneg_total, fps[-1]) if np.isclose(realpos_total, tps[-1]): realpos_total = max(realpos_total, tps[-1]) tns = realneg_total - fps fns = realpos_total - tps info = { 'fp_count': fps, 'tp_count': tps, 'tn_count': tns, 'fn_count': fns, 'thresholds': thresholds, 'realpos_total': realpos_total, 'realneg_total': realneg_total, 'nsupport': nsupport, } if self.cx is not None: info.update({ 'cx': self.cx, 'node': self.classes[self.cx], }) return info
def roc_curve(y_true, y_score, pos_label=None, sample_weight=None, drop_intermediate=True): """Compute Receiver operating characteristic (ROC) Note: this implementation is restricted to the binary classification task. Read more in the :ref:`User Guide <roc_metrics>`. Parameters ---------- y_true : array, shape = [n_samples] True binary labels. If labels are not either {-1, 1} or {0, 1}, then pos_label should be explicitly given. y_score : array, shape = [n_samples] Target scores, can either be probability estimates of the positive class, confidence values, or non-thresholded measure of decisions (as returned by "decision_function" on some classifiers). pos_label : int or str, default=None The label of the positive class. When ``pos_label=None``, if y_true is in {-1, 1} or {0, 1}, ``pos_label`` is set to 1, otherwise an error will be raised. sample_weight : array-like of shape (n_samples,), default=None Sample weights. drop_intermediate : boolean, optional (default=True) Whether to drop some suboptimal thresholds which would not appear on a plotted ROC curve. This is useful in order to create lighter ROC curves. .. versionadded:: 0.17 parameter *drop_intermediate*. Returns ------- fpr : array, shape = [>2] Increasing false positive rates such that element i is the false positive rate of predictions with score >= thresholds[i]. tpr : array, shape = [>2] Increasing true positive rates such that element i is the true positive rate of predictions with score >= thresholds[i]. thresholds : array, shape = [n_thresholds] Decreasing thresholds on the decision function used to compute fpr and tpr. `thresholds[0]` represents no instances being predicted and is arbitrarily set to `max(y_score) + 1`. See also -------- roc_auc_score : Compute the area under the ROC curve Notes ----- Since the thresholds are sorted from low to high values, they are reversed upon returning them to ensure they correspond to both ``fpr`` and ``tpr``, which are sorted in reversed order during their calculation. References ---------- .. [1] `Wikipedia entry for the Receiver operating characteristic <https://en.wikipedia.org/wiki/Receiver_operating_characteristic>`_ .. [2] Fawcett T. An introduction to ROC analysis[J]. Pattern Recognition Letters, 2006, 27(8):861-874. Examples -------- >>> import numpy as np >>> from sklearn import metrics >>> y = np.array([1, 1, 2, 2]) >>> scores = np.array([0.1, 0.4, 0.35, 0.8]) >>> fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2) >>> fpr array([0. , 0. , 0.5, 0.5, 1. ]) >>> tpr array([0. , 0.5, 0.5, 1. , 1. ]) >>> thresholds array([1.8 , 0.8 , 0.4 , 0.35, 0.1 ]) """ fps, tps, thresholds = _binary_clf_curve(y_true, y_score, pos_label=pos_label, sample_weight=sample_weight) # Attempt to drop thresholds corresponding to points in between and # collinear with other points. These are always suboptimal and do not # appear on a plotted ROC curve (and thus do not affect the AUC). # Here np.diff(_, 2) is used as a "second derivative" to tell if there # is a corner at the point. Both fps and tps must be tested to handle # thresholds with multiple data points (which are combined in # _binary_clf_curve). This keeps all cases where the point should be kept, # but does not drop more complicated cases like fps = [1, 3, 7], # tps = [1, 2, 4]; there is no harm in keeping too many thresholds. if drop_intermediate and len(fps) > 2: optimal_idxs = np.where( np.r_[True, np.logical_or(np.diff(fps, 2), np.diff(tps, 2)), True])[0] fps = fps[optimal_idxs] tps = tps[optimal_idxs] thresholds = thresholds[optimal_idxs] # Add an extra threshold position # to make sure that the curve starts at (0, 0) tps = np.r_[0, tps] fps = np.r_[0, fps] thresholds = np.r_[thresholds[0] * 1.01, thresholds] if fps[-1] <= 0: warnings.warn( "No negative samples in y_true, " "false positive value should be meaningless", UndefinedMetricWarning) fpr = np.repeat(np.nan, fps.shape) else: fpr = fps / fps[-1] if tps[-1] <= 0: warnings.warn( "No positive samples in y_true, " "true positive value should be meaningless", UndefinedMetricWarning) tpr = np.repeat(np.nan, tps.shape) else: tpr = tps / tps[-1] return fpr, tpr, thresholds