def histPlot(predictions, truth, classes = None, label = "Model", newFigure = None, splitPosNeg = False, kde = False): """ Computes the histograms of a binary predictions Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}} Keyword Arguments: label {str} -- Legend to plot (default: {"Model"}) newFigure {str} -- Display on a given figure (default: {None} - Create new figure) splitPosNeg {bool} -- Split between positive and negative (default: {False}) kde {bool} -- Computes the kde of the histogram (default: {False}) """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) bins = np.linspace(0, 1, 20) if newFigure is not None: plt.figure(newFigure) else: plt.xlabel('Predicted Probability') plt.ylabel('Frequency') plt.title('Histogram Probabilities') if splitPosNeg: sns.distplot(predictions[truth == 1], label=label + " Positive", kde = kde, bins = bins) sns.distplot(predictions[truth == 0], label=label + " Negative", kde = kde, bins = bins) else: sns.distplot(predictions, label=label, kde = kde, bins = bins)
def computeEvolutionRoc(temporalListLabels, predictions, classes = None, percentage = 0.001): """ Plots the evolution of the auc Arguments: temporalListLabels {List of (time, labels)*} -- Ground truth labels predictions {Dict / List of labels} -- Predicitons (same format than labels in temporalListLabels) classes {Dict} -- Classes to consider to plot (key: Name to display, Value: label) percentage {float} -- Evaluate the TPR and TNR at this given value of FNR and FPR """ aucs = {} for time, labels in temporalListLabels: pred_time, labels_time = selection(predictions, labels, classes) pred_time, labels_time = flatten(pred_time, labels_time) fpr, tpr, _ = roc_curve(labels_time, pred_time) fnr, tnr = (1 - tpr)[::-1], (1 - fpr)[::-1] auc_time = auc(fpr, tpr) wilson_tpr = 1.96 * np.sqrt(tpr * (1 - tpr)/len(predictions)) wilson_tnr = 1.96 * np.sqrt(tnr * (1 - tnr)/len(predictions)) aucs[time] = { "auc": auc_time, "lower": auc(fpr, tpr - wilson_tpr), "upper": auc(fpr, tpr + wilson_tpr), "tpr": np.interp(percentage, fpr, tpr), "tpr_wilson" : np.interp(percentage, fpr, wilson_tpr), "tnr": np.interp(percentage, fnr, tnr), "tnr_wilson" : np.interp(percentage, fnr, wilson_tnr), } return pd.DataFrame.from_dict(aucs, orient = "index")
def rocPlot(predictions, truth, classes = None, label = "Model", newFigure = None, reverse = False, percentage = None): """ Computes the roc with confidence bounds for the given model Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}} Keyword Arguments: label {str} -- Legend to plot (default: {"Model"}) newFigure {str} -- Display on a given figure (default: {None} - Create new figure) reverse {bool} -- Plot the reverse ROC useful for analyzing TNR (default: {False}) """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) global_fpr, global_tpr, _ = roc_curve(truth, predictions) if reverse: x, y = 1 - global_tpr, 1 - global_fpr # FNR, TNR x, y = x[::-1], y[::-1] minx = 1. / np.sum(truth == 1) if percentage is None: percentage = minx str_print = "TNR @{:.2f}% FNR : {:.2f}".format(percentage*100, np.interp(percentage, x, y)) else: x, y = global_fpr, global_tpr minx = 1. / np.sum(truth == 0) if percentage is None: percentage = minx str_print = "TPR @{:.2f}% FPR : {:.2f}".format(percentage*100, np.interp(percentage, x, y)) if newFigure is not None: plt.figure(newFigure) else: plt.plot(np.linspace(0, 1, 100), np.linspace(0, 1, 100), 'k--', label="Random") if reverse: plt.xlabel('False negative rate') plt.ylabel('True negative rate') plt.title('Reverse ROC curve') else: plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve') newx = np.linspace(minx, 1, 1000) y = np.interp(newx, x, y) wilson = 1.96 * np.sqrt(y * (1 - y)/len(predictions)) print(str_print + " +/- {:.2f}".format(np.interp(0.01, newx, wilson))) upper = np.minimum(y + wilson, 1) lower = np.maximum(y - wilson, 0) plRoc = plt.plot(newx, y, label=label + " ({:.2f} +/- {:.2f})".format(aucCompute(predictions, truth, classes), (auc(newx, upper) - auc(newx, lower))/2.), ls = '--' if "train" in label.lower() else '-') plt.fill_between(newx, lower, upper, color=plRoc[0].get_color(), alpha=.2)
def calibrationPlot(predictions, truth, classes=None, label="Model", newFigure=None, n_bins=5): """ Computes the roc with confidence bounds for the given model Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}} Keyword Arguments: label {str} -- Legend to plot (default: {"Model"}) newFigure {str} -- Display on a given figure (default: {None} - Create new figure) n_bins {int} -- Numbre of bins for the calibration (default: {5}) """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) predictions = ((predictions - predictions.min()) / (predictions.max() - predictions.min())).flatten() fraction_of_positives, mean_predicted_value = calibration_curve( truth, predictions, n_bins=n_bins) bins = np.linspace(0., 1. + 1e-8, n_bins + 1) binids = np.digitize(predictions, bins) - 1 bin_sums = np.bincount(binids, minlength=len(bins)) bin_sums = bin_sums[bin_sums != 0] * 500 / np.sum(bin_sums) if newFigure is not None: plt.figure(newFigure) else: plt.xlabel('Mean Predicted Value') plt.ylabel('Fraction Positive') plt.title('Calibration') p = plt.plot(mean_predicted_value, fraction_of_positives, alpha=0.5, ls=':') plt.scatter(mean_predicted_value, fraction_of_positives, s=bin_sums, label=label + " ({:.2f})".format(brier_score_loss(truth, predictions)), color=p[0].get_color(), alpha=0.5)
def averagePrecisionRecallCompute(predictions, truth, classes=None): """ Computes AUC of the given predictions Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth Keyword Arguments: classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}} Returns: float -- Estimation by pooling of auc """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) return average_precision_score(truth, predictions)
def confusionPlot(predictions, truth, classes, percentage = True): """ Computes the confusion matrix of the given model Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth classes {Dict "+":int, "-":int} -- Classes to consider to plot """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) classes_list = np.array(list(classes.keys())) confusion = confusion_matrix(truth, predictions, labels=[classes[c] for c in classes_list]) notNull = confusion.sum(axis = 0) != 0 if percentage: confusion = confusion / confusion.sum(axis = 1, keepdims = True) sns.heatmap(confusion[:, notNull], xticklabels = classes_list[notNull], yticklabels = classes_list, annot = True, vmin = 0, vmax = 1 if percentage else None) plt.xlabel("Predicted") plt.ylabel("Ground truth")
def precisionRecallPlot(predictions, truth, classes=None, label="Model", newFigure=None, reverse=False, percentage=None): """ Computes the roc with confidence bounds for the given model Arguments: predictions {Dict / List} -- Label predictions truth {Dict / List} -- Ground truth classes {Dict "+":int, "-":int} -- Classes to consider to plot {Default None ie {+":1, "-":0}} Keyword Arguments: label {str} -- Legend to plot (default: {"Model"}) newFigure {str} -- Display on a given figure (default: {None} - Create new figure) reverse {bool} -- Plot the reverse ROC useful for analyzing TNR (default: {False}) """ predictions, truth = selection(predictions, truth, classes) predictions, truth = flatten(predictions, truth) precision, recall, _ = precision_recall_curve(truth, predictions) if newFigure is not None: plt.figure(newFigure) else: plt.xlabel('Precision') plt.ylabel('Recall') plt.title('Precision Recall curve') plt.plot(precision, recall, label=label + " ({:.2f})".format( averagePrecisionRecallCompute(predictions, truth, classes)), ls='--' if "train" in label.lower() else '-')
def cross_validation(model, data, labels, folds, classes=None, transform=None, proba=True, nested_cv=False): """ Computes the cross valdiation on the data Given the folds indicated in folds Arguments: model {model} -- Model to cross validate data {Dict} -- Data to split labels {Dict} -- Labels of the data (keys have to match) folds {Dict: (key : fold, values: key of data and labels)} -- Folds in order to split the data transform {Transform Object} -- Compute the transformation on train and apply on train and test Returns: Predictions by the model on cross validated data (Dict with same keys than data) """ predictions, labels_res = {}, labels.copy() for k in folds: data_fold, labels_fold = selection( {d: data[d].copy() for d in data if d not in folds[k]}, {d: labels[d].copy() for d in data if d not in folds[k]}, classes) data_test = {d: data[d] for d in folds[k]} if transform is not None: data_fold = transform.fit_transform_dict(data_fold, labels_fold) data_test = transform.transform_dict(data_test) # Because Normalization can impact labeling data_fold = { d: data_fold[d] for d in data_fold if len(data_fold[d]) > 0 } labels_fold = { d: labels[d][data_fold[d].index] for d in data_fold if len(data_fold[d]) > 0 } data_test = { d: data_test[d] for d in data_test if len(data_test[d]) > 0 } labels_res.update({ d: labels[d][data_test[d].index] for d in folds[k] if len(data_test[d]) > 0 }) if nested_cv: groups = [i for i in data_fold for _ in range(len(data_fold[i]))] model.fit_dict(data_fold, labels_fold, groups=groups) else: model.fit_dict(data_fold, labels_fold) if proba: predictions.update(model.predict_proba_dict(data_test)) else: predictions.update(model.predict_dict(data_test)) return predictions, labels_res