def calculate_overall_lwlrap_sklearn(truth, scores): """Calculate the overall lwlrap using sklearn.metrics.lrap.""" # sklearn doesn't correctly apply weighting to samples with no labels, so just skip them. sample_weight = np.sum(truth > 0, axis=1) nonzero_weight_sample_indices = np.flatnonzero(sample_weight > 0) overall_lwlrap = lrap( truth[nonzero_weight_sample_indices, :] > 0, scores[nonzero_weight_sample_indices, :], sample_weight=sample_weight[nonzero_weight_sample_indices]) return overall_lwlrap
def update(self, y_true, y_pred): self._lrap += lrap(y_true, y_pred) * y_pred.shape[0] self._n_items += y_pred.shape[0]
def get_model_stats(y_true, model_outputs, b_thres=.5, q_thres=.5, e_thres=.5, plot_roc=True): ''' Gets the performance statistics of a model based on its outputs and the ground truth. ''' b_scores, q_scores, e_scores = model_outputs[:, 0], model_outputs[:, 1], model_outputs[:, 2] b_true, q_true, e_true = y_true[:, 0], y_true[:, 1], y_true[:, 2] b_roc_auc = roc_auc_score(b_true, b_scores) q_roc_auc = roc_auc_score(q_true, q_scores) e_roc_auc = roc_auc_score(e_true, e_scores) b_fpr, b_tpr, _ = roc_curve(b_true, b_scores) q_fpr, q_tpr, _ = roc_curve(q_true, q_scores) e_fpr, e_tpr, _ = roc_curve(e_true, e_scores) if plot_roc: plt.figure() plt.title('ROC Curves \nfor Bugs, Questions and Enhancements') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.plot(b_fpr, b_tpr, color='orange', label=f'Bug{" "*24}, AUC: {b_roc_auc:.3f}') plt.plot(q_fpr, q_tpr, color='blue', label=f'Question{" "*16}, AUC: {q_roc_auc:.3f}') plt.plot(e_fpr, e_tpr, color='green', label=f'Enhancement{" "*8}, AUC: {e_roc_auc:.3f}') plt.plot([0, 1], [0, 1], color='red', linestyle='--', label=f'Random Guess{" "*6}, AUC: 0.5') plt.legend(loc="lower right") plt.show() b_preds = np.where(b_scores >= b_thres, 1, 0) q_preds = np.where(q_scores >= q_thres, 1, 0) e_preds = np.where(e_scores >= e_thres, 1, 0) b_accuracy = accuracy_score(b_true, b_preds) q_accuracy = accuracy_score(q_true, q_preds) e_accuracy = accuracy_score(e_true, e_preds) b_precision, b_recall, b_f1, _ = precision_recall_fscore_support( b_true, b_preds, average='binary') q_precision, q_recall, q_f1, _ = precision_recall_fscore_support( q_true, q_preds, average='binary') e_precision, e_recall, e_f1, _ = precision_recall_fscore_support( e_true, e_preds, average='binary') y_pred = np.concatenate( (b_preds.reshape(-1, 1), q_preds.reshape(-1, 1), e_preds.reshape( -1, 1)), axis=1) exact_matches = 0 for true, pred in zip(y_true, y_pred): if (true == pred).all(): exact_matches += 1 exact_accuracy = exact_matches / len(y_true) metrics_df = pd.DataFrame( [[b_accuracy, b_roc_auc, b_precision, b_recall, b_f1], [q_accuracy, q_roc_auc, q_precision, q_recall, q_f1], [e_accuracy, e_roc_auc, e_precision, e_recall, e_f1]], columns=['Accuracy', 'ROC-AUC', 'Precision', 'Recall', 'F1'], index=['Bug', 'Question', 'Enhancement']) lrap_score = lrap(y_true, model_outputs) return metrics_df, exact_accuracy, lrap_score
def get_model_stats( y_true, model_outputs, b_thres=0.5, q_thres=0.5, e_thres=0.5, plot_roc=True ): """ Gets the performance statistics of a model based on its outputs and the ground truth. """ b_scores, q_scores, e_scores = ( model_outputs[:, 0], model_outputs[:, 1], model_outputs[:, 2], ) b_true, q_true, e_true = y_true[:, 0], y_true[:, 1], y_true[:, 2] b_roc_auc = roc_auc_score(b_true, b_scores) q_roc_auc = roc_auc_score(q_true, q_scores) e_roc_auc = roc_auc_score(e_true, e_scores) b_fpr, b_tpr, _ = roc_curve(b_true, b_scores) q_fpr, q_tpr, _ = roc_curve(q_true, q_scores) e_fpr, e_tpr, _ = roc_curve(e_true, e_scores) if plot_roc: plt.figure() plt.title("ROC Curves \nfor Bugs, Questions and Enhancements") plt.xlabel("False Positive Rate") plt.ylabel("True Positive Rate") plt.plot( b_fpr, b_tpr, color="orange", label=f'Bug{" "*24}, AUC: {b_roc_auc:.3f}' ) plt.plot( q_fpr, q_tpr, color="blue", label=f'Question{" "*16}, AUC: {q_roc_auc:.3f}' ) plt.plot( e_fpr, e_tpr, color="green", label=f'Enhancement{" "*8}, AUC: {e_roc_auc:.3f}', ) plt.plot( [0, 1], [0, 1], color="red", linestyle="--", label=f'Random Guess{" "*6}, AUC: 0.5', ) plt.legend(loc="lower right") plt.show() b_preds = np.where(b_scores >= b_thres, 1, 0) q_preds = np.where(q_scores >= q_thres, 1, 0) e_preds = np.where(e_scores >= e_thres, 1, 0) b_accuracy = accuracy_score(b_true, b_preds) q_accuracy = accuracy_score(q_true, q_preds) e_accuracy = accuracy_score(e_true, e_preds) b_precision, b_recall, b_f1, _ = precision_recall_fscore_support( b_true, b_preds, average="binary" ) q_precision, q_recall, q_f1, _ = precision_recall_fscore_support( q_true, q_preds, average="binary" ) e_precision, e_recall, e_f1, _ = precision_recall_fscore_support( e_true, e_preds, average="binary" ) y_pred = np.concatenate( (b_preds.reshape(-1, 1), q_preds.reshape(-1, 1), e_preds.reshape(-1, 1)), axis=1 ) exact_matches = 0 for true, pred in zip(y_true, y_pred): if (true == pred).all(): exact_matches += 1 exact_accuracy = exact_matches / len(y_true) metrics_df = pd.DataFrame( [ [b_accuracy, b_roc_auc, b_precision, b_recall, b_f1], [q_accuracy, q_roc_auc, q_precision, q_recall, q_f1], [e_accuracy, e_roc_auc, e_precision, e_recall, e_f1], ], columns=["Accuracy", "ROC-AUC", "Precision", "Recall", "F1"], index=["Bug", "Question", "Enhancement"], ) lrap_score = lrap(y_true, model_outputs) return metrics_df, exact_accuracy, lrap_score