Ejemplo n.º 1
0
def binary_classifier_quality(model, X_test, Y_test):
    """
    Meant for binary classification.
    If `model` is Grid it uses best model.
    """
    if isinstance(model, GridSearchCV):
        result = pd.DataFrame(
            {k: model.cv_results_[k] for k in \
                 ['params', 'mean_test_score', 'std_test_score', 'rank_test_score']}
            )
        print(result)
        print()
        print(f"best params: {model.best_params_}")
        print("best score: {:f}".format(model.best_score_))
        print()

    Y_hat = model.predict(X_test)
    print("Confusion matrix (true x pred):")
    print(confusion_matrix(Y_test, Y_hat))
    print("Sensitivity: {:f}".format( sum(Y_hat[Y_test==1]) / sum(Y_test) ))
    print("Specificity: {:f}".format( sum(1 - Y_hat[Y_test==0]) / sum(Y_test==0)))
    print("Accuracy score on test data: {:f}".format( accuracy_score(Y_test, Y_hat) ))
    print("F1 score on test data: {:f}".format( f1_score(Y_test, Y_hat) ))

    #print(confusion_matrix(grid.predict(X_test), y_test))
    ConfusionMatrixDisplay.from_predictions(Y_test, Y_hat)
    RocCurveDisplay.from_estimator(model, X_test, Y_test)
Ejemplo n.º 2
0
def sklearn_visualizations():
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split
    from sklearn.svm import SVC
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import RocCurveDisplay
    from sklearn import datasets

    # data
    X, y = datasets.load_wine(return_X_y=True)
    y = y == 2
    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=rng)

    # svm
    svc = SVC(random_state=rng)
    svc.fit(X_train, y_train)
    svc_disp = RocCurveDisplay.from_estimator(svc, X_test, y_test)
    plt.show()

    # random forest
    rfc = RandomForestClassifier(random_state=rng)
    rfc.fit(X_train, y_train)
    ax = plt.gca()
    rfc_disp = RocCurveDisplay.from_estimator(rfc,
                                              X_test,
                                              y_test,
                                              ax=ax,
                                              alpha=0.8)
    svc_disp.plot(ax=ax, alpha=0.8)
    plt.show()
    def plot(self, data_original_test):
        """"Plot ROC-AOC Curves of both original and synthetic in single figure"""
        X_test, y_test = self._split_xy(data_original_test)

        fig, ax = plt.subplots(1, 2, figsize=(14, 6))
        sns.despine()
        # roc curve
        RocCurveDisplay.from_estimator(self.stats_original_,
                                       X_test,
                                       y_test,
                                       name=self.labels[0],
                                       color=COLOR_PALETTE[0],
                                       ax=ax[0])
        RocCurveDisplay.from_estimator(self.stats_synthetic_,
                                       X_test,
                                       y_test,
                                       name=self.labels[1],
                                       color=COLOR_PALETTE[1],
                                       ax=ax[0])

        ax[0].plot([0, 1], [0, 1],
                   linestyle="--",
                   lw=1,
                   color="black",
                   alpha=0.7)
        ax[0].set_title('ROC Curve')

        # pr curve
        PrecisionRecallDisplay.from_estimator(self.stats_original_,
                                              X_test,
                                              y_test,
                                              name=self.labels[0],
                                              color=COLOR_PALETTE[0],
                                              ax=ax[1])
        PrecisionRecallDisplay.from_estimator(self.stats_synthetic_,
                                              X_test,
                                              y_test,
                                              name=self.labels[1],
                                              color=COLOR_PALETTE[1],
                                              ax=ax[1])
        no_skill = len(y_test[y_test == 1]) / len(y_test)
        ax[1].plot([0, 1], [no_skill, no_skill],
                   lw=1,
                   linestyle='--',
                   color='black',
                   alpha=0.7)
        ax[1].set_title('Precision-Recall Curve')
Ejemplo n.º 4
0
def test_roc_curve_display_complex_pipeline(pyplot, data_binary, clf,
                                            constructor_name):
    """Check the behaviour with complex pipeline."""
    X, y = data_binary

    if constructor_name == "from_estimator":
        with pytest.raises(NotFittedError):
            RocCurveDisplay.from_estimator(clf, X, y)

    clf.fit(X, y)

    if constructor_name == "from_estimator":
        display = RocCurveDisplay.from_estimator(clf, X, y)
        name = clf.__class__.__name__
    else:
        display = RocCurveDisplay.from_predictions(y, y)
        name = "Classifier"

    assert name in display.line_.get_label()
    assert display.estimator_name == name
Ejemplo n.º 5
0
import matplotlib.pyplot as plt
from sklearn.metrics import RocCurveDisplay

fig, ax = plt.subplots()

models = [
    ("RT embedding -> LR", rt_model),
    ("RF", random_forest),
    ("RF embedding -> LR", rf_model),
    ("GBDT", gradient_boosting),
    ("GBDT embedding -> LR", gbdt_model),
]

model_displays = {}
for name, pipeline in models:
    model_displays[name] = RocCurveDisplay.from_estimator(pipeline,
                                                          X_test,
                                                          y_test,
                                                          ax=ax,
                                                          name=name)
_ = ax.set_title("ROC curve")

# %%
fig, ax = plt.subplots()
for name, pipeline in models:
    model_displays[name].plot(ax=ax)

ax.set_xlim(0, 0.2)
ax.set_ylim(0.8, 1)
_ = ax.set_title("ROC curve (zoomed in at top left)")
Ejemplo n.º 6
0
X, y = load_wine(return_X_y=True)
y = y == 2

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
svc = SVC(random_state=42)
svc.fit(X_train, y_train)

# %%
# Plotting the ROC Curve
# ----------------------
# Next, we plot the ROC curve with a single call to
# :func:`sklearn.metrics.RocCurveDisplay.from_estimator`. The returned
# `svc_disp` object allows us to continue using the already computed ROC curve
# for the SVC in future plots.
svc_disp = RocCurveDisplay.from_estimator(svc, X_test, y_test)
plt.show()

# %%
# Training a Random Forest and Plotting the ROC Curve
# ---------------------------------------------------
# We train a random forest classifier and create a plot comparing it to the SVC
# ROC curve. Notice how `svc_disp` uses
# :func:`~sklearn.metrics.RocCurveDisplay.plot` to plot the SVC ROC curve
# without recomputing the values of the roc curve itself. Furthermore, we
# pass `alpha=0.8` to the plot functions to adjust the alpha values of the
# curves.
rfc = RandomForestClassifier(n_estimators=10, random_state=42)
rfc.fit(X_train, y_train)
ax = plt.gca()
rfc_disp = RocCurveDisplay.from_estimator(rfc,
Ejemplo n.º 7
0
def test_roc_curve_display_plotting(
    pyplot,
    response_method,
    data_binary,
    with_sample_weight,
    drop_intermediate,
    with_strings,
    constructor_name,
    default_name,
):
    """Check the overall plotting behaviour."""
    X, y = data_binary

    pos_label = None
    if with_strings:
        y = np.array(["c", "b"])[y]
        pos_label = "c"

    if with_sample_weight:
        rng = np.random.RandomState(42)
        sample_weight = rng.randint(1, 4, size=(X.shape[0]))
    else:
        sample_weight = None

    lr = LogisticRegression()
    lr.fit(X, y)

    y_pred = getattr(lr, response_method)(X)
    y_pred = y_pred if y_pred.ndim == 1 else y_pred[:, 1]

    if constructor_name == "from_estimator":
        display = RocCurveDisplay.from_estimator(
            lr,
            X,
            y,
            sample_weight=sample_weight,
            drop_intermediate=drop_intermediate,
            pos_label=pos_label,
            alpha=0.8,
        )
    else:
        display = RocCurveDisplay.from_predictions(
            y,
            y_pred,
            sample_weight=sample_weight,
            drop_intermediate=drop_intermediate,
            pos_label=pos_label,
            alpha=0.8,
        )

    fpr, tpr, _ = roc_curve(
        y,
        y_pred,
        sample_weight=sample_weight,
        drop_intermediate=drop_intermediate,
        pos_label=pos_label,
    )

    assert_allclose(display.roc_auc, auc(fpr, tpr))
    assert_allclose(display.fpr, fpr)
    assert_allclose(display.tpr, tpr)

    assert display.estimator_name == default_name

    import matplotlib as mpl  # noqal

    assert isinstance(display.line_, mpl.lines.Line2D)
    assert display.line_.get_alpha() == 0.8
    assert isinstance(display.ax_, mpl.axes.Axes)
    assert isinstance(display.figure_, mpl.figure.Figure)

    expected_label = f"{default_name} (AUC = {display.roc_auc:.2f})"
    assert display.line_.get_label() == expected_label

    expected_pos_label = 1 if pos_label is None else pos_label
    expected_ylabel = f"True Positive Rate (Positive label: {expected_pos_label})"
    expected_xlabel = f"False Positive Rate (Positive label: {expected_pos_label})"

    assert display.ax_.get_ylabel() == expected_ylabel
    assert display.ax_.get_xlabel() == expected_xlabel
Ejemplo n.º 8
0
def test_plot_roc_curve_pos_label(pyplot, response_method, constructor_name):
    # check that we can provide the positive label and display the proper
    # statistics
    X, y = load_breast_cancer(return_X_y=True)
    # create an highly imbalanced
    idx_positive = np.flatnonzero(y == 1)
    idx_negative = np.flatnonzero(y == 0)
    idx_selected = np.hstack([idx_negative, idx_positive[:25]])
    X, y = X[idx_selected], y[idx_selected]
    X, y = shuffle(X, y, random_state=42)
    # only use 2 features to make the problem even harder
    X = X[:, :2]
    y = np.array(["cancer" if c == 1 else "not cancer" for c in y],
                 dtype=object)
    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        stratify=y,
        random_state=0,
    )

    classifier = LogisticRegression()
    classifier.fit(X_train, y_train)

    # sanity check to be sure the positive class is classes_[0] and that we
    # are betrayed by the class imbalance
    assert classifier.classes_.tolist() == ["cancer", "not cancer"]

    y_pred = getattr(classifier, response_method)(X_test)
    # we select the correcponding probability columns or reverse the decision
    # function otherwise
    y_pred_cancer = -1 * y_pred if y_pred.ndim == 1 else y_pred[:, 0]
    y_pred_not_cancer = y_pred if y_pred.ndim == 1 else y_pred[:, 1]

    if constructor_name == "from_estimator":
        display = RocCurveDisplay.from_estimator(
            classifier,
            X_test,
            y_test,
            pos_label="cancer",
            response_method=response_method,
        )
    else:
        display = RocCurveDisplay.from_predictions(
            y_test,
            y_pred_cancer,
            pos_label="cancer",
        )

    roc_auc_limit = 0.95679

    assert display.roc_auc == pytest.approx(roc_auc_limit)
    assert np.trapz(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)

    if constructor_name == "from_estimator":
        display = RocCurveDisplay.from_estimator(
            classifier,
            X_test,
            y_test,
            response_method=response_method,
            pos_label="not cancer",
        )
    else:
        display = RocCurveDisplay.from_predictions(
            y_test,
            y_pred_not_cancer,
            pos_label="not cancer",
        )

    assert display.roc_auc == pytest.approx(roc_auc_limit)
    assert np.trapz(display.tpr, display.fpr) == pytest.approx(roc_auc_limit)
classifier = svm.SVC(kernel="linear",
                     probability=True,
                     random_state=random_state)

tprs = []
aucs = []
mean_fpr = np.linspace(0, 1, 100)

fig, ax = plt.subplots()
for i, (train, test) in enumerate(cv.split(X, y)):
    classifier.fit(X[train], y[train])
    viz = RocCurveDisplay.from_estimator(
        classifier,
        X[test],
        y[test],
        name="ROC fold {}".format(i),
        alpha=0.3,
        lw=1,
        ax=ax,
    )
    interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)

ax.plot([0, 1], [0, 1],
        linestyle="--",
        lw=2,
        color="r",
        label="Chance",
        alpha=0.8)
# The precision and recall metric focuses on the positive class, however, one
# might be interested in the compromise between accurately discriminating the
# positive class and accurately discriminating the negative classes. The
# statistics used for this are sensitivity and specificity. Sensitivity is just
# another name for recall. However, specificity measures the proportion of
# correctly classified samples in the negative class defined as: TN / (TN +
# FP). Similar to the precision-recall curve, sensitivity and specificity are
# generally plotted as a curve called the receiver operating characteristic
# (ROC) curve. Below is such a curve:

# %%
from sklearn.metrics import RocCurveDisplay

disp = RocCurveDisplay.from_estimator(classifier,
                                      data_test,
                                      target_test,
                                      pos_label='donated',
                                      marker="+")
disp = RocCurveDisplay.from_estimator(dummy_classifier,
                                      data_test,
                                      target_test,
                                      pos_label='donated',
                                      color="tab:orange",
                                      linestyle="--",
                                      ax=disp.ax_)
plt.legend(bbox_to_anchor=(1.05, 0.8), loc="upper left")
_ = disp.ax_.set_title("ROC AUC curve")

# %% [markdown]
# This curve was built using the same principle as the precision-recall curve:
# we vary the probability threshold for determining "hard" prediction and
Ejemplo n.º 11
0
}

X, y = make_classification(
    n_samples=N_SAMPLES,
    n_features=2,
    n_redundant=0,
    n_informative=2,
    random_state=1,
    n_clusters_per_class=1,
)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=0)

# prepare plots
fig, [ax_roc, ax_det] = plt.subplots(1, 2, figsize=(11, 5))

for name, clf in classifiers.items():
    clf.fit(X_train, y_train)

    RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax_roc, name=name)
    DetCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax_det, name=name)

ax_roc.set_title("Receiver Operating Characteristic (ROC) curves")
ax_det.set_title("Detection Error Tradeoff (DET) curves")

ax_roc.grid(linestyle="--")
ax_det.grid(linestyle="--")

plt.legend()
plt.show()