def test_calibration_display_default_labels(pyplot, name, expected_label):
    prob_true = np.array([0, 1, 1, 0])
    prob_pred = np.array([0.2, 0.8, 0.8, 0.4])
    y_prob = np.array([])

    viz = CalibrationDisplay(prob_true, prob_pred, y_prob, estimator_name=name)
    viz.plot()
    assert viz.line_.get_label() == expected_label
def test_calibration_display_ref_line(pyplot, iris_data_binary):
    # Check that `ref_line` only appears once
    X, y = iris_data_binary
    lr = LogisticRegression().fit(X, y)
    dt = DecisionTreeClassifier().fit(X, y)

    viz = CalibrationDisplay.from_estimator(lr, X, y)
    viz2 = CalibrationDisplay.from_estimator(dt, X, y, ax=viz.ax_)

    labels = viz2.ax_.get_legend_handles_labels()[1]
    assert labels.count("Perfectly calibrated") == 1
def test_calibration_display_label_class_plot(pyplot):
    # Checks that when instantiating `CalibrationDisplay` class then calling
    # `plot`, `self.estimator_name` is the one given in `plot`
    prob_true = np.array([0, 1, 1, 0])
    prob_pred = np.array([0.2, 0.8, 0.8, 0.4])
    y_prob = np.array([])

    name = "name one"
    viz = CalibrationDisplay(prob_true, prob_pred, y_prob, estimator_name=name)
    assert viz.estimator_name == name
    name = "name two"
    viz.plot(name=name)
    assert viz.line_.get_label() == name
def test_calibration_display_default_labels(pyplot, name, expected_label):
    prob_true = np.array([0, 1, 1, 0])
    prob_pred = np.array([0.2, 0.8, 0.8, 0.4])
    y_prob = np.array([])

    viz = CalibrationDisplay(prob_true, prob_pred, y_prob, estimator_name=name)
    viz.plot()

    expected_legend_labels = [] if name is None else [name]
    expected_legend_labels.append("Perfectly calibrated")
    legend_labels = viz.ax_.get_legend().get_texts()
    assert len(legend_labels) == len(expected_legend_labels)
    for labels in legend_labels:
        assert labels.get_text() in expected_legend_labels
def test_calibration_display_non_binary(pyplot, iris_data, constructor_name):
    X, y = iris_data
    clf = DecisionTreeClassifier()
    clf.fit(X, y)
    y_prob = clf.predict_proba(X)

    if constructor_name == "from_estimator":
        msg = "to be a binary classifier, but got"
        with pytest.raises(ValueError, match=msg):
            CalibrationDisplay.from_estimator(clf, X, y)
    else:
        msg = "y should be a 1d array, got an array of shape"
        with pytest.raises(ValueError, match=msg):
            CalibrationDisplay.from_predictions(y, y_prob)
def test_calibration_display_pos_label(
    pyplot, iris_data_binary, pos_label, expected_pos_label
):
    """Check the behaviour of `pos_label` in the `CalibrationDisplay`."""
    X, y = iris_data_binary

    lr = LogisticRegression().fit(X, y)
    viz = CalibrationDisplay.from_estimator(lr, X, y, pos_label=pos_label)

    y_prob = lr.predict_proba(X)[:, expected_pos_label]
    prob_true, prob_pred = calibration_curve(y, y_prob, pos_label=pos_label)

    assert_allclose(viz.prob_true, prob_true)
    assert_allclose(viz.prob_pred, prob_pred)
    assert_allclose(viz.y_prob, y_prob)

    assert (
        viz.ax_.get_xlabel()
        == f"Mean predicted probability (Positive class: {expected_pos_label})"
    )
    assert (
        viz.ax_.get_ylabel()
        == f"Fraction of positives (Positive class: {expected_pos_label})"
    )

    expected_legend_labels = [lr.__class__.__name__, "Perfectly calibrated"]
    legend_labels = viz.ax_.get_legend().get_texts()
    assert len(legend_labels) == len(expected_legend_labels)
    for labels in legend_labels:
        assert labels.get_text() in expected_legend_labels
def test_calibration_display_compute(pyplot, iris_data_binary, n_bins, strategy):
    # Ensure `CalibrationDisplay.from_predictions` and `calibration_curve`
    # compute the same results. Also checks attributes of the
    # CalibrationDisplay object.
    X, y = iris_data_binary

    lr = LogisticRegression().fit(X, y)

    viz = CalibrationDisplay.from_estimator(
        lr, X, y, n_bins=n_bins, strategy=strategy, alpha=0.8
    )

    y_prob = lr.predict_proba(X)[:, 1]
    prob_true, prob_pred = calibration_curve(
        y, y_prob, n_bins=n_bins, strategy=strategy
    )

    assert_allclose(viz.prob_true, prob_true)
    assert_allclose(viz.prob_pred, prob_pred)
    assert_allclose(viz.y_prob, y_prob)

    assert viz.estimator_name == "LogisticRegression"

    # cannot fail thanks to pyplot fixture
    import matplotlib as mpl  # noqa

    assert isinstance(viz.line_, mpl.lines.Line2D)
    assert viz.line_.get_alpha() == 0.8
    assert isinstance(viz.ax_, mpl.axes.Axes)
    assert isinstance(viz.figure_, mpl.figure.Figure)

    assert viz.ax_.get_xlabel() == "Mean predicted probability"
    assert viz.ax_.get_ylabel() == "Fraction of positives"
    assert viz.line_.get_label() == "LogisticRegression"
def my_calibration_plot(model, calibrated_model, X_data, Y_data):

    fig = plt.figure(figsize=(10, 10))
    gs = GridSpec(1, 1)
    ax_calibration_curve = fig.add_subplot(gs[0])
    calibration_displays = {}
    display = CalibrationDisplay.from_estimator(model,
                                                X_data,
                                                Y_data,
                                                n_bins=10,
                                                name='No calibrated',
                                                ax=ax_calibration_curve,
                                                color='red')
    calibration_displays['No calibrated'] = display

    ax = fig.add_subplot(gs[0])
    probs_calib = calibrated_model.predict_proba(X_data)[:, 1]
    fop_calib, mpv_calib = calibration_curve(Y_data,
                                             probs_calib,
                                             n_bins=10,
                                             normalize=True)

    ax.plot(mpv_calib, fop_calib, marker='.', color='blue', label='Calibrated')
    plt.legend()
    plt.show()
def test_calibration_display_pos_label(
    pyplot, iris_data_binary, pos_label, expected_pos_label
):
    """Check the behaviour of `pos_label` in the `CalibrationDisplay`."""
    X, y = iris_data_binary

    lr = LogisticRegression().fit(X, y)
    viz = CalibrationDisplay.from_estimator(lr, X, y, pos_label=pos_label)

    y_prob = lr.predict_proba(X)[:, expected_pos_label]
    prob_true, prob_pred = calibration_curve(y, y_prob, pos_label=pos_label)

    assert_allclose(viz.prob_true, prob_true)
    assert_allclose(viz.prob_pred, prob_pred)
    assert_allclose(viz.y_prob, y_prob)

    assert (
        viz.ax_.get_xlabel()
        == f"Mean predicted probability (Positive class: {expected_pos_label})"
    )
    assert (
        viz.ax_.get_ylabel()
        == f"Fraction of positives (Positive class: {expected_pos_label})"
    )
    assert viz.line_.get_label() == "LogisticRegression"
def test_plot_calibration_curve_pipeline(pyplot, iris_data_binary):
    # Ensure pipelines are supported by CalibrationDisplay.from_estimator
    X, y = iris_data_binary
    clf = make_pipeline(StandardScaler(), LogisticRegression())
    clf.fit(X, y)
    viz = CalibrationDisplay.from_estimator(clf, X, y)
    assert clf.__class__.__name__ in viz.line_.get_label()
    assert viz.estimator_name == clf.__class__.__name__
def test_calibration_display_label_class_plot(pyplot):
    # Checks that when instantiating `CalibrationDisplay` class then calling
    # `plot`, `self.estimator_name` is the one given in `plot`
    prob_true = np.array([0, 1, 1, 0])
    prob_pred = np.array([0.2, 0.8, 0.8, 0.4])
    y_prob = np.array([])

    name = "name one"
    viz = CalibrationDisplay(prob_true, prob_pred, y_prob, estimator_name=name)
    assert viz.estimator_name == name
    name = "name two"
    viz.plot(name=name)

    expected_legend_labels = [name, "Perfectly calibrated"]
    legend_labels = viz.ax_.get_legend().get_texts()
    assert len(legend_labels) == len(expected_legend_labels)
    for labels in legend_labels:
        assert labels.get_text() in expected_legend_labels
def test_plot_calibration_curve_pipeline(pyplot, iris_data_binary):
    # Ensure pipelines are supported by CalibrationDisplay.from_estimator
    X, y = iris_data_binary
    clf = make_pipeline(StandardScaler(), LogisticRegression())
    clf.fit(X, y)
    viz = CalibrationDisplay.from_estimator(clf, X, y)

    expected_legend_labels = [viz.estimator_name, "Perfectly calibrated"]
    legend_labels = viz.ax_.get_legend().get_texts()
    assert len(legend_labels) == len(expected_legend_labels)
    for labels in legend_labels:
        assert labels.get_text() in expected_legend_labels
def test_calibration_display_validation(pyplot, iris_data, iris_data_binary):
    X, y = iris_data
    X_binary, y_binary = iris_data_binary

    reg = LinearRegression().fit(X, y)
    msg = "'estimator' should be a fitted classifier"
    with pytest.raises(ValueError, match=msg):
        CalibrationDisplay.from_estimator(reg, X, y)

    clf = LinearSVC().fit(X, y)
    msg = "response method predict_proba is not defined in"
    with pytest.raises(ValueError, match=msg):
        CalibrationDisplay.from_estimator(clf, X, y)

    clf = LogisticRegression()
    with pytest.raises(NotFittedError):
        CalibrationDisplay.from_estimator(clf, X, y)
Example #14
0
def multi_class_calibration(y_true,
                            y_prob,
                            n_bins=5,
                            strategy='uniform',
                            names=None,
                            ref_line=True,
                            ax=None,
                            **kwargs):
    """
    Displays a multi-class Calibration Curve (one line per class in one-v-rest setup)

    :param y_true: True Labels: note that labels must be sequential starting from 0
    :param y_prob: Predicted Probabilities
    :param n_bins: Number of bins to use (see CalibrationDisplay.from_predictions)
    :param strategy: Strategy for bins (see CalibrationDisplay.from_predictions)
    :param names:  Class names to use
    :param ref_line: Whether to plot reference line (see CalibrationDisplay.from_predictions)
    :param ax: Axes to draw on (see CalibrationDisplay.from_predictions)
    :param kwargs: Keyword arguments passed on to plot
    :return: Dict of Calibration Displays (by name)
    """
    # Iterate over classes
    names = utils.default(names, np.arange(y_prob.shape[1]))
    displays = {}
    for cls, name in zip(range(y_prob.shape[1]), names):
        _y_true = (y_true == cls).astype(
            int)  # Get positive class for this label
        _y_prob = y_prob[:, cls]  # Get probability assigned to this class
        displays[name] = CalibrationDisplay.from_predictions(_y_true,
                                                             _y_prob,
                                                             n_bins=n_bins,
                                                             strategy=strategy,
                                                             name=name,
                                                             ref_line=ref_line,
                                                             ax=ax,
                                                             **kwargs)
    return displays
Example #15
0
]

# %%
fig = plt.figure(figsize=(10, 10))
gs = GridSpec(4, 2)
colors = plt.cm.get_cmap("Dark2")

ax_calibration_curve = fig.add_subplot(gs[:2, :2])
calibration_displays = {}
for i, (clf, name) in enumerate(clf_list):
    clf.fit(X_train, y_train)
    display = CalibrationDisplay.from_estimator(
        clf,
        X_test,
        y_test,
        n_bins=10,
        name=name,
        ax=ax_calibration_curve,
        color=colors(i),
    )
    calibration_displays[name] = display

ax_calibration_curve.grid()
ax_calibration_curve.set_title("Calibration plots (Naive Bayes)")

# Add histogram
grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
for i, (_, name) in enumerate(clf_list):
    row, col = grid_positions[i]
    ax = fig.add_subplot(gs[row, col])
Example #16
0
def eiras(filename1, filename2):
    """
    Getting result of applying one file model on another.
    Arguments:two different datasets,filename1 for training set, filename2 for test set.
    Output: parameters that measure the performance of the exsiting model using filename1.
    """

    bysy = pd.read_csv(filename1)
    szyy = pd.read_csv(filename2)
    train = bysy.values
    valid = szyy.values
    g = train.shape[1]

    X_train = train[:, :g - 1]
    y_train = train[:, g - 1]

    X_test = valid[0:, :g - 1]
    y_test = valid[0:, g - 1]

    feature_name = valid[-1:0:1, :g - 1]

    models = [{
        'label':
        'LR',
        'model':
        linear_model.LogisticRegression(penalty='l1', solver='liblinear'),
        'rank':
        'A',
    }, {
        'label': 'SVM',
        'model': svm.SVC(kernel='linear', gamma=1, C=1, probability=True),
        'rank': 'B',
    }, {
        'label': 'GBDT',
        'model': GradientBoostingClassifier(),
        'rank': 'C',
    }, {
        'label':
        'XGboost',
        'model':
        XGBClassifier(subsample=1,
                      eta=0.05,
                      eval_metric=['logloss', 'auc', 'error'],
                      use_label_encoder=False),
        'rank':
        'D',
    }]

    def Find_Optimal_Cutoff(TPR, FPR, threshold):
        y = TPR - FPR
        Youden_index = np.argmax(y)  # Only the first occurrence is returned.
        optimal_threshold = threshold[Youden_index]
        point = [FPR[Youden_index], TPR[Youden_index]]
        return optimal_threshold, point

    def importance(model, X_train, feature_name):
        try:
            shap_values = shap.TreeExplainer(model).shap_values(X_train)
            #shap.summary_plot(shap_values, X_train,feature_names=feature_name)
            #shap.plots.bar(shap_values)
        except Exception:
            return None

    n_bootstraps = 1000
    rng_seed = 42  # control reproducibility
    bootstrapped_scores = []
    rng = np.random.RandomState(rng_seed)

    AUC = []
    CI_lower = []
    CI_upper = []
    Specificity = []
    Sensitivity = []
    Accuracy = []
    Optimal = []
    Point = []
    Youden = []

    for m in models:
        model = m['model']  # select the model
        label = m['label']
        model.fit(X_train, y_train)  # train the model
        y_pred = model.predict(X_test)  # predict the test data
        y_pro = model.predict_proba(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        # Compute False postive rate=fpr, and True positive rate=tpr
        fpr, tpr, thresholds = metrics.roc_curve(
            y_test,
            model.predict_proba(X_test)[:, 1])
        # Calculate Area under the curve to display on the plot
        auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])
        plt.plot(fpr, tpr, label='%s (AUC = %0.2f)' % (m['label'], auc))

        #using optimal_threshold function to find best sensitivity and specificity trade-off
        optimal_threshold, point = Find_Optimal_Cutoff(tpr, fpr, thresholds)
        best_y_pred = (model.predict_proba(X_test)[:, 1] >=
                       optimal_threshold).astype(bool)

        # compute sensitivity,specifictiy and accuracy
        #confusion = confusion_matrix(y_test,best_y_pred)
        #disp = ConfusionMatrixDisplay(confusion_matrix=confusion,display_labels=model.classes_)
        #disp.plot()
        #plt.show()

        tn, fp, fn, tp = confusion_matrix(y_test, best_y_pred).ravel()
        sensitivity = float(tp) / float(tp + fn)
        specificity = float(tn) / float(tn + fp)
        youden = (sensitivity + specificity) - 1
        accuracy = metrics.accuracy_score(y_test,
                                          best_y_pred,
                                          normalize=True,
                                          sample_weight=None)

        #importance

        #importance(model,X_train,feature_name)

        #

        #compute CI using bootstraps
        yy_pred = y_pro[:, 1]
        for i in range(n_bootstraps):
            indices = rng.randint(0, len(yy_pred), len(yy_pred))
            if len(np.unique(y_test[indices])) < 2:
                continue
            score = roc_auc_score(y_test[indices], yy_pred[indices])
            bootstrapped_scores.append(score)
        sorted_scores = np.array(bootstrapped_scores)
        sorted_scores.sort()
        confidence_lower = sorted_scores[int(0.05 * len(sorted_scores))]
        confidence_upper = sorted_scores[int(0.95 * len(sorted_scores))]
        print(
            label,
            "Confidence interval for the score: [{:0.3f} - {:0.3}]".format(
                confidence_lower, confidence_upper))

        Sensitivity.append(sensitivity)
        Specificity.append(specificity)
        Accuracy.append(accuracy)
        AUC.append(auc)
        CI_lower.append(confidence_lower)
        CI_upper.append(confidence_upper)
        Youden.append(youden)
        Optimal.append(optimal_threshold)

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('1-特异率', fontproperties=myfont)
    plt.ylabel('敏感率', fontproperties=myfont)
    plt.legend(loc="lower right")
    #plt.show()
    plt.savefig('C:/Users/mjdee/Desktop/JI-2020/ML/the_roc_mimic.jpg',
                bbox_inches='tight',
                dpi=600)
    plt.close()

    #plot calibration_curve
    fig = plt.figure(figsize=(10, 10))
    gs = GridSpec(4, 2)
    ax_calibration_curve = fig.add_subplot(gs[:2, :2])
    calibration_displays = {}
    colors = plt.cm.get_cmap("Dark2")
    i = 0
    for m in (models):
        model = m['model']
        label = m['label']
        model.fit(X_train, y_train)
        display = CalibrationDisplay.from_estimator(
            model,
            X_test,
            y_test,
            n_bins=10,
            name=label,
            ax=ax_calibration_curve,
            color=colors(i),
        )
        i = i + 1
        calibration_displays[label] = display

    ax_calibration_curve.grid()
    ax_calibration_curve.set_title("A   Calibration plots")

    #plot calibration_histograms
    grid_positions = [(2, 0), (2, 1), (3, 0), (3, 1)]
    i = 0
    for m in (models):
        model = m['model']
        label = m['label']
        rank = m['rank']
        row, col = grid_positions[i]
        ax = fig.add_subplot(gs[row, col])

        ax.hist(calibration_displays[label].y_prob,
                range=(0, 1),
                bins=10,
                label=name,
                color=colors(i))
        i = i + 1
        ax.set(title=str(rank) + str(label),
               xlabel="Mean predicted probability",
               ylabel="Count")

    plt.tight_layout()
    plt.savefig('C:/Users/mjdee/Desktop/JI-2020/ML/the_calibration_mimic.jpg',
                bbox_inches='tight',
                dpi=600)
    plt.show()

    return Sensitivity, Specificity, Accuracy, AUC, CI_upper, CI_lower, Youden, Optimal