def plot_desicion_boundery_n_roc(X, y, clf, X_test, y_test):
    """Plots the desicioon boundery and ROC-curve

    Args:
        X (numpy array): features
        y (numpy array): target
        clf (sklearn classifier): ML model
        X_test (numpy array): test features
        y_test (numpy array): test target
    """
    if X.shape[1] == 2:
        fig1 = plt.figure(figsize=(15, 10))
        plot_decision_regions(X, y, clf=clf, legend=2)
        # Adding axes annotations
        plt.title('Decision Boundery')
        st.pyplot(fig1)

    if len(np.unique(y)) == 2:
        st.markdown('The ROC-curve for the test data is displayed below:')
        fig2 = plt.figure(figsize=(15, 10))
        ax = fig2.add_subplot(111)
        plot_roc_curve(clf, X_test, y_test, ax=ax)
        st.pyplot(fig2)
Esempio n. 2
0
def stratified_cross_validation(model, X, y, n_folds=10):

    # Cross-validation model
    cv = StratifiedKFold(n_splits=n_folds)

    # Evaluate classifier (actual purities)
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots(figsize=(14, 11))
    for i, (train, test) in enumerate(cv.split(X, y)):
        model.fit(X[train], y[train])
        viz = plot_roc_curve(model,
                             X[test],
                             y[test],
                             name="",
                             alpha=0,
                             lw=3,
                             ax=ax,
                             color="royalblue")
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)

    # Plot ROC curve
    ax.plot(mean_fpr,
            mean_tpr,
            color='royalblue',
            label=r'Mean AUC = %0.2f $\pm$ %0.2f' % (mean_auc, std_auc),
            lw=6,
            alpha=0.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    ax.fill_between(mean_fpr,
                    tprs_lower,
                    tprs_upper,
                    color='steelblue',
                    alpha=.4,
                    label=r'$\pm$ 1 std. dev.')

    return fig, ax
Esempio n. 3
0
    def plot_metrics(metrics_list):
        if 'Confusion Matrix' in metrics_list:
            st.subheader("Confusion Matrix")
            plot_confusion_matrix(model,
                                  x_test,
                                  y_test,
                                  display_labels=class_names)
            st.pyplot()

        if 'ROC Curve' in metrics_list:
            st.subheader("ROC Curve")
            plot_roc_curve(model, x_test, y_test)
            st.pyplot()
            st.set_option('deprecation.showPyplotGlobalUse', False)
            #fig, ax = matplotlib.pyplot.subplots()
            #ax.plot([0,0.5,1],[0,0.5,1])
            #st.pyplot(fig)

        if 'Precision-Recall Curve' in metrics_list:
            st.subheader('Precision-Recall Curve')
            plot_precision_recall_curve(model, x_test, y_test)
            st.pyplot()
            st.set_option('deprecation.showPyplotGlobalUse', False)
Esempio n. 4
0
def eval_model(model, X_test, y_test, thresh=None, plot=True):
    y_prob = model.predict_proba(X_test)[:, 1]
    if thresh == None:
        thresh, sens, spec, PPV, NPV, percent_pos = search_thresh(
            y_prob, y_test)
        predictions = y_prob > thresh
    else:
        predictions = y_prob > thresh
        sens, spec, PPV, NPV, percent_pos = calculate_stats(
            predictions, y_test)
    model_name = model.__class__.__name__
    print(model_name)
    print('AUPRC: {:.3f}'.format(
        metrics.average_precision_score(y_test, y_prob)))
    print('AUROC: {:.3f}'.format(metrics.roc_auc_score(y_test, y_prob)))
    print(metrics.confusion_matrix(y_test, predictions))
    print('sens: {:.3f} '.format(sens), 'spec: {:.3f} '.format(spec),
          'PPV: {:.3f} '.format(PPV), 'NPV: {:.3f} '.format(NPV),
          '%pos: {:.3f}'.format(percent_pos))
    # print(metrics.classification_report(y_test, predictions))

    if plot:
        fig, ax = plt.subplots(1, 3, figsize=(12, 3))
        # AUROC, AUPRC
        metrics.plot_roc_curve(model, X_test, y_test, ax=ax[0])
        metrics.plot_precision_recall_curve(model, X_test, y_test, ax=ax[1])
        # calibration curve
        fraction_pos, mean_predicted_value = calibration_curve(y_test,
                                                               y_prob,
                                                               n_bins=20)
        ax[2].plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
        ax[2].plot(mean_predicted_value, fraction_pos, 's-', label=model_name)
        ax[2].set_xlabel('mean predicted value')
        ax[2].set_ylabel('fraction positive')
        # save result
        plt.savefig('./result/' + model_name + '_AUC_plot.svg')
    return thresh
Esempio n. 5
0
def NonLinear_Model(X_train, Y_train, X_test, Y_test):
    ## Proceed To Prepare Linear SVM
    linear_svc = SVC(kernel='rbf')
    linear_svc.fit(X_train, Y_train)

    ## Proceed to Test Performance on the Training Dataset
    Y_train_predict = linear_svc.predict(X_train)

    ## Proceed to write accuracy
    train_accuracy = accuracy_score(Y_train, Y_train_predict)
    train_precision = precision_score(Y_train, Y_train_predict)
    train_recall = recall_score(Y_train, Y_train_predict)
    train_auc = roc_auc_score(Y_train, Y_train_predict)

    print("Training Results")
    print("Accuracy on Training:", round(train_accuracy, 3))
    print("Precision on Training:", round(train_precision, 3))
    print("Recall on Training:", round(train_recall, 3))
    print("AUC on Training:", round(train_auc, 3))

    ## Proceed to Test on Testing Dataset
    Y_test_predict = linear_svc.predict(X_test)

    # Proceed to Calculate Scores
    test_accuracy = accuracy_score(Y_test, Y_test_predict)
    test_precision = precision_score(Y_test, Y_test_predict)
    test_recall = recall_score(Y_test, Y_test_predict)
    test_auc = roc_auc_score(Y_test, Y_test_predict)

    print("\nTesting Results")
    print("Accuracy on Testing:", round(test_accuracy, 3))
    print("Precision on Testing:", round(test_precision, 3))
    print("Recall on Testing:", round(test_recall, 3))
    print("AUC on Testing:", round(test_auc, 3))

    ## Proceed to Graph ROC Curve
    plot_roc_curve(linear_svc, X_test, Y_test)
Esempio n. 6
0
def grid_search_stratified_cross_validation(clf,
                                            param_grid,
                                            Xtrain,
                                            ytrain,
                                            Xtest,
                                            ytest,
                                            n_splits=3,
                                            title=None):
    # Stratified-K-Fold Cross-Validation
    print()
    print('-' * 100)
    print('Stratified-K-Fold Cross-Validation')
    print('-' * 100)

    skf = StratifiedKFold(n_splits=n_splits)
    grid = GridSearchCV(estimator=clf,
                        param_grid=param_grid,
                        cv=skf,
                        verbose=0)

    grid.fit(Xtrain, ytrain)
    print()
    print('[*] Best Params:')
    pprint(grid.best_params_)

    print()
    print('[*] Best Estimator:')
    pprint(grid.best_estimator_)

    print()
    print('[*] Best Score:')
    pprint(grid.best_score_)

    plot_conf_matrix(grid, Xtest, ytest, title)
    # plot_roc_curve(grid, Xtest, ytest, label=title, title=title)
    plot_roc_curve(grid, Xtest, ytest)
    pass
Esempio n. 7
0
def roc_w_cross_val(X, y, classifier, plot=False):
    cv = StratifiedKFold(n_splits=6)

    X = X.to_numpy()
    y = y.to_numpy()
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots()

    for i, (train, test) in enumerate(cv.split(X, y)):
        classifier.fit(X[train], y[train])
        viz = plot_roc_curve(classifier, X[test], y[test],
                             name='ROC fold {}'.format(i),
                             alpha=0.3, lw=1, ax=ax)
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)

    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
            label='Chance', alpha=.8)
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, float(std_auc)),
            lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)

    ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                    label=r'$\pm$ 1 std. dev.')
    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
           title="Receiver operating characteristic example")
    # ax.legend(loc="lower right")
    ax.legend(bbox_to_anchor=(1, 0), loc="lower left")

    if plot:
        plt.show()
    else:
        plt.close()

    return mean_auc
Esempio n. 8
0
def roc_curve_image(
        model,
        model_name: str,
        X_test: DataFrame,
        y_test: Series,
        output_dir: str = "images/results"
):
    """
    Plots Receiver-Operating-Characteristic

    :param model: Fitted model to create plot for
    :param model_name: Used to name image file
    :param X_test: Test Dataframe of X values
    :param y_test: Test Series of y values
    :param output_dir: Output directory for plot
    :return: None
    """
    fig = plt.figure(figsize=(15, 8))
    ax = plt.gca()
    plot_roc_curve(model, X_test, y_test, ax=ax, alpha=0.8)
    plt.suptitle("Receiver Operating Characteristic Curve", fontweight="bold")
    plt.tight_layout()
    plt.savefig(os.path.join(output_dir, f"{model_name}_ROC_Curve.png"))
    plt.close(fig)
def DTree(data, name, visualize, t):
    Data = data.loc[:, data.columns != 'class']
    target = pd.DataFrame()
    target['class'] = data['class']
    data_train, data_test, target_train, target_test = train_test_split(
        data, target, test_size=t, random_state=42, shuffle=True)
    dt = DecisionTreeClassifier(criterion='gini', max_depth=10)
    pred = dt.fit(data_train, target_train).predict(data_test)
    print("Decision tree" + name + " accuracy: ",
          accuracy_score(target_test, pred, normalize=True))
    print(classification_report(target_test, pred))
    if (visualize):
        plot_roc_curve(dt, data_test, target_test)
        print(confusion_matrix(target_test, pred))
        plt.show()
    return {
        'DT': {
            'datatrain': data_train,
            'targettrain': target_train,
            'datatest': data_test,
            'targettest': target_test,
            'name': name
        }
    }
Esempio n. 10
0
def KNN(data, name, visualize, t):
    Data = data.loc[:, data.columns != 'class']
    target = pd.DataFrame()
    target['class'] = data['class']
    data_train, data_test, target_train, target_test = train_test_split(
        Data, target, test_size=t, random_state=42, shuffle=True)
    neigh = KNeighborsClassifier(n_neighbors=5)
    pred = neigh.fit(data_train, target_train).predict(data_test)
    print("KNN " + name + "  accuracy: ",
          accuracy_score(target_test, pred, normalize=True))
    print(classification_report(target_test, pred))
    if (visualize):
        plot_roc_curve(neigh, data_test, target_test)
        print(confusion_matrix(target_test, pred))
        plt.show()
    return {
        'KNN': {
            'datatrain': data_train,
            'targettrain': target_train,
            'datatest': data_test,
            'targettest': target_test,
            'name': name
        }
    }
Esempio n. 11
0
def LRegression(data, name, visualize, t):
    target = pd.DataFrame()
    target['class'] = data['class']
    Data = data.loc[:, data.columns != 'class']
    data_train, data_test, target_train, target_test = train_test_split(
        data, target, test_size=t, random_state=42, shuffle=True)
    lr = LogisticRegression()
    pred = lr.fit(data_train, target_train).predict(data_test)
    print("Logistic Regression " + name + " accuracy: ",
          accuracy_score(target_test, pred, normalize=True))
    print(classification_report(target_test, pred))
    if (visualize):
        plot_roc_curve(lr, data_test, target_test)
        print(confusion_matrix(target_test, pred))
        plt.show()
    return {
        'LR': {
            'datatrain': data_train,
            'targettrain': target_train,
            'datatest': data_test,
            'targettest': target_test,
            'name': name
        }
    }
Esempio n. 12
0
    def plot_metrics(metrics_list):
        if 'Score' in metrics_list:
            st.write("Accuracy: ", accuracy.round(2))
        if 'MSE' in metrics_list:
            st.write("MSE:  ", mean_squared_error(y_test, y_pred))
            st.write("RMSE:  ", np.sqrt(mean_squared_error(y_test, y_pred)))

        # Classification Metrics
        if 'Confusion Matrix' in metrics_list:
            st.subheader("Confusion Matrix")
            plot_confusion_matrix(model,
                                  x_test,
                                  y_test_enc,
                                  display_labels=class_names)
            #plot_confusion_matrix(model, x_test, y_test_enc)
            st.pyplot()
        if 'ROC Curve' in metrics_list:
            st.subheader("ROC Curve")
            plot_roc_curve(model, x_test, y_test_enc)
            st.pyplot()
        if 'Precision-Recall Curve' in metrics_list:
            st.subheader("Precision-Recall Curve")
            plot_precision_recall_curve(model, x_test, y_test_enc)
            st.pyplot()
def test_plot_roc_curve_estimator_name_multiple_calls(pyplot, data_binary):
    # non-regression test checking that the `name` used when calling
    # `plot_roc_curve` is used as well when calling `disp.plot()`
    X, y = data_binary
    clf_name = "my hand-crafted name"
    clf = LogisticRegression().fit(X, y)
    disp = plot_roc_curve(clf, X, y, name=clf_name)
    assert disp.estimator_name == clf_name
    pyplot.close("all")
    disp.plot()
    assert clf_name in disp.line_.get_label()
    pyplot.close("all")
    clf_name = "another_name"
    disp.plot(name=clf_name)
    assert clf_name in disp.line_.get_label()
Esempio n. 14
0
def plot_metrics(x_test,y_test,model,metrics_list,dataset):
    if dataset=='Mushroom Dataset':
        if 'ROC Curve' in metrics_list:
            st.subheader("ROC Curve")
            plot_roc_curve(model,x_test,y_test)
            st.pyplot()
        if 'Confusion Matrix' in metrics_list:
            st.subheader("Confusion Matrics")
            plot_confusion_matrix(model,x_test,y_test,display_labels=class_names)
            st.pyplot()
        if 'Precision Recall Curve' in metrics_list:
            st.subheader("Precision Recall Graph")
            plot_precision_recall_curve(model,x_test,y_test)
            st.pyplot()

    if dataset=='Iris':
        if 'ROC Curve' in metrics_list:
            pass
        if 'Confusion Matrix' in metrics_list:
            st.subheader("Confusion Matrix")
            plot_confusion_matrix(model,x_test,y_test)
            st.pyplot()
        if 'Precision Recall Curve' in metrics_list:
            pass
def evaluate_classification(model, X_test,y_test,cmap='Greens',
                            normalize='true',classes=['No-Recid','Yes-Recid'],figsize=(10,4),
                            X_train = None, y_train = None,label='Test Data',
                            return_report=False):
    """Evaluates a scikit-learn binary classification model.

    Args:
        model (classifier): any sklearn classification model.
        X_test_tf (Frame or Array): X data
        y_test (Series or Array): y data
        cmap (str, optional): Colormap for confusion matrix. Defaults to 'Greens'.
        normalize (str, optional): normalize argument for plot_confusion_matrix. 
                                    Defaults to 'true'.
        classes (list, optional): List of class names for display. Defaults to None.
        figsize (tuple, optional): figure size Defaults to (8,4).
        
        X_train (Frame or Array, optional): If provided, compare model.score 
                                for train and test. Defaults to None.
        y_train (Series or Array, optional): If provided, compare model.score 
                                for train and test. Defaults to None.
    """
    ## 
    get_report(model,X_test,y_test,as_df=False,label=label,target_names=classes)
    
    ## Plot Confusion Matrid and roc curve
    fig,ax = plt.subplots(ncols=2, figsize=figsize)
    metrics.plot_confusion_matrix(model, X_test,y_test,cmap=cmap, 
                                  normalize=normalize,display_labels=classes,
                                 ax=ax[0])
    
    ## if roc curve erorrs, delete second ax
    try:
        curve = metrics.plot_roc_curve(model,X_test,y_test,ax=ax[1])
        curve.ax_.grid()
        curve.ax_.plot([0,1],[0,1],ls=':')
        fig.tight_layout()
    except:
        fig.delaxes(ax[1])
        
    plt.show()
    
    ## Add comparing Scores if X_train and y_train provided.
    if (X_train is not None) & (y_train is not None):
        print(f"Training Score = {model.score(X_train,y_train):.2f}")
        print(f"Test Score = {model.score(X_test,y_test):.2f}")
        
    if return_report:
        return get_report(model,X_test,y_test,as_df=True,label=label)
Esempio n. 16
0
    def roCurves(clfList, X_test, y_test):

        roCurveList = []
        plt.subplots(1, 1, figsize=(5, 5))
        styleList = ['solid', 'solid', 'dashed', 'dashed', 'dotted', 'dashed']

        for clf, sty in zip(clfList, styleList):
            ax = plt.gca()
            roc = plot_roc_curve(clf, X_test, y_test, ax=ax, alpha=0.85, lw=2, linestyle=sty)
            roCurveList.append(roc)
        plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='dotted')
        plt.title('ROC')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')

        return roCurveList
Esempio n. 17
0
def test_plot_roc_curve(pyplot, response_method, data_binary,
                        with_sample_weight, drop_intermediate,
                        with_strings):
    X, y = data_binary

    pos_label = None
    if with_strings:
        y = np.array(["c", "b"])[y]
        pos_label = "c"

    if with_sample_weight:
        rng = np.random.RandomState(42)
        sample_weight = rng.randint(1, 4, size=(X.shape[0]))
    else:
        sample_weight = None

    lr = LogisticRegression()
    lr.fit(X, y)

    viz = plot_roc_curve(lr, X, y, alpha=0.8, sample_weight=sample_weight,
                         drop_intermediate=drop_intermediate)

    y_pred = getattr(lr, response_method)(X)
    if y_pred.ndim == 2:
        y_pred = y_pred[:, 1]

    fpr, tpr, _ = roc_curve(y, y_pred, sample_weight=sample_weight,
                            drop_intermediate=drop_intermediate,
                            pos_label=pos_label)

    assert_allclose(viz.roc_auc, auc(fpr, tpr))
    assert_allclose(viz.fpr, fpr)
    assert_allclose(viz.tpr, tpr)

    assert viz.estimator_name == "LogisticRegression"

    # cannot fail thanks to pyplot fixture
    import matplotlib as mpl  # noqal
    assert isinstance(viz.line_, mpl.lines.Line2D)
    assert viz.line_.get_alpha() == 0.8
    assert isinstance(viz.ax_, mpl.axes.Axes)
    assert isinstance(viz.figure_, mpl.figure.Figure)

    expected_label = "LogisticRegression (AUC = {:0.2f})".format(viz.roc_auc)
    assert viz.line_.get_label() == expected_label
    assert viz.ax_.get_ylabel() == "True Positive Rate"
    assert viz.ax_.get_xlabel() == "False Positive Rate"
def run_classification_models(features, budget_cats):
    # making test train splits
    X_train, X_test, y_train, y_test = train_test_split(features, budget_cats, test_size=0.33, random_state=0)

    # The Naive Bayes are bad, so I remove
    classifiers = [
        (LogisticRegression(random_state=0, max_iter = 1000), {
            'C': np.logspace(-2, 7, 10)
        }),
        (GradientBoostingClassifier(n_estimators=50, random_state=0), {
            'learning_rate': np.logspace(-4, 0, 10)
        }),
        (SVC(random_state=0), {
            'C': np.logspace(-2, 7, 10)
        })]

    for classifier, parameters in classifiers:
        print(classifier)

        clf = GridSearchCV(classifier, parameters, cv = 3)
        clf.fit(X_train, y_train)

        print("Best parameters set found on development set:")
        print()
        print(clf.best_params_)
        print()
        print("Grid scores on development set:")
        print()
        means = clf.cv_results_['mean_test_score']
        stds = clf.cv_results_['std_test_score']
        for mean, std, params in zip(means, stds, clf.cv_results_['params']):
            print("%0.3f (+/-%0.03f) for %r"
                  % (mean, std * 2, params))
        print()

        print("Detailed classification report:")
        y_true, y_pred = y_test, clf.predict(X_test)

        print("Accuracy Score: \n")
        print(accuracy_score(y_test, y_pred))

        print("F1 Score: \n")
        print(f1_score(y_true, y_pred, average = 'macro'))
        print(classification_report(y_true, y_pred))

        disp = plot_roc_curve(clf, X_test, y_test)
        plt.show()
Esempio n. 19
0
def print_classification_summary(model_name,
                                 dataset,
                                 model_instance,
                                 y,
                                 X,
                                 positive_label=1,
                                 negative_label=0):
    """Function to outlay summary information of chosen model including target distribution for dataset used, classification metric scores, confusion matrix and ROC/AUC and Precision/Recall curves."""
    from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix
    from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve

    y_pred = model_instance.predict(X)
    class_y = np.unique(y)

    print('***RESULTS SUMMARY***')
    print(
        '------------------------------------------------------------------------------------------'
    )
    print('Model:', model_name)
    print('Dataset:', dataset)
    print('Target distribution:')
    print('\n')
    print('Class:', class_y[0], '/ Count:', sum(y == class_y[0]), '/ Pct:',
          round(sum(y == class_y[0]) / len(y) * 100, 0))
    print('Class:', class_y[1], '/ Count:', sum(y == class_y[1]), '/ Pct:',
          round(sum(y == class_y[1]) / len(y) * 100, 0))
    print(
        '------------------------------------------------------------------------------------------'
    )
    print('Metric Scores: \n')
    print('Accuracy score:', round(accuracy_score(y, y_pred), 2))
    print('Recall score:', round(recall_score(y, y_pred), 2))
    print('Precision score:', round(precision_score(y, y_pred), 2))
    print('F1 score:', round(f1_score(y, y_pred), 2))
    print(
        '------------------------------------------------------------------------------------------'
    )
    print('Plots:')
    ax = plot_confusion_matrix(model_instance, X, y, values_format='d')
    plt.title('Confusion Matrix')
    plt.show()
    ax = plot_roc_curve(model_instance, X, y)
    plt.title('ROC Curve')
    plt.show()
    ax = plot_precision_recall_curve(model_instance, X, y)
    plt.title('Precision Recall Curve')
    plt.show()
Esempio n. 20
0
def compare_models(classifiers: Dict[str, ClassifierMixin],
                   cv: StratifiedShuffleSplit,
                   x: np.ndarray,
                   y: np.ndarray,
                   validation_size=0.2):
    train_scores: Dict[str, List] = {}
    test_scores: Dict[str, List] = {}

    for name in classifiers.keys():
        train_scores[name] = []
        test_scores[name] = []

    validation_cv = StratifiedShuffleSplit(n_splits=1,
                                           test_size=validation_size,
                                           random_state=0)
    train_ind, validation_ind = validation_cv.split(x, y).__next__()

    x_validation, y_validation = x[validation_ind], y[validation_ind]
    x, y = x[train_ind], y[train_ind]

    for train_ind, test_ind in cv.split(x, y):
        x_train, y_train = x[train_ind], y[train_ind]
        x_test, y_test = x[test_ind], y[test_ind]

        for name, clf in classifiers.items():
            clf.fit(x_train, y_train)
            train_scores[name].append(clf.score(x_train, y_train))
            test_scores[name].append(clf.score(x_test, y_test))

    for name, clf in classifiers.items():
        plt.figure()
        ax = plt.subplot(2, 2, 1)
        disp = plot_precision_recall_curve(clf,
                                           x_validation,
                                           y_validation,
                                           ax=ax)
        disp.ax_.set_title('{} Precision-Recall curve'.format(name))
        ax = plt.subplot(2, 2, 2)
        disp = plot_roc_curve(clf, x_validation, y_validation, ax=ax)
        disp.ax_.set_title('{} ROC curve'.format(name))
        ax = plt.subplot(2, 2, 3)
        disp = plot_confusion_matrix(clf, x_validation, y_validation, ax=ax)
        disp.ax_.set_title('{} Confusion matrix curve'.format(name))
        plt.show()

    return train_scores, test_scores
Esempio n. 21
0
def lab1_6(dataset: np.ndarray, targetDataset: np.ndarray, test: np.ndarray,
           testTarget: np.ndarray) -> None:
    """Загрузите набор данных из файла bank_scoring_train.csv. Это набор финансовых данных, характеризующий
    физических лиц. Целевым столбцом является «SeriousDlqin2yrs», означающий, ухудшится ли финансовая ситуация у
    клиента. Постройте систему по принятию решения о выдаче или невыдаче кредита физическому лицу. Сделайте как
    минимум 2 варианта системы на основе различных классификаторов. Подберите подходящую метрику качества работы
    системы исходя из специфики задачи и определите, принятие решения какой системой сработало лучше на
    bank_scoring_test.csv.

    :param dataset: Набор исходных данных для обучения
    :param targetDataset: Соответствующий набор классов
    :param test: Набор данных для тестирования
    :param testTarget: Соответствующий набор классов для тестирования
    """
    x_train, x_test, y_train, y_test = train_test_split(dataset,
                                                        targetDataset,
                                                        test_size=0.33)
    # подбираем лучший параметр для DecisionTreeClassifier
    # критерий оценки - точность, ROC и AUC
    max_depths = (3, 7, 15)
    for max_depth in max_depths:
        classifier = DecisionTreeClassifier(max_depth=max_depth)
        classifier.fit(x_train, y_train)
        prediction = classifier.predict(x_test)
        accuracy = accuracy_score(y_test, prediction)
        # ROС
        plot_roc_curve(classifier, x_test, y_test)
        plt.title('ROC curve (accuracy = {:.2f}, max depth = {})'.format(
            accuracy, max_depth))
        plt.show()
    # для DecisionTreeClassifier лучший параметр max_depth = 7

    # смотрим на GaussianNB
    classifier = GaussianNB()
    classifier.fit(x_train, y_train)
    prediction = classifier.predict(x_test)
    accuracy = accuracy_score(y_test, prediction)
    # ROС
    plot_roc_curve(classifier, x_test, y_test)
    plt.title('ROC curve (accuracy = {:.2f})'.format(accuracy))
    plt.show()

    # проверяем два лучших варианта на тестовых данных
    bestClassifiers = (DecisionTreeClassifier(max_depth=7), GaussianNB())
    for bestClassifier in bestClassifiers:
        bestClassifier.fit(x_train, y_train)
        prediction = bestClassifier.predict(test)
        accuracy = accuracy_score(testTarget, prediction)
        # ROС
        plot_roc_curve(bestClassifier, test, testTarget)
        plt.title('ROC curve for best classifier (accuracy = {:.2f})'.format(
            accuracy))
        plt.show()
Esempio n. 22
0
def run(X, y, penalty='l2', run_origin='localRun'):
    solver = "saga"
    if penalty is "elasticnet":
        l1_ratio = 0.5
    else:
        l1_ratio = None

    X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

    with mlflow.start_run(run_name=run_origin) as run:
        lr = LogisticRegression(penalty=penalty,
                                solver=solver,
                                l1_ratio=l1_ratio)
        lr.fit(X_train, y_train)
        score_train = lr.score(X_train, y_train)
        score_test = lr.score(X_test, y_test)

        prec_test = precision_score(y_test, lr.predict(X_test))
        rec_test = recall_score(y_test, lr.predict(X_test))
        f1_test = f1_score(y_test, lr.predict(X_test))

        print("hyperparameters: ", lr.get_params())
        print("train score: ", score_train)
        print("test score: ", score_test)
        print("test precision: ", prec_test)
        print("test recall: ", rec_test)
        print("test f1 score: ", f1_test)

        disp = plot_confusion_matrix(lr, X_test, y_test)
        print(disp.confusion_matrix)
        plt.savefig("sklearn_logreg_conf_mat.png")

        disp = plot_roc_curve(lr, X_test, y_test)
        plt.savefig("sklearn_logreg_roc_curve.png")

        print("runId: ", run.info.run_id)
        print("artifact_uri: ", mlflow.get_artifact_uri())
        mlflow.log_metrics({
            "training score": score_train,
            "test score": score_test
        })
        mlflow.log_params(lr.get_params())
        mlflow.set_tags({"run_origin": run_origin})
        mlflow.log_artifact("sklearn_logreg_conf_mat.png", "figures")
        mlflow.log_artifact("sklearn_logreg_roc_curve.png", "figures")
Esempio n. 23
0
    def generate_metrics_for_production_model(pipeline, x_test, y_test):

        print("Count of label NC (id 2 in Database, 1 in CM) in y_test: {}".
              format(sum(y_test == 2)))
        print(
            "Count of label AD-MCI (id 1 in Database, 0 in CM) in y_test: {} \n"
            .format(sum(y_test == 1)))

        y_pred = pipeline.predict(x_test)
        cf_matrix = confusion_matrix(y_test, y_pred)
        sns.heatmap(cf_matrix,
                    annot=True,
                    fmt='d',
                    cmap="RdBu",
                    cbar=False,
                    annot_kws={'fontsize': 16})

        print('Explain Confusion Matrix.\n',
              explain_confusion_matrix(y_test, y_pred))
        print('Custom Accuracy    :', custom_accuracy(y_test, y_pred))
        print('Custom Sensitivity :', custom_sensitivity(y_test, y_pred))
        print('Custom Specificity :', custom_specificity(y_test, y_pred))
        print('Custom Precision   :', custom_precision(y_test, y_pred))
        print('Custom NPV         :', custom_npv(y_test, y_pred))

        # plot roc curves
        display_roc = plot_roc_curve(pipeline, x_test, y_test)
        roc_axes = display_roc.ax_
        roc_axes.plot([0, 1], [0, 1],
                      linestyle='--',
                      lw=2,
                      color='r',
                      label='Chance',
                      alpha=.8)

        # plot precision recall curves
        display_pr = plot_precision_recall_curve(pipeline, x_test, y_test)
        pr_axes = display_pr.ax_
        pr_axes.plot([0, 1], [0.5, 0.5],
                     linestyle='--',
                     lw=2,
                     color='r',
                     label='Chance',
                     alpha=.8)
Esempio n. 24
0
File: plot.py Progetto: jimcui3/MLSR
def plot_roc(model, X, y, filename):
    """
    画roc图,不过sklearn只支持二分类roc,三分类画不了

    Args:
        model: 模型
        X: 特征
        y: 标签
        filename: 图片保存路径

    """
    ax = plt.gca()
    dis = plot_roc_curve(model, X, y, ax=ax)
    dis.plot(ax=ax, alpha=0.8)
    plt.savefig(filename)
    try:
        plt.show()
    except Exception as e:
        print(e.args)
def experiment(
        X_train,  # TODO
        X_test,
        y_train,
        y_test,
        fname,
        binarize=False):
    print("#" * 60)
    print('Started Experiment ..')
    # KNN
    # knn = sklearn.neighbors.KNeighborsClassifier(n_jobs=-1)
    # knn.fit(X_train, y_train)
    # print_accuracy(knn.predict)

    # Random Forest
    rforest = RandomForestClassifier(n_estimators=100,
                                     max_depth=None,
                                     min_samples_split=2,
                                     random_state=0,
                                     n_jobs=-1)
    rforest.fit(X_train, y_train)
    # print_accuracy(rforest.predict)
    y_pred = rforest.predict(X_test)

    evaluate_exp(y_test, y_pred, binarize)
    try:
        from sklearn.metrics import plot_roc_curve
        rforest_disp = skm.plot_roc_curve(rforest, X_test, y_test)
        plt_name = 'roc_curve_{}.png'.format(fname)
        plt.savefig(plt_name, format='png')
        plt.show()
    except:
        print("Not working, plot_roc_curve!")

    disp = skm.plot_precision_recall_curve(rforest, X_test, y_test)
    average_precision = skm.average_precision_score(y_test, y_pred)
    disp.ax_.set_title('2-class Precision-Recall curve: '
                       'AP={0:0.2f}'.format(average_precision))

    print("#" * 60)
Esempio n. 26
0
File: svm.py Progetto: jaiveerk/FML
    def plot_roc(self, verbose=False):
        """ Plot ROC Curve for model
        """
        tprs = []
        aucs = []
        mean_fpr = np.linspace(0, 1, 100)
        cv = StratifiedKFold(n_splits=6)

        fig, ax = plt.subplots()
        for i, (train, test) in enumerate(cv.split(self.data['features'], self.data['labels'])):
            self.model.fit(self.data['features'][train], self.data['labels'][train])
            viz = plot_roc_curve(self.model, self.data['features'][test], self.data['labels'][test],
                                 name='ROC fold {}'.format(i),
                                 alpha=0.3, lw=1, ax=ax)
            interp_tpr = interp(mean_fpr, viz.fpr, viz.tpr)
            interp_tpr[0] = 0.0
            tprs.append(interp_tpr)
            aucs.append(viz.roc_auc)

        ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
                label='Chance', alpha=.8)

        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        std_auc = np.std(aucs)
        ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8)

        std_tpr = np.std(tprs, axis=0)
        tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
        tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
        ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                        label=r'$\pm$ 1 std. dev.')

        ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
               title="Receiver operating characteristic example")
        ax.legend(loc="lower right")
        plt.show()
def visualize_roc_curve_with_cross_validation_1(clf, index_of_curve,
                                                x_with_features, y, mean_fpr,
                                                tprs, aucs, ax):

    ## train set
    # TODO follow my note documentatation
    viz = plot_roc_curve(
        clf,
        x_with_features,
        y,
        name='ROC fold {}'.format(index_of_curve),
        alpha=0.3,
        lw=1,
        ax=ax
    )  # How do i know if it use decision_funciton, predict proba or other?

    ### visualized roc cove of train set
    interp_tpr = interp(mean_fpr, viz.fpr, viz.tpr)
    interp_tpr[0] = 0.0
    tprs.append(interp_tpr)
    aucs.append(viz.roc_auc)
    return ax, tprs, aucs
Esempio n. 28
0
def runXGB(model,
           train_data,
           labels,
           test_data,
           index,
           n_folds=5,
           submiss_dir='./submiss'):

    fig, ax = plt.subplots()
    aucs = []
    cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

    for i, (train, valid) in enumerate(cv.split(train_data, labels)):
        model.fit(train_data[train],
                  labels[train],
                  early_stopping_rounds=50,
                  eval_set=[(train_data[valid], labels[valid])],
                  verbose=0)
        plot = plot_roc_curve(model,
                              train_data[valid],
                              labels[valid],
                              name=f'Fold number {i+1}',
                              ax=ax)
        aucs.append(plot.roc_auc)
        test_pred = model.predict_proba(test_data)[:, 1]

        submiss = pd.DataFrame({"id": index, "label": test_pred})
        submiss_path = os.path.join(submiss_dir,
                                    f'XGB_{plot.roc_auc:.2f}_{i+1}.csv')
        submiss.to_csv(submiss_path, index=False)

    ax.plot([0, 1], [0, 1], label='Luck', linestyle='--', color='r')
    mean_auc = np.mean(aucs)
    std_auc = np.std(aucs)
    ax.plot(mean_auc,
            label=f'Average AUC score: {mean_auc:.2f} $\pm$ {std_auc:.2f}')
    ax.legend(loc="lower right")
    ax.set(xlim=[-.1, 1.1], ylim=[-.1, 1.1], title='XGBoost Classifier')
    plt.show()
Esempio n. 29
0
def ROC_ML(model, X_test, y_test, name, i, rf=False, xgb=False):
    if rf:
        ax = plt.gca()
        # print(len(X_test))
        # print(len(X_test[0]))
        # print(len(X_test[0][0]))
        # print(len(y_test))
        # print(len(y_test[0]))
        # print(len(y_test[0][0]))
        score = plot_roc_curve(model, X_test, y_test, ax=ax, alpha=0.8)
        plt.show()
        sr, pr = SR_maker(y_test, model.predict(X_test))
        return score.roc_auc, sr, pr
    else:
        if xgb:
            y_pred_keras_tmp = model.predict(X_test)
        else:
            y_pred_keras_tmp = model.decision_function(X_test)
        fpr_keras, tpr_keras, _ = roc_curve(y_test, y_pred_keras_tmp)
        auc_keras = auc(fpr_keras, tpr_keras)

        if i == 0 and name == "SVM":
            plt.clf()
        if i == 6 and name == "SVM":
            plt.clf()
        plt.figure(i)
        plt.plot([0, 1], [0, 1], 'k--')
        plt.plot(fpr_keras, tpr_keras, label=name + str(i) + ' = {:.3f}'.format(auc_keras))
        plt.xlabel('False positive rate')
        plt.ylabel('True positive rate')
        plt.title('ROC curve _ ' + name)
        plt.legend(loc='best')
        fig1 = plt.gcf()
        plt.show()
        plt.draw()
        # fig1.savefig('result/ROC_' + name + str(i) + '.png', dpi=100)

        sr, pr = SR_maker(y_test, model.predict(X_test))
        return auc_keras, sr, pr
def roc_curve(classifier, cv, X, y):
    tprs = []
    aucs = []
    mean_fpr = np.linspace(0, 1, 100)

    fig, ax = plt.subplots()
    for i, (train, test) in enumerate(cv.split(X, y)):
        classifier.fit(X[train], np.ravel(y[train]))
        viz = plot_roc_curve(classifier, X[test], y[test],
                            name='ROC fold {}'.format(i),
                            alpha=0.3, lw=1, ax=ax)
        interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)
        aucs.append(viz.roc_auc)

    ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r',
            label='Chance', alpha=.8)

    mean_tpr = np.mean(tprs, axis=0)
    mean_tpr[-1] = 1.0
    mean_auc = auc(mean_fpr, mean_tpr)
    std_auc = np.std(aucs)
    ax.plot(mean_fpr, mean_tpr, color='b',
            label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc),
            lw=2, alpha=.8)

    std_tpr = np.std(tprs, axis=0)
    tprs_upper = np.minimum(mean_tpr + std_tpr, 1)
    tprs_lower = np.maximum(mean_tpr - std_tpr, 0)
    fig =ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2,
                    label=r'$\pm$ 1 std. dev.')

    ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05],
        title="Receiver operating characteristic example")
    ax.legend(loc="lower right")
    plt.show()