def plot_desicion_boundery_n_roc(X, y, clf, X_test, y_test): """Plots the desicioon boundery and ROC-curve Args: X (numpy array): features y (numpy array): target clf (sklearn classifier): ML model X_test (numpy array): test features y_test (numpy array): test target """ if X.shape[1] == 2: fig1 = plt.figure(figsize=(15, 10)) plot_decision_regions(X, y, clf=clf, legend=2) # Adding axes annotations plt.title('Decision Boundery') st.pyplot(fig1) if len(np.unique(y)) == 2: st.markdown('The ROC-curve for the test data is displayed below:') fig2 = plt.figure(figsize=(15, 10)) ax = fig2.add_subplot(111) plot_roc_curve(clf, X_test, y_test, ax=ax) st.pyplot(fig2)
def stratified_cross_validation(model, X, y, n_folds=10): # Cross-validation model cv = StratifiedKFold(n_splits=n_folds) # Evaluate classifier (actual purities) tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) fig, ax = plt.subplots(figsize=(14, 11)) for i, (train, test) in enumerate(cv.split(X, y)): model.fit(X[train], y[train]) viz = plot_roc_curve(model, X[test], y[test], name="", alpha=0, lw=3, ax=ax, color="royalblue") interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0 tprs.append(interp_tpr) aucs.append(viz.roc_auc) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) # Plot ROC curve ax.plot(mean_fpr, mean_tpr, color='royalblue', label=r'Mean AUC = %0.2f $\pm$ %0.2f' % (mean_auc, std_auc), lw=6, alpha=0.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='steelblue', alpha=.4, label=r'$\pm$ 1 std. dev.') return fig, ax
def plot_metrics(metrics_list): if 'Confusion Matrix' in metrics_list: st.subheader("Confusion Matrix") plot_confusion_matrix(model, x_test, y_test, display_labels=class_names) st.pyplot() if 'ROC Curve' in metrics_list: st.subheader("ROC Curve") plot_roc_curve(model, x_test, y_test) st.pyplot() st.set_option('deprecation.showPyplotGlobalUse', False) #fig, ax = matplotlib.pyplot.subplots() #ax.plot([0,0.5,1],[0,0.5,1]) #st.pyplot(fig) if 'Precision-Recall Curve' in metrics_list: st.subheader('Precision-Recall Curve') plot_precision_recall_curve(model, x_test, y_test) st.pyplot() st.set_option('deprecation.showPyplotGlobalUse', False)
def eval_model(model, X_test, y_test, thresh=None, plot=True): y_prob = model.predict_proba(X_test)[:, 1] if thresh == None: thresh, sens, spec, PPV, NPV, percent_pos = search_thresh( y_prob, y_test) predictions = y_prob > thresh else: predictions = y_prob > thresh sens, spec, PPV, NPV, percent_pos = calculate_stats( predictions, y_test) model_name = model.__class__.__name__ print(model_name) print('AUPRC: {:.3f}'.format( metrics.average_precision_score(y_test, y_prob))) print('AUROC: {:.3f}'.format(metrics.roc_auc_score(y_test, y_prob))) print(metrics.confusion_matrix(y_test, predictions)) print('sens: {:.3f} '.format(sens), 'spec: {:.3f} '.format(spec), 'PPV: {:.3f} '.format(PPV), 'NPV: {:.3f} '.format(NPV), '%pos: {:.3f}'.format(percent_pos)) # print(metrics.classification_report(y_test, predictions)) if plot: fig, ax = plt.subplots(1, 3, figsize=(12, 3)) # AUROC, AUPRC metrics.plot_roc_curve(model, X_test, y_test, ax=ax[0]) metrics.plot_precision_recall_curve(model, X_test, y_test, ax=ax[1]) # calibration curve fraction_pos, mean_predicted_value = calibration_curve(y_test, y_prob, n_bins=20) ax[2].plot([0, 1], [0, 1], "k:", label="Perfectly calibrated") ax[2].plot(mean_predicted_value, fraction_pos, 's-', label=model_name) ax[2].set_xlabel('mean predicted value') ax[2].set_ylabel('fraction positive') # save result plt.savefig('./result/' + model_name + '_AUC_plot.svg') return thresh
def NonLinear_Model(X_train, Y_train, X_test, Y_test): ## Proceed To Prepare Linear SVM linear_svc = SVC(kernel='rbf') linear_svc.fit(X_train, Y_train) ## Proceed to Test Performance on the Training Dataset Y_train_predict = linear_svc.predict(X_train) ## Proceed to write accuracy train_accuracy = accuracy_score(Y_train, Y_train_predict) train_precision = precision_score(Y_train, Y_train_predict) train_recall = recall_score(Y_train, Y_train_predict) train_auc = roc_auc_score(Y_train, Y_train_predict) print("Training Results") print("Accuracy on Training:", round(train_accuracy, 3)) print("Precision on Training:", round(train_precision, 3)) print("Recall on Training:", round(train_recall, 3)) print("AUC on Training:", round(train_auc, 3)) ## Proceed to Test on Testing Dataset Y_test_predict = linear_svc.predict(X_test) # Proceed to Calculate Scores test_accuracy = accuracy_score(Y_test, Y_test_predict) test_precision = precision_score(Y_test, Y_test_predict) test_recall = recall_score(Y_test, Y_test_predict) test_auc = roc_auc_score(Y_test, Y_test_predict) print("\nTesting Results") print("Accuracy on Testing:", round(test_accuracy, 3)) print("Precision on Testing:", round(test_precision, 3)) print("Recall on Testing:", round(test_recall, 3)) print("AUC on Testing:", round(test_auc, 3)) ## Proceed to Graph ROC Curve plot_roc_curve(linear_svc, X_test, Y_test)
def grid_search_stratified_cross_validation(clf, param_grid, Xtrain, ytrain, Xtest, ytest, n_splits=3, title=None): # Stratified-K-Fold Cross-Validation print() print('-' * 100) print('Stratified-K-Fold Cross-Validation') print('-' * 100) skf = StratifiedKFold(n_splits=n_splits) grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=skf, verbose=0) grid.fit(Xtrain, ytrain) print() print('[*] Best Params:') pprint(grid.best_params_) print() print('[*] Best Estimator:') pprint(grid.best_estimator_) print() print('[*] Best Score:') pprint(grid.best_score_) plot_conf_matrix(grid, Xtest, ytest, title) # plot_roc_curve(grid, Xtest, ytest, label=title, title=title) plot_roc_curve(grid, Xtest, ytest) pass
def roc_w_cross_val(X, y, classifier, plot=False): cv = StratifiedKFold(n_splits=6) X = X.to_numpy() y = y.to_numpy() tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) fig, ax = plt.subplots() for i, (train, test) in enumerate(cv.split(X, y)): classifier.fit(X[train], y[train]) viz = plot_roc_curve(classifier, X[test], y[test], name='ROC fold {}'.format(i), alpha=0.3, lw=1, ax=ax) interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0 tprs.append(interp_tpr) aucs.append(viz.roc_auc) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, float(std_auc)), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic example") # ax.legend(loc="lower right") ax.legend(bbox_to_anchor=(1, 0), loc="lower left") if plot: plt.show() else: plt.close() return mean_auc
def roc_curve_image( model, model_name: str, X_test: DataFrame, y_test: Series, output_dir: str = "images/results" ): """ Plots Receiver-Operating-Characteristic :param model: Fitted model to create plot for :param model_name: Used to name image file :param X_test: Test Dataframe of X values :param y_test: Test Series of y values :param output_dir: Output directory for plot :return: None """ fig = plt.figure(figsize=(15, 8)) ax = plt.gca() plot_roc_curve(model, X_test, y_test, ax=ax, alpha=0.8) plt.suptitle("Receiver Operating Characteristic Curve", fontweight="bold") plt.tight_layout() plt.savefig(os.path.join(output_dir, f"{model_name}_ROC_Curve.png")) plt.close(fig)
def DTree(data, name, visualize, t): Data = data.loc[:, data.columns != 'class'] target = pd.DataFrame() target['class'] = data['class'] data_train, data_test, target_train, target_test = train_test_split( data, target, test_size=t, random_state=42, shuffle=True) dt = DecisionTreeClassifier(criterion='gini', max_depth=10) pred = dt.fit(data_train, target_train).predict(data_test) print("Decision tree" + name + " accuracy: ", accuracy_score(target_test, pred, normalize=True)) print(classification_report(target_test, pred)) if (visualize): plot_roc_curve(dt, data_test, target_test) print(confusion_matrix(target_test, pred)) plt.show() return { 'DT': { 'datatrain': data_train, 'targettrain': target_train, 'datatest': data_test, 'targettest': target_test, 'name': name } }
def KNN(data, name, visualize, t): Data = data.loc[:, data.columns != 'class'] target = pd.DataFrame() target['class'] = data['class'] data_train, data_test, target_train, target_test = train_test_split( Data, target, test_size=t, random_state=42, shuffle=True) neigh = KNeighborsClassifier(n_neighbors=5) pred = neigh.fit(data_train, target_train).predict(data_test) print("KNN " + name + " accuracy: ", accuracy_score(target_test, pred, normalize=True)) print(classification_report(target_test, pred)) if (visualize): plot_roc_curve(neigh, data_test, target_test) print(confusion_matrix(target_test, pred)) plt.show() return { 'KNN': { 'datatrain': data_train, 'targettrain': target_train, 'datatest': data_test, 'targettest': target_test, 'name': name } }
def LRegression(data, name, visualize, t): target = pd.DataFrame() target['class'] = data['class'] Data = data.loc[:, data.columns != 'class'] data_train, data_test, target_train, target_test = train_test_split( data, target, test_size=t, random_state=42, shuffle=True) lr = LogisticRegression() pred = lr.fit(data_train, target_train).predict(data_test) print("Logistic Regression " + name + " accuracy: ", accuracy_score(target_test, pred, normalize=True)) print(classification_report(target_test, pred)) if (visualize): plot_roc_curve(lr, data_test, target_test) print(confusion_matrix(target_test, pred)) plt.show() return { 'LR': { 'datatrain': data_train, 'targettrain': target_train, 'datatest': data_test, 'targettest': target_test, 'name': name } }
def plot_metrics(metrics_list): if 'Score' in metrics_list: st.write("Accuracy: ", accuracy.round(2)) if 'MSE' in metrics_list: st.write("MSE: ", mean_squared_error(y_test, y_pred)) st.write("RMSE: ", np.sqrt(mean_squared_error(y_test, y_pred))) # Classification Metrics if 'Confusion Matrix' in metrics_list: st.subheader("Confusion Matrix") plot_confusion_matrix(model, x_test, y_test_enc, display_labels=class_names) #plot_confusion_matrix(model, x_test, y_test_enc) st.pyplot() if 'ROC Curve' in metrics_list: st.subheader("ROC Curve") plot_roc_curve(model, x_test, y_test_enc) st.pyplot() if 'Precision-Recall Curve' in metrics_list: st.subheader("Precision-Recall Curve") plot_precision_recall_curve(model, x_test, y_test_enc) st.pyplot()
def test_plot_roc_curve_estimator_name_multiple_calls(pyplot, data_binary): # non-regression test checking that the `name` used when calling # `plot_roc_curve` is used as well when calling `disp.plot()` X, y = data_binary clf_name = "my hand-crafted name" clf = LogisticRegression().fit(X, y) disp = plot_roc_curve(clf, X, y, name=clf_name) assert disp.estimator_name == clf_name pyplot.close("all") disp.plot() assert clf_name in disp.line_.get_label() pyplot.close("all") clf_name = "another_name" disp.plot(name=clf_name) assert clf_name in disp.line_.get_label()
def plot_metrics(x_test,y_test,model,metrics_list,dataset): if dataset=='Mushroom Dataset': if 'ROC Curve' in metrics_list: st.subheader("ROC Curve") plot_roc_curve(model,x_test,y_test) st.pyplot() if 'Confusion Matrix' in metrics_list: st.subheader("Confusion Matrics") plot_confusion_matrix(model,x_test,y_test,display_labels=class_names) st.pyplot() if 'Precision Recall Curve' in metrics_list: st.subheader("Precision Recall Graph") plot_precision_recall_curve(model,x_test,y_test) st.pyplot() if dataset=='Iris': if 'ROC Curve' in metrics_list: pass if 'Confusion Matrix' in metrics_list: st.subheader("Confusion Matrix") plot_confusion_matrix(model,x_test,y_test) st.pyplot() if 'Precision Recall Curve' in metrics_list: pass
def evaluate_classification(model, X_test,y_test,cmap='Greens', normalize='true',classes=['No-Recid','Yes-Recid'],figsize=(10,4), X_train = None, y_train = None,label='Test Data', return_report=False): """Evaluates a scikit-learn binary classification model. Args: model (classifier): any sklearn classification model. X_test_tf (Frame or Array): X data y_test (Series or Array): y data cmap (str, optional): Colormap for confusion matrix. Defaults to 'Greens'. normalize (str, optional): normalize argument for plot_confusion_matrix. Defaults to 'true'. classes (list, optional): List of class names for display. Defaults to None. figsize (tuple, optional): figure size Defaults to (8,4). X_train (Frame or Array, optional): If provided, compare model.score for train and test. Defaults to None. y_train (Series or Array, optional): If provided, compare model.score for train and test. Defaults to None. """ ## get_report(model,X_test,y_test,as_df=False,label=label,target_names=classes) ## Plot Confusion Matrid and roc curve fig,ax = plt.subplots(ncols=2, figsize=figsize) metrics.plot_confusion_matrix(model, X_test,y_test,cmap=cmap, normalize=normalize,display_labels=classes, ax=ax[0]) ## if roc curve erorrs, delete second ax try: curve = metrics.plot_roc_curve(model,X_test,y_test,ax=ax[1]) curve.ax_.grid() curve.ax_.plot([0,1],[0,1],ls=':') fig.tight_layout() except: fig.delaxes(ax[1]) plt.show() ## Add comparing Scores if X_train and y_train provided. if (X_train is not None) & (y_train is not None): print(f"Training Score = {model.score(X_train,y_train):.2f}") print(f"Test Score = {model.score(X_test,y_test):.2f}") if return_report: return get_report(model,X_test,y_test,as_df=True,label=label)
def roCurves(clfList, X_test, y_test): roCurveList = [] plt.subplots(1, 1, figsize=(5, 5)) styleList = ['solid', 'solid', 'dashed', 'dashed', 'dotted', 'dashed'] for clf, sty in zip(clfList, styleList): ax = plt.gca() roc = plot_roc_curve(clf, X_test, y_test, ax=ax, alpha=0.85, lw=2, linestyle=sty) roCurveList.append(roc) plt.plot([0, 1], [0, 1], color='black', lw=2, linestyle='dotted') plt.title('ROC') plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') return roCurveList
def test_plot_roc_curve(pyplot, response_method, data_binary, with_sample_weight, drop_intermediate, with_strings): X, y = data_binary pos_label = None if with_strings: y = np.array(["c", "b"])[y] pos_label = "c" if with_sample_weight: rng = np.random.RandomState(42) sample_weight = rng.randint(1, 4, size=(X.shape[0])) else: sample_weight = None lr = LogisticRegression() lr.fit(X, y) viz = plot_roc_curve(lr, X, y, alpha=0.8, sample_weight=sample_weight, drop_intermediate=drop_intermediate) y_pred = getattr(lr, response_method)(X) if y_pred.ndim == 2: y_pred = y_pred[:, 1] fpr, tpr, _ = roc_curve(y, y_pred, sample_weight=sample_weight, drop_intermediate=drop_intermediate, pos_label=pos_label) assert_allclose(viz.roc_auc, auc(fpr, tpr)) assert_allclose(viz.fpr, fpr) assert_allclose(viz.tpr, tpr) assert viz.estimator_name == "LogisticRegression" # cannot fail thanks to pyplot fixture import matplotlib as mpl # noqal assert isinstance(viz.line_, mpl.lines.Line2D) assert viz.line_.get_alpha() == 0.8 assert isinstance(viz.ax_, mpl.axes.Axes) assert isinstance(viz.figure_, mpl.figure.Figure) expected_label = "LogisticRegression (AUC = {:0.2f})".format(viz.roc_auc) assert viz.line_.get_label() == expected_label assert viz.ax_.get_ylabel() == "True Positive Rate" assert viz.ax_.get_xlabel() == "False Positive Rate"
def run_classification_models(features, budget_cats): # making test train splits X_train, X_test, y_train, y_test = train_test_split(features, budget_cats, test_size=0.33, random_state=0) # The Naive Bayes are bad, so I remove classifiers = [ (LogisticRegression(random_state=0, max_iter = 1000), { 'C': np.logspace(-2, 7, 10) }), (GradientBoostingClassifier(n_estimators=50, random_state=0), { 'learning_rate': np.logspace(-4, 0, 10) }), (SVC(random_state=0), { 'C': np.logspace(-2, 7, 10) })] for classifier, parameters in classifiers: print(classifier) clf = GridSearchCV(classifier, parameters, cv = 3) clf.fit(X_train, y_train) print("Best parameters set found on development set:") print() print(clf.best_params_) print() print("Grid scores on development set:") print() means = clf.cv_results_['mean_test_score'] stds = clf.cv_results_['std_test_score'] for mean, std, params in zip(means, stds, clf.cv_results_['params']): print("%0.3f (+/-%0.03f) for %r" % (mean, std * 2, params)) print() print("Detailed classification report:") y_true, y_pred = y_test, clf.predict(X_test) print("Accuracy Score: \n") print(accuracy_score(y_test, y_pred)) print("F1 Score: \n") print(f1_score(y_true, y_pred, average = 'macro')) print(classification_report(y_true, y_pred)) disp = plot_roc_curve(clf, X_test, y_test) plt.show()
def print_classification_summary(model_name, dataset, model_instance, y, X, positive_label=1, negative_label=0): """Function to outlay summary information of chosen model including target distribution for dataset used, classification metric scores, confusion matrix and ROC/AUC and Precision/Recall curves.""" from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, confusion_matrix from sklearn.metrics import plot_confusion_matrix, plot_roc_curve, plot_precision_recall_curve y_pred = model_instance.predict(X) class_y = np.unique(y) print('***RESULTS SUMMARY***') print( '------------------------------------------------------------------------------------------' ) print('Model:', model_name) print('Dataset:', dataset) print('Target distribution:') print('\n') print('Class:', class_y[0], '/ Count:', sum(y == class_y[0]), '/ Pct:', round(sum(y == class_y[0]) / len(y) * 100, 0)) print('Class:', class_y[1], '/ Count:', sum(y == class_y[1]), '/ Pct:', round(sum(y == class_y[1]) / len(y) * 100, 0)) print( '------------------------------------------------------------------------------------------' ) print('Metric Scores: \n') print('Accuracy score:', round(accuracy_score(y, y_pred), 2)) print('Recall score:', round(recall_score(y, y_pred), 2)) print('Precision score:', round(precision_score(y, y_pred), 2)) print('F1 score:', round(f1_score(y, y_pred), 2)) print( '------------------------------------------------------------------------------------------' ) print('Plots:') ax = plot_confusion_matrix(model_instance, X, y, values_format='d') plt.title('Confusion Matrix') plt.show() ax = plot_roc_curve(model_instance, X, y) plt.title('ROC Curve') plt.show() ax = plot_precision_recall_curve(model_instance, X, y) plt.title('Precision Recall Curve') plt.show()
def compare_models(classifiers: Dict[str, ClassifierMixin], cv: StratifiedShuffleSplit, x: np.ndarray, y: np.ndarray, validation_size=0.2): train_scores: Dict[str, List] = {} test_scores: Dict[str, List] = {} for name in classifiers.keys(): train_scores[name] = [] test_scores[name] = [] validation_cv = StratifiedShuffleSplit(n_splits=1, test_size=validation_size, random_state=0) train_ind, validation_ind = validation_cv.split(x, y).__next__() x_validation, y_validation = x[validation_ind], y[validation_ind] x, y = x[train_ind], y[train_ind] for train_ind, test_ind in cv.split(x, y): x_train, y_train = x[train_ind], y[train_ind] x_test, y_test = x[test_ind], y[test_ind] for name, clf in classifiers.items(): clf.fit(x_train, y_train) train_scores[name].append(clf.score(x_train, y_train)) test_scores[name].append(clf.score(x_test, y_test)) for name, clf in classifiers.items(): plt.figure() ax = plt.subplot(2, 2, 1) disp = plot_precision_recall_curve(clf, x_validation, y_validation, ax=ax) disp.ax_.set_title('{} Precision-Recall curve'.format(name)) ax = plt.subplot(2, 2, 2) disp = plot_roc_curve(clf, x_validation, y_validation, ax=ax) disp.ax_.set_title('{} ROC curve'.format(name)) ax = plt.subplot(2, 2, 3) disp = plot_confusion_matrix(clf, x_validation, y_validation, ax=ax) disp.ax_.set_title('{} Confusion matrix curve'.format(name)) plt.show() return train_scores, test_scores
def lab1_6(dataset: np.ndarray, targetDataset: np.ndarray, test: np.ndarray, testTarget: np.ndarray) -> None: """Загрузите набор данных из файла bank_scoring_train.csv. Это набор финансовых данных, характеризующий физических лиц. Целевым столбцом является «SeriousDlqin2yrs», означающий, ухудшится ли финансовая ситуация у клиента. Постройте систему по принятию решения о выдаче или невыдаче кредита физическому лицу. Сделайте как минимум 2 варианта системы на основе различных классификаторов. Подберите подходящую метрику качества работы системы исходя из специфики задачи и определите, принятие решения какой системой сработало лучше на bank_scoring_test.csv. :param dataset: Набор исходных данных для обучения :param targetDataset: Соответствующий набор классов :param test: Набор данных для тестирования :param testTarget: Соответствующий набор классов для тестирования """ x_train, x_test, y_train, y_test = train_test_split(dataset, targetDataset, test_size=0.33) # подбираем лучший параметр для DecisionTreeClassifier # критерий оценки - точность, ROC и AUC max_depths = (3, 7, 15) for max_depth in max_depths: classifier = DecisionTreeClassifier(max_depth=max_depth) classifier.fit(x_train, y_train) prediction = classifier.predict(x_test) accuracy = accuracy_score(y_test, prediction) # ROС plot_roc_curve(classifier, x_test, y_test) plt.title('ROC curve (accuracy = {:.2f}, max depth = {})'.format( accuracy, max_depth)) plt.show() # для DecisionTreeClassifier лучший параметр max_depth = 7 # смотрим на GaussianNB classifier = GaussianNB() classifier.fit(x_train, y_train) prediction = classifier.predict(x_test) accuracy = accuracy_score(y_test, prediction) # ROС plot_roc_curve(classifier, x_test, y_test) plt.title('ROC curve (accuracy = {:.2f})'.format(accuracy)) plt.show() # проверяем два лучших варианта на тестовых данных bestClassifiers = (DecisionTreeClassifier(max_depth=7), GaussianNB()) for bestClassifier in bestClassifiers: bestClassifier.fit(x_train, y_train) prediction = bestClassifier.predict(test) accuracy = accuracy_score(testTarget, prediction) # ROС plot_roc_curve(bestClassifier, test, testTarget) plt.title('ROC curve for best classifier (accuracy = {:.2f})'.format( accuracy)) plt.show()
def run(X, y, penalty='l2', run_origin='localRun'): solver = "saga" if penalty is "elasticnet": l1_ratio = 0.5 else: l1_ratio = None X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42) with mlflow.start_run(run_name=run_origin) as run: lr = LogisticRegression(penalty=penalty, solver=solver, l1_ratio=l1_ratio) lr.fit(X_train, y_train) score_train = lr.score(X_train, y_train) score_test = lr.score(X_test, y_test) prec_test = precision_score(y_test, lr.predict(X_test)) rec_test = recall_score(y_test, lr.predict(X_test)) f1_test = f1_score(y_test, lr.predict(X_test)) print("hyperparameters: ", lr.get_params()) print("train score: ", score_train) print("test score: ", score_test) print("test precision: ", prec_test) print("test recall: ", rec_test) print("test f1 score: ", f1_test) disp = plot_confusion_matrix(lr, X_test, y_test) print(disp.confusion_matrix) plt.savefig("sklearn_logreg_conf_mat.png") disp = plot_roc_curve(lr, X_test, y_test) plt.savefig("sklearn_logreg_roc_curve.png") print("runId: ", run.info.run_id) print("artifact_uri: ", mlflow.get_artifact_uri()) mlflow.log_metrics({ "training score": score_train, "test score": score_test }) mlflow.log_params(lr.get_params()) mlflow.set_tags({"run_origin": run_origin}) mlflow.log_artifact("sklearn_logreg_conf_mat.png", "figures") mlflow.log_artifact("sklearn_logreg_roc_curve.png", "figures")
def generate_metrics_for_production_model(pipeline, x_test, y_test): print("Count of label NC (id 2 in Database, 1 in CM) in y_test: {}". format(sum(y_test == 2))) print( "Count of label AD-MCI (id 1 in Database, 0 in CM) in y_test: {} \n" .format(sum(y_test == 1))) y_pred = pipeline.predict(x_test) cf_matrix = confusion_matrix(y_test, y_pred) sns.heatmap(cf_matrix, annot=True, fmt='d', cmap="RdBu", cbar=False, annot_kws={'fontsize': 16}) print('Explain Confusion Matrix.\n', explain_confusion_matrix(y_test, y_pred)) print('Custom Accuracy :', custom_accuracy(y_test, y_pred)) print('Custom Sensitivity :', custom_sensitivity(y_test, y_pred)) print('Custom Specificity :', custom_specificity(y_test, y_pred)) print('Custom Precision :', custom_precision(y_test, y_pred)) print('Custom NPV :', custom_npv(y_test, y_pred)) # plot roc curves display_roc = plot_roc_curve(pipeline, x_test, y_test) roc_axes = display_roc.ax_ roc_axes.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) # plot precision recall curves display_pr = plot_precision_recall_curve(pipeline, x_test, y_test) pr_axes = display_pr.ax_ pr_axes.plot([0, 1], [0.5, 0.5], linestyle='--', lw=2, color='r', label='Chance', alpha=.8)
def plot_roc(model, X, y, filename): """ 画roc图,不过sklearn只支持二分类roc,三分类画不了 Args: model: 模型 X: 特征 y: 标签 filename: 图片保存路径 """ ax = plt.gca() dis = plot_roc_curve(model, X, y, ax=ax) dis.plot(ax=ax, alpha=0.8) plt.savefig(filename) try: plt.show() except Exception as e: print(e.args)
def experiment( X_train, # TODO X_test, y_train, y_test, fname, binarize=False): print("#" * 60) print('Started Experiment ..') # KNN # knn = sklearn.neighbors.KNeighborsClassifier(n_jobs=-1) # knn.fit(X_train, y_train) # print_accuracy(knn.predict) # Random Forest rforest = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=2, random_state=0, n_jobs=-1) rforest.fit(X_train, y_train) # print_accuracy(rforest.predict) y_pred = rforest.predict(X_test) evaluate_exp(y_test, y_pred, binarize) try: from sklearn.metrics import plot_roc_curve rforest_disp = skm.plot_roc_curve(rforest, X_test, y_test) plt_name = 'roc_curve_{}.png'.format(fname) plt.savefig(plt_name, format='png') plt.show() except: print("Not working, plot_roc_curve!") disp = skm.plot_precision_recall_curve(rforest, X_test, y_test) average_precision = skm.average_precision_score(y_test, y_pred) disp.ax_.set_title('2-class Precision-Recall curve: ' 'AP={0:0.2f}'.format(average_precision)) print("#" * 60)
def plot_roc(self, verbose=False): """ Plot ROC Curve for model """ tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) cv = StratifiedKFold(n_splits=6) fig, ax = plt.subplots() for i, (train, test) in enumerate(cv.split(self.data['features'], self.data['labels'])): self.model.fit(self.data['features'][train], self.data['labels'][train]) viz = plot_roc_curve(self.model, self.data['features'][test], self.data['labels'][test], name='ROC fold {}'.format(i), alpha=0.3, lw=1, ax=ax) interp_tpr = interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0 tprs.append(interp_tpr) aucs.append(viz.roc_auc) ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic example") ax.legend(loc="lower right") plt.show()
def visualize_roc_curve_with_cross_validation_1(clf, index_of_curve, x_with_features, y, mean_fpr, tprs, aucs, ax): ## train set # TODO follow my note documentatation viz = plot_roc_curve( clf, x_with_features, y, name='ROC fold {}'.format(index_of_curve), alpha=0.3, lw=1, ax=ax ) # How do i know if it use decision_funciton, predict proba or other? ### visualized roc cove of train set interp_tpr = interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0 tprs.append(interp_tpr) aucs.append(viz.roc_auc) return ax, tprs, aucs
def runXGB(model, train_data, labels, test_data, index, n_folds=5, submiss_dir='./submiss'): fig, ax = plt.subplots() aucs = [] cv = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42) for i, (train, valid) in enumerate(cv.split(train_data, labels)): model.fit(train_data[train], labels[train], early_stopping_rounds=50, eval_set=[(train_data[valid], labels[valid])], verbose=0) plot = plot_roc_curve(model, train_data[valid], labels[valid], name=f'Fold number {i+1}', ax=ax) aucs.append(plot.roc_auc) test_pred = model.predict_proba(test_data)[:, 1] submiss = pd.DataFrame({"id": index, "label": test_pred}) submiss_path = os.path.join(submiss_dir, f'XGB_{plot.roc_auc:.2f}_{i+1}.csv') submiss.to_csv(submiss_path, index=False) ax.plot([0, 1], [0, 1], label='Luck', linestyle='--', color='r') mean_auc = np.mean(aucs) std_auc = np.std(aucs) ax.plot(mean_auc, label=f'Average AUC score: {mean_auc:.2f} $\pm$ {std_auc:.2f}') ax.legend(loc="lower right") ax.set(xlim=[-.1, 1.1], ylim=[-.1, 1.1], title='XGBoost Classifier') plt.show()
def ROC_ML(model, X_test, y_test, name, i, rf=False, xgb=False): if rf: ax = plt.gca() # print(len(X_test)) # print(len(X_test[0])) # print(len(X_test[0][0])) # print(len(y_test)) # print(len(y_test[0])) # print(len(y_test[0][0])) score = plot_roc_curve(model, X_test, y_test, ax=ax, alpha=0.8) plt.show() sr, pr = SR_maker(y_test, model.predict(X_test)) return score.roc_auc, sr, pr else: if xgb: y_pred_keras_tmp = model.predict(X_test) else: y_pred_keras_tmp = model.decision_function(X_test) fpr_keras, tpr_keras, _ = roc_curve(y_test, y_pred_keras_tmp) auc_keras = auc(fpr_keras, tpr_keras) if i == 0 and name == "SVM": plt.clf() if i == 6 and name == "SVM": plt.clf() plt.figure(i) plt.plot([0, 1], [0, 1], 'k--') plt.plot(fpr_keras, tpr_keras, label=name + str(i) + ' = {:.3f}'.format(auc_keras)) plt.xlabel('False positive rate') plt.ylabel('True positive rate') plt.title('ROC curve _ ' + name) plt.legend(loc='best') fig1 = plt.gcf() plt.show() plt.draw() # fig1.savefig('result/ROC_' + name + str(i) + '.png', dpi=100) sr, pr = SR_maker(y_test, model.predict(X_test)) return auc_keras, sr, pr
def roc_curve(classifier, cv, X, y): tprs = [] aucs = [] mean_fpr = np.linspace(0, 1, 100) fig, ax = plt.subplots() for i, (train, test) in enumerate(cv.split(X, y)): classifier.fit(X[train], np.ravel(y[train])) viz = plot_roc_curve(classifier, X[test], y[test], name='ROC fold {}'.format(i), alpha=0.3, lw=1, ax=ax) interp_tpr = np.interp(mean_fpr, viz.fpr, viz.tpr) interp_tpr[0] = 0.0 tprs.append(interp_tpr) aucs.append(viz.roc_auc) ax.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.8) mean_tpr = np.mean(tprs, axis=0) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) std_auc = np.std(aucs) ax.plot(mean_fpr, mean_tpr, color='b', label=r'Mean ROC (AUC = %0.2f $\pm$ %0.2f)' % (mean_auc, std_auc), lw=2, alpha=.8) std_tpr = np.std(tprs, axis=0) tprs_upper = np.minimum(mean_tpr + std_tpr, 1) tprs_lower = np.maximum(mean_tpr - std_tpr, 0) fig =ax.fill_between(mean_fpr, tprs_lower, tprs_upper, color='grey', alpha=.2, label=r'$\pm$ 1 std. dev.') ax.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], title="Receiver operating characteristic example") ax.legend(loc="lower right") plt.show()