def plot_decision_threshold(): from mglearn.datasets import make_blobs from sklearn.svm import SVC try: from sklearn.model_selection import train_test_split except: from sklearn.cross_validation import train_test_split X, y = make_blobs(n_samples=(400, 50), centers=2, cluster_std=[7.0, 2], random_state=22) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) fig, axes = plt.subplots(2, 3, figsize=(15, 8)) plt.suptitle("decision_threshold") axes[0, 0].set_title("training data") axes[0, 0].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm) svc = SVC(gamma=.05).fit(X_train, y_train) axes[0, 1].set_title("decision with threshold 0") axes[0, 1].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm) plot_2d_scores(svc, X_train, function="decision_function", alpha=.7, ax=axes[0, 1]) plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 1]) axes[0, 2].set_title("decision with threshold -0.8") axes[0, 2].scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm) plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 2], threshold=-.8) plot_2d_scores(svc, X_train, function="decision_function", alpha=.7, ax=axes[0, 2]) axes[1, 0].set_visible(False) mask = np.abs(X_train[:, 1] - 7) < 5 bla = np.sum(mask) line = np.linspace(X_train.min(), X_train.max(), 100) axes[1, 1].set_title("Cross-section with threshold 0") axes[1, 1].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k') contour = (svc.decision_function(np.c_[line, 10 * np.ones(100)]) > 0).reshape(1, -1).repeat(10, axis=0) axes[1, 1].contourf(line, np.linspace(-1.5, 1.5, 10), contour, alpha=0.2, cmap=cm) axes[1, 1].scatter(X_train[mask, 0], np.zeros(bla), c=y_train[mask], cmap=cm, alpha=.1, s=100) axes[1, 1].set_xlim(X_train.min(), X_train.max()) axes[1, 1].set_ylim(-1.5, 1.5) axes[1, 1].set_xticks(()) axes[1, 1].set_ylabel("Decision value") contour2 = (svc.decision_function(np.c_[line, 10 * np.ones(100)]) > -.8).reshape(1, -1).repeat(10, axis=0) axes[1, 2].set_title("Cross-section with threshold -0.8") axes[1, 2].contourf(line, np.linspace(-1.5, 1.5, 10), contour2, alpha=0.2, cmap=cm) axes[1, 2].scatter(X_train[mask, 0], np.zeros(bla), c=y_train[mask], cmap=cm, alpha=.1, s=100) axes[1, 2].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k') axes[1, 2].set_xlim(X_train.min(), X_train.max()) axes[1, 2].set_ylim(-1.5, 1.5) axes[1, 2].set_xticks(()) axes[1, 2].set_ylabel("Decision value")
def plot_decision_threshold(): from mglearn.datasets import make_blobs from sklearn.svm import SVC from sklearn.model_selection import train_test_split X, y = make_blobs(n_samples=(400, 50), centers=2, cluster_std=[7.0, 2], random_state=22) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) fig, axes = plt.subplots(2, 3, figsize=(15, 8), subplot_kw={'xticks': (), 'yticks': ()}) plt.suptitle("decision_threshold") axes[0, 0].set_title("training data") discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 0]) svc = SVC(gamma=.05).fit(X_train, y_train) axes[0, 1].set_title("decision with threshold 0") discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 1]) plot_2d_scores(svc, X_train, function="decision_function", alpha=.7, ax=axes[0, 1], cm=ReBl) plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 1]) axes[0, 2].set_title("decision with threshold -0.8") discrete_scatter(X_train[:, 0], X_train[:, 1], y_train, ax=axes[0, 2]) plot_2d_separator(svc, X_train, linewidth=3, ax=axes[0, 2], threshold=-.8) plot_2d_scores(svc, X_train, function="decision_function", alpha=.7, ax=axes[0, 2], cm=ReBl) axes[1, 0].set_axis_off() mask = np.abs(X_train[:, 1] - 7) < 5 bla = np.sum(mask) line = np.linspace(X_train.min(), X_train.max(), 100) axes[1, 1].set_title("Cross-section with threshold 0") axes[1, 1].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k') dec = svc.decision_function(np.c_[line, 10 * np.ones(100)]) contour = (dec > 0).reshape(1, -1).repeat(10, axis=0) axes[1, 1].contourf(line, np.linspace(-1.5, 1.5, 10), contour, alpha=0.4, cmap=cm) discrete_scatter(X_train[mask, 0], np.zeros(bla), y_train[mask], ax=axes[1, 1]) axes[1, 1].set_xlim(X_train.min(), X_train.max()) axes[1, 1].set_ylim(-1.5, 1.5) axes[1, 1].set_xticks(()) axes[1, 1].set_ylabel("Decision value") contour2 = (dec > -.8).reshape(1, -1).repeat(10, axis=0) axes[1, 2].set_title("Cross-section with threshold -0.8") axes[1, 2].contourf(line, np.linspace(-1.5, 1.5, 10), contour2, alpha=0.4, cmap=cm) discrete_scatter(X_train[mask, 0], np.zeros(bla), y_train[mask], alpha=.1, ax=axes[1, 2]) axes[1, 2].plot(line, svc.decision_function(np.c_[line, 10 * np.ones(100)]), c='k') axes[1, 2].set_xlim(X_train.min(), X_train.max()) axes[1, 2].set_ylim(-1.5, 1.5) axes[1, 2].set_xticks(()) axes[1, 2].set_ylabel("Decision value") axes[1, 0].legend(['negative class', 'positive class'])
def imbalanced_two_classes(): from mglearn.datasets import make_blobs # X, y = make_blobs(n_samples=(350, 50), centers=[2], cluster_std=[7.0, 2], random_state=seed) X, y = make_blobs(n_samples=(350, 50), cluster_std=[7.0, 2], random_state=seed) show_title("不平衡数据的二分类问题") print("数据中87.5%是一类,12.5%的数据是另一类") from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed) show_subtitle("测试集的标签") print(y_test) from sklearn.svm import SVC from sklearn.metrics import classification_report svc = SVC(gamma=0.05) svc.fit(X_train, y_train) predict_svc = svc.predict(X_test) show_subtitle("SVC分类报告") print(classification_report(y_test, predict_svc)) from sklearn.metrics import confusion_matrix confusion = confusion_matrix(y_test, predict_svc) show_subtitle("SVC分类的混淆矩阵") print(confusion) # 使用决策函数可以调整数据的平衡问题,以及样本中不同类别的权重 # 不过这种人工设置阈值的方式不是很好 predict_svc_lower_threshold = svc.decision_function(X_test) > -0.35 show_subtitle("SVC基于决策函数进行预测的分类报告:") print(classification_report(y_test, predict_svc_lower_threshold)) # 决策函数的阈值选取,只能依靠经验,没有合适的算法 confusion = confusion_matrix(y_test, predict_svc_lower_threshold) show_subtitle("SVC分类的混淆矩阵") print(confusion) mglearn.plots.plot_decision_threshold() plt.suptitle("图5-12:决策函数的热图与改变决策阈值的影响") pass
from mglearn.datasets import make_blobs from sklearn.svm import SVC from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report X, y = make_blobs(n_samples=(400, 50), centers=2, cluster_std=[7.0, 2], random_state=22) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) svc = SVC(gamma=0.05) svc.fit(X_train, y_train) print('Default:') print(classification_report(y_test, svc.predict(X_test))) y_pred_lower_threshold = svc.decision_function(X_test) > -0.8 print('Lower Threshold:') print(classification_report(y_test, y_pred_lower_threshold))
def compare_roc_curve(): from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=(4000, 500), n_features=2, cluster_std=[7.0, 2], random_state=22) show_title("使用 ROC 曲线分析不平衡的数据对模型的影响") print("数据中87.5%是一类,12.5%的数据是另一类") from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed) show_subtitle("测试集的标签") print(y_test) from sklearn.svm import SVC svc = SVC(gamma=0.05) svc.fit(X_train, y_train) from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_test, svc.decision_function(X_test)) # 找到最接近于0的阈值的位置 close_zero = np.argmin(np.abs(thresholds)) plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10, label="SVC的0阈值", fillstyle='none', c='k', mew=2) plt.plot(fpr, tpr, label="ROC Curve") plt.xlabel('FPR') plt.ylabel('TPR(recall)') plt.legend() plt.title("图5-15:SVC(gamma=0.05)的ROC曲线\n" "曲线越靠近左上角,则分类器越好") from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2) rf.fit(X_train, y_train) # RandomForestClassifier有predict_proba,但是没有decision_function fpr_rf, tpr_rf, thresholds_rf = roc_curve(y_test, rf.predict_proba(X_test)[:, 1]) close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5)) plt.figure() plt.plot(fpr[close_zero], tpr[close_zero], 'o', markersize=10, label="SVC的0阈值", fillstyle='none', c='k', mew=2) plt.plot(fpr, tpr, label="ROC Curve SVC") plt.plot(fpr_rf[close_default_rf], tpr_rf[close_default_rf], '^', markersize=10, label="随机森林的0.5阈值", fillstyle='none', c='k', mew=2) plt.plot(fpr_rf, tpr_rf, label="ROC Curve RF") plt.xlabel('FPR(假真类率)') plt.ylabel('TPR(真真类率)') plt.legend() plt.title("图5-16:比较 SVM 和 随机森林 的 ROC曲线\n" "曲线越靠近左上角,则分类器越好\n" "即假真类率(FPR)要低,真真类率(TPR)要高") # 对于不平衡数据集的分类问题,AUC指标比精度指标的效果更好。 # 分别随机从样本集中抽取一个正样本和一个负样本,正样本的预测值大于负样本的预测值的概率。 show_subtitle("AUC 表示曲线下的积分(即面积),解释为评估正例样本的排名") from sklearn.metrics import roc_auc_score svc_auc = roc_auc_score(y_test, svc.decision_function(X_test)) print("SVC的AUC:{:.3f}".format(svc_auc)) rf_auc = roc_auc_score(y_test, rf.predict_proba(X_test)[:, 1]) print("随机森林的AUC:{:.3f}".format(rf_auc)) print("对于不平衡类别的分类问题,选择模型时使用 AUC 比 精度 更有意义") pass
def compare_precision_recall_curve(): from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=(4000, 500), n_features=2, cluster_std=[7.0, 2], random_state=22) show_title("使用“准确率——召回率曲线”分析不平衡的数据对模型的影响") print("数据中87.5%是一类,12.5%的数据是另一类") from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) show_subtitle("测试集的标签") print(y_test) from sklearn.svm import SVC svc = SVC(gamma=0.05) svc.fit(X_train, y_train) from sklearn.metrics import precision_recall_curve precision_svc, recall_svc, thresholds_svc = precision_recall_curve(y_test, svc.decision_function(X_test)) # 找到最接近于0的阈值的位置 close_zero = np.argmin(np.abs(thresholds_svc)) plt.plot(precision_svc[close_zero], recall_svc[close_zero], 'o', markersize=10, label="SVC的0阈值", fillstyle='none', c='k', mew=2) plt.plot(precision_svc, recall_svc, label="准确率——召回率曲线") plt.xlabel('准确率') plt.ylabel('召回率') plt.legend() plt.suptitle("图5-13:SVC(gamma=0.05)的准确率--召回率曲线\n" "曲线越靠近右上角,则分类器越好") from sklearn.ensemble import RandomForestClassifier rf = RandomForestClassifier(n_estimators=100, random_state=0, max_features=2) rf.fit(X_train, y_train) # RandomForestClassifier 有预测概率(predict_proba),但是没有决策函数(decision_function) precision_rf, recall_rf, thresholds_rf = precision_recall_curve(y_test, rf.predict_proba(X_test)[:, 1]) plt.figure() plt.plot(precision_svc, recall_svc, label="SVC") plt.plot(precision_svc[close_zero], recall_svc[close_zero], 'o', markersize=10, label="SVC的0阈值", fillstyle='none', c='k', mew=2) close_default_rf = np.argmin(np.abs(thresholds_rf - 0.5)) plt.plot(precision_rf, recall_rf, label="随机森林") plt.plot(precision_rf[close_default_rf], recall_rf[close_default_rf], '^', markersize=10, label="随机森林的0.5阈值", fillstyle='none', mew=2) plt.xlabel('准确率') plt.ylabel('召回率') plt.legend() plt.title("图5-14:比较 SVM 与 随机森林 的 准确率--召回率曲线\n" "SVM在中间位置的表现更好\n" "随机森林在极值处表现更好(即极值处的精度或是高准确率或是高召回率)") show_subtitle("f1_score表示了准确率——召回率曲线上默认阈值对应的点") from sklearn.metrics import f1_score predict_svc = svc.predict(X_test) print("SVC的f1_score: {:.3f}".format(f1_score(y_test, predict_svc))) predict_rf = rf.predict(X_test) print("随机森林的f1_score: {:.3f}".format(f1_score(y_test, predict_rf))) show_subtitle("平均准确率(Average Precision)表示曲线下的积分(即面积)") from sklearn.metrics import average_precision_score ap_svc = average_precision_score(y_test, svc.decision_function(X_test)) print("SVC的平均准确率:{:.3f}".format(ap_svc)) ap_rf = average_precision_score(y_test, rf.predict_proba(X_test)[:, 1]) print("随机森林的平均准确率:{:.3f}".format(ap_rf)) pass
import matplotlib.pyplot as plt import mglearn import numpy as np import pandas as pd from mglearn.datasets import make_blobs from sklearn.svm import LinearSVC X, y = make_blobs(centers=4, random_state=8) y = y % 2 linear_svm = LinearSVC().fit(X, y) ''' mglearn.plots.plot_2d_separator(linear_svm,X) mglearn.discrete_scatter(X[:,0],X[:,1],y) plt.xlabel("Feature 0") plt.ylabel("Feature 1") ''' X_new = np.hstack([X, X[:, 1:]**2]) from mpl_toolkits.mplot3d import Axes3D, axes3d figure = plt.figure() ax = Axes3D(figure, elev=-152, azim=-26) mask = y == 0 ax.scatter(X_new[mask, 0], X_new[mask, 1], X_new[mask, 2], c='b', cmap=mglearn.cm2, s=60)
import numpy as np import mglearn import matplotlib.pyplot as plt from mglearn.datasets import make_blobs from sklearn.svm import LinearSVC X, y = make_blobs(random_state=42, centers=3) linear_svc = LinearSVC().fit(X, y) print('Coefficient Shape', linear_svc.coef_.shape) print('Intercept Shape', linear_svc.intercept_.shape) mglearn.plots.plot_2d_classification(linear_svc, X, fill=True, alpha=.6) mglearn.discrete_scatter(X[:, 0], X[:, 1], y) line = np.linspace(-15, 15) for coef, intercept, color in zip(linear_svc.coef_, linear_svc.intercept_, ['b', 'r', 'g']): plt.plot(line, -(line * coef[0] + intercept) / coef[1], c=color) plt.ylim(-10, 15) plt.xlim(-10, 8) plt.xlabel('Feature 0') plt.ylabel('Feature 1') plt.legend(['Class 0', 'Class 1', 'Class 2', 'Line Class 1', 'Line Class 2', 'Line Class 3'], loc=(1.01, 0.3)) plt.show()
import matplotlib.pyplot as plt import mglearn import numpy as np import pandas as pd #mglearn.plots.plot_agglomerative() from mglearn.datasets import make_blobs from scipy.cluster.hierarchy import dendrogram, ward X, y = make_blobs(random_state=0, n_samples=12) linkage_array = ward(X) dendrogram(linkage_array) ax = plt.gca() bounds = ax.get_xbound() ax.plot(bounds, [7.25, 7.25], '--', c='k') ax.plot(bounds, [4, 4], '--', c='k') ax.text(bounds[1], 7.25, 'two clusters', va='center', fontdict={'size': 15}) ax.text(bounds[1], 4, 'threeclusters', va='center', fontdict={'size': 15}) plt.xlabel("Sample index") plt.ylabel("Cluster distance")
Chapter 2 Supervised Learning - Kernelized Support Vector Machines """ import matplotlib.pyplot as plt import mglearn.datasets from mpl_toolkits import mplot3d import numpy as np from sklearn import datasets from sklearn import model_selection from sklearn import svm """ Linear models and nonlinear features """ x, y = datasets.make_blobs(centers=4, random_state=8) y = y % 2 mglearn.discrete_scatter(x[:, 0], x[:, 1], y) plt.xlabel('Feature 0') plt.ylabel('Feature 1') linear_svm = svm.LinearSVC().fit(x, y) # decision boundary found by a linear SVM mglearn.plots.plot_2d_separator(linear_svm, x) mglearn.discrete_scatter(x[:, 0], x[:, 1], y) plt.xlabel('Feature 0') plt.ylabel('Feature 1') # add a third feature derived from feature 1